In [None]:
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB, ComplementNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import GridSearchCV

from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.adapt import MLkNN

from load_data import *
from preprocessing.feature_analysis import feature_analysis
from preprocessing.intents_preprocess import intents_preprocessing
from preprocessing.text_preprocess import *

import importlib
import preprocessing.feature_analysis
importlib.reload(preprocessing.feature_analysis)
from preprocessing.feature_analysis import *

# Chargement des données et preprocessing

In [2]:
X_train, labels_train, dico_labels = load_data("train",127)
X_val, labels_val, dico_labels_val = load_data("val",20)
X_test, labels_test,dico_labels_test = load_data("test",34)

In [3]:
binarizer=MultiLabelBinarizer()

### Preprocessing train

In [4]:
##labels
labels_preprocessed_train = intents_preprocessing(labels_train)
#labels flatten
labels_train=sum(labels_preprocessed_train,[])
#binarization for BinaryRelevance
labels_train=binarizer.fit_transform(labels_train)

In [5]:
##data
X_train_preprocess = [[preprocessing(text) for text in dialogue] for dialogue in X_train]
features_train = feature_analysis(X_train_preprocess)
#features flatten
features_train=sum(features_train,[])
#vectorizer
vec = DictVectorizer()
features_train = vec.fit_transform(features_train)

  0%|          | 0/16142 [00:00<?, ?it/s]

### Preprocessing val

In [6]:
##labels
labels_preprocessed_val = intents_preprocessing(labels_val)
labels_val=sum(labels_preprocessed_val,[])
labels_val=binarizer.fit_transform(labels_val)

##data
X_val_preprocess = [[preprocessing(text) for text in dialogue] for dialogue in X_val]
features_val = feature_analysis(X_val_preprocess)
features_val=sum(features_val,[])
vec = DictVectorizer()
features_val = vec.fit_transform(features_val)

  0%|          | 0/2482 [00:00<?, ?it/s]

### Preprocessing test

In [7]:
##labels
labels_preprocessed_test = intents_preprocessing(labels_test)
labels_test=sum(labels_preprocessed_test,[])
labels_test=binarizer.fit_transform(labels_test)

##data
X_test_preprocess = [[preprocessing(text) for text in dialogue] for dialogue in X_test]
features_test = feature_analysis(X_test_preprocess)
features_test=sum(features_test,[])
vec = DictVectorizer()
features_test = vec.fit_transform(features_test)

  0%|          | 0/4201 [00:00<?, ?it/s]

In [8]:
from time import time
import preprocessing.feature_analysisv1 as manon
import preprocessing.feature_analysis as candice

start = time()
features = manon.feature_analysis(X_test_preprocess)
print(time() - start)

start = time()
features = candice.feature_analysis(X_test_preprocess)
print(time() - start)

  0%|          | 0/4201 [00:00<?, ?it/s]

325.68599343299866


  0%|          | 0/4201 [00:00<?, ?it/s]

321.5447585582733


## Naive Bayes

In [10]:
##we use validation set to fine-tune the model hyperparameters
#we use binaryRelevance()

params_NB=[
    { 'classifier':[MultinomialNB(),ComplementNB()],
      'classifier__alpha': np.logspace(0,-9, num=50)
    },
    {
       'classifier':[GaussianNB()],
        'classifier__var_smoothing': np.logspace(0,-9, num=50)
    }
]
gs_NB = GridSearchCV(BinaryRelevance(), param_grid=params_NB,scoring='accuracy')
gs_NB.fit(features_train, labels_train)
gs_NB.fit(features_val, labels_val)
print (gs_NB.best_params_,gs_NB.best_score_)

{'classifier': GaussianNB(var_smoothing=0.004094915062380423), 'classifier__var_smoothing': 0.004094915062380423} 0.21218534775077633


In [11]:
##we use Test Dataset to evaluate the model
pred = gs_NB.predict(features_test)
acc=round(accuracy_score(labels_test,pred),4)
pre=round(precision_score(labels_test,pred,average='micro'),4)
rec=round(recall_score(labels_test,pred,average='micro'),4)
f1=round(f1_score(labels_test,pred,average='micro'),4)

scores={'method':'NaiveBayes','acc':acc, 'precision':pre,'recall':rec,'f1':f1}
print(scores)

{'method': 'NaiveBayes', 'acc': 0.207, 'precision': 0.3779, 'recall': 0.5517, 'f1': 0.4486}


In [8]:
scores=[{'method': 'NaiveBayes', 'acc': 0.207, 'precision': 0.3779, 'recall': 0.5517, 'f1': 0.4486}]

## ML-knn

In [None]:
params_KNN={'k':range(1,3),'s':[0.1,0.01,0.001,0.0001]}
gs_KNN = GridSearchCV(knn, param_grid=params_KNN,scoring='accuracy')
gs_KNN.fit(features_val, labels_val)
print (gs_KNN.best_params_,gs_KNN.best_score_)



In [None]:
pred = gs_KNN.predict(features_test)
acc=round(accuracy_score(labels_test,pred),4)
pre=round(precision_score(labels_test,pred,average='micro'),4)
rec=round(recall_score(labels_test,pred,average='micro'),4)
f1=round(f1_score(labels_test,pred,average='micro'),4)

score = {'method':'ML-kNN','acc':acc, 'precision':pre,'recall':rec,'f1':f1}
scores.append(score)
print(score)

## RandomForest

In [None]:
RF=BinaryRelevance(RandomForestClassifier())
RF.fit(features_train, labels_train)
pred = RF.predict(features_test)
acc=round(accuracy_score(labels_test,pred),4)
print(acc)

In [None]:
params_RF=[
    { 'classifier':[RandomForestClassifier()],
      'classifier__n_estimators': [100,500,1000,1500],
      'classifier__max_depth': np.arange[5,15,35,50],
      'classifier__min_samples_split': [5,25,50,75,100],
      'classifier__min_samples_leaf': [2,4,6,8,10],
      'classifier__max_features': ['auto','sqrt']
    }
]
gs_RF = GridSearchCV(BinaryRelevance(), param_grid=params_RF,scoring='accuracy')
gs_RF.fit(features_train, labels_train)
gs_RF.fit(features_val, labels_val)
print (gs_RF.best_params_,gs_RF.best_score_)

In [None]:
pred = gs_RF.predict(features_test)
acc=round(accuracy_score(labels_test,pred),4)
pre=round(precision_score(labels_test,pred,average='micro'),4)
rec=round(recall_score(labels_test,pred,average='micro'),4)
f1=round(f1_score(labels_test,pred,average='micro'),4)

score={'method':'RandForest','acc':acc, 'precision':pre,'recall':rec,'f1':f1}
scores.append(score)
print(score)

## Adaboost

In [None]:
params_AB=[
    { 'classifier':[AdaBoostClassifier()],
      'classifier__n_estimators': [100,500,1000,1500,2000],
      'classifier__learning_rate': [0.1,0.01,0.001,0.0001],
      'classifier__algorithm':['SAMME','SAMME.R']
    }
]
gs_AB = GridSearchCV(BinaryRelevance(), param_grid=params_AB,scoring='accuracy')
gs_AB.fit(features_train, labels_train)
gs_AB.fit(features_val, labels_val)
print (gs_AB.best_params_,gs_AB.best_score_)

In [None]:
pred = gs_AB.predict(features_test)
acc=round(accuracy_score(labels_test,pred),4)
pre=round(precision_score(labels_test,pred,average='micro'),4)
rec=round(recall_score(labels_test,pred,average='micro'),4)
f1=round(f1_score(labels_test,pred,average='micro'),4)

score={'method':'Adaboost','acc':acc, 'precision':pre,'recall':rec,'f1':f1}
scores.append(score)
print(score)

## SVM

In [None]:
params_SVM=[
    { 'classifier':[SVC()],
      'classifier__kernel': ['rbf','linear'],
      'classifier__C': [0.1,0.01,0.001,0.0001,0.00001],
      'classifier__gamma': [0.1,0.01,0.001,0.0001,0.00001]
    }
]
gs_SVM = GridSearchCV(BinaryRelevance(), param_grid=params_SVM,scoring='accuracy')
gs_SVM.fit(features_train, labels_train)
gs_SVM.fit(features_val, labels_val)
print (gs_SVM.best_params_,gs_SVM.best_score_)

In [None]:
pred = gs_SVM.predict(features_test)
acc=round(accuracy_score(labels_test,pred),4)
pre=round(precision_score(labels_test,pred,average='micro'),4)
rec=round(recall_score(labels_test,pred,average='micro'),4)
f1=round(f1_score(labels_test,pred,average='micro'),4)

scores=[scores]
score={'method':'SVM','acc':acc, 'precision':pre,'recall':rec,'f1':f1}
scores.append(score)
print(score)

## Comparatif des algorithmes

In [None]:
print(scores)

### expérimentations sur un jeu de données plus petit

In [4]:
X_val, labels_val, dico_labels_val = load_data("val",20)

In [5]:
labels_preprocessed_val = intents_preprocessing(labels_val)
X_val_preprocess = [[preprocessing(text) for text in dialogue] for dialogue in X_val]

In [14]:
features_val = preprocessing.feature_analysis(X_val_preprocess)

  0%|          | 0/2482 [00:00<?, ?it/s]

In [15]:
#utiliser flatten ?
features_val=sum(features_val,[])

In [16]:
print(len(features_val))
print(features_val[0])
vec = DictVectorizer()
features = vec.fit_transform(features_val)
print(vec.get_feature_names_out())

48726
{'cos_initial_utterance': 1.0, 'cos_entire_dialogue': 1.755, 'question': 0.0, 'same': 0.0, 'pos': 0.0, 'nor_pos': 0.0, 'num_w': 9.0, 'sw': 9.0, 'sw_stem': 9.0, 'is_starter': 1.0, 'thank': 0.0, 'exc': 0.0, 'neg': 0.0, 'positif': 0.085, 'negatif': 0.0, 'neutre': 0.915}
['cos_entire_dialogue' 'cos_initial_utterance' 'exc' 'is_starter' 'neg'
 'negatif' 'neutre' 'nor_pos' 'num_w' 'pos' 'positif' 'question' 'same'
 'sw' 'sw_stem' 'thank']


In [18]:
clf = BinaryRelevance(classifier=MultinomialNB())
labels_val=sum(labels_preprocessed_val,[])
print(len(labels_val))
print(labels_val[:10])
binarizer=MultiLabelBinarizer()
labels_val=binarizer.fit_transform(labels_val)

48726
[['INFORM', 'INFORM_INTENT'], ['REQUEST'], ['INFORM'], ['CONFIRM'], ['REQUEST', 'AFFIRM'], ['INFORM', 'NOTIFY_SUCCESS'], ['REQUEST'], ['INFORM'], ['THANK_YOU'], ['REQ_MORE']]


In [19]:
print(labels_val[0])
print(labels_val[1])

[0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]


In [20]:
clf.fit(features, labels_val)

BinaryRelevance(classifier=MultinomialNB(), require_dense=[True, True])