> # Traitement Automatique de texte en IA projet
> 
> > Antoine Vidal-Mazuy - Yann Brault
> > 28/12/2021 Université Côtes d'azur

<br>

# Table of content
[Pre Processing](#pre-processing)

> ## Introduction

<br>

> blablzqdbd bzqjkdb qz

<br>
<br>

> ## Datasets utilisés
>
> > #### Datasets de base:

<br>

> Nous avons décidé de construire nous même notre base de donnée. Pour cela nous avons conçu un petit scraper su site Jeux-Video.com. <br>
> Le code source du scraper se trouve dans le lien suivant [JVCScraper](https://github.com/Brotherta/JVCScraper).

<br>

> Nous avons récupéré tous les commentaires et notes d'utilisateurs sur environ 50 jeux, pour un total de 102000 avis. Puis nous les avons sauvegarder sous forme de fichiers csv. <br>
> Nous avons dû nettoyer les avis de tous les charactères spéciaux, des mots de liaisons, les urls et tout ce qui n'apportait rien à la compréhension de l'avis.
> Afin de nettoyer plus facilement les commentaires nous avons créer une classe **CleanData** qui nous permettait d'effectuer plusieurs tâches de pré-processing.

In [8]:
import re
from spellchecker import SpellChecker

import pandas as pd
from pandas.core.frame import DataFrame
from tqdm import trange



class CleanData:

    def __init__(self, max_words, df: DataFrame = None) -> None:
        self.max_words = max_words
        self.df = df
        self.unused_chars = ',|;|\&|\#|\@|\%|\:|\>|\<|\(|\)|\{|\}|\=|\+|\_|\[|\}|\^|\*|\!|\?|\/|\¨|\~|\\\|\§|\||[0-9]|\[|\]|\"'
        self.connecting_words = [
            "c'est", "ces", "ses", "s'est", "a", "de", "du", 
            "et", "le", "les", "un", "une", "pour", "sur", "etc", "est", "c",
            'la', "jeu", "que", "des", "en", "ce", "qu", "ca", "y", "je", "sa", "son",
            "au", "ai", "mon", "ma", "mes", "qui", "je", "tu", "il", "ils", "elles", "elle", "vous", "nous",
            "qu'il", "qu'elle", "qu'ils", "qu'elles", "qu'on",
            "on", "se", "par"]
        self.urls = r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%|\-)*\b'
            
        self.spell = SpellChecker(language='fr')

    def correction_spelling(self, review):
        """ 
        try to elimiminates the unknown words of a sentence, and 
        replacing it by a correct word. 
        """
        
        review_list = review.split(" ")
        
        bad = []
        for word in review_list:
            if word != " ":
                bad = self.spell.unknown(review_list)

        new_list = []
        for word in review_list:
            if word in bad:
                new_list.append(self.spell.correction(word))
            else:
                new_list.append(word)
        
        return ' '.join(new_list)

    def replace_nan(self):
        """
        Replace nan by 'bon' or 'mauvais' in the dataframe.
        """
        to_drop = []
        for i in self.df.index:
            r = self.df['avis'][i]
            if pd.isna(r) or r in ['nan', 'Nan'] or type(r) == float:
                to_drop.append(i)
        print("droped nan : ", len(to_drop))
        self.df = self.df.drop(to_drop)

    def remove_urls(self, review):
        review = re.sub(self.urls, '', review, flags=re.MULTILINE)
        return(review)

    def clean_str(self, review):
        """
        Remove special characters from the string.
        """

        if len(review) > 0 or review != None:
            review = re.sub(self.unused_chars, ' ', review)
            review = review.replace('.', ' ').replace('\t', ' ').replace('\r', ' ').replace('\n', ' ')
            review = review.lower()
            review = re.sub(' +', ' ', review)
            review = re.sub(r' (?! ) ', '', review) #removing single characters
       
        return review
    
    def clean_stop_words(self, review):
        review_list = review.split(" ")

        new_list = []
        for word in review_list:
            if word != " " and not word in self.connecting_words and len(word) > 1:
                new_list.append(word)

        return ' '.join(new_list)

    def clean_review(self, review):
        review = self.clean_str(review)
        review = self.clean_stop_words(review)
        review = self.correction_spelling(review)
        
        return review

    def clean_dataset(self):
        """
        Main method call to prepare a text to be vectorized.
        """

        self.df = self.replace_nan()
        for i in trange(self.df.shape[0]):
            review = self.df.at[i, 'avis']
            review = self.clean_str(review)
            review = self.clean_stop_words(review)
            review = self.correction_spelling(review)
                
            self.df.at[i, 'avis'] = review

    def filter_long_review(self):
        """
        Filter the string with too many words.
        """

        to_drop = []
        for i in self.df.index:
            review = self.df['avis'][i]
            review_list = review.split()

            if len(review_list) > self.max_words:
                to_drop.append(i)

        self.df = self.df.drop(to_drop)
        print(f"dropped {len(to_drop)} lines")

    def fix_repartition_for_4_classes(self):
        """
        Fix the bad repartitions of the dataset, by removing randomly good reviews.
        """

        d = self.df.groupby(['classe_bon_mauvais'], as_index=False).count()
        nb_bad = d['avis'][0]

        nb_good = d['avis'][2]
        to_remove = nb_good - nb_bad

        while(to_remove > 0):
            row: DataFrame = self.df.sample()
            index = row.first_valid_index()
            print(f"{to_remove}")

            if row['classe_bon_mauvais'][index] == 2:
                self.df.drop(index, inplace=True)
                to_remove -= 1


        nb_good = d['avis'][3]
        to_remove = nb_good - nb_bad

        while(to_remove > 0):
            row: DataFrame = self.df.sample()
            index = row.first_valid_index()
            print(f"{to_remove}")

            if row['classe_bon_mauvais'][index] == 3:
                self.df.drop(index, inplace=True)
                to_remove -= 1
        
        d = self.df.groupby(['classe_bon_mauvais'], as_index=False).count()
        nb_bad = d['avis'][0]
        nb_good = d['avis'][1]
        print("2", nb_bad, nb_good)

    def fix_repartition(self):
        """
        Fix the bad repartitions of the dataset, by removing randomly good advice.
        """

        d = self.df.groupby(['classe_bon_mauvais'], as_index=False).count()
        nb_bad = d['avis'][0]
        nb_good = d['avis'][1]
        print(nb_bad, nb_good)

        to_remove = nb_good - nb_bad
        
        while(to_remove > 0):
            row: DataFrame = self.df.sample()
            index = row.first_valid_index()
            print(f"{to_remove}")

            if row['classe_bon_mauvais'][index] == 1:
                self.df.drop(index, inplace=True)
                to_remove -= 1
        
        d = self.df.groupby(['classe_bon_mauvais'], as_index=False).count()
        nb_bad = d['avis'][0]
        nb_good = d['avis'][1]
        print("2", nb_bad, nb_good)

    def save_data(self, path):
        self.df.to_csv(path, index=False)


> ## Pre-processing
> Données avant pre processing

In [9]:
import pandas as pd

df = pd.read_csv('dataset/csv/dataset_original.csv')

df['avis'][3]

"Aller, un bon 10 parce que ca reste jouable, mais systeme de paiement si tu veut plus, un jeu copier coller de l'ancienne version ,et on recommence ! pas besoin d'en dire plus"

> Données après pre-processing :

In [10]:
import pandas as pd

df = pd.read_csv('dataset/csv/dataset_0-1.csv')

df['avis'][3]

"aller bon parce reste jouable mais systeme paiement si veut plus copier coller l'ancienne version recommence pas besoin d'en dire plus"

> Nous avons calculer les 30 mots les plus utilisés que l'on a par la suite enlevé. parmis eux :

```
"c'est", "ces", "ses", "s'est", "a", "de", "du", "et", "le", "les", "un", "une", "pour", "sur", "etc", "est", "c",
'la', "jeu", "que", "des", "en", "ce", "qu", "ca", "y", "je", "sa", "son","au", "ai", "mon", "ma", "mes", "qui", "je", 
"tu", "il", "ils", "elles", "elle", "vous", "nous","qu'il", "qu'elle", "qu'ils", "qu'elles", "qu'on", "on", "se", "par"
```

> Les avis trop longs posent aussi problèmes. Certains avis comportaient plus de 1000 mots, pour une moyenne beaucoup plus basse. <br>
> Cela pose problème pour l'entrainement mais aussi pour la représentation des avis.

<br>

> ## Classes des notes
> > Pour représenter les notes, nous avons instaurer 2 classes différentes, 0 pour les notes < 12, et 1 pour les notes >= 12.

<br>

> Dataset 2 classes :

In [11]:
import pandas as pd

df = pd.read_csv('dataset/csv/dataset_0-1.csv')

df.sample(frac=1)

Unnamed: 0,classe_bon_mauvais,avis
26642,0,j'ai acheté portal après avoir vu tous comment...
3013,1,très bon jeux moins bugé origin gigantesque op...
3883,1,jeux parfaitement représente jeux vidéos moi t...
332,0,pokemon honte vidéoludique avoir autant rabais...
15477,0,j'ai créé compte jv juste poster commentaire j...
...,...,...
23849,0,qu'est passé pourquoi avoir sorti grosse démo ...
1361,0,très bon avant c'était tuerie mais maintenant ...
11926,0,sérieusement l'ai emprunté pote ben garde dégo...
32386,0,multijoueurs assez monotone campagne très liné...


> Dataset 4 classes :

In [12]:
import pandas as pd

df = pd.read_csv('dataset/csv/dataset_0-3.csv')

df.sample(frac=1)

Unnamed: 0,classe_bon_mauvais,avis
109,3,bon des très beaux graphismes pokémon les musi...
1305,2,je joue dofus depuis ans ne voit trés rarement...
21932,2,il sera fait attendre titre malgré tout après ...
14589,3,je suis désolé vraiment mais fait partie très ...
23458,0,encore réchauffé encore toujours pouriiiiiiiii...
...,...,...
27459,0,n'achetez surtout pas ce jeu j'ai testé car pe...
16025,2,les
8035,3,oh mais qu'est j'en marre compare deux jeux so...
22179,0,tout négatif dans bugs impossibilité choisir c...


> ## Corretions des mots
> > Afin de tenir comptes des mots mal orthographiés, nous avons utilisés la librairie [spellchecker](https://pypi.org/project/pyspellchecker/). <br>
> > Malheureusement nous avons fait une erreur dans le code. En effet nous avons transformé tous les points et toutes les virgules par du vide. <br>
> > Nous avons perdu par ce biais beaucoup de mots car inutilisables pour la méthode Word2Vec.

<br>

> ## Base line:
>
> > #### Prédicteurs de base:
> > > Pour commencer notre projet, nous avons fait deux systèmes de prédictions simples. <br>
> > > De base, une classe est assignée à chaque avis. Si la note attribuée est supérieure ou égale à 12 alors on assigne l'avis à la classe 1 comme signe de bon jeu. <br>
> > > En revanche, si la note est strictement inférieure à 12 alors, l'avis est catégorisé par la classe 0, celle des mauvais jeux. <br>
> > > ##### Prédicteur par comptage de mots:
> > > L'idée ici est plutôt naïve. Si le commentaire contient plus de mots au sens négatifs, alors on prédit le jeu comme étant mauvais. À l'inverse, <br>
> > > si la condition énoncée n'est pas validée alors le jeu est considéré comme étant bon. <br>
> > > Le code marche comme ceci:


In [13]:
from tqdm import trange
import pandas as pd

def predict(data: pd.DataFrame) -> tuple[list[int], list[int]]:
    classes_predicted = []
    classes_base = []

    for i in trange(data.shape[0]):
        row = data.iloc[i]
        neg = row['negative_words']
        pos = row['positive_words']
        
        predict = 1
        if neg > pos: 
            predict = 0
        
        base = row['classe_bon_mauvais']
        classes_base.append(int(base))
        classes_predicted.append(int(predict))
    
    return classes_base, classes_predicted
    
def compute_confusion_matrix(classes_base, classes_predicted):
    
    M = [[0, 0], [0, 0]]

    for i in range(len(classes_base)):
        M[classes_base[i]][classes_predicted[i]] += 1
    
    return M

def accuracy(TN, TP, FN, FP):
    size_list = len(TN)
    accuracy_sum = 0
    for i in range(size_list):
        accuracy_sum += (TP[i] + TN[i]) / (TP[i] + TN[i] + FN[i] + FP[i])
    return accuracy_sum / size_list

def recall(TP, FN):
    size_list = len(TP)
    recall_sum = 0
    for i in range(size_list):
        recall_sum += TP[i] / (TP[i] + FN[i])
    return recall_sum / size_list

if __name__ == "__main__":
    data = pd.read_csv('dataset/csv/base_predictor.csv')
    classes_base, classes_predicted = predict(data)
    M = compute_confusion_matrix(classes_base, classes_predicted)
    print(M)
    TP = [0,0]
    TN = [0,0]
    FP = [0,0]
    FN = [0,0]
    Total = M[0][0] + M[0][1] + M[1][0] + M[1][1]

    for i in range(2):
        TP[i] = M[i][i]
        for j in range(2):
            FN[i] += M[i][j]
            FP[i] += M[j][i]
        
        FN[i] -= M[i][i]
        FP[i] -= M[i][i]
        TN[i] = Total - FP[i] - FN[i] + TP[i]

    for i in range(2):
        print(f"cat {i}: TP:{TP[i]}, TN:{TN[i]}, FP:{FP[i]}, FN:{FN[i]}\n")

    print (f"accuracy: {accuracy(TN, TP, FN, FP)}\n")
    print (f"recall: {recall(TP, FN)}\n")


100%|██████████| 102478/102478 [00:09<00:00, 10778.85it/s]

[[15110, 4847], [48369, 34152]]
cat 0: TP:15110, TN:64372, FP:48369, FN:4847

cat 1: TP:34152, TN:83414, FP:4847, FN:48369

accuracy: 0.64368358122073

recall: 0.585493057720152






> > > ##### Prédicteur toujours bon:
> > > L'idée ici est plus que simple. Peu importe que la classe de base soit bonne ou mauvaise, on prédit toujours que le jeu est bon. <br>

In [14]:
def predict_class(data): #here we will assign a class number according to the rate. 1 for good games and 2 for bad games.
    classes_predicted = []
    classes_base = []
    predict = 1
    for i in range(data.shape[0]):
        row = data.iloc[i]
        base = row['classe_bon_mauvais']
        classes_base.append(int(base))
        classes_predicted.append(int(predict))
    return classes_base, classes_predicted

def accuracy(TN, TP, FN, FP):
    size_list = len(TN)
    accuracy_sum = 0
    for i in range(size_list):
        accuracy_sum += (TP[i] + TN[i]) / (TP[i] + TN[i] + FN[i] + FP[i])
    return accuracy_sum / size_list

def recall(TP, FN):
    size_list = len(TP)
    recall_sum = 0
    for i in range(size_list):
        recall_sum += TP[i] / (TP[i] + FN[i])
    return recall_sum / size_list

def CM(classes_base, classes_predicted):
    M = [[0, 0], [0, 0]]

    for i in range(len(classes_base)):
        M[classes_base[i]][classes_predicted[i]] += 1

    TP = [0,0]
    TN = [0,0]
    FP = [0,0]
    FN = [0,0]
    Total = M[0][0] + M[0][1] + M[1][0] + M[1][1]

    for i in range(2):
        TP[i] = M[i][i]
        for j in range(2):
            FN[i] += M[i][j]
            FP[i] += M[j][i]
        
        FN[i] -= M[i][i]
        FP[i] -= M[i][i]
        TN[i] = Total - FP[i] - FN[i] + TP[i]

    for i in range(2):
        print(f"cat {i}: TP:{TP[i]}, TN:{TN[i]}, FP:{FP[i]}, FN:{FN[i]}\n")

    accu = accuracy(TN, TP, FN, FP)
    rec = recall(TP, FN)

    print(f"the globla accuracy is {accu:.2f}\n")
    print(f"the globla recall is {rec:.2f}\n")

if __name__ == '__main__': 
    my_data = pd.read_csv("./dataset/csv/base_predictor.csv")
    classes_base, classes_predicted = predict_class(my_data)
    CM(classes_base, classes_predicted)

cat 0: TP:0, TN:82521, FP:0, FN:19957

cat 1: TP:82521, TN:165042, FP:19957, FN:0

the globla accuracy is 0.87

the globla recall is 0.50



> ## CLassifiers Classes
> > #### Classifier Base

In [15]:
from abc import abstractmethod
from enum import Enum


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import src.utils.utils as u
from joblib import dump, load
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix)

from tqdm import trange


class ClassifierType(Enum):
    WORD2VEC = 1
    NAIVES_BAYES = 2
    WORD2VEC_MIX = 3
    TFIDF_MNB = 4
    TFIDF_LogReg = 5
    TFIDF_MLP = 6
    
    
class Classifier:
    """
    Model Class of all Classifiers.
    """
    
    def __init__(self, data) -> None:
        self.data = data
        self.verify_data()
    
        self.classifier = None
        self.predictions = None
        
        self.X_train = None
        self.y_train = None
    
        self.X_test = None
        self.y_test = None
    
    def verify_data(self):
        print("Verifying data ... ")
        to_drop = []
        for i in self.data.index:
            r = self.data['avis'][i]
            if pd.isna(r) or r in ['nan', 'Nan'] or type(r) == float:
                to_drop.append(i)
        print("droped nan : ", len(to_drop))
        self.data = self.data.drop(to_drop)
    
    def show_repartition(self) -> None:
        print(self.data.groupby(['classe_bon_mauvais'], as_index=False).count())
    
    def save(self, path, features = None):
        dump(self.classifier, path)
    
    def load(self, model_path: str, features_path: str = None):
        self.classifier = load(model_path)
    
    def train(self):
        print("Training on data...")
        self.classifier.fit(self.X_train, self.y_train)
    
    def predict(self):
        print("Prediciton on tests...")
        self.predictions = self.classifier.predict(self.X_test)

    def show_results(self):
        print('==========================Classifier Results============================')
        M = confusion_matrix(self.y_test, self.predictions)
        print(M)

        print('\n Accuracy: ', accuracy_score(self.y_test, self.predictions))
        print('\n Score: ', self.classifier.score(self.X_test, self.y_test))

        print(u.compute_metrics(2, M))
        print(classification_report(self.y_test, self.predictions))

    # Abstract methods

    def get_accuracy(self):
        return accuracy_score(self.y_test, self.predictions)

    def get_precisions(self, c):
        d = classification_report(self.y_test, self.predictions, output_dict=True)
        c_str = str(c)

        return d[c_str]['precision']


    def plot_matrix_classification_report(self, title, cp_path, matrix_path, classes):
        y_test = self.y_test
        predic = self.predictions

        confm = confusion_matrix(y_test, predic)
        df_cm = pd.DataFrame(confm, index=classes, columns=classes)

        fig, ax = plt.subplots(figsize=(12,10))
        ax.set_title('Confusion matrix for '+ title)
        sb.heatmap(df_cm, cmap='YlOrRd', annot=True, fmt='g', ax=ax)
        plt.savefig(matrix_path)

        c_r = classification_report(self.y_test, self.predictions)
        f = open(cp_path, 'a')
        f.write(c_r)
        f.close()
        
        

    def plot_accuracy_precisions(self, title, acc_path, prec_path, label, params, classes: list[int], accuracies: list[int], precisions: list[list[int]]):
        
        # Accuracy
        plt.title('Accuracy for ' + title)
        plt.xlabel(label)
        plt.ylabel('accuracy')
        plt.plot(params, accuracies)
        plt.savefig(acc_path)
        plt.show()

        # Precisions
        plt.title('Precisions for ' + title)
        plt.xlabel(label)
        plt.ylabel('precisions')
        
        color = ['r', 'b', 'g', 'y']
        for i in range(len(precisions)):
            plt.plot(params, precisions[i], color=color[i], label=classes[i])
        plt.legend(loc="upper right", title='classes')

        plt.savefig(prec_path)
        plt.show()



    @abstractmethod
    def fit_transform_data(self):
        pass
    
    @abstractmethod
    def init_sets(self):
        pass
    
    @abstractmethod
    def init_classifier(self):
        pass
    
    @abstractmethod
    def predict_input(self, review: str):
        pass


> ## PipeLine des Classifiers:

In [None]:
class PipelineClassifier:

    def __init__(self, classifier_type, data, vec_bin=None, max_features=None, nb_word_n=None, max_word=None, max_iter=None, test_size=None, layers=None, vec_dim=None, reg=None, alpha=None) -> None:
        self.classifier = None
        self.classifier_type = classifier_type
        self.data = data
        self.max_features = max_features
        self.vec_bin = vec_bin
        self.nb_word_n = nb_word_n
        self.max_word = max_word
        self.max_iter = max_iter
        self.test_size = test_size
        self.layers = layers
        self.vec_dim = vec_dim
        self.reg = reg
        self.alpha = alpha

        self.__init_classifier()

    def __init_classifier(self):
        if self.classifier_type == ClassifierType.WORD2VEC:
            self.classifier = ClassifierWord2Vec(
                data=self.data,
                word2vec_bin=self.vec_bin,
                max_iter=self.max_iter,
                layers=self.layers,
                vec_dim=self.vec_dim,
                test_size=self.test_size
            )

        elif self.classifier_type == ClassifierType.NAIVES_BAYES:
            self.classifier = NaivesBayes(
                data=self.data,
                nb_word=self.nb_word_n,
                test_size=self.test_size
            )

        elif self.classifier_type == ClassifierType.WORD2VEC_MIX:
            self.classifier = ClassifierWord2VecMix(
                data=self.data,
                word2vec_bin=self.vec_bin,
                max_iter=self.max_iter,
                layers=self.layers,
                vec_dim=self.vec_dim,
                test_size=self.test_size
            )
        
        elif self.classifier_type == ClassifierType.TFIDF_LogReg:
            self.classifier = TFIDF_LogReg(
                data=self.data,
                test_size=self.test_size,
                max_iter=self.max_iter,
                regularization=self.reg,
                max_features=self.max_features
            )

        elif self.classifier_type == ClassifierType.TFIDF_MNB:
            self.classifier = TFIDF_MNB(
                data=self.data,
                test_size=self.test_size,
                alpha=self.alpha,
                max_features=self.max_features
            )
        elif self.classifier_type == ClassifierType.TFIDF_MLP:
            self.classifier = TFIDF_MLP(
                data=self.data,
                test_size=self.test_size,
                max_iter=self.max_iter,
                layers=self.layers,
                max_features=self.max_features
            )
            
    
    def load(self, model_path, features_path=None):
        self.classifier.load(model_path, features_path)
        self.classifier.init_sets()
        self.classifier.X_train = None
        self.classifier.y_train = None
        self.classifier.fit_transform_data()

    def save(self, model_path, features_path=None):
        self.classifier.save(model_path, features_path)

    def train(self):
        self.classifier.init_sets()
        self.classifier.init_classifier()
        self.classifier.fit_transform_data()
        self.classifier.train()

    def transform_data(self):
        self.classifier.init_sets()
        self.classifier.fit_transform_data()
    
    def train_without_transform(self):
        self.classifier.train()
    
    def predict(self):
        self.classifier.show_repartition()
        self.classifier.predict()
        self.classifier.show_results()

    def predict_input(self):
        review = input("Write a review to predict: \n")
        c = CleanData(self.max_word)
        review = c.clean_review(review)
        review = self.classifier.predict_input(review)


> #### Classifier Word2Vec Multi-layer Perceptron classifier

In [16]:
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from gensim.models.keyedvectors import KeyedVectors

from src.utils.utils import compute_metrics
from src.classifiers.Classifier import Classifier


class ClassifierWord2Vec(Classifier):

    def __init__(self, data, word2vec_bin = None, max_iter=0, test_size=0, layers=0, vec_dim=0, create = True) -> None:
        super().__init__(data)
        
        self.max_iter = max_iter
        self.test_size = test_size
        self.layers = layers
        self.vec_dim = vec_dim
        self.word2vec_bin = word2vec_bin
        
    
        self.words_dictionary = None

        self.load_dictionnary_from_bin()

    def load_dictionnary_from_bin(self):
        print(f"Loading dictionary from binary {self.word2vec_bin}...")
        self.words_dictionnary: KeyedVectors = KeyedVectors.load_word2vec_format(self.word2vec_bin, binary=True)

    def init_sets(self):
        print("Initialization of train and test sets...")
        X = self.data['avis'].copy()
        y = self.data['classe_bon_mauvais'].copy()

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=self.test_size)
    
    def init_classifier(self):
        print(f"Initialization of the MLP Classifier with {self.layers} layers and {self.max_iter} max iteration.")
        
        self.classifier = MLPClassifier(hidden_layer_sizes=self.layers, max_iter=self.max_iter)

    def _transform_to_vec(self, set_to_transform):
        vectors_list = []
        len_set = len(set_to_transform)
        acc = 1
        for review in set_to_transform:
            print(f"{acc}/{len_set}", end='\r')
            acc+=1
            try:
                reviews_list = review.split()
            except:
                print('exception !', review)
                if (type(review) == float):
                    print(type(review))
                exit()
            reviews_vec = []
            
            for word in reviews_list:
                try:
                    word_vec = self.words_dictionnary[word]
                    reviews_vec.append(word_vec)
                except:
                    pass

            for i in range(len(reviews_vec), self.vec_dim):
                reviews_vec.append(np.zeros(self.vec_dim, dtype=np.float32))

            tot_vec = []
            for i in range(self.vec_dim):
                sum = 0.0
                for vec in reviews_vec[i]:
                    sum += vec
                tot_vec.append(sum)

            vectors_list.append(tot_vec)
        
        return vectors_list

    def fit_transform_data(self):
        if self.X_test is not None:
            print(f"Transforming reviews of the test set into vectors...")
            self.X_test = self._transform_to_vec(self.X_test)
        
        if self.X_train is not None:
            print(f"Transforming reviews of the train set into vectors...")
            self.X_train = self._transform_to_vec(self.X_train)
        
    def train(self):
        print("Training on data...")
        self.classifier.fit(self.X_train, self.y_train)
    
    def predict(self):
        print("Prediciton on tests...")
        self.predictions = self.classifier.predict(self.X_test)

    def show_results(self):
        M = confusion_matrix(self.y_test, self.predictions)

        print(M)
        print(compute_metrics(2, M))

        print(classification_report(self.y_test, self.predictions))

> #### Classifier Word2Vec Multi-layer Perceptron classifier mélange de model pré-entrainé et auto-entrainé

In [17]:
import os
import tempfile
from typing import DefaultDict
import pandas as pd

import numpy as np
from src.classifiers.Classifier import Classifier

from src.classifiers.ClassifierWord2Vec import ClassifierWord2Vec
from gensim.models import FastText  


class ClassifierWord2VecMix(ClassifierWord2Vec):

    def __init__(self, data, word2vec_bin=None, max_word=0, max_iter=0, test_size=0, layers=0, vec_dim=0) -> None:
        super().__init__(
            data,
            word2vec_bin=word2vec_bin,
            max_word=max_word,
            max_iter=max_iter,
            test_size=test_size,
            layers=layers,
            vec_dim=vec_dim
            )

        self.words_dictionary_self = None
        
        self.create_dictionnary()

    def create_dictionnary(self):
        sentences = [s.split() for s in self.data['avis']]

        print("Training model")
        model = FastText(sentences=sentences, vector_size=self.vec_dim)
        print('Training done')
        self.words_dictionary_self = model.wv

    def _transform_to_vec(self, set_to_transform):
        vectors_list = []
        len_set = len(set_to_transform)
        acc = 1
        for review in set_to_transform:
            print(f"{acc}/{len_set}", end='\r')
            acc+=1
            reviews_list = review.split()
            reviews_vec = []
            
            for word in reviews_list:
                try:
                    word_vec = self.words_dictionnary[word]
                    reviews_vec.append(word_vec)
                except:
                    word_vec = self.words_dictionary_self[word]
                    reviews_vec.append(word_vec)

            for i in range(len(reviews_vec), self.vec_dim):
                reviews_vec.append(np.zeros(self.vec_dim, dtype=np.float32))

            tot_vec = []
            for i in range(self.vec_dim):
                sum = 0.0
                for vec in reviews_vec[i]:
                    sum += vec
                tot_vec.append(sum)

            vectors_list.append(tot_vec)
        
        return vectors_list

> #### Classifier Word Features with Naives Bayes

In [18]:
import collections

from sklearn.model_selection import train_test_split

from src.classifiers.Classifier import Classifier
from src.utils.clean_data import CleanData
from joblib import dump, load
import json

import nltk


class NaivesBayes(Classifier):


    def __init__(self, data, nb_word, test_size=0) -> None:
        super().__init__(data)

        self.test_size = test_size

        self.train_set = None
        self.test_set = None

        self.nb_word = nb_word

        self.refsets = None
        self.testsets = None
        self.predictions_labels = None
        self.predictions = None

        self.word_features = None
    
    def _review_features(self, review):
        review_words = set(review)
        features = {}
        for word in self.word_features:
            features['contains({})'.format(word)] = (word in review_words)
        return features

    def _compute_word_features(self):
        all_words = nltk.FreqDist()
    
        print("Computes words frequencies ...")
        acc = 0
        size = self.data.shape[0]
        for avis in self.data['avis']:
            acc+=1
            print(acc, "/", size, sep='', end='\r')
            for word in avis.split():
                all_words[word] += 1

        self.word_features = list(all_words)[:self.nb_word]

    def _compute_features(self, tuple_to_compute):
        print("get Features out of review ...")
        
        featuresets = []
        acc = 0
        size = len(tuple_to_compute)
        for (r, c) in tuple_to_compute:
            acc+=1
            print(acc, "/", size, sep='', end='\r')
            featuresets.append((self._review_features(r), c))
        
        return featuresets

    def _get_tuples(self, input, output):
        print("Get all tuples of reviews ...")
        reviews = []
        size = len(input)
        for i in input.index:
            print(i, "/", size, sep='', end='\r')
            reviews.append((input[i].split(), output[i]))

        return reviews

    def load(self, model_path: str, features_path: str = None):
        self.classifier = load(model_path)

        with open(features_path, 'r', encoding='utf8') as fp:
            data = json.load(fp)
            self.word_features = data['features']

    def save(self, path, features = None):
        data = {}
        data['features'] = self.word_features
        dump(self.classifier, path)
        with open(features, 'w', encoding='utf8') as fp:
            json.dump(data, fp, indent=4, ensure_ascii=False)

    def init_classifier(self):
        print("Initialization of the word features dictionnary")
        self._compute_word_features()

    def init_sets(self):
        print("Initialization of train and test sets...")
        X = self.data['avis'].copy()
        y = self.data['classe_bon_mauvais'].copy()

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=self.test_size)
        
    def fit_transform_data(self):
        if self.X_test is not None:
            print(f"Transforming reviews of the test set into tuples of features...")
            self.test_set = self._get_tuples(self.X_test, self.y_test)
            self.test_set = self._compute_features(self.test_set)
        
        if self.X_train is not None:
            print(f"Transforming reviews of the train set into tuples of features...")
            self.train_set = self._get_tuples(self.X_train, self.y_train)
            self.train_set = self._compute_features(self.train_set)

    def train(self):
        print("Training on data...")
        self.classifier = nltk.NaiveBayesClassifier.train(self.train_set)

    def predict(self):
        print("Prediciton on tests...")

        self.predictions = []
        self.predictions_labels = []
        self.refsets = collections.defaultdict(set)
        self.testsets = collections.defaultdict(set)
        
        for i, (feats, label) in enumerate(self.test_set):
            print(f"{i}/{len(self.test_set)}", end='\r')
            self.refsets[label].add(i)
            observed = self.classifier.classify(feats)
            self.testsets[observed].add(i)
            self.predictions_labels.append(label)
            self.predictions.append(observed)
    
    def show_results(self):
        print("Confusion Matrix:\n", nltk.ConfusionMatrix(self.predictions_labels, self.predictions))
        print("accuracy:", nltk.accuracy(self.predictions_labels, self.predictions))
        
        for i in range(2):
            print (f"class {i}:")
            print("f1_score:", nltk.f_measure(self.refsets[i], self.testsets[i]))
            print("recall:", nltk.recall(self.refsets[i], self.testsets[i]))
            print("precision:", nltk.precision(self.refsets[i], self.testsets[i]))

    def predict_input(self, review: str):
        test = self._review_features(review.split())
        print(self.classifier.classify(test))
        


> #### Classifier TF-IDF avec Logistic Regression

In [19]:
from src.classifiers.Classifier import Classifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression



class TFIDF_LogReg(Classifier):

    def __init__(self, data, test_size, max_iter, regularization, max_features) -> None:
        super().__init__(data)

        self.test_size = test_size
        self.max_iter = max_iter
        self.regularization = regularization
        self.max_features = max_features

    def init_sets(self):
        print("Initialization of train and test sets...")
        td = TfidfVectorizer(max_features=self.max_features) 
        X = self.data['avis'].copy()
        X = td.fit_transform(X).toarray()
        y = self.data['classe_bon_mauvais'].copy()

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=self.test_size, random_state=0)

    def init_classifier(self):
        print(f"Initialization of the LogReg Classifier with a reg of {self.regularization} and {self.max_iter} max iteration.")
        self.classifier = LogisticRegression(C=self.regularization, max_iter=self.max_iter)


> #### Classifier TF-IDF avec Multinomial Naives Bayes

In [20]:
from src.classifiers.Classifier import Classifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB


class TFIDF_MNB(Classifier):

    def __init__(self, data, test_size, alpha, max_features) -> None:
        super().__init__(data)

        self.alpha = alpha

        self.test_size = test_size
        self.max_features = max_features

    def init_sets(self):
        print("Initialization of train and test sets...")
        td = TfidfVectorizer(max_features=self.max_features) 
        X = self.data['avis'].copy()
        X = td.fit_transform(X).toarray()
        y = self.data['classe_bon_mauvais'].copy()

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=self.test_size, random_state=0)

    def init_classifier(self):
        print(f"Initialization of the LogReg Classifier with a alpha of {self.alpha}.")
        self.classifier = MultinomialNB(alpha=self.alpha)



> #### Classifier TF-IDF avec Multi-layer Perceptron

In [21]:
from src.classifiers.Classifier import Classifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier


class TFIDF_MLP(Classifier):

    def __init__(self, data, test_size, max_iter, layers, max_features) -> None:
        super().__init__(data)

        self.test_size = test_size
        self.max_iter = max_iter
        self.layers = layers
        self.max_features = max_features

    def init_sets(self):
        print("Initialization of train and test sets...")
        td = TfidfVectorizer(max_features=self.max_features) 
        X = self.data['avis'].copy()
        X = td.fit_transform(X).toarray()
        y = self.data['classe_bon_mauvais'].copy()

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=self.test_size, random_state=0)

    def init_classifier(self):
        #print(f"Initialization of the LogReg Classifier with a reg of {self.regularization} and {self.max_iter} max iteration.")
        self.classifier = MLPClassifier(hidden_layer_sizes=self.layers, max_iter=self.max_iter)

> ## Expérimentations
> > Nous avons effectué plusieurs expérimentations afin de trouver les meilleurs paramètres de chaque modèle.

<br>

> ## Différences entre les datasets:
> > Pour les expériences suivantes nous utiliserons le modèles MultinomialNB.

<br>

> #### Performances sur le dataset original :

In [None]:

TEST_SIZE = 1/3
ALPHA = 1.0
MAX_FEATURES = 10000

CLASSIFIER = ClassifierType.TFIDF_MNB

DATASET = 'dataset/csv/data_original.csv'

PLOT_MATRIX_PATH = 'assets/data_analysis/data_original.plot.png'
CP_PATH = 'assets/data_analysis/data_original_cp.txt'

TILE_CM = 'data_original_CM'

CLASSES = [0,1]


df = pd.read_csv(DATASET)[['classe_bon_mauvais', 'avis']]

p = PipelineClassifier(CLASSIFIER, df, test_size=TEST_SIZE, alpha=ALPHA, max_features=MAX_FEATURES)

p.train()
p.predict()

p.classifier.plot_matrix_classification_report(TILE_CM, CP_PATH, PLOT_MATRIX_PATH, CLASSES)

> ### Results :

```
              precision    recall  f1-score   support

           0       0.87      0.37      0.52      5758
           1       0.89      0.99      0.93     28402

    accuracy                           0.89     34160
   macro avg       0.88      0.68      0.73     34160
weighted avg       0.88      0.89      0.87     34160
```


<img src='assets\data_analysis\data_original.plot.png' width='750'>

> #### Performances sur le dataset original avec répartition :

In [None]:

TEST_SIZE = 1/3
ALPHA = 1.0
MAX_FEATURES = 10000

CLASSIFIER = ClassifierType.TFIDF_MNB

DATASET = 'dataset/csv/data_original_rep.csv'

PLOT_MATRIX_PATH = 'assets/data_analysis/data_original_rep.plot.png'
CP_PATH = 'assets/data_analysis/data_original_rep_cp.txt'

TILE_CM = 'data_original_rep_CM'

CLASSES = [0,1]


df = pd.read_csv(DATASET)[['classe_bon_mauvais', 'avis']]

p = PipelineClassifier(CLASSIFIER, df, test_size=TEST_SIZE, alpha=ALPHA, max_features=MAX_FEATURES)

p.train()
p.predict()

p.classifier.plot_matrix_classification_report(TILE_CM, CP_PATH, PLOT_MATRIX_PATH, CLASSES)

> ### Results :

<br>

```
              precision    recall  f1-score   support

           0       0.85      0.85      0.85      5838
           1       0.85      0.85      0.85      5875

    accuracy                           0.85     11713
   macro avg       0.85      0.85      0.85     11713
weighted avg       0.85      0.85      0.85     11713
```

<img src='assets/data_analysis/data_original_rep.plot.png' width='750'>

> #### Performances sur le dataset original en nettoyant les caractères spéciaux.

In [None]:

TEST_SIZE = 1/3
ALPHA = 1.0
MAX_FEATURES = 10000


CLASSIFIER = ClassifierType.TFIDF_MNB
DATASET = 'dataset/csv/data_clean_str.csv'


PLOT_MATRIX_PATH = 'assets/data_analysis/data_clean_str.plot.png'
CP_PATH = 'assets/data_analysis/data_clean_str_cp.txt'
TILE_CM = 'data_clean_str_CM'

CLASSES = [0,1]

df = pd.read_csv(DATASET)[['classe_bon_mauvais', 'avis']]

p = PipelineClassifier(CLASSIFIER, df, test_size=TEST_SIZE, alpha=ALPHA, max_features=MAX_FEATURES)

p.train()
p.predict()

p.classifier.plot_matrix_classification_report(TILE_CM, CP_PATH, PLOT_MATRIX_PATH, CLASSES)

> ### Results :

<br>

```
              precision    recall  f1-score   support

           0       0.85      0.85      0.85      5838
           1       0.85      0.85      0.85      5875

    accuracy                           0.85     11713
   macro avg       0.85      0.85      0.85     11713
weighted avg       0.85      0.85      0.85     11713
```

<img src='assets/data_analysis/data_clean_str.plot.png' width='750'>

> #### Performances sur le dataset original en nettoyant les "stop words"

In [None]:

TEST_SIZE = 1/3
ALPHA = 1.0
MAX_FEATURES = 10000

CLASSIFIER = ClassifierType.TFIDF_MNB
DATASET = 'dataset/csv/data_clean_str_stop_words.csv'

PLOT_MATRIX_PATH = 'assets/data_analysis/data_clean_str_stop_words.plot.png'
CP_PATH = 'assets/data_analysis/data_clean_str_stop_words_cp.txt'
TILE_CM = 'data_clean_str_stop_words_CM'

CLASSES = [0,1]

df = pd.read_csv(DATASET)[['classe_bon_mauvais', 'avis']]

p = PipelineClassifier(CLASSIFIER, df, test_size=TEST_SIZE, alpha=ALPHA, max_features=MAX_FEATURES)

p.train()
p.predict()

p.classifier.plot_matrix_classification_report(TILE_CM, CP_PATH, PLOT_MATRIX_PATH, CLASSES)

> ### Results :

<br>

```
              precision    recall  f1-score   support

           0       0.85      0.86      0.86      5817
           1       0.86      0.85      0.86      5880

    accuracy                           0.86     11697
   macro avg       0.86      0.86      0.86     11697
weighted avg       0.86      0.86      0.86     11697
```

<img src='assets/data_analysis/data_clean_str_stop_words.plot.png' width='750'>

> #### Performances sur le dataset original en corrigeant les fautes d'ortographes

In [None]:
TEST_SIZE = 1/3
ALPHA = 1.0
MAX_FEATURES = 10000

CLASSIFIER = ClassifierType.TFIDF_MNB
DATASET = 'dataset/csv/data_with_correction_spell.csv'

PLOT_MATRIX_PATH = 'assets/data_analysis/data_with_correction_spell.plot.png'
CP_PATH = 'assets/data_analysis/data_with_correction_spell_cp.txt'
TILE_CM = 'data_with_correction_spell_CM'

CLASSES = [0,1]

df = pd.read_csv(DATASET)[['classe_bon_mauvais', 'avis']]

p = PipelineClassifier(CLASSIFIER, df, test_size=TEST_SIZE, alpha=ALPHA, max_features=MAX_FEATURES)

p.train()
p.predict()

p.classifier.plot_matrix_classification_report(TILE_CM, CP_PATH, PLOT_MATRIX_PATH, CLASSES)

> ### Results :

<br>

```
              precision    recall  f1-score   support

         0.0       0.83      0.85      0.84      6475
         1.0       0.85      0.83      0.84      6651

    accuracy                           0.84     13126
   macro avg       0.84      0.84      0.84     13126
weighted avg       0.84      0.84      0.84     13126
```

<img src='assets/data_analysis/data_with_correction_spell.plot.png' width='750'>

> #### Performances avec le dataset original réparti en 2 classes 0 et 1

In [None]:
TEST_SIZE = 1/3
ALPHA = 1.0
MAX_FEATURES = 10000

CLASSIFIER = ClassifierType.TFIDF_MNB
DATASET = 'dataset/csv/dataset_0-1.csv'

PLOT_MATRIX_PATH = 'assets/data_analysis/dataset_0-1.plot.png'
CP_PATH = 'assets/data_analysis/dataset_0-1_cp.txt'
TILE_CM = 'dataset_0-1_CM'

CLASSES = [0,1]

df = pd.read_csv(DATASET)[['classe_bon_mauvais', 'avis']]

p = PipelineClassifier(CLASSIFIER, df, test_size=TEST_SIZE, alpha=ALPHA, max_features=MAX_FEATURES)

p.train()
p.predict()

p.classifier.plot_matrix_classification_report(TILE_CM, CP_PATH, PLOT_MATRIX_PATH, CLASSES)

> ### Results :

<br>

```
              precision    recall  f1-score   support

           0       0.85      0.86      0.86      5817
           1       0.86      0.85      0.86      5880

    accuracy                           0.86     11697
   macro avg       0.86      0.86      0.86     11697
weighted avg       0.86      0.86      0.86     11697
```

<img src='assets/data_analysis/dataset_0-1.plot.png' width='750'>

> #### Performances avec le dataset original réparti en 4 classes 0, 1, 2 et 3

In [None]:
TEST_SIZE = 1/3
ALPHA = 1.0
MAX_FEATURES = 10000

CLASSIFIER = ClassifierType.TFIDF_MNB
DATASET = 'dataset/csv/dataset_0-3.csv'

PLOT_MATRIX_PATH = 'assets/data_analysis/dataset_0-3.plot.png'
CP_PATH = 'assets/data_analysis/dataset_0-3_cp.txt'
TILE_CM = 'dataset_0-3_CM'

CLASSES = [0,1,2,3]

df = pd.read_csv(DATASET)[['classe_bon_mauvais', 'avis']]

p = PipelineClassifier(CLASSIFIER, df, test_size=TEST_SIZE, alpha=ALPHA, max_features=MAX_FEATURES)

p.train()
p.predict()

p.classifier.plot_matrix_classification_report(TILE_CM, CP_PATH, PLOT_MATRIX_PATH, CLASSES)

> ### Results :

<br>

```
              precision    recall  f1-score   support

           0       0.59      0.74      0.66      2672
           1       0.48      0.07      0.13      1955
           2       0.47      0.65      0.55      2644
           3       0.74      0.74      0.74      2643

    accuracy                           0.58      9914
   macro avg       0.57      0.55      0.52      9914
weighted avg       0.58      0.58      0.54      9914

```

<img src='assets/data_analysis/dataset_0-3.plot.png' width='750'>

> ## Différences entre les méthodes:
> > ### MutlinomialNB en TFIDF

<br>

> #### Variation du test size (10% -> 50% par pas de 5%)

In [None]:
ALPHA = 1.0
MAX_FEATURES = 10000

CLASSIFIER = ClassifierType.TFIDF_MNB

DATASET = 'dataset/csv/dataset_0-1.csv'

PLOT_ACC_PATH = 'assets/tfidf/mnb/mnb_test_size_dataset_0-1_acc.plot.png'
PLOT_PREC_PATH = 'assets/tfidf/mnb/mnb_test_size_dataset_0-1_prec.plot.png'
TITLE_PREC_ACC = 'mnb_test_size_dataset_0-1_Prec_Acc'

CLASSES = [0,1]
        
df = pd.read_csv(DATASET)[['classe_bon_mauvais', 'avis']]

params = [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]

accuracies = []
precisions = [[], []]

for i in range(len(params)):

    p = PipelineClassifier(CLASSIFIER, df, test_size=params[i], alpha=ALPHA, max_features=MAX_FEATURES)
    p.train()
    p.predict()
    
    accuracies.append(p.classifier.get_accuracy())
    for c in CLASSES:
        precisions[c].append(p.classifier.get_precisions(c))

p.classifier.plot_accuracy_precisions(TITLE_PREC_ACC, PLOT_ACC_PATH, PLOT_PREC_PATH, TITLE_PREC_ACC, params, CLASSES, accuracies, precisions)

> ## Results
> > ### Accuracy

<br>

<img src='assets/tfidf/mnb/mnb_test_size_dataset_0-1_acc.plot.png' width='500'>

<br>
<br>

> #### Precision

<br>

<img src='assets/tfidf/mnb/mnb_test_size_dataset_0-1_prec.plot.png' width='500'>

> ## Différences entre les méthodes:
> > ### MutlinomialNB en TFIDF

<br>

> #### Variation de paramètre alpha (1 -> 6, par pas de 1)

In [None]:

TEST_SIZE = 1/4
MAX_FEATURES = 10000

CLASSIFIER = ClassifierType.TFIDF_MNB
DATASET = 'dataset/csv/dataset_0-1.csv'

PLOT_ACC_PATH = 'assets/tfidf/mnb/mnb_alpha_dataset_0-1_acc.plot.png'
PLOT_PREC_PATH = 'assets/tfidf/mnb/mnb_alpha_dataset_0-1_prec.plot.png'

TITLE_PREC_ACC = 'mnb_alpha_dataset_0-1_Prec_Acc'

CLASSES = [0,1]


df = pd.read_csv(DATASET)[['classe_bon_mauvais', 'avis']]

params = [1,2,3,4,5,6]

accuracies = []
precisions = [[], []]

for i in range(len(params)):

    p = PipelineClassifier(CLASSIFIER, df, test_size=TEST_SIZE, alpha=params[i], max_features=MAX_FEATURES)
    p.train()
    p.predict()
    
    accuracies.append(p.classifier.get_accuracy())
    for c in CLASSES:
        precisions[c].append(p.classifier.get_precisions(c))

p.classifier.plot_accuracy_precisions(TITLE_PREC_ACC, PLOT_ACC_PATH, PLOT_PREC_PATH, TITLE_PREC_ACC, params, CLASSES, accuracies, precisions)  

> ## Results
> > ### Accuracy

<br>

<img src='assets/tfidf/mnb/mnb_alpha_dataset_0-1_acc.plot.png' width='500'>

<br>
<br>

> #### Precision

<br>

<img src='assets/tfidf/mnb/mnb_alpha_dataset_0-1_prec.plot.png' width='500'>

> ## Différences entre les méthodes:
> > ### Logistic Regression en TFIDF

<br>

> #### Variation de paramètre max_iter (250 -> 1000, par pas de 250)

In [None]:
TEST_SIZE = 1/4
REG = 1.0
MAX_FEATURES = 10000

CLASSIFIER = ClassifierType.TFIDF_LogReg
DATASET = 'dataset/csv/dataset_0-1.csv'

PLOT_ACC_PATH = 'assets/tfidf/log_reg/log_reg_max_iter_dataset_0-1_acc.plot.png'
PLOT_PREC_PATH = 'assets/tfidf/log_reg/log_reg_max_iter_dataset_0-1_prec.plot.png'

TITLE_PREC_ACC = 'log_reg_max_iter_dataset_0-1_Prec_Acc'

CLASSES = [0,1]

df = pd.read_csv(DATASET)[['classe_bon_mauvais', 'avis']]

params = [250, 500, 750, 1000]

accuracies = []
precisions = [[], []]

for i in range(len(params)):

    p = PipelineClassifier(CLASSIFIER, df, test_size=TEST_SIZE, max_iter=params[i], reg=REG, max_features=MAX_FEATURES)
    p.train()
    p.predict()
    
    accuracies.append(p.classifier.get_accuracy())
    for c in CLASSES:
        precisions[c].append(p.classifier.get_precisions(c))

p.classifier.plot_accuracy_precisions(TITLE_PREC_ACC, PLOT_ACC_PATH, PLOT_PREC_PATH, TITLE_PREC_ACC, params, CLASSES, accuracies, precisions) 

> ## Results
> > ### Accuracy

<br>

<img src='assets/tfidf/log_reg/log_reg_max_iter_dataset_0-1_acc.plot.png' width='500'>

<br>
<br>

> #### Precision

<br>

<img src='assets/tfidf/log_reg/log_reg_max_iter_dataset_0-1_prec.plot.png' width='500'>

> ## Différences entre les méthodes:
> > ### Logistic Regression en TFIDF

<br>

> #### Variation de paramètre de regularisation (0.25, 0.50, 0.75, 1.0, 12.0)

In [None]:
MAX_ITER = 250
TEST_SIZE = 1/4
ALPHA = 1.0
MAX_FEATURES = 10000

CLASSIFIER = ClassifierType.TFIDF_LogReg
DATASET = 'dataset/csv/dataset_0-1.csv'

PLOT_ACC_PATH = 'assets/tfidf/log_reg/log_reg_reg_dataset_0-1_acc.plot.png'
PLOT_PREC_PATH = 'assets/tfidf/log_reg/log_reg_reg_dataset_0-1_prec.plot.png'
TITLE_PREC_ACC = 'log_reg_reg_dataset_0-1_Prec_Acc'

CLASSES = [0,1]

df = pd.read_csv(DATASET)[['classe_bon_mauvais', 'avis']]

params = [0.25, 0.5, 0.75, 1.0, 12.0]

accuracies = []
precisions = [[], []]

for i in range(len(params)):

    p = PipelineClassifier(CLASSIFIER, df, test_size=TEST_SIZE, max_iter=MAX_ITER, reg=params[i], max_features=MAX_FEATURES)
    p.train()
    p.predict()
    
    accuracies.append(p.classifier.get_accuracy())
    for c in CLASSES:
        precisions[c].append(p.classifier.get_precisions(c))

p.classifier.plot_accuracy_precisions(TITLE_PREC_ACC, PLOT_ACC_PATH, PLOT_PREC_PATH, TITLE_PREC_ACC, params, CLASSES, accuracies, precisions)

> ## Results
> > ### Accuracy

<br>

<img src='assets/tfidf/log_reg/log_reg_reg_dataset_0-1_acc.plot.png' width='500'>

<br>
<br>

> #### Precision

<br>

<img src='assets/tfidf/log_reg/log_reg_reg_dataset_0-1_prec.plot.png' width='500'>

> ## Différences entre les méthodes:
> > ### Logistic Regression en TFIDF

<br>

> #### Variation de paramètre des max features (4500 -> 15000, par pas de 1500)

In [None]:
MAX_ITER = 250
TEST_SIZE = 1/4
REG = 1.0

CLASSIFIER = ClassifierType.TFIDF_LogReg
DATASET = 'dataset/csv/dataset_0-1.csv'

PLOT_ACC_PATH = 'assets/tfidf/log_reg/log_reg_max_features_dataset_0-1_acc.plot.png'
PLOT_PREC_PATH = 'assets/tfidf/log_reg/log_reg_max_features_dataset_0-1_prec.plot.png'
TITLE_PREC_ACC = 'log_reg_max_features_dataset_0-1_Prec_Acc'

CLASSES = [0,1]

df = pd.read_csv(DATASET)[['classe_bon_mauvais', 'avis']]

params = [4500, 6000, 7500, 9000, 10500, 12000, 13500, 15000]

accuracies = []
precisions = [[], []]

for i in range(len(params)):

    p = PipelineClassifier(CLASSIFIER, df, test_size=TEST_SIZE, max_iter=MAX_ITER, reg=REG, max_features=params[i])
    p.train()
    p.predict()
    
    accuracies.append(p.classifier.get_accuracy())
    for c in CLASSES:
        precisions[c].append(p.classifier.get_precisions(c))

p.classifier.plot_accuracy_precisions(TITLE_PREC_ACC, PLOT_ACC_PATH, PLOT_PREC_PATH, TITLE_PREC_ACC, params, CLASSES, accuracies, precisions)

> ## Results
> > ### Accuracy

<br>

<img src='assets/tfidf/log_reg/log_reg_max_features_dataset_0-1_acc.plot.png' width='500'>

<br>
<br>

> #### Precision

<br>

<img src='assets/tfidf/log_reg/log_reg_max_features_dataset_0-1_prec.plot.png' width='500'>

> ## Différences entre les méthodes:
> > ### Multi-layer perceptron en Word2Vec avec un modèle pré-entrainé

<br>

> #### Variation de paramètre des max iter (250 -> 1000, par pas de 250)

In [None]:
TEST_SIZE = 0.2
LAYERS = (13, 13, 13)

VEC_DIM = 200
VEC_BIN = 'dataset/vectors/frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin'

CLASSIFIER = ClassifierType.WORD2VEC
DATASET = 'dataset/csv/dataset_0-1.csv'

PLOT_ACC_PATH = 'assets/w2v/mlp/word2vec_max_iter_acc_0-1.plot.png'
PLOT_PREC_PATH = 'assets/w2v/mlp/word2vec_max_iter_prec_0-1.plot.png'

TITLE_PREC_ACC = 'max_iteration'

CLASSES = [0,1]

df = pd.read_csv(DATASET)[['classe_bon_mauvais', 'avis']]

params = [250, 500, 750, 1000]

accuracies = []
precisions = [[], []]

for i in range(len(params)):

    p = PipelineClassifier(CLASSIFIER, data=df, vec_bin=VEC_BIN, max_iter=params[i], layers=LAYERS, vec_dim=VEC_DIM, test_size=TEST_SIZE)
    p.train()
    p.predict()
    
    accuracies.append(p.classifier.get_accuracy())
    for c in CLASSES:
        precisions[c].append(p.classifier.get_precisions(c))

p.classifier.plot_accuracy_precisions(TITLE_PREC_ACC, PLOT_ACC_PATH, PLOT_PREC_PATH, TITLE_PREC_ACC, params, CLASSES, accuracies, precisions)

> ## Results
> > ### Accuracy

<br>

<img src='assets/w2v/mlp/word2vec_max_iter_acc_0-1.plot.png' width='500'>

<br>
<br>

> #### Precision

<br>

<img src='assets/w2v/mlp/word2vec_max_iter_prec_0-1.plot.png' width='500'>

> ## Différences entre les méthodes:
> > ### Multi-layer perceptron en Word2Vec avec un modèle pré-entrainé

<br>

> #### Variation de paramètre de la taille des layers (1 -> 9 , par pas de 1)

In [None]:
MAX_ITER = 500
TEST_SIZE = 0.2

VEC_DIM = 200
VEC_BIN = 'dataset/vectors/frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin'

CLASSIFIER = ClassifierType.WORD2VEC
DATASET = 'dataset/csv/dataset_0-1.csv'

PLOT_ACC_PATH = 'assets/w2v/mlp/word2vec_layers_acc_0-1.plot.png'
PLOT_PREC_PATH = 'assets/w2v/mlp/word2vec_layers_prec_0-1.plot.png'
TITLE_PREC_ACC = 'hidden layers'

CLASSES = [0,1]

df = pd.read_csv(DATASET)[['classe_bon_mauvais', 'avis']]

params = [1,2,3,4,5,6,7,8,9]

accuracies = []
precisions = [[], []]

p = PipelineClassifier(CLASSIFIER, data=df, vec_bin=VEC_BIN, max_iter=MAX_ITER, layers=None, vec_dim=VEC_DIM, test_size=TEST_SIZE)
p.transform_data()

for i in range(len(params)):
    
    p.classifier.classifier = ClassifierWord2Vec(
        data=df,
        word2vec_bin=VEC_BIN,
        max_iter=MAX_ITER,
        layers= [13] * params[i],
        vec_dim=VEC_DIM,
        test_size=TEST_SIZE
    )
    p.train_without_transform()
    p.predict()
    
    accuracies.append(p.classifier.get_accuracy())
    for c in CLASSES:
        precisions[c].append(p.classifier.get_precisions(c))

p.classifier.plot_accuracy_precisions(TITLE_PREC_ACC, PLOT_ACC_PATH, PLOT_PREC_PATH, TITLE_PREC_ACC, params, CLASSES, accuracies, precisions)



> ## Results
> > ### Accuracy

<br>

<img src='assets/w2v/mlp/word2vec_layers_acc_0-1.plot.png' width='500'>

<br>
<br>

> #### Precision

<br>

<img src='assets/w2v/mlp/word2vec_layers_prec_0-1.plot.png' width='500'>

> ## Différences entre les méthodes:
> > ### Naives Bayes avec un dictionnaires de features

<br>

> #### Variation de paramètre du nombre d'entrées du dictionnaire (1500 -> 6000 , par pas de 1500)

In [None]:
TEST_SIZE = 0.2

CLASSIFIER = ClassifierType.NAIVES_BAYES
DATASET = 'dataset/csv/dataset_0-1.csv'

PLOT_ACC_PATH = 'assets/features/nb/naives_bayes_nb_word_acc_0-1.plot.png'
PLOT_PREC_PATH = 'assets/features/nb/naives_bayes_nb_word_prec_0-1.plot.png'
TITLE_PREC_ACC = 'number of words'

CLASSES = [0,1]

df = pd.read_csv(DATASET)[['classe_bon_mauvais', 'avis']]

params = [1500, 3000, 4500, 6000]

accuracies = []
precisions = [[], []]

for i in range(len(params)):

    p = PipelineClassifier(CLASSIFIER, data=df, nb_word_n=params[i], test_size=TEST_SIZE)
    p.train()
    p.predict()
    
    accuracies.append(p.classifier.get_accuracy())
    for c in CLASSES:
        precisions[c].append(p.classifier.get_precisions(c))

p.classifier.plot_accuracy_precisions(TITLE_PREC_ACC, PLOT_ACC_PATH, PLOT_PREC_PATH, TITLE_PREC_ACC, params, CLASSES, accuracies, precisions)

> ## Results
> > ### Accuracy

<br>

<img src='assets/features/nb/naives_bayes_nb_word_acc_0-1.plot.png' width='500'>

<br>
<br>

> #### Precision

<br>

<img src='assets/features/nb/naives_bayes_nb_word_prec_0-1.plot.png' width='500'>

> ## Cross Validation 
> > Nous avons utilisé la cross-validation afin de trouver la meilleur combinaison de paramètres pour chaque modèle.

> ### Cross Validation des modèles TFIDF en Logistic Regression, MultinomialNB, et Mutli-layer percreptron

In [None]:
from pprint import pprint
from time import time

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier

def grid(classifier, params, X_train, X_test, y_train, y_test, classes, matrix_path, cp_path, title):

    # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
    grid_search = GridSearchCV(classifier, cv=5, param_grid=params)

    print("Performing grid search...")
    print(f"classifier : {classifier}")
    print(f"parameters : {params}")
    
    initial_t = time()
    # on fit les datas à notre grid search
    grid_search.fit(X_train, y_train)
    print("done in %0.3fs" % (time() - initial_t))
    print()
    print("---------------SCORE-------------")
    print(grid_search.cv_results_)

    # On récupère le meilleur score de prédiction ( à priori équivalent à la précision)

    print("Best CV score : %0.3f" % grid_search.best_score_)
    print("Best parameters set: ")
    # On donne également les meilleurs paramètres
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(params.keys()) :
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

    print("Test score with best_estimator_ : %0.3f" % grid_search.best_estimator_.score(X_test, y_test))
    print("\n")
    print("Classification Report Test Data")
    print(classification_report(y_test, grid_search.best_estimator_.predict(X_test)))

    y_test = y_test
    predic = grid_search.best_estimator_.predict(X_test)

    confm = confusion_matrix(y_test, predic)
    df_cm = pd.DataFrame(confm, index=classes, columns=classes)

    fig, ax = plt.subplots(figsize=(12,10))
    ax.set_title('Confusion matrix for '+ title)
    sb.heatmap(df_cm, cmap='YlOrRd', annot=True, fmt='g', ax=ax)
    plt.savefig(matrix_path)

    c_r = classification_report(y_test, predic)
    f = open(cp_path, 'a')
    f.write(c_r)
    f.close()

def replace_nan(df):
    for i in range(df.shape[0]):
        if pd.isna(df["avis"][i]):
            if df["classe_bon_mauvais"][i] == 1: 
                df.at[i, "avis"] = "good"
            else:
                df.at[i, "avis"] = "bad"
    return df


df = pd.read_csv("./dataset/csv/dataset_0-1.csv")

df = replace_nan(df)

X = df['avis'].copy()  # X correspond aux reviews
y = df['classe_bon_mauvais'].copy()  # y correspond aux classes comme c'est ce que l'on cherche à prévoir 

# Extractions des features
td = TfidfVectorizer(max_features=9000)
X = td.fit_transform(X).toarray()

# On split les datas en différents ensemble d'entrainement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/4, random_state=0)

logreg_classifier = LogisticRegression(C=1, max_iter=250).fit(X_train, y_train) # max_iter à 100 de base, faut monter ici car trop de data
logreg_score = logreg_classifier.score(X_test, y_test)

mnb_classifier = MultinomialNB().fit(X_train, y_train)
mnb_score = mnb_classifier.score(X_test, y_test)

mlp_classifier = MLPClassifier(hidden_layer_sizes=[13,13,13], max_iter=500).fit(X_train, y_train)
mlp_score = mlp_classifier.score(X_test, y_test)

print('======================================================')
# print(f"\n LogReg score {logreg_score}")
# print(f"\n MNB score {mnb_score}")
print(f"\n MLPC score {mlp_score}")
print('======================================================')

param_logreg_grid_ = {
    'C': [0.75, 1.0],
    'max_iter' : [250, 500]
    }
# as for the alpha param mnb, c is the hyperparameter of log reg
param_mnb_grid_ ={'alpha': [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}

param_mlp_grid_ = {
    'hidden_layer_sizes' : [[13,13,13,13], [13,13,13,13,13]],
    'max_iter' : [750, 1000]
}

classes = [0,1]
PLOT_MATRIX_PATH_LR = 'assets/tfidf/grid_search/log_reg/grid_search_logreg_dataset_0-1.plot.png'
CP_PATH_LR = 'assets/tfidf/grid_search/log_reg/grid_search_logreg_dataset_0-1_cp.txt'
TILE_CM_LR = 'grid_search_logreg_dataset_0-1_CM'

PLOT_MATRIX_PATH_MNB = 'assets/tfidf/grid_search/mnb/grid_search_mnb_dataset_0-1.plot.png'
CP_PATH_MNB = 'assets/tfidf/grid_search/mnb/grid_search_mnb_dataset_0-1_cp.txt'
TILE_CM_MNB = 'grid_search_mnb_dataset_0-1_CM'

PLOT_MATRIX_PATH_MLP = 'assets/tfidf/grid_search/mlp/grid_search_mlp_dataset_0-1.plot.png'
CP_PATH_MLP = 'assets/tfidf/grid_search/mlp/grid_search_mlp_dataset_0-1_cp.txt'
TILE_CM_MLP = 'grid_search_mlp_dataset_0-1_CM'

# to understand alpha param https://stackoverflow.com/questions/33830959/multinomial-naive-bayes-parameter-alpha-setting-scikit-learn
grid(mnb_classifier, param_mnb_grid_, X_train, X_test, y_train, y_test, classes, PLOT_MATRIX_PATH_MNB, CP_PATH_MNB, TILE_CM_MNB)

grid(logreg_classifier, param_logreg_grid_, X_train, X_test, y_train, y_test, classes, PLOT_MATRIX_PATH_LR, CP_PATH_LR, TILE_CM_LR)

grid(mlp_classifier, param_mlp_grid_, X_train, X_test, y_train, y_test, classes, PLOT_MATRIX_PATH_MLP, CP_PATH_MLP, TILE_CM_MLP)

> # Conclusion
> > Meilleurs modèles de TFIDF: 

> ## Logistic Regression

<br>

<img src='assets/tfidf/grid_search/log_reg/grid_search_logreg_dataset_0-1.plot.png' width='500'>

<br>

```
              precision    recall  f1-score   support

           0       0.85      0.87      0.86      4325
           1       0.87      0.85      0.86      4460

    accuracy                           0.86      8785
   macro avg       0.86      0.86      0.86      8785
weighted avg       0.86      0.86      0.86      8785

```

<br>

> ## Multinomial Naives Bayes

<br>

<img src='assets/tfidf/grid_search/mnb/grid_search_mnb_dataset_0-1.plot.png' width='500'>

<br>

```
              precision    recall  f1-score   support

           0       0.84      0.85      0.85      4325
           1       0.86      0.85      0.85      4460

    accuracy                           0.85      8785
   macro avg       0.85      0.85      0.85      8785
weighted avg       0.85      0.85      0.85      8785

```

<br>

> ## Multinomial Multi-layer perceptron

<br>

<img src='assets/tfidf/grid_search/mlp/grid_search_mlp_dataset_0-1.plot.png' width='500'>

<br>

```
              precision    recall  f1-score   support

           0       0.83      0.82      0.83      4361
           1       0.82      0.83      0.83      4412

    accuracy                           0.83      8773
   macro avg       0.83      0.83      0.83      8773
weighted avg       0.83      0.83      0.83      8773

```

<br>



> ## Meilleurs score : TFIDF Logistique Regression

In [None]:
MAX_ITER = 250
TEST_SIZE = 1/4
REG = 1.0
MAX_FEATURES = 9000

CLASSIFIER = ClassifierType.TFIDF_LogReg
DATASET = 'dataset/csv/dataset_0-3.csv'

PLOT_MATRIX_PATH = 'assets/tfidf/log_reg/log_reg_best_params_dataset_0-3.plot.png'
CP_PATH = 'assets/tfidf/log_reg/log_reg_best_params_dataset_0-3_cp.txt'
TILE_CM = 'log_reg_best_params_dataset_0-3_CM'

CLASSES = [0,1,2,3]

df = pd.read_csv(DATASET)[['classe_bon_mauvais', 'avis']]

p = PipelineClassifier(CLASSIFIER, df, test_size=TEST_SIZE, max_iter=MAX_ITER, reg=REG, max_features=MAX_FEATURES)

p.train()
p.predict()

p.classifier.plot_matrix_classification_report(TILE_CM, CP_PATH, PLOT_MATRIX_PATH, CLASSES)

> ### Results :

<br>

```
              precision    recall  f1-score   support

           0       0.60      0.71      0.65      1998
           1       0.40      0.27      0.33      1465
           2       0.51      0.55      0.53      1959
           3       0.76      0.74      0.75      2014

    accuracy                           0.59      7436
   macro avg       0.57      0.57      0.57      7436
weighted avg       0.58      0.59      0.58      7436

```

<img src='assets/tfidf/log_reg/log_reg_best_params_dataset_0-3.plot.png' width='750'>