# Modèle de classification bayésienne

### Import des données

In [None]:
# Importation des modules nécessaires
import nltk
import re
import time
import seaborn
import pickle
import numpy as np
import pandas as pd 
import simplemma as sp  
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from unidecode import unidecode

# Téléchargement des modules de nltk nécessaires
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('stopwords')

In [None]:
!pip install simplemma | pip install unidecode

In [None]:
# Lecture des données
data = pd.read_excel("../talks_complet.xlsx")

# Classification binaire des tag
data['tag'].mask(data['tag'] == 'negative', 0, inplace=True)
data['tag'].mask(data['tag'] == 'positive', 1, inplace=True)

### Pré-traitement des données

In [None]:
# Conversion des messages en minuscule et suppression des accents
x_lowered = [text.lower() for text in data["query"]]
x_lowered = [unidecode(text) for text in data["query"]]

print(x_lowered[0])

In [None]:
# Tokenisation par mot
x_tokenized = [nltk.word_tokenize(text) for text in x_lowered]

print(x_tokenized[0])

In [None]:
# Lemmatization
x_lemmatized = [[sp.lemmatize((word), lang=('fr')) for word in text] for text in x_tokenized]

print(x_lemmatized[0])

In [None]:
# Suppression des stopwords
stopwords = nltk.corpus.stopwords.words("french")
x_prepared = [[word for word in text if word not in stopwords] for text in x_lemmatized]

print(x_prepared[0])

### Vectorisation par Bag of Words

In [None]:
# Vectorisation
vectorizer = CountVectorizer(max_features=20000)
doc = [" ".join(v) for v in x_prepared]
x = vectorizer.fit_transform(doc).toarray()

### Création et entraînement du modèle

In [None]:
# Séparation en un training set et un testing set
x_train,x_test,y_train,y_test  = train_test_split(x, np.asarray(data["tag"]), random_state = 0, test_size = 0.2)

In [None]:
# Entraînement du modèle
NB = GaussianNB()
y_train=y_train.astype('int')
NB.fit(x_train,y_train)
y_test = y_test.astype('int')

### Prédiction et métriques

In [None]:
# Mean accuracy
NB.score(x_test,y_test)

In [None]:
# Prédiction 
y_pred = NB.predict(x_test)

# Matrice de confusion
conf = confusion_matrix(y_pred=y_pred,y_true=y_test)
palette = seaborn.cubehelix_palette(n_colors=4, start=0, rot=0.4, gamma=1.0, hue=0.8, light=0.85, dark=0.15, reverse=False, as_cmap=False)
seaborn.heatmap(conf,annot=True,fmt=".1f",linewidths=1.5, cbar=True, cmap=palette)
plt.show()

In [None]:
# Précision, rappel et f-score
precision_recall_fscore_support(y_test, y_pred)

### Sauvegarde du modèle

In [None]:
# Sauvegarde du modèle et du vectorizer
with open("model.pckl",mode="wb") as F:
    pickle.dump(NB,F)
    
with open("vectorizer.pckl",mode="wb") as F:
    pickle.dump(vectorizer,F)

In [None]:
# Fonction de prédiction
def predict_anonymize(message):
    
    model = pickle.load(open("model.pckl",mode="rb"))
    vectorizer = pickle.load(open("vectorizer.pckl",mode="rb"))
    
    message = unidecode(message)
    stopwords = nltk.corpus.stopwords.words('french')
    
    message = message.lower()
    message = nltk.word_tokenize(message)
    message = [word for word in message if word not in stopwords]
    message = " ".join(message)
    
    vector = vectorizer.transform([message])
    decision = model.predict(vector.toarray())
    
    return decision[0]

### Test de prédiction

In [None]:
predict_anonymize("quels sont les horaires du festival ?")