In [None]:
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings('ignore')

### DATASET AVEC STOPWORDS OCCURENCES/TF-IDF

In [None]:
train_text = pd.read_csv('../Ressources/separated_data/scenarii_training_text_sans_chiffres.csv', encoding="ISO-8859-1", header = None)
train_tags = pd.read_csv('../Ressources/separated_data/scenarii_training_tags_sans_chiffres.csv', encoding="ISO-8859-1", header = None)
test_text = pd.read_csv('../Ressources/separated_data/scenarii_testing_text_sans_chiffres.csv', encoding="ISO-8859-1", header = None)
test_tags = pd.read_csv('../Ressources/separated_data/scenarii_testing_tags_sans_chiffres.csv', encoding="ISO-8859-1", header = None)

train_text = train_text[train_text.columns[0]]
train_tags = train_tags[train_tags.columns[0]]
test_text = test_text[test_text.columns[0]]
test_tags = test_tags[test_tags.columns[0]]

In [None]:
text_clf = Pipeline([('vect', CountVectorizer()),
                         #Commenter cette ligne pour avoir les occurences, et enlever le commentaire pour avoir les scores TF-IDF
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultinomialNB())])
text_clf.fit(train_text, train_tags)
predicted = text_clf.predict(test_text)
print("Precision: " + str(precision_score(test_tags, predicted, average='macro')))
print("Recall: " + str(recall_score(test_tags, predicted, average='macro')))
print("F measure: " + str(f1_score(test_tags, predicted, average='macro')))

#### CROSS VALIDATION

In [None]:
df = pd.read_csv("../Ressources/scenarii.csv", encoding="ISO-8859-1", sep=";", header=0)
tags = df['tag']
data = CountVectorizer().fit_transform(df['texte']).toarray()
#Commenter cette ligne pour avoir les occurences, et enlever le commentaire pour avoir les scores TF-IDF
data = TfidfTransformer().fit_transform(data).toarray()

In [None]:
clf = MultinomialNB()
scoring = {'accuracy': 'accuracy',
           'prec': 'average_precision', 
           'recall': 'recall',
           'f1_macro': 'f1_macro'}
scores = cross_val_score(clf, data, tags, cv=5, scoring = 'f1_macro')
print(np.mean(scores))
scores = cross_val_score(clf, data, tags, cv=5, scoring = 'recall_macro')
print(np.mean(scores))
scores = cross_val_score(clf, data, tags, cv=5, scoring = 'precision_macro')
print(np.mean(scores))

### DATASET SANS STOPWORDS OCCURENCES/TF-IDF


In [None]:
'''
Construction du modèle Naive Bayes pour les données sans stopwords.
'''
stopwords_fr = stopwords.words('french')
text_clf = Pipeline([('vect', CountVectorizer(stop_words = stopwords_fr)),
                         #Commenter cette ligne pour avoir les occurences, et enlever le commentaire pour avoir les scores TF-IDF
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultinomialNB())])

text_clf.fit(train_text, train_tags)
predicted = text_clf.predict(test_text)

print("Precision: " + str(precision_score(test_tags, predicted, average='macro')))
print("Recall: " + str(recall_score(test_tags, predicted, average='macro')))
print("F measure: " + str(f1_score(test_tags, predicted, average='macro')))

In [None]:
df = pd.read_csv("../Ressources/scenarii.csv", encoding="ISO-8859-1", sep=";", header=0)
tags = df['tag']
data = CountVectorizer(stop_words = stopwords_fr).fit_transform(df['texte']).toarray()
#Commenter cette ligne pour avoir les occurences, et enlever le commentaire pour avoir les scores TF-IDF
data = TfidfTransformer().fit_transform(data).toarray()

In [None]:
clf = MultinomialNB()
scoring = {'accuracy': 'accuracy',
           'prec': 'average_precision', 
           'recall': 'recall',
           'f1_macro': 'f1_macro'}
scores = cross_val_score(clf, data, tags, cv=5, scoring = 'f1_macro')
print(np.mean(scores))
scores = cross_val_score(clf, data, tags, cv=5, scoring = 'recall_macro')
print(np.mean(scores))
scores = cross_val_score(clf, data, tags, cv=5, scoring = 'precision_macro')
print(np.mean(scores))

### DATASET AVEC VERBES OCCURENCES/TF-IDF


In [None]:
train_text = pd.read_csv("../Ressources/data_verbs_only/training_matrix_with_verbs.csv", 
                   encoding="ISO-8859-1", sep=",", header=0)
train_tags = pd.read_csv("../Ressources/data_verbs_only/training_tags_with_verbs.csv", 
                         encoding="ISO-8859-1", sep=",", header = None)  
test_text = pd.read_csv("../Ressources/data_verbs_only/testing_text_with_verbs.csv", 
                   encoding="ISO-8859-1", sep=",", header=None)
test_tags = pd.read_csv("../Ressources/data_verbs_only/testing_tags_with_verbs.csv", 
                   encoding="ISO-8859-1", sep=",", header=None)

In [None]:
'''
Construction du modèle Naive Bayes pour les données avec les verbes uniquement.
'''
text_clf = Pipeline([
                         #Commenter cette ligne pour avoir les occurences, et enlever le commentaire pour avoir les scores TF-IDF
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultinomialNB())])
text_clf.fit(train_text, train_tags)
predicted = text_clf.predict(test_text)
print("Precision: " + str(precision_score(test_tags, predicted, average='macro')))
print("Recall: " + str(recall_score(test_tags, predicted, average='macro')))
print("F measure: " + str(f1_score(test_tags, predicted, average='macro')))

#### CROSS VALIDATION



In [None]:
data = pd.read_csv("../Ressources/data_verbs_only/all_data_with_verbs.csv", 
                   encoding="ISO-8859-1", sep=";", header=0)

tags = pd.read_csv("../Ressources/data_verbs_only/all_tags.csv", 
                   encoding="ISO-8859-1", sep=";", header=None)

In [None]:
#Commenter cette ligne pour avoir les occurences, et enlever le commentaire pour avoir les scores TF-IDF
data = TfidfTransformer().fit_transform(data).toarray()

In [None]:
'''
Construction du modèle Naive Bayes pour les données avec les verbes uniquemet en utilisant la cross validation.
'''
clf = MultinomialNB()
scoring = {'prec': 'average_precision', 
           'recall': 'recall',
           'f1_macro': 'f1_macro'}
scores = cross_val_score(clf, data, tags, cv=5, scoring = 'f1_macro')
print(np.mean(scores))
scores = cross_val_score(clf, data, tags, cv=5, scoring = 'recall_macro')
print(np.mean(scores))
scores = cross_val_score(clf, data, tags, cv=5, scoring = 'precision_macro')
print(np.mean(scores))

### DATASET AVEC VERBES D'ACTION OCCURENCES/TF-IDF

In [None]:
train_text = pd.read_csv("../Ressources/data_action_verbs_only/training_matrix_with_action_verbs.csv", 
                   encoding="ISO-8859-1", sep=",", header=0)
train_tags = pd.read_csv("../Ressources/data_action_verbs_only/training_tags_with_action_verbs.csv", 
                         encoding="ISO-8859-1", sep=",", header = None)  
test_text = pd.read_csv("../Ressources/data_action_verbs_only/testing_text_with_action_verbs.csv", 
                   encoding="ISO-8859-1", sep=",", header=None)
test_tags = pd.read_csv("../Ressources/data_action_verbs_only/testing_tags_with_action_verbs.csv", 
                   encoding="ISO-8859-1", sep=",", header=None)

In [None]:
'''
Construction du modèle Naive Bayes pour les données avec les verbes d'action uniquement.
'''
text_clf = Pipeline([
                         #Commenter cette ligne pour avoir les occurences, et enlever le commentaire pour avoir les scores TF-IDF
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultinomialNB())])
text_clf.fit(train_text, train_tags)
predicted = text_clf.predict(test_text)
print("Precision: " + str(precision_score(test_tags, predicted, average='macro')))
print("Recall: " + str(recall_score(test_tags, predicted, average='macro')))
print("F measure: " + str(f1_score(test_tags, predicted, average='macro')))

#### CROSS VALIDATION

In [None]:
data = pd.read_csv("../Ressources/data_action_verbs_only/all_data_with_action_verbs.csv", 
                   encoding="ISO-8859-1", sep=",", header=0)

tags = pd.read_csv("../Ressources/data_action_verbs_only/all_tags.csv", 
                   encoding="ISO-8859-1", sep=",", header=0)

In [None]:
'''
Construction du modèle Naive Bayes pour les données avec les verbes d'action uniquemet en utilisant la cross validation.
'''
clf = MultinomialNB()
scores = cross_val_score(clf, data, tags, cv=5, scoring='f1_macro')
print(np.mean(scores))
scores = cross_val_score(clf, data, tags, cv=5, scoring = 'recall_macro')
print(np.mean(scores))
scores = cross_val_score(clf, data, tags, cv=5, scoring = 'precision_macro')
print(np.mean(scores))