In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings('ignore')

### DATASET AVEC STOPWORDS OCCURENCES/TF-IDF

In [2]:
train_text = pd.read_csv('../Ressources/separated_data/scenarii_training_text.csv', encoding="ISO-8859-1")
train_tags = pd.read_csv('../Ressources/separated_data/scenarii_training_tags.csv', encoding="ISO-8859-1")
test_text = pd.read_csv('../Ressources/separated_data/scenarii_testing_text.csv', encoding="ISO-8859-1")
test_tags = pd.read_csv('../Ressources/separated_data/scenarii_testing_tags.csv', encoding="ISO-8859-1")

train_text = train_text[train_text.columns[0]]
train_tags = train_tags[train_tags.columns[0]]
test_text = test_text[test_text.columns[0]]
test_tags = test_tags[test_tags.columns[0]]

In [3]:
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                         #Commenter cette ligne pour avoir les occurences, et enlever le commentaire pour avoir les scores TF-IDF
                         ('tfidf', TfidfTransformer()),
                         ('clf', svm.SVC(kernel='linear', C=1))])
text_clf_svm.fit(train_text, train_tags)
predicted_svm = text_clf_svm.predict(test_text)
print("Precision: " + str(precision_score(test_tags, predicted_svm, average='macro')))
print("Recall: " + str(recall_score(test_tags, predicted_svm, average='macro')))
print("F measure: " + str(f1_score(test_tags, predicted_svm, average='macro')))

Precision: 0.9342449225160298
Recall: 0.9504956462729549
F measure: 0.9421355566825083


#### CROSS VALIDATION

In [4]:
df = pd.read_csv("../Ressources/scenarii.csv", encoding="ISO-8859-1", sep=";", header=0)
tags = df['tag']
data = CountVectorizer().fit_transform(df['texte']).toarray()
#Commenter cette ligne pour avoir les occurences, et enlever le commentaire pour avoir les scores TF-IDF
data = TfidfTransformer().fit_transform(data).toarray()

In [5]:
clf = svm.SVC(kernel='linear', C=1)
scoring = {'accuracy': 'accuracy',
           'prec': 'average_precision', 
           'recall': 'recall',
           'f1_macro': 'f1_macro'}
scores = cross_val_score(clf, data, tags, cv=5, scoring = 'f1_macro')
print(np.mean(scores))
scores = cross_val_score(clf, data, tags, cv=5, scoring = 'recall_macro')
print(np.mean(scores))
scores = cross_val_score(clf, data, tags, cv=5, scoring = 'precision_macro')
print(np.mean(scores))

0.9105579858100141
0.8987427718480351
0.9380797210238289


### DATASET SANS STOPWORDS  OCCURENCES/TF-IDF

In [6]:
'''
Construction du modèle SVM pour les données sans stopwords.
'''
stopwords_fr = stopwords.words('french')
text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words = stopwords_fr)),
                         #Commenter cette ligne pour avoir les occurences, et enlever le commentaire pour avoir les scores TF-IDF
                         ('tfidf', TfidfTransformer()),
                         ('clf', svm.SVC(kernel='linear', C=1))])

text_clf_svm.fit(train_text, train_tags)

#enregistrer le modèle
from sklearn.externals import joblib
filename = 'Models/svm_bow_without_stopwords.pkl'
joblib.dump(text_clf_svm, filename) #text_clf est le classifieur à enregistrer

predicted_svm = text_clf_svm.predict(test_text)

print("Precision: " + str(precision_score(test_tags, predicted_svm, average='macro')))
print("Recall: " + str(recall_score(test_tags, predicted_svm, average='macro')))
print("F measure: " + str(f1_score(test_tags, predicted_svm, average='macro')))

Precision: 0.937981957603639
Recall: 0.9628161322871076
F measure: 0.9498376975621559


#### CROSS VALIDATION

In [7]:
df = pd.read_csv("../Ressources/scenarii.csv", encoding="ISO-8859-1", sep=";", header=0)
tags = df['tag']
data = CountVectorizer(stop_words = stopwords_fr).fit_transform(df['texte']).toarray()
#Commenter cette ligne pour avoir les occurences, et enlever le commentaire pour avoir les scores TF-IDF
data = TfidfTransformer().fit_transform(data).toarray()

In [8]:
clf = svm.SVC(kernel='linear', C=1)
scoring = {'accuracy': 'accuracy',
           'prec': 'average_precision', 
           'recall': 'recall',
           'f1_macro': 'f1_macro'}
scores = cross_val_score(clf, data, tags, cv=5, scoring = 'f1_macro')
print(np.mean(scores))
scores = cross_val_score(clf, data, tags, cv=5, scoring = 'recall_macro')
print(np.mean(scores))
scores = cross_val_score(clf, data, tags, cv=5, scoring = 'precision_macro')
print(np.mean(scores))

0.9111913354572934
0.9020158963843175
0.9371963677330749


### DATASET AVEC VERBES OCCURENCES/TF-IDF

In [9]:
train_text = pd.read_csv("../Ressources/data_verbs_only/training_matrix_with_verbs.csv", 
                   encoding="ISO-8859-1", sep=",", header=0)
train_tags = pd.read_csv("../Ressources/data_verbs_only/training_tags_with_verbs.csv", 
                         encoding="ISO-8859-1", sep=",", header = None)  
test_text = pd.read_csv("../Ressources/data_verbs_only/testing_text_with_verbs.csv", 
                   encoding="ISO-8859-1", sep=",", header=None)
test_tags = pd.read_csv("../Ressources/data_verbs_only/testing_tags_with_verbs.csv", 
                   encoding="ISO-8859-1", sep=",", header=None)

In [10]:
'''
Construction du modèle SVM pour les données avec les verbes uniquement.
'''
text_clf_svm = Pipeline([
                         #Commenter cette ligne pour avoir les occurences, et enlever le commentaire pour avoir les scores TF-IDF
                         ('tfidf', TfidfTransformer()),
                         ('clf', svm.SVC(kernel='linear', C=1))])
text_clf_svm.fit(train_text, train_tags)
predicted_svm = text_clf_svm.predict(test_text)
print("Precision: " + str(precision_score(test_tags, predicted_svm, average='macro')))
print("Recall: " + str(recall_score(test_tags, predicted_svm, average='macro')))
print("F measure: " + str(f1_score(test_tags, predicted_svm, average='macro')))

Precision: 0.7894495762453508
Recall: 0.7408789846044828
F measure: 0.7390533276050665


#### CROSS VALIDATION


In [11]:
data = pd.read_csv("../Ressources/data_verbs_only/all_data_with_verbs.csv", 
                   encoding="ISO-8859-1", sep=";", header=0)

tags = pd.read_csv("../Ressources/data_verbs_only/all_tags.csv", 
                   encoding="ISO-8859-1", sep=";", header=None)

In [12]:
#Commenter cette ligne pour avoir les occurences, et enlever le commentaire pour avoir les scores TF-IDF
data = TfidfTransformer().fit_transform(data).toarray()

In [13]:
'''
Construction du modèle SVM pour les données avec les verbes uniquemet en utilisant la cross validation.
'''
clf = svm.SVC(kernel='linear', C=1)
scoring = {'prec': 'average_precision', 
           'recall': 'recall',
           'f1_macro': 'f1_macro'}
scores = cross_val_score(clf, data, tags, cv=5, scoring = 'f1_macro')
print(np.mean(scores))
scores = cross_val_score(clf, data, tags, cv=5, scoring = 'recall_macro')
print(np.mean(scores))
scores = cross_val_score(clf, data, tags, cv=5, scoring = 'precision_macro')
print(np.mean(scores))

0.7690194942453921
0.7780153180153182
0.7872809942754716


### DATASET AVEC VERBES D'ACTION

In [14]:
train_text = pd.read_csv("../Ressources/data_action_verbs_only/training_matrix_with_action_verbs.csv", 
                   encoding="ISO-8859-1", sep=",", header=0)
train_tags = pd.read_csv("../Ressources/data_action_verbs_only/training_tags_with_action_verbs.csv", 
                         encoding="ISO-8859-1", sep=",", header = None)  
test_text = pd.read_csv("../Ressources/data_action_verbs_only/testing_text_with_action_verbs.csv", 
                   encoding="ISO-8859-1", sep=",", header=None)
test_tags = pd.read_csv("../Ressources/data_action_verbs_only/testing_tags_with_action_verbs.csv", 
                   encoding="ISO-8859-1", sep=",", header=None)

In [15]:
'''
Construction du modèle SVM pour les données avec les verbes d'action uniquement.
'''
text_clf_svm = Pipeline([
                         #Commenter cette ligne pour avoir les occurences, et enlever le commentaire pour avoir les scores TF-IDF
                         ('tfidf', TfidfTransformer()),
                         ('clf', svm.SVC(kernel='linear', C=1))])
text_clf_svm.fit(train_text, train_tags)
predicted_svm = text_clf_svm.predict(test_text)
print("Precision: " + str(precision_score(test_tags, predicted_svm, average='macro')))
print("Recall: " + str(recall_score(test_tags, predicted_svm, average='macro')))
print("F measure: " + str(f1_score(test_tags, predicted_svm, average='macro')))

Precision: 0.7178591305850834
Recall: 0.491617727727171
F measure: 0.5116184982675399


#### CROSS VALIDATION

In [16]:
data = pd.read_csv("../Ressources/data_action_verbs_only/all_data_with_action_verbs.csv", 
                   encoding="ISO-8859-1", sep=",", header=0)

tags = pd.read_csv("../Ressources/data_action_verbs_only/all_tags.csv", 
                   encoding="ISO-8859-1", sep=",", header=0)

#Commenter cette ligne pour avoir les occurences, et enlever le commentaire pour avoir les scores TF-IDF
data = TfidfTransformer().fit_transform(data).toarray()

In [17]:
'''
Construction du modèle SVM pour les données avec les verbes d'action uniquemet en utilisant la cross validation.
'''
clf = svm.SVC(kernel='linear', C=1)
scores = cross_val_score(clf, data, tags, cv=5, scoring='f1_macro')
print(np.mean(scores))
scores = cross_val_score(clf, data, tags, cv=5, scoring = 'recall_macro')
print(np.mean(scores))
scores = cross_val_score(clf, data, tags, cv=5, scoring = 'precision_macro')
print(np.mean(scores))

0.5063782415736953
0.48783348676301397
0.7602583097208973
