import packages

In [1]:
import wheel
import fasttext
import pandas as pd
import numpy as np

import data

In [2]:
df = pd.read_csv('../data/training_data.csv')

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
labelencoder = LabelEncoder()
df['le_difficulty'] = labelencoder.fit_transform(df['difficulty'])
df.head(5)

X = df["sentence"]
y = df["le_difficulty"]
# best test size for linera
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.02416669999999996, random_state=5)

In [4]:
def evaluate(true, pred):
    precision = precision_score(y_test, y_pred, average="macro")
    recall = recall_score(y_test, y_pred, average="macro")
    f1 = f1_score(y_test, y_pred, average="macro")
    print(f"CONFUSION MATRIX:\n{confusion_matrix(true, pred)}")
    print(f"ACCURACY SCORE:\n{accuracy_score(true, pred):.4f}")
    print(f"CLASSIFICATION REPORT:\n\tPrecision: {precision:.4f}\n\tRecall: {recall:.4f}\n\tF1_Score: {f1:.4f}")

Cleaners

In [5]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
#nltk.download('punkt')

# Fonction pour enlever les mots inutiles (stop words) d'une phrase
def remove_stop_words(sentence):
    # Tokeniser la phrase et enlever les stop words
    words = nltk.word_tokenize(sentence)
    filtered_words = [word for word in words if word not in stopwords.words('french')]

    # Rejoindre les mots filtrés en une phrase
    return ' '.join(filtered_words)



# Fonction pour lemmatiser les mots d'une phrase
def lemmatize(sentence):
    # Tokeniser la phrase et lemmatiser les mots
    words = nltk.word_tokenize(sentence)
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(word) for word in words]

    # Rejoindre les lemmas en une phrase
    return ' '.join(lemmas)

def data_cleaner(sms):

    # Define stopwords
    stop_words = stopwords.words('french')

    # Define tokenizer and stemmer
    from nltk.tokenize import word_tokenize
    from nltk.stem import PorterStemmer

    # Remove digits
    sms = re.sub(r"\d+","", sms)

    # Lowercase
    sms = sms.lower()

    # Remove punctuation
    sms = str(re.sub(r"[^\w\s\d]","", sms))

    # Remove stop words
    sms = sms.split()
    sms = " ".join([word for word in sms if not word in stop_words])

    # Tokenize
    sms = word_tokenize(sms)

    # Stemming
    ps = PorterStemmer()
    sms = [ps.stem(word) for word in sms]

    return str(sms)

def empty(sentence):

    return sentence

In [6]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
# Combining the three models into an ensemble
from sklearn.ensemble import VotingClassifier







In [7]:
# Définir les méthodes de vectorisation, de classification et de nettoyage de données à tester
#vectorizers = [CountVectorizer(), TfidfVectorizer(), HashingVectorizer()]
#classifiers = [LogisticRegression(max_iter=1000,solver="lbfgs",random_state=50), KNeighborsClassifier(n_neighbors=5),SVC()]
#cleaners = [data_cleaner,remove_stop_words, lemmatize]

vectorizers = [TfidfVectorizer()]
classifiers = [LinearSVC()]
cleaners = [empty]

In [8]:
estimators = []
for clf in classifiers:
    estimators.append((clf.__class__.__name__,clf))

#classifiers.append((VotingClassifier(estimators=estimators, voting='hard')))

In [9]:
def otpimize_test_size(X,y,method,parameter_grid):
    results = []

    #print("iterations : "+str(parameter_grid["test_size"]))
    for test_size in parameter_grid["test_size"]:
        X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=test_size, random_state=0)

        tfidf = TfidfVectorizer()
        X_train = tfidf.fit_transform(X_train)
        X_test = tfidf.transform(X_test)

        print(X_train.shape)
        print(y_train.shape)

        method.fit(X_train,y_train)

        y_pred = method.predict(X_test)

        score = accuracy_score(y_test,y_pred)


        results.append({
            "test_size":test_size,
            "score":score,
        })

    results.sort(key=lambda x: x["score"],reverse=True)

    best_params = {"test_size":results[0]["test_size"]}
    best_score = results[0]['score']

    return best_params,best_score

def otpimize_randomstate(X,y,method,parameter_grid,test_size):
    results = []

    #print("iterations : "+str(parameter_grid["test_size"]))
    for random_state in parameter_grid["random_state"]:
        X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=test_size, random_state=random_state)

        tfidf = TfidfVectorizer()
        X_train = tfidf.fit_transform(X_train)
        X_test = tfidf.transform(X_test)

        method.fit(X_train,y_train)

        y_pred = method.predict(X_test)

        score = accuracy_score(y_test,y_pred)


        results.append({
            "random_state":random_state,
            "score":score,
        })

    results.sort(key=lambda x: x["score"],reverse=True)

    best_params = {"random_state":results[0]["random_state"]}
    best_score = results[0]['score']

    return best_params,best_score

SEARCH BEST TEST SIZE PARAM

In [10]:
grid = {"test_size": np.arange(0.081,0.082,0.0001)}

#best param
print(X.shape)
print(y.shape)
best_param, best_score = otpimize_test_size(X,y,classifiers[0],grid)
print("best param ", best_param["test_size"])
print("best score ", best_score)

(4800,)
(4800,)
(4411, 13856)
(4411,)
(4410, 13849)
(4410,)
(4410, 13849)
(4410,)
(4409, 13846)
(4409,)
(4409, 13846)
(4409,)
(4408, 13846)
(4408,)
(4408, 13846)
(4408,)
(4407, 13845)
(4407,)
(4407, 13845)
(4407,)
(4406, 13845)
(4406,)
(4406, 13845)
(4406,)
best param  0.081
best score  0.5141388174807198


In [76]:
grid = {"test_size": np.arange(0.03,0.035,0.0001)}

#best param 0.03291999999999988

best_param, best_score = otpimize_test_size(X,y,MultinomialNB(),grid)
print("best param ", best_param["test_size"])
print("best score ", best_score)


KeyboardInterrupt: 

In [29]:
grid = {"test_size": np.arange(0.024166,0.024267,0.0000001)}

# best param 0.02416669999999996

best_param, best_score, best_precision = otpimize_test_size(X,y,LinearSVC(),grid)
print("best param ", best_param["test_size"])
print("best score ", best_score)
print("best precision", best_precision)

KeyboardInterrupt: 

In [13]:
grid = {"random_state": np.arange(0,10)}

# best param 0.02416669999999996
print(X)



best_param, best_score = otpimize_randomstate(X,y,LinearSVC(),grid,0.02416669999999996)
print("best param ", best_param["random_state"])
print("best score ", best_score)

0       Les coûts kilométriques réels peuvent diverger...
1       Le bleu, c'est ma couleur préférée mais je n'a...
2       Le test de niveau en français est sur le site ...
3                Est-ce que ton mari est aussi de Boston?
4       Dans les écoles de commerce, dans les couloirs...
                              ...                        
4795    C'est pourquoi, il décida de remplacer les hab...
4796    Il avait une de ces pâleurs splendides qui don...
4797    Et le premier samedi de chaque mois, venez ren...
4798    Les coûts liés à la journalisation n'étant pas...
4799    Sur le sable, la mer haletait de toute la resp...
Name: sentence, Length: 4800, dtype: object
best param  5
best score  0.5982905982905983


In [42]:
grid = {"test_size": np.arange(0.01,1,0.01)}

# best param 0.023999999999999987

best_param, best_score = otpimize_test_size(X,y,LinearSVC(),grid)
print("best param ", best_param["test_size"])
print("best score ", best_score)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


KeyboardInterrupt: 

In [154]:
grid = {"test_size": np.arange(0.01,1,0.001)}

# best param

best_param, best_score = otpimize_test_size(X,y,LogisticRegression(max_iter=1000,solver="lbfgs",random_state=0),grid)
print("best param ", best_param["test_size"])
print("best score ", best_score)

iterations : [0.01  0.011 0.012 0.013 0.014 0.015 0.016 0.017 0.018 0.019 0.02  0.021
 0.022 0.023 0.024 0.025 0.026 0.027 0.028 0.029 0.03  0.031 0.032 0.033
 0.034 0.035 0.036 0.037 0.038 0.039 0.04  0.041 0.042 0.043 0.044 0.045
 0.046 0.047 0.048 0.049 0.05  0.051 0.052 0.053 0.054 0.055 0.056 0.057
 0.058 0.059 0.06  0.061 0.062 0.063 0.064 0.065 0.066 0.067 0.068 0.069
 0.07  0.071 0.072 0.073 0.074 0.075 0.076 0.077 0.078 0.079 0.08  0.081
 0.082 0.083 0.084 0.085 0.086 0.087 0.088 0.089 0.09  0.091 0.092 0.093
 0.094 0.095 0.096 0.097 0.098 0.099 0.1   0.101 0.102 0.103 0.104 0.105
 0.106 0.107 0.108 0.109 0.11  0.111 0.112 0.113 0.114 0.115 0.116 0.117
 0.118 0.119 0.12  0.121 0.122 0.123 0.124 0.125 0.126 0.127 0.128 0.129
 0.13  0.131 0.132 0.133 0.134 0.135 0.136 0.137 0.138 0.139 0.14  0.141
 0.142 0.143 0.144 0.145 0.146 0.147 0.148 0.149 0.15  0.151 0.152 0.153
 0.154 0.155 0.156 0.157 0.158 0.159 0.16  0.161 0.162 0.163 0.164 0.165
 0.166 0.167 0.168 0.169 0.17  0.171 0

In [19]:
# Initialiser une liste pour stocker les résultats de chaque combinaison de méthodes
results = []



max_algo_iter = len(vectorizers)*len(classifiers)*len(cleaners)
current_iter = 0

# Pour chaque combinaison de méthodes de vectorisation, de classification et de nettoyage de données
for vec in vectorizers:
    for clf in classifiers:
        for cleaner in cleaners:
            current_iter += 1
            print("===")
            print("iter number "+str(current_iter)+" sur "+str(max_algo_iter))
            print('vectorizer: '+ vec.__class__.__name__+ "\n"+
            'classifier: '+ clf.__class__.__name__+ "\n"+
            'cleaner: '+ cleaner.__name__)
            print("---")
            # Appliquer la méthode de nettoyage de données aux données d'entraînement et de test
            X_train_cleaned = [cleaner(sentence) for sentence in X_train]
            X_test_cleaned = [cleaner(sentence) for sentence in X_test]

            # Convertir les phrases en vecteurs en utilisant la méthode de vectorisation
            X_train_vec = vec.fit_transform(X_train_cleaned)
            X_test_vec = vec.transform(X_test_cleaned)

            # Entraîner un modèle de classification sur les données d'entraînement
            clf.fit(X_train_vec, y_train)

            # Prédire les étiquettes pour les données de test
            y_pred = clf.predict(X_test_vec)

            # Calculer la précision du modèle sur les données de test
            accuracy = accuracy_score(y_test, y_pred)

            precision = precision_score(y_test,y_pred,average="macro")

            # Ajouter les résultats à la liste de résultats
            results.append({
                'vectorizer': vec.__class__.__name__,
                'classifier': clf.__class__.__name__,
                'cleaner': cleaner.__name__,
                'accuracy': accuracy,
                "precision": precision
            })

# Trier les résultats par précision décroissante
results.sort(key=lambda x: x['accuracy'], reverse=True)

# Afficher les meilleurs résultats
print("Best results:")
for result in results[:5]:
    print("Vectorizer:", result['vectorizer'])
    print("Classifier:", result['classifier'])
    print("Cleaner:", result['cleaner'])
    print("Accuracy:", result['accuracy'])
    print()

===
iter number 1 sur 1
vectorizer: TfidfVectorizer
classifier: LinearSVC
cleaner: empty
---
Best results:
Vectorizer: TfidfVectorizer
Classifier: LinearSVC
Cleaner: empty
Accuracy: 0.5982905982905983



GridSearchs

In [14]:
tfidf = TfidfVectorizer()
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)


In [140]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'alpha': np.arange(0.591,0.593,0.0001),
    "fit_prior":[True,False]
}



grid_search = GridSearchCV(MultinomialNB(),param_grid)

grid_search.fit(X_train_vec, y_train)

print("best params: ",grid_search.best_params_)
print("pécision :", grid_search.best_score_)
print("pécision avec test :", grid_search.score(X_test_vec,y_test))

best params:  {'alpha': 0.5912, 'fit_prior': True}
pécision : 0.4419270350024127
pécision avec test : 0.5157232704402516


Accuracy: 0.564%
ACCURACY SCORE:
0.5641


AttributeError: 'LogisticRegression' object has no attribute '_class'

KeyboardInterrupt: 

FASTTEXT

In [1]:

linSVC = LinearSVC()
linSVC.fit(X_train_vec,y_train)


y_pred = linSVC.predict(X_train_vec)

print("precision: ",accuracy_score(y_test,y_pred))

NameError: name 'LinearSVC' is not defined

Return to CSV

In [163]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', LinearSVC())
])

# Définition de la grille de recherche des hyperparamètres
param_grid = {
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'vectorizer__min_df': [2, 3, 4],
    'vectorizer__max_df': [0.9, 0.95, 1.0],
    'vectorizer__max_features': [1000, 2000, 3000],
    'classifier__C': [0.1, 1, 10]
}

# Création de l'objet GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5)

# Entraînement du modèle sur les données d'entraînement
grid_search.fit(X_train, y_train)

# Prédiction des étiquettes pour les données de test
predictions = grid_search.predict(X_test)

# Calcul de la précision du modèle
accuracy = accuracy_score(y_test, predictions)
print("Précision:", accuracy)

# Affichage des meilleurs hyperparamètres trouvés par la grille de recherche
print("Meilleurs hyperparamètres:", grid_search.best_params_)

Précision: 0.45
Meilleurs hyperparamètres: {'classifier__C': 0.1, 'vectorizer__max_df': 0.9, 'vectorizer__max_features': 3000, 'vectorizer__min_df': 2, 'vectorizer__ngram_range': (1, 1)}


In [21]:



unlabelled_test_data = pd.read_csv("../data/unlabelled_test_data.csv")
unlabelled_test_data_tfidf = tfidf.transform(unlabelled_test_data["sentence"])
linSVC = LinearSVC().fit(X_train_vec,y_train)
y_pred = linSVC.predict(unlabelled_test_data_tfidf)



unlabelled_test_data["difficulty"] = y_pred

unlabelled_test_data["difficulty"] = labelencoder.inverse_transform(y_pred)

unlabelled_test_data.drop(columns=["sentence"],inplace=True)

unlabelled_test_data.to_csv("../data/submit9.csv",index=False)



ValueError: could not convert string to float: 'Nous dûmes nous excuser des propos que nous eûmes prononcés'