In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import os

In [2]:
nltk.download('stopwords')
stop_words = list(set(stopwords.words('french')))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/beatriz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
output_folder = "/home/beatriz/Documents/courses/mgt502/project/mgt502/outputs/plots/"
data = pd.read_csv(os.path.abspath("../available_datasets/training_data.csv"))

X = data['sentence']
y = data['difficulty']

In [4]:
from transformers import AutoTokenizer, TFFlaubertModel

import tensorflow as tf

tokenizer = AutoTokenizer.from_pretrained("flaubert/flaubert_base_cased")

model = TFFlaubertModel.from_pretrained("flaubert/flaubert_base_cased")

inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")

outputs = model(inputs)

last_hidden_states = outputs.last_hidden_state

  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'tensorflow'

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, solver='sag'),# (solver='sag'),
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier()
}

param_grid = {
    "Logistic Regression": {'C': [0.1, 1, 5, 10, 15, 20]},
    "KNN": {'n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17]},
    "Decision Tree": {'max_depth': [2, 4, 6, 10, 15, 20]},
    "Random Forest": {'n_estimators': [10, 50, 100, 200, 500]}
}

vectorizer = TfidfVectorizer() # stop_words=stop_words)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

scaler = MaxAbsScaler()

X_train_scale = scaler.fit_transform(X_train_vec)
X_test_scale = scaler.transform(X_test_vec)

X_train_df_tfidf = pd.DataFrame(X_train_scale.toarray(), columns=vectorizer.get_feature_names_out())
X_test_df_tfidf = pd.DataFrame(X_test_scale.toarray(), columns=vectorizer.get_feature_names_out())

eval_dict = {'model': [], 'accuracy': [], 'precision': [], 'recall': [], 'f1': []}
best_parameters = {'model': [], 'best_parameters': []}
for name, model in models.items():
    grid_search = GridSearchCV(model, param_grid[name], cv=5, scoring='accuracy')
    grid_search.fit(X_train_scale, y_train)
    print(f"Best parameters for {name}: {grid_search.best_params_}")
    
    model = grid_search.best_estimator_
    model.fit(X_train_scale, y_train)
    y_pred = model.predict(X_test_scale)

    # show confusion matrix and plot it
    print(f"Confusion matrix for {name}:")
    print(pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted']))
    # plot confusion matrix
    plt.figure()
    plt.matshow(pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted']), cmap='Blues')
    plt.title(name)
    plt.savefig(output_folder+name+".pdf")
    plt.close()
    

    print(f"Evaluation metrics for {name}:")
    print(classification_report(y_test, y_pred))
    eval_dict['model'].append(name)
    eval_dict['accuracy'].append(model.score(X_test_scale, y_test))
    eval_dict['precision'].append(classification_report(y_test, y_pred, output_dict=True)['weighted avg']['precision'])
    eval_dict['recall'].append(classification_report(y_test, y_pred, output_dict=True)['weighted avg']['recall'])
    eval_dict['f1'].append(classification_report(y_test, y_pred, output_dict=True)['weighted avg']['f1-score'])
    best_parameters['model'].append(name)
    best_parameters['best_parameters'].append(grid_search.best_params_)



Best parameters for Logistic Regression: {'C': 1}
Confusion matrix for Logistic Regression:
Predicted   A1  A2  B1  B2  C1  C2
Actual                            
A1         110  39   8   6   3   0
A2          50  61  33   5   5   4
B1          28  52  59   8   6  13
B2           9   6  14  69  29  26
C1           5  12  13  22  66  34
C2           5   5  14  19  30  92
Evaluation metrics for Logistic Regression:
              precision    recall  f1-score   support

          A1       0.53      0.66      0.59       166
          A2       0.35      0.39      0.37       158
          B1       0.42      0.36      0.38       166
          B2       0.53      0.45      0.49       153
          C1       0.47      0.43      0.45       152
          C2       0.54      0.56      0.55       165

    accuracy                           0.48       960
   macro avg       0.48      0.47      0.47       960
weighted avg       0.48      0.48      0.47       960

Best parameters for KNN: {'n_neighbors': 

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

In [35]:
# exclude the model column
df_eval = pd.DataFrame(eval_dict, index=eval_dict['model'])
df_eval = df_eval.T
# remove model row
df_eval = df_eval.drop('model')
df_eval

Unnamed: 0,Logistic Regression,KNN,Decision Tree,Random Forest
accuracy,0.476042,0.180208,0.290625,0.398958
precision,0.475604,0.476458,0.285611,0.399041
recall,0.476042,0.180208,0.290625,0.398958
f1,0.473248,0.088171,0.271956,0.381365


In [40]:
X_train_all = data['sentence']
y_train_all = data['difficulty']

vectorizer = TfidfVectorizer() # stop_words=stop_words)
X_train_all_vec = vectorizer.fit_transform(X_train_all)

model = LogisticRegression(C=1, max_iter=1000, solver='sag')
model.fit(X_train_all_vec, y_train_all)

# predict on unlabelled data 
data_unlabelled = pd.read_csv(os.path.abspath("../available_datasets/unlabelled_test_data.csv"))
X_unlabelled = data_unlabelled['sentence']

X_unlabelled_vec = vectorizer.transform(X_unlabelled)

scaler = MaxAbsScaler()

X_unlabelled_scale = scaler.fit_transform(X_unlabelled_vec)

X_unlabelled_df_tfidf = pd.DataFrame(X_unlabelled_scale.toarray(), columns=vectorizer.get_feature_names_out())

y_unlabelled_pred = model.predict(X_unlabelled_scale)

# create a new dataset with id from data_unlabelled and predicted difficulty. the dataset should have the columns "id" and "difficulty"
data_unlabelled['difficulty'] = y_unlabelled_pred
data_unlabelled_submit = data_unlabelled[['id', 'difficulty']]
data_unlabelled_submit.to_csv(os.path.abspath("../outputs/datasets/predicted_logistic_regression.csv"), index=False)

In [41]:
X_train_all = data['sentence']
y_train_all = data['difficulty']

vectorizer = TfidfVectorizer() # stop_words=stop_words)
X_train_all_vec = vectorizer.fit_transform(X_train_all)

model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train_all_vec, y_train_all)

# predict on unlabelled data 
data_unlabelled = pd.read_csv(os.path.abspath("../available_datasets/unlabelled_test_data.csv"))
X_unlabelled = data_unlabelled['sentence']

X_unlabelled_vec = vectorizer.transform(X_unlabelled)

scaler = MaxAbsScaler()

X_unlabelled_scale = scaler.fit_transform(X_unlabelled_vec)

X_unlabelled_df_tfidf = pd.DataFrame(X_unlabelled_scale.toarray(), columns=vectorizer.get_feature_names_out())

y_unlabelled_pred = model.predict(X_unlabelled_scale)

# create a new dataset with id from data_unlabelled and predicted difficulty. the dataset should have the columns "id" and "difficulty"
data_unlabelled['difficulty'] = y_unlabelled_pred
data_unlabelled_submit = data_unlabelled[['id', 'difficulty']]
data_unlabelled_submit.to_csv(os.path.abspath("../outputs/datasets/predicted_knn.csv"), index=False)

In [42]:
X_train_all = data['sentence']
y_train_all = data['difficulty']

vectorizer = TfidfVectorizer() # stop_words=stop_words)
X_train_all_vec = vectorizer.fit_transform(X_train_all)

model = DecisionTreeClassifier(max_depth=10)
model.fit(X_train_all_vec, y_train_all)

# predict on unlabelled data 
data_unlabelled = pd.read_csv(os.path.abspath("../available_datasets/unlabelled_test_data.csv"))
X_unlabelled = data_unlabelled['sentence']

X_unlabelled_vec = vectorizer.transform(X_unlabelled)

scaler = MaxAbsScaler()

X_unlabelled_scale = scaler.fit_transform(X_unlabelled_vec)

X_unlabelled_df_tfidf = pd.DataFrame(X_unlabelled_scale.toarray(), columns=vectorizer.get_feature_names_out())

y_unlabelled_pred = model.predict(X_unlabelled_scale)

# create a new dataset with id from data_unlabelled and predicted difficulty. the dataset should have the columns "id" and "difficulty"
data_unlabelled['difficulty'] = y_unlabelled_pred
data_unlabelled_submit = data_unlabelled[['id', 'difficulty']]
data_unlabelled_submit.to_csv(os.path.abspath("../outputs/datasets/predicted_tree.csv"), index=False)

In [43]:
X_train_all = data['sentence']
y_train_all = data['difficulty']

vectorizer = TfidfVectorizer() # stop_words=stop_words)
X_train_all_vec = vectorizer.fit_transform(X_train_all)

model = RandomForestClassifier(n_estimators=500)    
model.fit(X_train_all_vec, y_train_all)

# predict on unlabelled data 
data_unlabelled = pd.read_csv(os.path.abspath("../available_datasets/unlabelled_test_data.csv"))
X_unlabelled = data_unlabelled['sentence']

X_unlabelled_vec = vectorizer.transform(X_unlabelled)

scaler = MaxAbsScaler()

X_unlabelled_scale = scaler.fit_transform(X_unlabelled_vec)

X_unlabelled_df_tfidf = pd.DataFrame(X_unlabelled_scale.toarray(), columns=vectorizer.get_feature_names_out())

y_unlabelled_pred = model.predict(X_unlabelled_scale)

# create a new dataset with id from data_unlabelled and predicted difficulty. the dataset should have the columns "id" and "difficulty"
data_unlabelled['difficulty'] = y_unlabelled_pred
data_unlabelled_submit = data_unlabelled[['id', 'difficulty']]
data_unlabelled_submit.to_csv(os.path.abspath("../outputs/datasets/predicted_forest.csv"), index=False)