Importing libraries and modules.

In [37]:
import pandas as pd
import numpy as np

#--------------------- SKLEARN -----------------------------------

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline



#--------------------- OTHER -----------------------------------


from xgboost import XGBClassifier
from nltk.corpus import stopwords

# ------------------ Files --------------------------------
from functions import *
from LightningModel import *

Importing the data set and basic formatting. Then splitting the data into a training and testing set.

In [38]:
url = "https://raw.githubusercontent.com/Makxxx/data_dmmlProject/main/training_data.csv"
data = pd.read_csv(url, index_col=0, dtype=str)
data.sentence = data.sentence.apply(lambda s: s.replace("'", ' '))
labels = data.difficulty.unique()
num_labels = len(labels)
data.difficulty = LabelEncoder().fit_transform(data.difficulty)
#-------------------------------------------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(data.sentence, data.difficulty, test_size=0.2)

Creating pipelines and evaluating the models

In [39]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

def create_and_train_pipeline(classifier, X_train, y_train):
    pipe = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', classifier()),
    ])

    pipe.fit(X_train, y_train)
    return pipe


def evaluate_clf(clf, X_test, y_test, labels, full=False):
    predicted = clf.predict(X_test)
    if full:
        return classification_report(y_test, predicted, target_names=labels)
    else:
        return np.mean(y_test == predicted)

In [42]:
lr_clf = create_and_train_pipeline(LogisticRegression, X_train, y_train)
knn_clf = create_and_train_pipeline(KNeighborsClassifier, X_train, y_train)
dt_clf = create_and_train_pipeline(DecisionTreeClassifier, X_train, y_train)
rf_clf = create_and_train_pipeline(RandomForestClassifier, X_train, y_train)
xgb_clf = create_and_train_pipeline(XGBClassifier, X_train, y_train)

print("logistic", evaluate_clf(lr_clf, X_test, y_test, labels, full=True))
print("knn", evaluate_clf(knn_clf, X_test, y_test, labels))
print("dt", evaluate_clf(dt_clf, X_test, y_test, labels))
print("rf", evaluate_clf(rf_clf, X_test, y_test, labels))
print("xgb", evaluate_clf(xgb_clf, X_test, y_test, labels))

logistic               precision    recall  f1-score   support

          C1       0.59      0.62      0.61       168
          A1       0.44      0.42      0.43       160
          B1       0.38      0.37      0.38       160
          A2       0.51      0.36      0.42       167
          B2       0.35      0.43      0.39       134
          C2       0.48      0.56      0.52       171

    accuracy                           0.46       960
   macro avg       0.46      0.46      0.46       960
weighted avg       0.46      0.46      0.46       960

knn 0.31875
dt 0.3145833333333333
rf 0.4114583333333333
xgb 0.42604166666666665


# Camambert

In [41]:
#from transformers import AutoTokenizer
#
#tokenizer = AutoTokenizer.from_pretrained('camembert-base')


