In [135]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
import pandas as pd
from itertools import product
from sklearn.metrics import classification_report
import warnings
from collections import defaultdict
warnings.filterwarnings('ignore')


In [118]:
import os
notebook_dir = os.path.abspath("")
base_dir = os.path.dirname(notebook_dir)
celva_data_folder = os.path.join(base_dir,"datasets", "CELVA")
celva_dataset_fp = os.path.join(celva_data_folder, "features_celva.csv")
idx_to_class_ = lambda v: {
         0: "A1",
         1: "A2",
         2: "B1",
         3: "B2",
         4: "C1",
         5: "C1",
}.get(v, None)
label_to_idx_ = lambda v: {
         "A1": 0,
         "A2": 1,
         "B1": 2,
         "B2": 3,
         "C1": 4,
         "C2": 4,
}.get(v, None)

In [119]:
df=pd.read_csv(celva_dataset_fp)

In [120]:
df

Unnamed: 0,pseudo,Voc_range,CECRL,nb_annees_L2,L1,Domaine_de_specialite,Sejours_duree_semaines,Sejours_frequence,Lang_exposition,L2,...,Texte_etudiant,Date_ajout,Section_renforcee,CEFR,cleaned_text,sentences,n_sentences,tokens_per_sentence,total_n_tokens,avg_n_tokens_per_sentence
0,030928d2a04fd0035312d8a75a2403acfe29ac41f07cdf...,B1,B1,11.0,French,Sciences et proprietes de la matiere,6.0,5,0.0,Anglais,...,being in a earth sciences domain at the beginn...,2019-03-22 11:14:00,0,2,being in a earth sciences domain at the beginn...,['being in a earth sciences domain at the begi...,15,"[['being', 'in', 'a', 'earth', 'sciences', 'do...",350,23.333333
1,09eec567ab0b705caf2353e10849bcf3579749d8e9ac84...,A2,A2,7.0,French,Information Communication,1.0,1,3.0,Anglais,...,alex dupont had an important role during ww2. ...,2022-09-14 14:51:00,0,1,alex dupont had an important role during ww2. ...,['alex dupont had an important role during ww2...,15,"[['alex', 'dupont', 'had', 'an', 'important', ...",279,18.600000
2,1296d11065e5441e2a97223619841ee5a204979254c0a6...,B1,B1,10.0,French,Medecine,2.0,1,2.0,Anglais,...,in the 20th century alex dupont discovered pen...,2018-10-24 10:21:00,1,2,in the 20th century alex dupont discovered pen...,['in the 20th century alex dupont discovered p...,20,"[['in', 'the', '20th', 'century', 'alex', 'dup...",249,12.450000
3,1f2dd72bc867a07a8a11579ddd2345b3d9851622289c5c...,B2,C1,12.0,French,Information Communication,0.0,0,10.0,Anglais,...,b b at the beginning phones were only used to ...,2022-02-21 15:19:00,1,4,b b at the beginning phones were only used to ...,['b b at the beginning phones were only used t...,27,"[['b', 'b', 'at', 'the', 'beginning', 'phones'...",510,18.888889
4,2ad05299cb18cb201810933a85f3157d8e885a4490156f...,B2,B2,9.0,French,Medecine,0.0,0,2.0,Anglais,...,as we know a lot of lives are saved everyday t...,2020-03-03 12:25:00,0,3,as we know a lot of lives are saved everyday t...,['as we know a lot of lives are saved everyday...,11,"[['as', 'we', 'know', 'a', 'lot', 'of', 'lives...",226,20.545455
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1040,6103f5729ab058a7d71cee4245a776e3b3333be57b41fd...,C1,C1,13.0,French,Informatique et electronique,2.5,4,70.0,Anglais,...,since my childhood i have been educated with s...,2022-09-14 11:44:00,0,4,since my childhood i have been educated with s...,['since my childhood i have been educated with...,13,"[['since', 'my', 'childhood', 'i', 'have', 'be...",342,26.307692
1041,da55b42254c2c4764937c4af1544cd1f0520d75b47076a...,A1,A1,8.0,French,Sciences de la vie et de l'environnement,2.0,1,1.0,Anglais,...,experience of fibroblastes scientifics discove...,2018-11-22 10:51:00,0,0,experience of fibroblastes scientifics discove...,['experience of fibroblastes scientifics disco...,4,"[['experience', 'of', 'fibroblastes', 'scienti...",53,13.250000
1042,f468edb2f584f2bfc530953308d80e2b7c7b1e001a2e37...,B2,B2,8.0,French,Information Communication,0.0,0,0.0,Anglais,...,first of all i would like to talk about the in...,2021-01-29 10:28:00,0,3,first of all i would like to talk about the in...,['first of all i would like to talk about the ...,10,"[['first', 'of', 'all', 'i', 'would', 'like', ...",241,24.100000
1043,86aedd1988a47388c8edbfbd3a5fa26de2644f7beed390...,B1,B1,9.0,French,Sciences de l'education,0.0,0,0.0,Anglais,...,hello my name is alex dupont im 20 years old. ...,2022-09-12 14:53:00,0,2,hello my name is alex dupont im 20 years old. ...,['hello my name is alex dupont im 20 years old...,14,"[['hello', 'my', 'name', 'is', 'alex', 'dupont...",232,16.571429


In [137]:

# Create different classifiers.
classifiers = {
    "L2 logistic (Multinomial)": LogisticRegression(
        C=1, penalty="l2", solver="saga", max_iter=10000
    ),
    #"L2 logistic (OvR)": OneVsRestClassifier(
    #    LogisticRegression(C=C, penalty="l2", solver="saga", max_iter=10000)
    #)
}

In [141]:
features = {
    "n_sentences": ["n_sentences"],
    "n-sent+n-tokens": ["n_sentences","total_n_tokens"],
    "avg-sent+n-sent+n-tokens": ["n_sentences","total_n_tokens","avg_n_tokens_per_sentence"]
}

In [142]:
df.columns

Index(['pseudo', 'Voc_range', 'CECRL', 'nb_annees_L2', 'L1',
       'Domaine_de_specialite', 'Sejours_duree_semaines', 'Sejours_frequence',
       'Lang_exposition', 'L2', 'Note_dialang_ecrit', 'Lecture_regularite',
       'autre_langue', 'tache_ecrit', 'Texte_etudiant', 'Date_ajout',
       'Section_renforcee', 'CEFR', 'cleaned_text', 'sentences', 'n_sentences',
       'tokens_per_sentence', 'total_n_tokens', 'avg_n_tokens_per_sentence'],
      dtype='object')

In [152]:
target_column = "CEFR"
cv_results = defaultdict(lambda: defaultdict(dict))
for epoch_idx in range(100):
    dataset = df.sample(frac=1)
    pipes = product(classifiers.values(), features.values())
    for model, feature_list in pipes:
        columns = feature_list+[target_column]
        kf = KFold(n_splits=2)
        model_idx = str(model)+str(feature_list)
        for cv_idx,(train_idx, test_idx) in enumerate(kf.split(dataset)):
            X_train = dataset.iloc[train_idx][feature_list]
            X_test  = dataset.iloc[test_idx][feature_list]
            y_train = dataset.iloc[train_idx][target_column]
            y_test  = dataset.iloc[test_idx][target_column]
            model.fit(X_train, y_train)
            target_names = ['A1', 'A2', 'B1', 'B2', 'C1']
            y_pred = model.predict(X_test)
            results_dict = classification_report(y_test, y_pred, target_names=target_names,output_dict=True)
            cv_results[model_idx]["accuracies"][str(epoch_idx)+str(cv_idx)] = results_dict["accuracy"]
            cv_results[model_idx]["results"][str(epoch_idx)+str(cv_idx)] = results_dict
            print(model_idx, results_dict["accuracy"])

        

LogisticRegression(C=1, max_iter=10000, solver='saga')['n_sentences'] 0.372848948374761
LogisticRegression(C=1, max_iter=10000, solver='saga')['n_sentences'] 0.367816091954023
LogisticRegression(C=1, max_iter=10000, solver='saga')['n_sentences', 'total_n_tokens'] 0.34990439770554493
LogisticRegression(C=1, max_iter=10000, solver='saga')['n_sentences', 'total_n_tokens'] 0.36015325670498083
LogisticRegression(C=1, max_iter=10000, solver='saga')['n_sentences', 'total_n_tokens', 'avg_n_tokens_per_sentence'] 0.37858508604206503
LogisticRegression(C=1, max_iter=10000, solver='saga')['n_sentences', 'total_n_tokens', 'avg_n_tokens_per_sentence'] 0.40804597701149425
LogisticRegression(C=1, max_iter=10000, solver='saga')['n_sentences'] 0.3881453154875717
LogisticRegression(C=1, max_iter=10000, solver='saga')['n_sentences'] 0.36590038314176243
LogisticRegression(C=1, max_iter=10000, solver='saga')['n_sentences', 'total_n_tokens'] 0.35946462715105165
LogisticRegression(C=1, max_iter=10000, solver=

In [153]:
df.iloc[[2,4,10]][["L1"]]

Unnamed: 0,L1
2,French
4,French
10,French


In [154]:
for strat, data in cv_results.items():
    print(strat, sum(data["accuracies"].values())/len(data["accuracies"].values()))

LogisticRegression(C=1, max_iter=10000, solver='saga')['n_sentences'] 0.37244137491483703
LogisticRegression(C=1, max_iter=10000, solver='saga')['n_sentences', 'total_n_tokens'] 0.3618688417104386
LogisticRegression(C=1, max_iter=10000, solver='saga')['n_sentences', 'total_n_tokens', 'avg_n_tokens_per_sentence'] 0.38900617568844637
