In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
import pandas as pd
from itertools import product
from sklearn.metrics import classification_report
import warnings
from collections import defaultdict
warnings.filterwarnings('ignore')


In [5]:
import os
notebook_dir = os.path.abspath("")
base_dir = os.path.dirname(notebook_dir)
celva_data_folder = os.path.join(base_dir,"datasets", "CELVA")
celva_dataset_fp = os.path.join(celva_data_folder, "features_celva.csv")
idx_to_class_ = lambda v: {
         0: "A1",
         1: "A2",
         2: "B1",
         3: "B2",
         4: "C1",
         5: "C1",
}.get(v, None)
label_to_idx_ = lambda v: {
         "A1": 0,
         "A2": 1,
         "B1": 2,
         "B2": 3,
         "C1": 4,
         "C2": 4,
}.get(v, None)

In [6]:
import os
notebook_dir = os.path.abspath("")
base_dir = os.path.dirname(notebook_dir)
experiment_data_folder = os.path.join(base_dir,"datasets", "NLP4CALL_2025_experiment","experiments_data")
# efcamdat_100k_with_text_and_measures.csv
efcamdat_100k_fp = os.path.join(experiment_data_folder, "efcamdat_100k_with_text_and_measures.csv")
efcamdat_100k_train_fp = os.path.join(experiment_data_folder, "efcamdat_train_id.csv")
efcamdat_100k_test_fp = os.path.join(experiment_data_folder, "efcamdat_test_id.csv")
efcamdat_100k_train_fe_fp = os.path.join(experiment_data_folder, "andrew100ktrain_df_fe.csv")
efcamdat_100k_test_fe_fp = os.path.join(experiment_data_folder, "andrew100ktest_df_fe.csv")


config = {
    "text_column": "text",
    "column_mapping": {
        "remainder_efcamdat": {
            "text":"",
            "CEFR":"?"
        },
        "andrew100k": {
            "text": "text",
            "CEFR":"cefr_level"
        }
    },
    "train_output_fp": os.path.join(experiment_data_folder, "andrew100k-train-fe.csv")
}

In [7]:
df=pd.read_csv(efcamdat_100k_train_fe_fp)

In [9]:
df

Unnamed: 0,writing_id,cefr_level,measures.collocations.text_level.ratio_num_token,measures.collocations.text_level.ttr,measures.counts.acl,measures.counts.acl_ratio,measures.counts.acl:relcl,measures.counts.acl:relcl_ratio,measures.counts.ADJ,measures.counts.ADJ_ratio,...,measures.taassc.L2SCA.T_S,measures.taassc.L2SCA.VP_T,text,l1,cleaned_text,sentences,n_sentences,tokens_per_sentence,total_n_tokens,avg_n_tokens_per_sentence
0,115499,b1,0.108696,1.000000,2,0.021739,1,0.010870,3,0.032609,...,1.000000,1.500000,grandmas home remedies and recipes. Do you hav...,German,grandmas home remedies and recipes. Do you hav...,"['grandmas home remedies and recipes.', 'Do yo...",8,"[['grandmas', 'home', 'remedies', 'and', 'reci...",92,11.500000
1,1081381,a1,0.111111,1.000000,0,0.000000,0,0.000000,6,0.166667,...,3.000000,1.000000,My friend is very nice.She comes from Italy.Sh...,Italian,My friend is very nice.She comes from Italy.Sh...,['My friend is very nice.She comes from Italy....,2,"[['My', 'friend', 'is', 'very', 'nice.She', 'c...",32,16.000000
2,452770,b1,0.176000,1.000000,2,0.016000,1,0.008000,5,0.040000,...,9.000000,1.777778,"First, I will study a lot and finish my degree...",Portuguese,"First, I will study a lot and finish my degree...","['First, I will study a lot and finish my degr...",1,"[['First', ',', 'I', 'will', 'study', 'a', 'lo...",125,125.000000
3,412035,a1,0.050000,1.000000,0,0.000000,0,0.000000,2,0.050000,...,0.666667,1.500000,"Hy, my name's Andr. I have thirty one years ol...",Portuguese,"Hy, my name's Andr. I have thirty one years ol...","[""Hy, my name's Andr."", 'I have thirty one yea...",6,"[['Hy', ',', 'my', 'name', ""'s"", 'Andr', '.'],...",40,6.666667
4,132380,b1,0.061224,1.000000,0,0.000000,1,0.010204,6,0.061224,...,0.818182,1.777778,bello! I glad to congratulate you with the bes...,Russian,bello! I glad to congratulate you with the bes...,"['bello!', 'I glad to congratulate you with th...",11,"[['bello', '!'], ['I', 'glad', 'to', 'congratu...",95,8.636364
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79993,1136689,b1,0.135922,0.857143,0,0.000000,0,0.000000,8,0.077670,...,1.500000,1.416667,"Dear Ali, I hope this letter finds you in a go...",Arabic,"Dear Ali, I hope this letter finds you in a go...","['Dear Ali, I hope this letter finds you in a ...",8,"[['Dear', 'Ali', ',', 'I', 'hope', 'this', 'le...",103,12.875000
79994,797042,a1,0.000000,0.000000,0,0.000000,1,0.027027,3,0.081081,...,0.666667,1.000000,My piano and Me I am not re ally into my piano...,Arabic,My piano and Me I am not re ally into my piano...,['My piano and Me I am not re ally into my pia...,3,"[['My', 'piano', 'and', 'Me', 'I', 'am', 'not'...",36,12.000000
79995,628121,a2,0.074074,1.000000,2,0.024691,1,0.012346,4,0.049383,...,1.333333,1.500000,"Dear Louis, I''m very happy with the new that ...",Portuguese,"Dear Louis, I''m very happy with the new that ...","[""Dear Louis, I''m very happy with the new tha...",3,"[['Dear', 'Louis', ',', 'I', ""''"", 'm', 'very'...",86,28.666667
79996,211344,a2,0.064516,1.000000,0,0.000000,1,0.010753,9,0.096774,...,2.200000,1.181818,"Yes , today I have a bad day,have a bad taste ...",Mandarin,"Yes , today I have a bad day,have a bad taste ...","['Yes , today I have a bad day,have a bad tast...",5,"[['Yes', ',', 'today', 'I', 'have', 'a', 'bad'...",91,18.200000


In [137]:

# Create different classifiers.
classifiers = {
    "L2 logistic (Multinomial)": LogisticRegression(
        C=1, penalty="l2", solver="saga", max_iter=10000
    ),
    #"L2 logistic (OvR)": OneVsRestClassifier(
    #    LogisticRegression(C=C, penalty="l2", solver="saga", max_iter=10000)
    #)
}

In [141]:
features = {
    "n_sentences": ["n_sentences"],
    "n-sent+n-tokens": ["n_sentences","total_n_tokens"],
    "avg-sent+n-sent+n-tokens": ["n_sentences","total_n_tokens","avg_n_tokens_per_sentence"]
}

In [142]:
df.columns

Index(['pseudo', 'Voc_range', 'CECRL', 'nb_annees_L2', 'L1',
       'Domaine_de_specialite', 'Sejours_duree_semaines', 'Sejours_frequence',
       'Lang_exposition', 'L2', 'Note_dialang_ecrit', 'Lecture_regularite',
       'autre_langue', 'tache_ecrit', 'Texte_etudiant', 'Date_ajout',
       'Section_renforcee', 'CEFR', 'cleaned_text', 'sentences', 'n_sentences',
       'tokens_per_sentence', 'total_n_tokens', 'avg_n_tokens_per_sentence'],
      dtype='object')

In [152]:
target_column = "CEFR"
cv_results = defaultdict(lambda: defaultdict(dict))
for epoch_idx in range(100):
    dataset = df.sample(frac=1)
    pipes = product(classifiers.values(), features.values())
    for model, feature_list in pipes:
        columns = feature_list+[target_column]
        kf = KFold(n_splits=2)
        model_idx = str(model)+str(feature_list)
        for cv_idx,(train_idx, test_idx) in enumerate(kf.split(dataset)):
            X_train = dataset.iloc[train_idx][feature_list]
            X_test  = dataset.iloc[test_idx][feature_list]
            y_train = dataset.iloc[train_idx][target_column]
            y_test  = dataset.iloc[test_idx][target_column]
            model.fit(X_train, y_train)
            target_names = ['A1', 'A2', 'B1', 'B2', 'C1']
            y_pred = model.predict(X_test)
            results_dict = classification_report(y_test, y_pred, target_names=target_names,output_dict=True)
            cv_results[model_idx]["accuracies"][str(epoch_idx)+str(cv_idx)] = results_dict["accuracy"]
            cv_results[model_idx]["results"][str(epoch_idx)+str(cv_idx)] = results_dict
            print(model_idx, results_dict["accuracy"])

        

LogisticRegression(C=1, max_iter=10000, solver='saga')['n_sentences'] 0.372848948374761
LogisticRegression(C=1, max_iter=10000, solver='saga')['n_sentences'] 0.367816091954023
LogisticRegression(C=1, max_iter=10000, solver='saga')['n_sentences', 'total_n_tokens'] 0.34990439770554493
LogisticRegression(C=1, max_iter=10000, solver='saga')['n_sentences', 'total_n_tokens'] 0.36015325670498083
LogisticRegression(C=1, max_iter=10000, solver='saga')['n_sentences', 'total_n_tokens', 'avg_n_tokens_per_sentence'] 0.37858508604206503
LogisticRegression(C=1, max_iter=10000, solver='saga')['n_sentences', 'total_n_tokens', 'avg_n_tokens_per_sentence'] 0.40804597701149425
LogisticRegression(C=1, max_iter=10000, solver='saga')['n_sentences'] 0.3881453154875717
LogisticRegression(C=1, max_iter=10000, solver='saga')['n_sentences'] 0.36590038314176243
LogisticRegression(C=1, max_iter=10000, solver='saga')['n_sentences', 'total_n_tokens'] 0.35946462715105165
LogisticRegression(C=1, max_iter=10000, solver=

In [153]:
df.iloc[[2,4,10]][["L1"]]

Unnamed: 0,L1
2,French
4,French
10,French


In [154]:
for strat, data in cv_results.items():
    print(strat, sum(data["accuracies"].values())/len(data["accuracies"].values()))

LogisticRegression(C=1, max_iter=10000, solver='saga')['n_sentences'] 0.37244137491483703
LogisticRegression(C=1, max_iter=10000, solver='saga')['n_sentences', 'total_n_tokens'] 0.3618688417104386
LogisticRegression(C=1, max_iter=10000, solver='saga')['n_sentences', 'total_n_tokens', 'avg_n_tokens_per_sentence'] 0.38900617568844637
