In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
import pandas as pd
from itertools import product
from sklearn.metrics import classification_report
import warnings
from collections import defaultdict
import os
from rclone_python import rclone
warnings.filterwarnings('ignore')


In [21]:
notebook_dir = os.path.abspath("")
local_base_dir = os.path.dirname(notebook_dir)
local_experiment_data_folder = os.path.join(local_base_dir,"datasets", "NLP4CALL_2025_experiment","experiments_data")


def remote_expected(config):
    """
    Check if the configuration expects remote storage for inputs or outputs.
    Returns a tuple:
    - A boolean indicating if remote storage is expected.
    - The remote base directory (if remote storage is expected), otherwise None.
    """
    input_remote = config["storage"]["inputs"]["from"] == "rclone"
    output_remote = config["storage"]["outputs"]["from"] == "rclone"

    # Determine if remote storage is expected
    is_remote = input_remote or output_remote

    # Determine the remote base directory
    remote_base_dir = None
    if input_remote:
        remote_base_dir = config["storage"]["inputs"].get("remote_base_dir")
    if output_remote:
        remote_base_dir = config["storage"]["outputs"].get("remote_base_dir")

    # If both input and output are remote, ensure they have the same base directory
    if input_remote and output_remote:
        input_dir = config["storage"]["inputs"].get("remote_base_dir")
        output_dir = config["storage"]["outputs"].get("remote_base_dir")
        if input_dir != output_dir:
            raise ValueError("Input and output remote base directories must be the same when both are remote.")
    remote_flags = {
        "input": input_remote,
        "output": output_remote,
    }
    return is_remote, remote_base_dir, remote_flags


config_local_local = {
    "storage": {
        "inputs": {
            "from": "local"
        },
        "outputs": {
            "from": "local"
        },
    },
    "text_column": "text",
    "column_mapping": {
        "remainder_efcamdat": {
            "text":"",
            "CEFR":"?"
        },
        "andrew100k": {
            "text": "text",
            "CEFR":"cefr_level"
        }
    },
    "perplexity_models" : [
        "gpt2"
    ],
    "base_columns": ['writing_id', 'cefr_level','text', 'l1' ],
}

config_local_rclone = {
    "storage": {
        "inputs": {
            "from": "local"
        },
        "outputs": {
            "from": "rclone",
            "remote_base_dir": "insight-gdrive:/phd-experimental-data"
        },
    },
    "text_column": "text",
    "column_mapping": {
        "remainder_efcamdat": {
            "text":"",
            "CEFR":"?"
        },
        "andrew100k": {
            "text": "text",
            "CEFR":"cefr_level"
        }
    },
    "perplexity_models" : [
        "gpt2"
    ],
    "base_columns": ['writing_id', 'cefr_level','text', 'l1' ],
}

config_rclone_rclone = {
    "storage": {
        "inputs": {
            "from": "rclone",
            "remote_base_dir": "insight-gdrive:/phd-experimental-data"
        },
        "outputs": {
            "from": "rclone",
            "remote_base_dir": "insight-gdrive:/phd-experimental-data"
        },
    },
    "text_column": "text",
    "column_mapping": {
        "remainder_efcamdat": {
            "text":"",
            "CEFR":"?"
        },
        "andrew100k": {
            "text": "text",
            "CEFR":"cefr_level"
        }
    },
    "perplexity_models" : [
        "gpt2"
    ],
    "base_columns": ['writing_id', 'cefr_level','text', 'l1' ],
}


config = config_rclone_rclone


is_remoted_expected, remote_base_dir, remote_flags = remote_expected(config)
config["remote_flags"] = remote_flags
if is_remoted_expected:
   remote_experiment_data_folder = os.path.join(remote_base_dir, "NLP4CALL_2025_experiment","experiments_data")


andrew_100k_train_fe_local_fp  = os.path.join(local_experiment_data_folder, "andrew100k-train-fe.csv")
if remote_flags["input"] == True:
    # expects a remote + the same folder structure as local folder structure
    # efcamdat_100k_with_text_and_measures.csv
    if not os.path.exists(andrew_100k_train_fe_local_fp):
        andrew_100k_train_fe_remote_fp = os.path.join(remote_experiment_data_folder, "andrew100k-train-fe.csv","andrew100k-train-fe.csv")
        os.makedirs(local_experiment_data_folder,exist_ok=True)
        rclone.copyto(andrew_100k_train_fe_remote_fp, andrew_100k_train_fe_local_fp)
        
if remote_flags["output"] == False:
    pass
    #config["train_output_fp"] =  os.path.join(local_experiment_data_folder, "andrew100k-train-fe.csv")
    #config["test_output_fp"] =  os.path.join(local_experiment_data_folder, "andrew100k-test-fe.csv")
elif remote_flags["output"] == True:
    pass
    # expects a remote + the same folder structure as local folder structure
    #config["train_output_fp"] =  os.path.join(remote_experiment_data_folder, "andrew100k-train-fe.csv")
    #config["test_output_fp"] =  os.path.join(remote_experiment_data_folder, "andrew100k-test-fe.csv")
else:
    raise Exception("unexpected output remote flag")



Output()

In [22]:
config

{'storage': {'inputs': {'from': 'rclone',
   'remote_base_dir': 'insight-gdrive:/phd-experimental-data'},
  'outputs': {'from': 'rclone',
   'remote_base_dir': 'insight-gdrive:/phd-experimental-data'}},
 'text_column': 'text',
 'column_mapping': {'remainder_efcamdat': {'text': '', 'CEFR': '?'},
  'andrew100k': {'text': 'text', 'CEFR': 'cefr_level'}},
 'perplexity_models': ['gpt2'],
 'base_columns': ['writing_id', 'cefr_level', 'text', 'l1'],
 'remote_flags': {'input': True, 'output': True}}

In [23]:
idx_to_class_ = lambda v: {
         0: "A1",
         1: "A2",
         2: "B1",
         3: "B2",
         4: "C1",
         5: "C1",
}.get(v, None)
label_to_idx_ = lambda v: {
         "A1": 0,
         "A2": 1,
         "B1": 2,
         "B2": 3,
         "C1": 4,
         "C2": 4,
}.get(v, None)

In [24]:
andrew100k_train_fe_df=pd.read_csv(andrew_100k_train_fe_local_fp)

In [25]:
andrew100k_train_fe_df.head()

Unnamed: 0,writing_id,cefr_level,text,l1,measures.andrew.collocations.text_level.ratio_num_token,measures.andrew.collocations.text_level.ttr,measures.andrew.counts.acl,measures.andrew.counts.acl_ratio,measures.andrew.counts.ADJ,measures.andrew.counts.ADJ_ratio,...,measures.andrew.taassc.L2SCA.MLS,measures.andrew.taassc.L2SCA.MLT,measures.andrew.taassc.L2SCA.T_S,measures.andrew.taassc.L2SCA.VP_T,cleaned_text,sentences,measures.basic.n_sentences,tokens_per_sentence,measures.basic.total_n_tokens,measures.basic.avg_n_tokens_per_sentence
0,115499,b1,grandmas home remedies and recipes. do you hav...,German,0.108696,1.0,2,0.021739,3,0.032609,...,11.5,11.5,1.0,1.5,grandmas home remedies and recipes. do you hav...,"['grandmas home remedies and recipes.', 'do yo...",7,"[['grandmas', 'home', 'remedies', 'and', 'reci...",90,12.857143
1,1081381,a1,my friend is very nice.she comes from italy.sh...,Italian,0.111111,1.0,0,0.0,6,0.166667,...,16.0,5.333333,3.0,1.0,my friend is very nice.she comes from italy.sh...,['my friend is very nice.she comes from italy....,2,"[['my', 'friend', 'is', 'very', 'nice.she', 'c...",28,14.0
2,452770,b1,first i will study a lot and finish my degree ...,Portuguese,0.176,1.0,2,0.016,5,0.04,...,125.0,13.888889,9.0,1.777778,first i will study a lot and finish my degree ...,['first i will study a lot and finish my degre...,1,"[['first', 'i', 'will', 'study', 'a', 'lot', '...",113,113.0
3,412035,a1,hy my names andr. i have thirty one years old....,Portuguese,0.05,1.0,0,0.0,2,0.05,...,6.666667,10.0,0.666667,1.5,hy my names andr. i have thirty one years old....,"['hy my names andr.', 'i have thirty one years...",6,"[['hy', 'my', 'names', 'andr', '.'], ['i', 'ha...",36,6.0
4,132380,b1,bello i glad to congratulate you with the best...,Russian,0.061224,1.0,0,0.0,6,0.061224,...,8.636364,10.555556,0.818182,1.777778,bello i glad to congratulate you with the best...,['bello i glad to congratulate you with the be...,6,"[['bello', 'i', 'glad', 'to', 'congratulate', ...",84,14.0


['measures.basic.n_sentences',
 'measures.basic.total_n_tokens',
 'measures.basic.avg_n_tokens_per_sentence']

In [50]:

# Create different classifiers.
classifiers = {
    "L2 logistic (Multinomial)": LogisticRegression(
        C=1, penalty="l2", solver="saga", max_iter=10000
    ),
    #"L2 logistic (OvR)": OneVsRestClassifier(
    #    LogisticRegression(C=C, penalty="l2", solver="saga", max_iter=10000)
    #)
}

In [53]:
features = {
    #"n_sentences": ["measures.basic.n_sentences"],
    #"n_sent+n_tokens": ["measures.basic.n_sentences","measures.basic.total_n_tokens"],
    #"avg-sent+n-sent+n-tokens": ["measures.basic.n_sentences","measures.basic.total_n_tokens","measures.basic.avg_n_tokens_per_sentence"],
    #"measures.basic": [c for c in andrew100k_train_fe_df.columns if "measures.basic" in c]
    "measures.andrew": [c for c in andrew100k_train_fe_df.columns if "measures.andrew" in c]

}

In [54]:
target_column = config["column_mapping"]["andrew100k"]["CEFR"]
cv_results = defaultdict(lambda: defaultdict(dict))
n_epochs = 1
for epoch_idx in range(n_epochs):
    dataset = andrew100k_train_fe_df.sample(frac=1)
    pipes = product(classifiers.values(), features.values())
    for model, feature_list in pipes:
        columns = feature_list+[target_column]
        kf = KFold(n_splits=2)
        model_idx = str(model)+str(feature_list)
        for cv_idx,(train_idx, test_idx) in enumerate(kf.split(dataset)):
            X_train = dataset.iloc[train_idx][feature_list]
            X_test  = dataset.iloc[test_idx][feature_list]
            y_train = dataset.iloc[train_idx][target_column]
            y_test  = dataset.iloc[test_idx][target_column]
            model.fit(X_train, y_train)
            target_names = ['A1', 'A2', 'B1', 'B2', 'C1']
            y_pred = model.predict(X_test)
            results_dict = classification_report(y_test, y_pred, target_names=target_names,output_dict=True)
            cv_results[model_idx]["accuracies"][str(epoch_idx)+str(cv_idx)] = results_dict["accuracy"]
            cv_results[model_idx]["results"][str(epoch_idx)+str(cv_idx)] = results_dict
            print(model_idx, results_dict["accuracy"])

        

LogisticRegression(C=1, max_iter=10000, solver='saga')['measures.andrew.collocations.text_level.ratio_num_token', 'measures.andrew.collocations.text_level.ttr', 'measures.andrew.counts.acl', 'measures.andrew.counts.acl_ratio', 'measures.andrew.counts.ADJ', 'measures.andrew.counts.ADJ_ratio', 'measures.andrew.counts.ADP', 'measures.andrew.counts.ADP_ratio', 'measures.andrew.counts.ADV', 'measures.andrew.counts.ADV_ratio', 'measures.andrew.counts.advcl', 'measures.andrew.counts.advcl_ratio', 'measures.andrew.counts.advmod', 'measures.andrew.counts.advmod_ratio', 'measures.andrew.counts.amod', 'measures.andrew.counts.amod_ratio', 'measures.andrew.counts.aux', 'measures.andrew.counts.aux_ratio', 'measures.andrew.counts.ccomp', 'measures.andrew.counts.ccomp_ratio', 'measures.andrew.counts.compound', 'measures.andrew.counts.compound_ratio', 'measures.andrew.counts.csubj', 'measures.andrew.counts.csubj_ratio', 'measures.andrew.counts.DET', 'measures.andrew.counts.DET_ratio', 'measures.andrew.

In [153]:
df.iloc[[2,4,10]][["L1"]]

Unnamed: 0,L1
2,French
4,French
10,French


In [154]:
for strat, data in cv_results.items():
    print(strat, sum(data["accuracies"].values())/len(data["accuracies"].values()))

LogisticRegression(C=1, max_iter=10000, solver='saga')['n_sentences'] 0.37244137491483703
LogisticRegression(C=1, max_iter=10000, solver='saga')['n_sentences', 'total_n_tokens'] 0.3618688417104386
LogisticRegression(C=1, max_iter=10000, solver='saga')['n_sentences', 'total_n_tokens', 'avg_n_tokens_per_sentence'] 0.38900617568844637
