In [1]:
import pandas as pd
import numpy as np
from IPython.display import clear_output

In [2]:
import os
notebook_dir = os.path.abspath("")
base_dir = os.path.dirname(notebook_dir)
experiment_data_folder = os.path.join(base_dir,"datasets", "NLP4CALL_2025_experiment","experiments_data","nlp4call2025_article_experiments")
# efcamdat_100k_with_text_and_measures.csv
efcamdat_100k_fp = os.path.join(experiment_data_folder, "efcamdat_100k_with_text_and_measures.csv")
efcamdat_100k_train_fp = os.path.join(experiment_data_folder, "efcamdat_train_id.csv")
efcamdat_100k_test_fp = os.path.join(experiment_data_folder, "efcamdat_test_id.csv")

In [17]:
from sklearn.metrics import classification_report

cefr_levels =["A1","A2","B1","B2","C1","C2"]

idx_to_class_ = lambda v: {
         0: "A1",
         1: "A2",
         2: "B1",
         3: "B2",
         4: "C1",
}.get(v, None)

class hard_predictions_eval:
    def __init__(self, hard_predictions_dict : dict, hard_gold_labels_dict : dict):
        self.hard_predictions_dict = hard_predictions_dict
        self.hard_gold_labels_dict = hard_gold_labels_dict
        self.y_pred = [v for v in self.hard_predictions_dict.values()], 
        self.y_true = [v for v in self.hard_gold_labels_dict.values()]
        print(self.y_pred, self.y_true)
    def accuracy(self):
        pass
    def precision(self):
        pass
    def recall(self):
        pass
    def report(self):
        # assuming dicts are aligned with all same ids
        self.report = classification_report(
            [v for v in self.hard_predictions_dict.values()], 
            [v for v in self.hard_gold_labels_dict.values()]
        )
        print(self.report)

def soft_predictions_eval(predictions, gold_labels):
    pass

## Expected folder structure
## 
```
./datasets
└── NLP4CALL_2025_experiment
    ├── experiments_data
    │   └── nlp4call2025_article_experiments
    │       ├── celva_1742_with_text_and_measures.csv
    │       ├── data_for_cefr_model_3efcamdat_to_1celva_with_ids_and_texts.csv
    │       ├── efcamdat_100k_with_text_and_measures.csv
    │       ├── efcamdat_test_with_id.csv
    │       └── efcamdat_train_with_id.csv
    └── experiments_data.zip
./notebooks
└── andrews-CEFR-benchmark-eval.ipynb
```

## Load dataset and get test split

In [4]:
andrews100kdf = pd.read_csv(efcamdat_100k_fp,index_col=0)
andrews100ktrainids = pd.read_csv(efcamdat_100k_train_fp)['writing_id']
andrews100ktestids = pd.read_csv(efcamdat_100k_test_fp)['writing_id']

In [5]:
andrews100kdf.shape

(100000, 725)

In [6]:
andrew100ktrain_df = pd.merge(andrews100kdf,andrews100ktrainids,on="writing_id")
andrew100ktest_df = pd.merge(andrews100kdf,andrews100ktestids,on="writing_id")

In [7]:
for df in [andrews100kdf, andrew100ktest_df, andrew100ktrain_df]:
    print(df.shape)
    print(df.columns)

(100000, 725)
Index(['writing_id', 'cefr_level',
       'measures.collocations.text_level.ratio_num_token',
       'measures.collocations.text_level.ttr', 'measures.counts.acl',
       'measures.counts.acl_ratio', 'measures.counts.acl:relcl',
       'measures.counts.acl:relcl_ratio', 'measures.counts.ADJ',
       'measures.counts.ADJ_ratio',
       ...
       'measures.taassc.L2SCA.CT_T', 'measures.taassc.L2SCA.DC_C',
       'measures.taassc.L2SCA.DC_T', 'measures.taassc.L2SCA.MLC',
       'measures.taassc.L2SCA.MLS', 'measures.taassc.L2SCA.MLT',
       'measures.taassc.L2SCA.T_S', 'measures.taassc.L2SCA.VP_T', 'text',
       'l1'],
      dtype='object', length=725)
(20002, 725)
Index(['writing_id', 'cefr_level',
       'measures.collocations.text_level.ratio_num_token',
       'measures.collocations.text_level.ttr', 'measures.counts.acl',
       'measures.counts.acl_ratio', 'measures.counts.acl:relcl',
       'measures.counts.acl:relcl_ratio', 'measures.counts.ADJ',
       'measures.c

### What is the distribution of CEFR level ?


In [8]:
for df in [andrews100kdf, andrew100ktest_df, andrew100ktrain_df]:
    print(df["cefr_level"].value_counts()/len(df["cefr_level"]))

cefr_level
A1    0.47215
A2    0.29701
B1    0.16172
B2    0.05497
C1    0.01415
Name: count, dtype: float64
cefr_level
A1    0.471853
A2    0.296820
B1    0.161984
B2    0.055594
C1    0.013749
Name: count, dtype: float64
cefr_level
A1    0.472224
A2    0.297057
B1    0.161654
B2    0.054814
C1    0.014250
Name: count, dtype: float64


## Loading models and doing predictions

In [9]:
from collections import defaultdict

models_predictions = defaultdict(lambda: {
                                    "model_name": None,
                                    "probas": None, # soft_predictions
                                    "hard_predictions": None,
                                         })

## Loading flat classifier

In [10]:
import sys
model_name="CefrFlatMultiClassLogisticRegressionModel"
sys.path.append(base_dir)
sys.path.append(f"{base_dir}{os.sep}modelling")
import flat_classifier as FC
import torch

In [11]:
model, model_dict = FC.load_model(f"{base_dir}{os.sep}modelling{os.sep}models{os.sep}{model_name}.pth")

  checkpoint = torch.load(model_save_path)


In [12]:
batch_size = andrew100ktest_df.shape[0]
with torch.no_grad():  # We don't need gradients for inference\n",
    random_input = torch.randn(batch_size, model_dict['model_architecture']['input_size'])  #  (3, input_size)\n",
    logits, probas= model(random_input)  # Get logits (predictions)\n",
    print("Model output (logits):", logits.shape)

Model output (logits): torch.Size([20002, 6])


In [13]:
flatclassifier_hard_predictions = np.argmax(probas,axis=1)
print([idx_to_class_(class_idx) for class_idx in flatclassifier_hard_predictions])
clear_output()

In [14]:
probas.shape

torch.Size([20002, 6])

In [15]:
model

FlatMultiClassLogisticRegressionModel(
  (linear): Linear(in_features=4, out_features=6, bias=True)
)

## Random Predictions

In [18]:
sample_size = andrew100ktest_df.shape[0]
random_probas = np.random.dirichlet(np.ones(5),size=sample_size)
'''predictions = {
    {
     "A1": 0.31,
     "A2": 0.54,
     "B1": 0.4,
     "B2": 0.74,
     "C1": 0.74,
    }
}'''
soft_random_predictions = {
    id_: {class_:proba for class_, proba in zip(cefr_levels, cefr_vector)}
    for id_, cefr_vector in zip(range(sample_size),random_probas.tolist())
}

hard_random_predictions = {k:idx_to_class_(v) for k,v in zip(range(sample_size), np.argmax(random_probas,axis=1).tolist())}

## Evaluating Predictions

In [19]:
'''
cefr_levels =["A1","A2","B1","B2","C1","C2"]
sample_size = 10000
dummy_test_set = andrews100kdf.sample(sample_size).reset_index()
hard_golden_labels=dummy_test_set["cefr_level"].to_dict()
print(hard_golden_labels)
clear_output()
hard_eval.report()
'''
hard_golden_labels=andrew100ktest_df["cefr_level"].to_dict()

In [20]:
hard_eval_random_predictions = hard_predictions_eval(hard_random_predictions,hard_golden_labels)
clear_output()

In [21]:
hard_eval_random_predictions.report()

              precision    recall  f1-score   support

          A1       0.20      0.47      0.28      3962
          A2       0.20      0.30      0.24      3985
          B1       0.21      0.16      0.18      4081
          B2       0.20      0.06      0.09      4016
          C1       0.17      0.01      0.02      3958

    accuracy                           0.20     20002
   macro avg       0.20      0.20      0.16     20002
weighted avg       0.20      0.20      0.16     20002



## DUMMY DATA

In [None]:
batch_size = 3
with torch.no_grad():  # We don't need gradients for inference
    random_input = torch.randn(batch_size, model_dict['model_architecture']['input_size'])  #  (3, input_size)
    logits, probas= model(random_input)  # Get logits (predictions)
    print("Model output (logits):", logits)
