In [31]:
import pandas as pd
import numpy as np
from IPython.display import clear_output

In [2]:
import os
notebook_dir = os.path.abspath("")
base_dir = os.path.dirname(notebook_dir)
experiment_data_folder = os.path.join(base_dir,"datasets", "NLP4CALL_2025_experiment","experiments_data","nlp4call2025_article_experiments")
# efcamdat_100k_with_text_and_measures.csv
efcamdat_100k_fp = os.path.join(experiment_data_folder, "efcamdat_100k_with_text_and_measures.csv")
efcamdat_100k_train_fp = os.path.join(experiment_data_folder, "efcamdat_test_with_id.csv")
efcamdat_100k_test_fp = os.path.join(experiment_data_folder, "efcamdat_train_with_id.csv")

In [3]:
from sklearn.metrics import classification_report

class hard_predictions_eval:
    def __init__(self, hard_predictions_dict : dict, hard_gold_labels_dict : dict):
        self.hard_predictions_dict = hard_predictions_dict
        self.hard_gold_labels_dict = hard_gold_labels_dict
        self.y_pred = [v for v in self.hard_predictions_dict.values()], 
        self.y_true = [v for v in self.hard_gold_labels_dict.values()]
        print(self.y_pred, self.y_true)
    def accuracy(self):
        pass
    def precision(self):
        pass
    def recall(self):
        pass
    def report(self):
        # assuming dicts are aligned with all same ids
        self.report = classification_report(
            [v for v in self.hard_predictions_dict.values()], 
            [v for v in self.hard_gold_labels_dict.values()]
        )
        print(self.report)

def soft_predictions_eval(predictions, gold_labels):
    pass

## Expected folder structure
## 
```
./datasets
└── NLP4CALL_2025_experiment
    ├── experiments_data
    │   └── nlp4call2025_article_experiments
    │       ├── celva_1742_with_text_and_measures.csv
    │       ├── data_for_cefr_model_3efcamdat_to_1celva_with_ids_and_texts.csv
    │       ├── efcamdat_100k_with_text_and_measures.csv
    │       ├── efcamdat_test_with_id.csv
    │       └── efcamdat_train_with_id.csv
    └── experiments_data.zip
./notebooks
└── andrews-CEFR-benchmark-eval.ipynb
```

In [4]:
andrews100kdf = pd.read_csv(efcamdat_100k_fp,index_col=0)
andrews100ktrainids = pd.read_csv(efcamdat_100k_train_fp)
andrews100ktestids = pd.read_csv(efcamdat_100k_test_fp)

In [5]:
andrews100ktrainids.columns

Index(['cefr_level', 'measures.collocations.text_level.ratio_num_token',
       'measures.collocations.text_level.ttr', 'measures.counts.acl',
       'measures.counts.acl_ratio', 'measures.counts.acl.relcl',
       'measures.counts.acl.relcl_ratio', 'measures.counts.ADJ',
       'measures.counts.ADJ_ratio', 'measures.counts.ADP',
       ...
       'measures.taassc.L2SCA.CP_C', 'measures.taassc.L2SCA.CP_T',
       'measures.taassc.L2SCA.CT_T', 'measures.taassc.L2SCA.DC_C',
       'measures.taassc.L2SCA.DC_T', 'measures.taassc.L2SCA.MLC',
       'measures.taassc.L2SCA.MLS', 'measures.taassc.L2SCA.MLT',
       'measures.taassc.L2SCA.T_S', 'measures.taassc.L2SCA.VP_T'],
      dtype='object', length=505)

In [32]:
cefr_levels =["A1","A2","B1","B2","C1","C2"]
sample_size = 10000
dummy_test_set = andrews100kdf.sample(sample_size).reset_index()
hard_golden_labels=dummy_test_set["cefr_level"].to_dict()
print(hard_golden_labels)
clear_output()

In [7]:
random_probas = np.random.dirichlet(np.ones(5),size=sample_size)
'''predictions = {
    {
     "A1": 0.31,
     "A2": 0.54,
     "B1": 0.4,
     "B2": 0.74,
     "C1": 0.74,
    }
}'''
soft_predictions = {
    id_: {class_:proba for class_, proba in zip(cefr_levels, cefr_vector)}
    for id_, cefr_vector in zip(range(sample_size),random_probas.tolist())
}
idx_to_class_ = lambda v: {
         0: "A1",
         1: "A2",
         2: "B1",
         3: "B2",
         4: "C1",
}.get(v, None)
hard_predictions = {k:idx_to_class_(v) for k,v in zip(range(sample_size), np.argmax(random_probas,axis=1).tolist())}

In [34]:
hard_eval = hard_predictions_eval(hard_predictions,hard_golden_labels)
clear_output()

In [9]:
hard_eval.report()

              precision    recall  f1-score   support

          A1       0.20      0.46      0.28      2012
          A2       0.20      0.29      0.24      2030
          B1       0.19      0.15      0.17      1989
          B2       0.20      0.06      0.09      1971
          C1       0.22      0.01      0.03      1998

    accuracy                           0.20     10000
   macro avg       0.20      0.20      0.16     10000
weighted avg       0.20      0.20      0.16     10000



### What is the distribution of CEFR level ?


In [10]:
andrews100kdf["cefr_level"].value_counts()/len(andrews100kdf["cefr_level"])

cefr_level
A1    0.47215
A2    0.29701
B1    0.16172
B2    0.05497
C1    0.01415
Name: count, dtype: float64

# Loading a model and doing predictions

In [11]:
import sys
model_name="CefrFlatMultiClassLogisticRegressionModel"
sys.path.append(base_dir)
sys.path.append(f"{base_dir}{os.sep}modelling")
import flat_classifier as FC
import torch

In [12]:
model, model_dict = FC.load_model(f"{base_dir}{os.sep}modelling{os.sep}models{os.sep}{model_name}.pth")

  checkpoint = torch.load(model_save_path)


In [19]:
batch_size = 3
with torch.no_grad():  # We don't need gradients for inference
    random_input = torch.randn(batch_size, model_dict['model_architecture']['input_size'])  #  (3, input_size)
    logits, probas= model(random_input)  # Get logits (predictions)
    print("Model output (logits):", logits)


Model output (logits): tensor([[-0.0335,  0.4973, -0.7702,  0.3604, -0.6651, -0.5276],
        [-0.0117,  0.5206, -0.6446, -0.0161, -0.6992, -0.1394],
        [-0.8555, -0.9051,  0.3068,  0.5986, -0.3094, -1.3785]])


In [14]:
# probas = torch.nn.functional.softmax(logits,dim=1)

In [24]:
probas

tensor([[0.1723, 0.2930, 0.0825, 0.2555, 0.0916, 0.1051],
        [0.1782, 0.3034, 0.0946, 0.1774, 0.0896, 0.1568],
        [0.0851, 0.0810, 0.2721, 0.3644, 0.1470, 0.0505]])

In [25]:
model

FlatMultiClassLogisticRegressionModel(
  (linear): Linear(in_features=4, out_features=6, bias=True)
)