In [1]:
import pandas as pd
import numpy as np
from IPython.display import clear_output

In [1]:
import os
notebook_dir = os.path.abspath("")
base_dir = os.path.dirname(notebook_dir)
experiment_data_folder = os.path.join(base_dir,"datasets", "NLP4CALL_2025_experiment","experiments_data")
# efcamdat_100k_with_text_and_measures.csv
efcamdat_100k_fp = os.path.join(experiment_data_folder, "efcamdat_100k_with_text_and_measures.csv")
efcamdat_100k_train_fp = os.path.join(experiment_data_folder, "efcamdat_train_id.csv")
efcamdat_100k_test_fp = os.path.join(experiment_data_folder, "efcamdat_test_id.csv")
full_cleaned_efcamdat = os.path.join(experiment_data_folder, "cleaned_efcamdat.csv")

In [1]:
from sklearn.metrics import classification_report

cefr_levels =["A1","A2","B1","B2","C1","C2"]

idx_to_class_ = lambda v: {
         0: "A1",
         1: "A2",
         2: "B1",
         3: "B2",
         4: "C1",
}.get(v, None)

label_to_idx_ = lambda v: {
         "A1": 0,
         "A2": 1,
         "B1": 2,
         "B2": 3,
         "C1": 4,
         "C2": 4,
}.get(v, None)

class hard_predictions_eval:
    def __init__(self, hard_predictions_dict : dict, hard_gold_labels_dict : dict):
        self.hard_predictions_dict = hard_predictions_dict
        self.hard_gold_labels_dict = hard_gold_labels_dict
        self.y_pred = [v for v in self.hard_predictions_dict.values()], 
        self.y_true = [v for v in self.hard_gold_labels_dict.values()]
        print(self.y_pred, self.y_true)
    def accuracy(self):
        pass
    def precision(self):
        pass
    def recall(self):
        pass
    def report(self):
        # assuming dicts are aligned with all same ids
        self._report = classification_report(
            [v for v in self.hard_predictions_dict.values()], 
            [v for v in self.hard_gold_labels_dict.values()]
        )
        print(self._report)

def soft_predictions_eval(predictions, gold_labels):
    pass

## Expected folder structure
## 
```
./datasets
└── NLP4CALL_2025_experiment
    ├── experiments_data
    │   └── nlp4call2025_article_experiments
    │       ├── celva_1742_with_text_and_measures.csv
    │       ├── data_for_cefr_model_3efcamdat_to_1celva_with_ids_and_texts.csv
    │       ├── efcamdat_100k_with_text_and_measures.csv
    │       ├── efcamdat_test_with_id.csv
    │       └── efcamdat_train_with_id.csv
    └── experiments_data.zip
./notebooks
└── andrews-CEFR-benchmark-eval.ipynb
```

## Load dataset and get test split

In [59]:
andrews100kdf = pd.read_csv(efcamdat_100k_fp,index_col=0)
andrews100ktrainids = pd.read_csv(efcamdat_100k_train_fp)['writing_id']
andrews100ktestids = pd.read_csv(efcamdat_100k_test_fp)['writing_id']

In [5]:
andrews100kdf.shape

(100000, 725)

In [6]:
andrew100ktrain_df = pd.merge(andrews100kdf,andrews100ktrainids,on="writing_id")
andrew100ktest_df = pd.merge(andrews100kdf,andrews100ktestids,on="writing_id").sample(1000)

In [7]:
for df in [andrews100kdf, andrew100ktest_df, andrew100ktrain_df]:
    print(df.shape)
    print(df.columns)

(100000, 725)
Index(['writing_id', 'cefr_level',
       'measures.collocations.text_level.ratio_num_token',
       'measures.collocations.text_level.ttr', 'measures.counts.acl',
       'measures.counts.acl_ratio', 'measures.counts.acl:relcl',
       'measures.counts.acl:relcl_ratio', 'measures.counts.ADJ',
       'measures.counts.ADJ_ratio',
       ...
       'measures.taassc.L2SCA.CT_T', 'measures.taassc.L2SCA.DC_C',
       'measures.taassc.L2SCA.DC_T', 'measures.taassc.L2SCA.MLC',
       'measures.taassc.L2SCA.MLS', 'measures.taassc.L2SCA.MLT',
       'measures.taassc.L2SCA.T_S', 'measures.taassc.L2SCA.VP_T', 'text',
       'l1'],
      dtype='object', length=725)
(1000, 725)
Index(['writing_id', 'cefr_level',
       'measures.collocations.text_level.ratio_num_token',
       'measures.collocations.text_level.ttr', 'measures.counts.acl',
       'measures.counts.acl_ratio', 'measures.counts.acl:relcl',
       'measures.counts.acl:relcl_ratio', 'measures.counts.ADJ',
       'measures.co

### What is the distribution of CEFR level ?


In [8]:
for df in [andrews100kdf, andrew100ktest_df, andrew100ktrain_df]:
    print(df["cefr_level"].value_counts()/len(df["cefr_level"]))

cefr_level
A1    0.47215
A2    0.29701
B1    0.16172
B2    0.05497
C1    0.01415
Name: count, dtype: float64
cefr_level
A1    0.506
A2    0.295
B1    0.141
B2    0.044
C1    0.014
Name: count, dtype: float64
cefr_level
A1    0.472224
A2    0.297057
B1    0.161654
B2    0.054814
C1    0.014250
Name: count, dtype: float64


## Loading models and doing predictions

In [9]:
from collections import defaultdict

models_predictions = defaultdict(lambda: {
                                    "model_name": None,
                                    "probas": None, # soft_predictions
                                    "hard_predictions": None,
                                         })

## Loading flat classifier

In [10]:
import sys
model_name="CefrFlatMultiClassLogisticRegressionModel"
sys.path.append(base_dir)
sys.path.append(f"{base_dir}{os.sep}modelling")
import flat_classifier as FC
import torch

In [11]:
model_path = f"{base_dir}{os.sep}models{os.sep}{model_name}.pth"
print(model_path)

/home/berstearns/p/garbage_collector/cefr-classifcation/cefr_classification/models/CefrFlatMultiClassLogisticRegressionModel.pth


In [12]:
model, model_dict = FC.load_model(model_path)

  checkpoint = torch.load(model_save_path)


In [13]:
batch_size = andrew100ktest_df.shape[0]
with torch.no_grad():  # We don't need gradients for inference\n",
    random_input = torch.randn(batch_size, model_dict['model_architecture']['input_size'])  #  (3, input_size)\n",
    logits, probas= model(random_input)  # Get logits (predictions)\n",
    print("Model output (logits):", logits.shape)

Model output (logits): torch.Size([1000, 5])


In [15]:
print(probas)

tensor([[0.0837, 0.3011, 0.1103, 0.2551, 0.2498],
        [0.1774, 0.4419, 0.0807, 0.1637, 0.1363],
        [0.2326, 0.0519, 0.1702, 0.4043, 0.1411],
        ...,
        [0.1845, 0.1376, 0.1124, 0.1480, 0.4175],
        [0.2636, 0.3519, 0.0631, 0.1039, 0.2175],
        [0.0340, 0.6691, 0.0722, 0.0628, 0.1619]])


In [32]:
probas.shape

torch.Size([1000, 5])

In [33]:
model

FlatMultiClassLogisticRegressionModel(
  (linear): Linear(in_features=4, out_features=5, bias=True)
)

### Evaluating


In [60]:
hard_golden_labels=andrew100ktest_df["cefr_level"].to_dict()
# hard_random_predictions = {k:idx_to_class_(v) for k,v in zip(range(sample_size), np.argmax(random_probas,axis=1).tolist())}
flatclassifier_hard_predictions = { 
                        idx: idx_to_class_(np.argmax(probas,axis=1).tolist()[0])
                                for idx in andrew100ktest_df["cefr_level"].to_dict().keys()
                                  }
#print(flatclassifier_hard_predictions.items())
#clear_output()
print(hard_golden_labels)

{16314: 'A1', 9311: 'A1', 8212: 'A1', 11607: 'A2', 15465: 'B1', 19009: 'A2', 6150: 'A1', 16163: 'A1', 11410: 'A2', 11586: 'A2', 5599: 'A2', 3264: 'A1', 16677: 'A1', 8626: 'A1', 4280: 'A1', 4644: 'A1', 8718: 'B1', 16124: 'B1', 7484: 'A1', 14241: 'A2', 19730: 'A1', 7715: 'A1', 10816: 'A2', 13159: 'A1', 1289: 'A1', 19240: 'A1', 6124: 'A2', 2116: 'A2', 13611: 'A1', 11341: 'A1', 18374: 'A1', 1294: 'A1', 4510: 'A1', 17789: 'B1', 15844: 'A2', 3347: 'A1', 7467: 'A1', 7028: 'A1', 1040: 'A1', 6964: 'A1', 1078: 'A1', 7890: 'A1', 2592: 'A1', 1576: 'A2', 7122: 'A1', 18031: 'C1', 17957: 'C1', 3800: 'A1', 11828: 'A2', 12999: 'A1', 15667: 'A1', 6505: 'A2', 2427: 'C1', 9422: 'A2', 2829: 'A1', 680: 'A1', 16740: 'A2', 420: 'A2', 17744: 'A1', 610: 'A2', 11647: 'A2', 18573: 'A1', 568: 'A1', 11129: 'B1', 17155: 'A2', 2112: 'A1', 3829: 'B2', 14896: 'B2', 3837: 'A2', 5474: 'A2', 2676: 'A2', 4360: 'A1', 18816: 'A1', 14985: 'A1', 13282: 'B1', 10513: 'A2', 2330: 'A2', 5164: 'A2', 9016: 'A1', 6791: 'A1', 2274: 'A

In [61]:
print([v for v in flatclassifier_hard_predictions.values()][0])

A2


In [62]:
hard_eval_FC_predictions = hard_predictions_eval(flatclassifier_hard_predictions,hard_golden_labels)
clear_output()

In [63]:
hard_eval_FC_predictions.report()

              precision    recall  f1-score   support

          A1       0.00      0.00      0.00         0
          A2       1.00      0.29      0.46      1000
          B1       0.00      0.00      0.00         0
          B2       0.00      0.00      0.00         0
          C1       0.00      0.00      0.00         0

    accuracy                           0.29      1000
   macro avg       0.20      0.06      0.09      1000
weighted avg       1.00      0.29      0.46      1000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Random Predictions

In [None]:
sample_size = andrew100ktest_df.shape[0]
random_probas = np.random.dirichlet(np.ones(5),size=sample_size)
'''predictions = {
    {
     "A1": 0.31,
     "A2": 0.54,
     "B1": 0.4,
     "B2": 0.74,
     "C1": 0.74,
    }
}'''
soft_random_predictions = {
    id_: {class_:proba for class_, proba in zip(cefr_levels, cefr_vector)}
    for id_, cefr_vector in zip(range(sample_size),random_probas.tolist())
}

hard_random_predictions = {k:idx_to_class_(v) for k,v in zip(range(sample_size), np.argmax(random_probas,axis=1).tolist())}

### Evaluating Predictions

In [None]:
'''
cefr_levels =["A1","A2","B1","B2","C1","C2"]
sample_size = 10000
dummy_test_set = andrews100kdf.sample(sample_size).reset_index()
hard_golden_labels=dummy_test_set["cefr_level"].to_dict()
print(hard_golden_labels)
clear_output()
hard_eval.report()
'''
hard_golden_labels=andrew100ktest_df["cefr_level"].to_dict()

In [None]:
hard_eval_random_predictions = hard_predictions_eval(hard_random_predictions,hard_golden_labels)
clear_output()

In [None]:
hard_eval_random_predictions.report()

## DUMMY DATA

In [None]:
batch_size = 3
with torch.no_grad():  # We don't need gradients for inference
    random_input = torch.randn(batch_size, model_dict['model_architecture']['input_size'])  #  (3, input_size)
    logits, probas= model(random_input)  # Get logits (predictions)
    print("Model output (logits):", logits)
