In [95]:
import tokenizers
import transformers
from transformers import BertTokenizer, BertForMaskedLM
import sklearn
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from transformers.pipelines import pipeline
from sklearn.linear_model import Ridge
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm


In [96]:
train_df = pd.read_csv("C:/Users/bergo/OneDrive - University of Pisa/Tesi Magistrale/probing_data/probing_train.csv")
val_df = pd.read_csv("C:/Users/bergo/OneDrive - University of Pisa/Tesi Magistrale/probing_data/probing_test.csv")

In [97]:
train_dict = train_df[:500].to_dict(orient='records')
val_dict = val_df[:250].to_dict(orient='records')

In [98]:
train_dict[0]

{'id': 'isdt_tut-689',
 'sent': "Il diritto dell'usufruttuario non si estende al tesoro che si scopra durante l'usufrutto, salve le ragioni che gli possono competere come ritrovatore (932).",
 'category': 8,
 'n_tokens': 31,
 'char_per_tok': 4.81481481481482,
 'upos_dist_DET': 16.1290322580645,
 'upos_dist_ADV': 3.2258064516129,
 'upos_dist_PUNCT': 12.9032258064516,
 'upos_dist_NUM': 3.2258064516129,
 'upos_dist_PRON': 16.1290322580645,
 'upos_dist_ADP': 12.9032258064516,
 'upos_dist_PROPN': 0.0,
 'upos_dist_ADJ': 3.2258064516129,
 'upos_dist_VERB': 9.67741935483871,
 'upos_dist_NOUN': 19.3548387096774,
 'upos_dist_CCONJ': 0.0,
 'upos_dist_AUX': 3.2258064516129,
 'avg_links_len': 2.42307692307692,
 'max_links_len': 11,
 'avg_max_depth': 5,
 'dep_dist_obj': 0.0,
 'dep_dist_nsubj': 12.9032258064516,
 'subj_pre': 75.0,
 'subj_post': 25.0,
 'n_prepositional_chains': 1,
 'avg_prepositional_chain_len': 1.0,
 'avg_subordinate_chain_len': 1.0,
 'subordinate_proposition_dist': 66.6666666666667,

In [99]:
#funzione per ottenere gli embedding delle frasi.
def feature_extraction(samples, model_name):
    first_layer = 1
    last_layer = 8
    tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-italian-cased") #come tokenizzatore utilizziamo il solito giusto (bert-base-italian-uncased)? 
    model = BertForMaskedLM.from_pretrained(model_name)
    for sample in tqdm(samples, desc="Estrazione features", unit="sample"):
        encoded_sen = tokenizer(sample["sent"], padding=True, truncation=True, max_length=128, return_tensors='pt') 
        with torch.no_grad():    
            model_output = model(**encoded_sen, output_hidden_states=True)
            hidden_states = model_output.hidden_states
            for layer in range(first_layer, last_layer+1):
                layer_output = torch.squeeze(hidden_states[layer])
                cls_embedding = layer_output[0, :].cpu().detach().numpy()
                sample[f'layer_{layer}'] = {'cls_embedding': cls_embedding}
    return samples

#funzione per ottenere features e lables
def get_features_lables(samples, feature, layer):
    X = []
    y = []
    for sample in samples:
        emb = sample[layer]["cls_embedding"]
        label =  sample[feature]
        X.append(emb)
        y.append(label)
    return X, y


#funzione per addestrare e valutare il modello
def train_eval(train_set, val_set, feature, layer):
    scaler = MinMaxScaler()
    X_train, y_train = get_features_lables(train_set, feature, layer)
    X_val, y_val = get_features_lables(val_set, feature, layer)
    X_train = np.array(X_train) 
    X_val = np.array(X_val)
    scaled_X_train = scaler.fit_transform(X_train)
    scaled_X_val = scaler.transform(X_val)
    clf = sklearn.linear_model.Ridge(alpha=1.0)
    clf.fit(scaled_X_train, y_train)
    y_pred = clf.predict(scaled_X_val) 
    #return compute_metrics(np.array(y_val), y_pred)
    return y_pred

In [100]:
def compute_metrics(y_real, y_pred):
    metrics = {}
    metrics["accuracy"] = accuracy_score(y_real, y_pred)
    metrics["precision"] = precision_score(y_real, y_pred)
    metrics["recall"] = recall_score(y_real, y_pred)
    return metrics

In [64]:
final_model_path =  f"C:/Users/bergo/OneDrive - University of Pisa/Tesi Magistrale/models/ANTI_CURRICULUM/final_pretrained_model"
samples = feature_extraction(val_dict, final_model_path)

Estrazione features: 100%|██████████| 250/250 [00:10<00:00, 23.54sample/s]


In [65]:
print(samples[0])

{'id': 'vit_VIT-1574', 'sent': 'E anche Catanzaro ha le sue difficoltà.', 'category': 1, 'n_tokens': 8, 'char_per_tok': 4.57142857142857, 'upos_dist_DET': 25.0, 'upos_dist_ADV': 12.5, 'upos_dist_PUNCT': 12.5, 'upos_dist_NUM': 0.0, 'upos_dist_PRON': 0.0, 'upos_dist_ADP': 0.0, 'upos_dist_PROPN': 12.5, 'upos_dist_ADJ': 0.0, 'upos_dist_VERB': 12.5, 'upos_dist_NOUN': 12.5, 'upos_dist_CCONJ': 12.5, 'upos_dist_AUX': 0.0, 'avg_links_len': 1.83333333333333, 'max_links_len': 3, 'avg_max_depth': 2, 'dep_dist_obj': 12.5, 'dep_dist_nsubj': 12.5, 'subj_pre': 100.0, 'subj_post': 0.0, 'n_prepositional_chains': 0, 'avg_prepositional_chain_len': 0.0, 'avg_subordinate_chain_len': 0.0, 'subordinate_proposition_dist': 0.0, 'avg_verb_edges': 3.0, 'layer_1': {'cls_embedding': array([-1.58320463e+00,  6.58254683e-01,  1.52717829e+00, -7.21148431e-01,
        9.39633250e-01, -3.79958123e-01,  1.51076829e+00,  7.05702245e-01,
       -6.10219538e-01, -1.24295795e+00,  5.09354293e-01,  6.94169760e-01,
       -2.3

In [92]:
pred = train_eval(samples[:50], samples[90:110], "n_tokens", "layer_1")

In [93]:
pred

array([32.704872 , 46.76863  , 12.371323 , 34.933537 ,  5.1193733,
       13.003799 , 30.975391 ,  2.0613728, 62.330864 , 20.78401  ,
       36.800365 , 46.047382 , 16.529696 , 13.209357 , 10.965827 ,
       27.622496 , 18.091286 , 28.119837 ,  3.5392475, 14.891975 ],
      dtype=float32)

In [111]:
# DEVO ITERARE ANCHE SUI CHECKPOINT CHE USIAMO
checkpoints = [2, 32, 0] 
ling_features = ["n_tokens", "upos_dist_ADP", "char_per_tok"]

In [135]:
def probing_checkpoints(checkpoints, training_id, train_dict, val_dict, ling_features):
    first_layer = 1
    last_layer = 8
    results = pd.DataFrame()
    for n_step in checkpoints:
        checkpoint_name = f'checkpoint-step{n_step}'
        if n_step == 0:
            model_name = f"C:/Users/bergo/OneDrive - University of Pisa/Tesi Magistrale/models/{training_id}/final_pretrained_model"
            checkpoint = 15449
            print("Inizio probing per il modello finale")
        else:
            checkpoint = n_step
            model_name = f"C:/Users/bergo/OneDrive - University of Pisa/Tesi Magistrale/models/{training_id}/checkpoints/{checkpoint_name}"
            print(f"Inizio probing per il checkpoint {n_step}")
        print("Estrazione delle feature di training...")
        train_samples = feature_extraction(train_dict, model_name)  #si effettua l'estrazione delle feature per il checkpoint
        print("Estrazione delle features di validation...")
        val_samples = feature_extraction(val_dict, model_name)
        for ling_feature in ling_features:
            print(f'Addestramento del modello sulla feature linguistica: {ling_feature}') 
            # layer_results = dict()
            # feature_result = dict()
            for layer in range(first_layer, last_layer+1):
                print(f"Training for layer {layer}/{last_layer}")
                layer_result = train_eval(train_samples, val_samples, ling_feature, f'layer_{layer}')   #addestriamo il ridge per ogni layer ottenendo le metriche
                row = {"model": training_id, "step": checkpoint, "ling_feature": ling_feature, "layer": layer, "preds": layer_result}
                results = results._append(row, ignore_index = True)            #     layer_results[f'layer_{layer}'] = layer_result
            # feature_result[ling_feature] = {"results": layer_results}
            # results[f"checkpoint{n_step}"] = feature_result
    return results
        

In [136]:
final_results = probing_checkpoints(checkpoints, "ANTI_CURRICULUM", train_dict, val_dict, ling_features)

Inizio probing per il checkpoint 2
Estrazione delle feature di training...


Estrazione features: 100%|██████████| 500/500 [00:26<00:00, 18.64sample/s]


Estrazione delle features di validation...


Estrazione features: 100%|██████████| 250/250 [00:15<00:00, 16.28sample/s]


Addestramento del modello sulla feature linguistica: n_tokens
Training for layer 1/8
Training for layer 2/8
Training for layer 3/8
Training for layer 4/8
Training for layer 5/8
Training for layer 6/8
Training for layer 7/8
Training for layer 8/8
Addestramento del modello sulla feature linguistica: upos_dist_ADP
Training for layer 1/8
Training for layer 2/8
Training for layer 3/8
Training for layer 4/8
Training for layer 5/8
Training for layer 6/8
Training for layer 7/8
Training for layer 8/8
Addestramento del modello sulla feature linguistica: char_per_tok
Training for layer 1/8
Training for layer 2/8
Training for layer 3/8
Training for layer 4/8
Training for layer 5/8
Training for layer 6/8
Training for layer 7/8
Training for layer 8/8
Inizio probing per il checkpoint 32
Estrazione delle feature di training...


Estrazione features: 100%|██████████| 500/500 [00:31<00:00, 16.02sample/s]


Estrazione delle features di validation...


Estrazione features: 100%|██████████| 250/250 [00:18<00:00, 13.42sample/s]


Addestramento del modello sulla feature linguistica: n_tokens
Training for layer 1/8
Training for layer 2/8
Training for layer 3/8
Training for layer 4/8
Training for layer 5/8
Training for layer 6/8
Training for layer 7/8
Training for layer 8/8
Addestramento del modello sulla feature linguistica: upos_dist_ADP
Training for layer 1/8
Training for layer 2/8
Training for layer 3/8
Training for layer 4/8
Training for layer 5/8
Training for layer 6/8
Training for layer 7/8
Training for layer 8/8
Addestramento del modello sulla feature linguistica: char_per_tok
Training for layer 1/8
Training for layer 2/8
Training for layer 3/8
Training for layer 4/8
Training for layer 5/8
Training for layer 6/8
Training for layer 7/8
Training for layer 8/8
Inizio probing per il modello finale
Estrazione delle feature di training...


Estrazione features: 100%|██████████| 500/500 [00:27<00:00, 17.86sample/s]


Estrazione delle features di validation...


Estrazione features: 100%|██████████| 250/250 [00:13<00:00, 18.34sample/s]


Addestramento del modello sulla feature linguistica: n_tokens
Training for layer 1/8
Training for layer 2/8
Training for layer 3/8
Training for layer 4/8
Training for layer 5/8
Training for layer 6/8
Training for layer 7/8
Training for layer 8/8
Addestramento del modello sulla feature linguistica: upos_dist_ADP
Training for layer 1/8
Training for layer 2/8
Training for layer 3/8
Training for layer 4/8
Training for layer 5/8
Training for layer 6/8
Training for layer 7/8
Training for layer 8/8
Addestramento del modello sulla feature linguistica: char_per_tok
Training for layer 1/8
Training for layer 2/8
Training for layer 3/8
Training for layer 4/8
Training for layer 5/8
Training for layer 6/8
Training for layer 7/8
Training for layer 8/8


In [139]:
final_results

Unnamed: 0,model,checkpoint,ling_feature,layer,preds
0,ANTI_CURRICULUM,2,n_tokens,1,"[4.6450605, 25.247198, 25.023708, 29.093542, 2..."
1,ANTI_CURRICULUM,2,n_tokens,2,"[1.3256454, 29.483768, 15.737802, 26.452576, 2..."
2,ANTI_CURRICULUM,2,n_tokens,3,"[-2.7904987, 28.723927, 17.163132, 28.610245, ..."
3,ANTI_CURRICULUM,2,n_tokens,4,"[0.10903835, 29.50755, 19.56321, 31.398087, 23..."
4,ANTI_CURRICULUM,2,n_tokens,5,"[0.29800224, 33.28931, 21.385345, 27.387943, 2..."
...,...,...,...,...,...
67,ANTI_CURRICULUM,15449,char_per_tok,4,"[3.793093, 4.773731, 4.5966487, 5.3181996, 4.1..."
68,ANTI_CURRICULUM,15449,char_per_tok,5,"[3.9956179, 4.579143, 5.086319, 5.16664, 4.103..."
69,ANTI_CURRICULUM,15449,char_per_tok,6,"[4.133931, 4.550061, 4.861389, 5.503478, 3.952..."
70,ANTI_CURRICULUM,15449,char_per_tok,7,"[4.0975614, 4.2907443, 5.070397, 5.657561, 3.7..."
