In [1]:
import tokenizers
import transformers
from transformers import BertTokenizer, BertForMaskedLM
import sklearn
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from transformers.pipelines import pipeline
from sklearn.linear_model import Ridge
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm


In [3]:
train_df = pd.read_csv("C:/Users/bergo/OneDrive - University of Pisa/Tesi Magistrale/probing_data/probing_train.csv")
val_df = pd.read_csv("C:/Users/bergo/OneDrive - University of Pisa/Tesi Magistrale/probing_data/probing_test.csv")

In [4]:
train_dict = train_df.to_dict(orient='records')
val_dict = val_df.to_dict(orient='records')

In [5]:
train_dict[0]

{'id': 'isdt_tut-689',
 'sent': "Il diritto dell'usufruttuario non si estende al tesoro che si scopra durante l'usufrutto, salve le ragioni che gli possono competere come ritrovatore (932).",
 'category': 8,
 'n_tokens': 31,
 'char_per_tok': 4.81481481481482,
 'upos_dist_DET': 16.1290322580645,
 'upos_dist_ADV': 3.2258064516129,
 'upos_dist_PUNCT': 12.9032258064516,
 'upos_dist_NUM': 3.2258064516129,
 'upos_dist_PRON': 16.1290322580645,
 'upos_dist_ADP': 12.9032258064516,
 'upos_dist_PROPN': 0.0,
 'upos_dist_ADJ': 3.2258064516129,
 'upos_dist_VERB': 9.67741935483871,
 'upos_dist_NOUN': 19.3548387096774,
 'upos_dist_CCONJ': 0.0,
 'upos_dist_AUX': 3.2258064516129,
 'avg_links_len': 2.42307692307692,
 'max_links_len': 11,
 'avg_max_depth': 5,
 'dep_dist_obj': 0.0,
 'dep_dist_nsubj': 12.9032258064516,
 'subj_pre': 75.0,
 'subj_post': 25.0,
 'n_prepositional_chains': 1,
 'avg_prepositional_chain_len': 1.0,
 'avg_subordinate_chain_len': 1.0,
 'subordinate_proposition_dist': 66.6666666666667,

In [6]:
#funzione per ottenere gli embedding delle frasi.
def feature_extraction(samples, model_name):
    first_layer = 1
    last_layer = 8
    tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-italian-cased") #come tokenizzatore utilizziamo il solito giusto (bert-base-italian-uncased)? 
    model = BertForMaskedLM.from_pretrained(model_name)
    for sample in tqdm(samples, desc="Estrazione features", unit="sample"):
        encoded_sen = tokenizer(sample["sent"], padding=True, truncation=True, max_length=128, return_tensors='pt') 
        with torch.no_grad():    
            model_output = model(**encoded_sen, output_hidden_states=True)
            hidden_states = model_output.hidden_states
            for layer in range(first_layer, last_layer+1):
                layer_output = torch.squeeze(hidden_states[layer])
                cls_embedding = layer_output[0, :].cpu().detach().numpy()
                sample[f'layer_{layer}'] = {'cls_embedding': cls_embedding}
    return samples

#funzione per ottenere features e lables
def get_features_lables(samples, feature, layer):
    X = []
    y = []
    for sample in samples:
        emb = sample[layer]["cls_embedding"]
        label =  sample[feature]
        X.append(emb)
        y.append(label)
    return X, y


#funzione per addestrare e valutare il modello
def train_eval(train_set, val_set, feature, layer):
    scaler = MinMaxScaler()
    X_train, y_train = get_features_lables(train_set, feature, layer)
    X_val, y_val = get_features_lables(val_set, feature, layer)
    X_train = np.array(X_train) 
    X_val = np.array(X_val)
    scaled_X_train = scaler.fit_transform(X_train)
    scaled_X_val = scaler.transform(X_val)
    clf = sklearn.linear_model.Ridge(alpha=1.0)
    clf.fit(scaled_X_train, y_train)
    y_pred = clf.predict(scaled_X_val) 
    #return compute_metrics(np.array(y_val), y_pred)
    return y_pred

In [9]:
# DEVO ITERARE ANCHE SUI CHECKPOINT CHE USIAMO
checkpoints = [2, 32, 512, 8192, 0] 
ling_features = ["n_tokens",  "char_per_tok", "upos_dist_DET", "upos_dist_ADV", "upos_dist_PUNCT", "upos_dist_NUM", "upos_dist_PRON", "upos_dist_ADP", "upos_dist_PROPN","upos_dist_ADJ","upos_dist_VERB","upos_dist_NOUN", "upos_dist_CCONJ", "upos_dist_AUX", "avg_links_len", "max_links_len", "avg_max_depth", "dep_dist_obj", "dep_dist_nsubj", "subj_pre", "subj_post", "n_prepositional_chains", "avg_prepositional_chain_len", "avg_subordinate_chain_len", "subordinate_proposition_dist", "avg_verb_edges"]
training_id = "ANTI_CURRICULUM"

In [10]:
def probing_checkpoints(checkpoints, training_id, train_dict, val_dict, ling_features):
    first_layer = 1
    last_layer = 8
    results = pd.DataFrame()
    for n_step in checkpoints:
        checkpoint_name = f'checkpoint-step{n_step}'
        if n_step == 0:
            model_name = f"C:/Users/bergo/OneDrive - University of Pisa/Tesi Magistrale/models/{training_id}/final_pretrained_model"
            checkpoint = 15449
            print("Inizio probing per il modello finale")
        else:
            checkpoint = n_step
            model_name = f"C:/Users/bergo/OneDrive - University of Pisa/Tesi Magistrale/models/{training_id}/checkpoints/{checkpoint_name}"
            print(f"Inizio probing per il checkpoint {n_step}")
        print("Estrazione delle feature di training...")
        train_samples = feature_extraction(train_dict, model_name)  #si effettua l'estrazione delle feature per il checkpoint
        print("Estrazione delle features di validation...")
        val_samples = feature_extraction(val_dict, model_name)
        for ling_feature in ling_features:
            print(f'Addestramento del modello sulla feature linguistica: {ling_feature}') 
            # layer_results = dict()
            # feature_result = dict()
            for layer in range(first_layer, last_layer+1):
                print(f"Training for layer {layer}/{last_layer}")
                layer_result = train_eval(train_samples, val_samples, ling_feature, f'layer_{layer}')   #addestriamo il ridge per ogni layer ottenendo le metriche
                row = {"model": training_id, "step": checkpoint, "ling_feature": ling_feature, "layer": layer, "preds": layer_result}
                results = results._append(row, ignore_index = True)            #     layer_results[f'layer_{layer}'] = layer_result
            # feature_result[ling_feature] = {"results": layer_results}
            # results[f"checkpoint{n_step}"] = feature_result
    return results
        

In [11]:
final_results = probing_checkpoints(checkpoints, "ANTI_CURRICULUM", train_dict, val_dict, ling_features)

Inizio probing per il checkpoint 2
Estrazione delle feature di training...


Estrazione features: 100%|██████████| 10000/10000 [09:02<00:00, 18.44sample/s]


Estrazione delle features di validation...


Estrazione features: 100%|██████████| 5000/5000 [04:12<00:00, 19.80sample/s]


Addestramento del modello sulla feature linguistica: n_tokens
Training for layer 1/8
Training for layer 2/8
Training for layer 3/8
Training for layer 4/8
Training for layer 5/8
Training for layer 6/8
Training for layer 7/8
Training for layer 8/8
Addestramento del modello sulla feature linguistica: char_per_tok
Training for layer 1/8
Training for layer 2/8
Training for layer 3/8
Training for layer 4/8
Training for layer 5/8
Training for layer 6/8
Training for layer 7/8
Training for layer 8/8
Addestramento del modello sulla feature linguistica: upos_dist_DET
Training for layer 1/8
Training for layer 2/8
Training for layer 3/8
Training for layer 4/8
Training for layer 5/8
Training for layer 6/8
Training for layer 7/8
Training for layer 8/8
Addestramento del modello sulla feature linguistica: upos_dist_ADV
Training for layer 1/8
Training for layer 2/8
Training for layer 3/8
Training for layer 4/8
Training for layer 5/8
Training for layer 6/8
Training for layer 7/8
Training for layer 8/8
Ad

Estrazione features: 100%|██████████| 10000/10000 [10:47<00:00, 15.43sample/s]


Estrazione delle features di validation...


Estrazione features: 100%|██████████| 5000/5000 [05:38<00:00, 14.77sample/s]


Addestramento del modello sulla feature linguistica: n_tokens
Training for layer 1/8
Training for layer 2/8
Training for layer 3/8
Training for layer 4/8
Training for layer 5/8
Training for layer 6/8
Training for layer 7/8
Training for layer 8/8
Addestramento del modello sulla feature linguistica: char_per_tok
Training for layer 1/8
Training for layer 2/8
Training for layer 3/8
Training for layer 4/8
Training for layer 5/8
Training for layer 6/8
Training for layer 7/8
Training for layer 8/8
Addestramento del modello sulla feature linguistica: upos_dist_DET
Training for layer 1/8
Training for layer 2/8
Training for layer 3/8
Training for layer 4/8
Training for layer 5/8
Training for layer 6/8
Training for layer 7/8
Training for layer 8/8
Addestramento del modello sulla feature linguistica: upos_dist_ADV
Training for layer 1/8
Training for layer 2/8
Training for layer 3/8
Training for layer 4/8
Training for layer 5/8
Training for layer 6/8
Training for layer 7/8
Training for layer 8/8
Ad

Estrazione features: 100%|██████████| 10000/10000 [10:34<00:00, 15.77sample/s]


Estrazione delle features di validation...


Estrazione features: 100%|██████████| 5000/5000 [04:30<00:00, 18.52sample/s]


Addestramento del modello sulla feature linguistica: n_tokens
Training for layer 1/8
Training for layer 2/8
Training for layer 3/8
Training for layer 4/8
Training for layer 5/8
Training for layer 6/8
Training for layer 7/8
Training for layer 8/8
Addestramento del modello sulla feature linguistica: char_per_tok
Training for layer 1/8
Training for layer 2/8
Training for layer 3/8
Training for layer 4/8
Training for layer 5/8
Training for layer 6/8
Training for layer 7/8
Training for layer 8/8
Addestramento del modello sulla feature linguistica: upos_dist_DET
Training for layer 1/8
Training for layer 2/8
Training for layer 3/8
Training for layer 4/8
Training for layer 5/8
Training for layer 6/8
Training for layer 7/8
Training for layer 8/8
Addestramento del modello sulla feature linguistica: upos_dist_ADV
Training for layer 1/8
Training for layer 2/8
Training for layer 3/8
Training for layer 4/8
Training for layer 5/8
Training for layer 6/8
Training for layer 7/8
Training for layer 8/8
Ad

Estrazione features: 100%|██████████| 10000/10000 [08:33<00:00, 19.46sample/s]


Estrazione delle features di validation...


Estrazione features: 100%|██████████| 5000/5000 [04:15<00:00, 19.54sample/s]


Addestramento del modello sulla feature linguistica: n_tokens
Training for layer 1/8
Training for layer 2/8
Training for layer 3/8
Training for layer 4/8
Training for layer 5/8
Training for layer 6/8
Training for layer 7/8
Training for layer 8/8
Addestramento del modello sulla feature linguistica: char_per_tok
Training for layer 1/8
Training for layer 2/8
Training for layer 3/8
Training for layer 4/8
Training for layer 5/8
Training for layer 6/8
Training for layer 7/8
Training for layer 8/8
Addestramento del modello sulla feature linguistica: upos_dist_DET
Training for layer 1/8
Training for layer 2/8
Training for layer 3/8
Training for layer 4/8
Training for layer 5/8
Training for layer 6/8
Training for layer 7/8
Training for layer 8/8
Addestramento del modello sulla feature linguistica: upos_dist_ADV
Training for layer 1/8
Training for layer 2/8
Training for layer 3/8
Training for layer 4/8
Training for layer 5/8
Training for layer 6/8
Training for layer 7/8
Training for layer 8/8
Ad

Estrazione features: 100%|██████████| 10000/10000 [08:16<00:00, 20.16sample/s]


Estrazione delle features di validation...


Estrazione features: 100%|██████████| 5000/5000 [04:42<00:00, 17.71sample/s]


Addestramento del modello sulla feature linguistica: n_tokens
Training for layer 1/8
Training for layer 2/8
Training for layer 3/8
Training for layer 4/8
Training for layer 5/8
Training for layer 6/8
Training for layer 7/8
Training for layer 8/8
Addestramento del modello sulla feature linguistica: char_per_tok
Training for layer 1/8
Training for layer 2/8
Training for layer 3/8
Training for layer 4/8
Training for layer 5/8
Training for layer 6/8
Training for layer 7/8
Training for layer 8/8
Addestramento del modello sulla feature linguistica: upos_dist_DET
Training for layer 1/8
Training for layer 2/8
Training for layer 3/8
Training for layer 4/8
Training for layer 5/8
Training for layer 6/8
Training for layer 7/8
Training for layer 8/8
Addestramento del modello sulla feature linguistica: upos_dist_ADV
Training for layer 1/8
Training for layer 2/8
Training for layer 3/8
Training for layer 4/8
Training for layer 5/8
Training for layer 6/8
Training for layer 7/8
Training for layer 8/8
Ad

In [12]:
final_results.head()

Unnamed: 0,model,step,ling_feature,layer,preds
0,ANTI_CURRICULUM,2,n_tokens,1,"[14.261738, 26.558847, 17.390493, 22.84751, 20..."
1,ANTI_CURRICULUM,2,n_tokens,2,"[14.1515, 27.287937, 15.549747, 24.80665, 20.5..."
2,ANTI_CURRICULUM,2,n_tokens,3,"[11.943846, 27.356674, 15.11858, 23.55846, 20...."
3,ANTI_CURRICULUM,2,n_tokens,4,"[13.21221, 26.788387, 17.908596, 24.068243, 20..."
4,ANTI_CURRICULUM,2,n_tokens,5,"[14.430076, 27.610977, 16.715775, 21.83453, 18..."


In [14]:
final_results.to_csv(f"C:/Users/bergo/OneDrive - University of Pisa/Tesi Magistrale/models/ANTI_CURRICULUM/predictions.csv")