In [35]:
import torch
import torch.nn.functional as F
import tokenizers
import transformers
from transformers import BertTokenizer, BertForMaskedLM
import os
import pandas as pd
from tqdm import tqdm


In [36]:
def mask_all_tokens(sentence, tokenizer):
    masked_sentences = []
    tokenized_sentence = tokenizer(sentence, return_tensors='pt')
    for token_idx in range(1, len(tokenized_sentence['input_ids'][0]) - 1):
        masked_input = tokenizer(sentence, return_tensors='pt')  # tokenized_sentence.copy()
        masked_input['input_ids'][0][token_idx] = tokenizer.mask_token_id
        masked_sentences.append(masked_input)
    return masked_sentences

def compute_perplexity(sentence, checkpoint, training_id):
    tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-italian-cased")
    if checkpoint == 0:
        model = BertForMaskedLM.from_pretrained(f"C:/Users/bergo/OneDrive - University of Pisa/Tesi Magistrale/models/{training_id}/final_pretrained_model")
    else:
        model = BertForMaskedLM.from_pretrained(f"C:/Users/bergo/OneDrive - University of Pisa/Tesi Magistrale/models/{training_id}/checkpoints/checkpoint-step{checkpoint}")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    masked_sentences = mask_all_tokens(sentence, tokenizer)
    original_tokens = tokenizer(sentence)['input_ids']
    sent_results = {'most_prob': [], 'correct_prob': [], 'top_1': [], 'top_5': [], 'top_10': []}
    for sent_idx, masked_sentence in enumerate(masked_sentences):
        masked_idx = sent_idx + 1
        correct_token = original_tokens[masked_idx]
        with torch.no_grad():
            outputs = model(**masked_sentence.to(device))
            logits = outputs.logits[0, masked_idx]
            probs = logits.softmax(dim=-1)
            values, predictions = probs.topk(10)
            # sent_results['top_1'].append(1 if correct_token in predictions[0] else 0)
            # sent_results['top_5'].append(1 if correct_token in predictions[:5] else 0)
            # sent_results['top_10'].append(1 if correct_token in predictions else 0)
            sent_results['correct_prob'].append(probs[correct_token].item())
            # sent_results['most_prob'].append(values[0].item())
        break
    final_result = sum(sent_results['correct_prob']) / len(sent_results['correct_prob'])
    return final_result


In [37]:
'''
frase = "Nonostante l'apparente ineluttabilità del destino, intessuto com'è nel tessuto stesso del continuum spazio-temporale, la teoria della relatività generale di Einstein suggerisce che le curvature dello spazio-tempo, influenzate dalla distribuzione della massa-energia, possono dare origine a fenomeni astrofisici tanto enigmatici quanto i buchi neri e le onde gravitazionali."
result = compute_perplexity(frase, tokenizer, model)
'''

'\nfrase = "Nonostante l\'apparente ineluttabilità del destino, intessuto com\'è nel tessuto stesso del continuum spazio-temporale, la teoria della relatività generale di Einstein suggerisce che le curvature dello spazio-tempo, influenzate dalla distribuzione della massa-energia, possono dare origine a fenomeni astrofisici tanto enigmatici quanto i buchi neri e le onde gravitazionali."\nresult = compute_perplexity(frase, tokenizer, model)\n'

In [38]:
frasi_df = pd.read_csv("data/csv/pseudoppl/f10_words.csv", encoding="utf-8")

In [39]:

frasi_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 33 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   text                100 non-null    object 
 1   num_words           100 non-null    int64  
 2   dataset             100 non-null    object 
 3   pp_gulp_c2          100 non-null    float64
 4   pp_gulp_c32         100 non-null    float64
 5   pp_gulp_c512        100 non-null    float64
 6   pp_gulp_c8192       100 non-null    float64
 7   pp_gulp_c15449      100 non-null    float64
 8   pp_antigulp_c2      100 non-null    float64
 9   pp_antigulp_c32     100 non-null    float64
 10  pp_antigulp_c512    100 non-null    float64
 11  pp_antigulp_c8192   100 non-null    float64
 12  pp_antigulp_c15449  100 non-null    float64
 13  pp_random5_c2       100 non-null    float64
 14  pp_random5_c32      100 non-null    float64
 15  pp_random5_c512     100 non-null    float64
 16  pp_random

In [40]:
checkpoints = [2, 32, 512, 8192, 0]
training_id = "ANTI_CURRICULUM"
tqdm.pandas(desc="Processing sentences")

for checkpoint in checkpoints:
    print(f"Calcolo perplexity del modello allo step: {checkpoint}")
    frasi_df[f"pp_antireadit_c{checkpoint}"] = frasi_df["text"].progress_apply(lambda x: compute_perplexity(x, checkpoint, training_id))

Calcolo perplexity del modello allo step: 2


Processing sentences: 100%|██████████| 100/100 [00:48<00:00,  2.07it/s]


Calcolo perplexity del modello allo step: 32


Processing sentences: 100%|██████████| 100/100 [00:48<00:00,  2.07it/s]


Calcolo perplexity del modello allo step: 512


Processing sentences: 100%|██████████| 100/100 [00:47<00:00,  2.09it/s]


Calcolo perplexity del modello allo step: 8192


Processing sentences: 100%|██████████| 100/100 [00:48<00:00,  2.06it/s]


Calcolo perplexity del modello allo step: 0


Processing sentences: 100%|██████████| 100/100 [00:48<00:00,  2.08it/s]


In [None]:
#frasi_df["readit_cFinal"] = frasi_df["text"].apply(compute_perplexity, args = )

In [34]:
frasi_df.head()

Unnamed: 0,text,num_words,dataset,pp_gulp_c2,pp_gulp_c32,pp_gulp_c512,pp_gulp_c8192,pp_gulp_c15449,pp_antigulp_c2,pp_antigulp_c32,...,pp_misto_c2,pp_misto_c32,pp_misto_c512,pp_misto_c8192,pp_misto_c15449,pp_antireadit_2,pp_antireadit_32,pp_antireadit_512,pp_antireadit_8192,pp_antireadit_0
0,Ma l'assassino è stato più veloce di lei.,10,wiki,6.4e-05,0.000399,0.074747,0.320007,0.339214,6.5e-05,0.000379,...,5e-05,0.000338,0.068793,0.323576,0.34509,3.1e-05,5.8e-05,0.000218,0.010143,0.022693
1,"Duca di Marino, II Duca di Miraglia, VIII",10,wiki,7.6e-05,0.001669,0.006879,0.059059,0.062741,8.5e-05,0.001716,...,6.6e-05,0.001251,0.002618,0.025771,0.055518,3.7e-05,5.7e-05,7.6e-05,7.7e-05,0.000606
2,Il singolo vende oltre di copie solo negli Usa.,10,wiki,5.3e-05,0.000302,0.079173,0.146777,0.146458,6.8e-05,0.000285,...,5.3e-05,0.000271,0.090314,0.138327,0.149582,4.3e-05,0.000132,0.139898,0.746319,0.661946
3,I nuovi successi continentali (1924-1928).,10,wiki,3.4e-05,0.000199,0.064485,0.275995,0.305674,3.3e-05,0.00021,...,3.8e-05,0.000212,0.058776,0.266913,0.30378,2.4e-05,7.8e-05,0.013062,0.173031,0.289449
4,"Intanto, Alaric e Jenna iniziano ad uscire ins...",10,wiki,4.9e-05,0.000814,0.004991,0.032791,0.040868,4.2e-05,0.000828,...,4.7e-05,0.000647,0.006396,0.021022,0.026111,4.5e-05,4.2e-05,0.000146,0.004021,0.006865
