In [29]:
import torch
import torch.nn.functional as F
import tokenizers
import transformers
from transformers import BertTokenizer, BertForMaskedLM
import os
import pandas as pd
from tqdm import tqdm


In [30]:
def mask_all_tokens(sentence, tokenizer):
    masked_sentences = []
    tokenized_sentence = tokenizer(sentence, return_tensors='pt')
    for token_idx in range(1, len(tokenized_sentence['input_ids'][0]) - 1):
        masked_input = tokenizer(sentence, return_tensors='pt')  # tokenized_sentence.copy()
        masked_input['input_ids'][0][token_idx] = tokenizer.mask_token_id
        masked_sentences.append(masked_input)
    return masked_sentences

def compute_perplexity(sentence, checkpoint, training_id):
    tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-italian-cased")
    if checkpoint == 15449:
        model = BertForMaskedLM.from_pretrained(f"C:/Users/bergo/OneDrive - University of Pisa/Tesi Magistrale/models/{training_id}/final_pretrained_model")
    else:
        model = BertForMaskedLM.from_pretrained(f"C:/Users/bergo/OneDrive - University of Pisa/Tesi Magistrale/models/{training_id}/checkpoints/checkpoint-step{checkpoint}")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    masked_sentences = mask_all_tokens(sentence, tokenizer)
    original_tokens = tokenizer(sentence)['input_ids']
    sent_results = {'most_prob': [], 'correct_prob': [], 'top_1': [], 'top_5': [], 'top_10': []}
    for sent_idx, masked_sentence in enumerate(masked_sentences):
        masked_idx = sent_idx + 1
        correct_token = original_tokens[masked_idx]
        with torch.no_grad():
            outputs = model(**masked_sentence.to(device))
            logits = outputs.logits[0, masked_idx]
            probs = logits.softmax(dim=-1)
            #values, predictions = probs.topk(10)
            # sent_results['top_1'].append(1 if correct_token in predictions[0] else 0)
            # sent_results['top_5'].append(1 if correct_token in predictions[:5] else 0)
            # sent_results['top_10'].append(1 if correct_token in predictions else 0)
            sent_results['correct_prob'].append(probs[correct_token].item())
            # sent_results['most_prob'].append(values[0].item())
    final_result = sum(sent_results['correct_prob']) / len(sent_results['correct_prob'])
    return final_result


In [3]:
'''
frase = "Nonostante l'apparente ineluttabilità del destino, intessuto com'è nel tessuto stesso del continuum spazio-temporale, la teoria della relatività generale di Einstein suggerisce che le curvature dello spazio-tempo, influenzate dalla distribuzione della massa-energia, possono dare origine a fenomeni astrofisici tanto enigmatici quanto i buchi neri e le onde gravitazionali."
result = compute_perplexity(frase, tokenizer, model)
'''

'\nfrase = "Nonostante l\'apparente ineluttabilità del destino, intessuto com\'è nel tessuto stesso del continuum spazio-temporale, la teoria della relatività generale di Einstein suggerisce che le curvature dello spazio-tempo, influenzate dalla distribuzione della massa-energia, possono dare origine a fenomeni astrofisici tanto enigmatici quanto i buchi neri e le onde gravitazionali."\nresult = compute_perplexity(frase, tokenizer, model)\n'

In [56]:
frasi_df = pd.read_csv("data/csv/pseudoppl/f30_words.csv", encoding="utf-8")

In [54]:
frasi_df["num_words"].unique()

array([25], dtype=int64)

In [57]:
checkpoints = [2, 32, 512, 8192, 15449]
training_ids = ["CURRICULUM", "ANTI_CURRICULUM", "RANDOM_S21", "RANDOM_S37"]
column_names = ["readit", "antireadit", "random21", "random37"]
tqdm.pandas(desc="Processing sentences")
ppls = pd.DataFrame()
ppls["text"] = frasi_df["text"]

for i, training_id in enumerate(training_ids):
    for checkpoint in checkpoints:
        print(f"Calcolo perplexity del modello {training_id} allo step: {checkpoint}")
        ppls[f"pp_{column_names[i]}_c{checkpoint}"] = frasi_df["text"].progress_apply(lambda x: compute_perplexity(x, checkpoint, training_id))

ppls.to_csv(f"C:/Users/bergo/OneDrive - University of Pisa/Tesi Magistrale/perplexity/ricalcolati/ppl_ricalcolati_l30.csv")

Calcolo perplexity del modello CURRICULUM allo step: 2


Processing sentences: 100%|██████████| 100/100 [03:33<00:00,  2.13s/it]


Calcolo perplexity del modello CURRICULUM allo step: 32


Processing sentences: 100%|██████████| 100/100 [03:36<00:00,  2.16s/it]


Calcolo perplexity del modello CURRICULUM allo step: 512


Processing sentences: 100%|██████████| 100/100 [03:35<00:00,  2.16s/it]


Calcolo perplexity del modello CURRICULUM allo step: 8192


Processing sentences: 100%|██████████| 100/100 [03:35<00:00,  2.15s/it]


Calcolo perplexity del modello CURRICULUM allo step: 15449


Processing sentences: 100%|██████████| 100/100 [04:02<00:00,  2.42s/it]


Calcolo perplexity del modello ANTI_CURRICULUM allo step: 2


Processing sentences: 100%|██████████| 100/100 [03:46<00:00,  2.26s/it]


Calcolo perplexity del modello ANTI_CURRICULUM allo step: 32


Processing sentences: 100%|██████████| 100/100 [03:42<00:00,  2.23s/it]


Calcolo perplexity del modello ANTI_CURRICULUM allo step: 512


Processing sentences: 100%|██████████| 100/100 [03:47<00:00,  2.27s/it]


Calcolo perplexity del modello ANTI_CURRICULUM allo step: 8192


Processing sentences: 100%|██████████| 100/100 [03:44<00:00,  2.24s/it]


Calcolo perplexity del modello ANTI_CURRICULUM allo step: 15449


Processing sentences: 100%|██████████| 100/100 [03:44<00:00,  2.24s/it]


Calcolo perplexity del modello RANDOM_S21 allo step: 2


Processing sentences: 100%|██████████| 100/100 [03:44<00:00,  2.24s/it]


Calcolo perplexity del modello RANDOM_S21 allo step: 32


Processing sentences: 100%|██████████| 100/100 [03:41<00:00,  2.21s/it]


Calcolo perplexity del modello RANDOM_S21 allo step: 512


Processing sentences: 100%|██████████| 100/100 [03:46<00:00,  2.26s/it]


Calcolo perplexity del modello RANDOM_S21 allo step: 8192


Processing sentences: 100%|██████████| 100/100 [03:44<00:00,  2.24s/it]


Calcolo perplexity del modello RANDOM_S21 allo step: 15449


Processing sentences: 100%|██████████| 100/100 [03:55<00:00,  2.35s/it]


Calcolo perplexity del modello RANDOM_S37 allo step: 2


Processing sentences: 100%|██████████| 100/100 [03:44<00:00,  2.24s/it]


Calcolo perplexity del modello RANDOM_S37 allo step: 32


Processing sentences: 100%|██████████| 100/100 [03:45<00:00,  2.25s/it]


Calcolo perplexity del modello RANDOM_S37 allo step: 512


Processing sentences: 100%|██████████| 100/100 [03:42<00:00,  2.22s/it]


Calcolo perplexity del modello RANDOM_S37 allo step: 8192


Processing sentences: 100%|██████████| 100/100 [03:44<00:00,  2.24s/it]


Calcolo perplexity del modello RANDOM_S37 allo step: 15449


Processing sentences: 100%|██████████| 100/100 [03:43<00:00,  2.24s/it]


In [24]:
ppls.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   text                          100 non-null    object 
 1   pp_pp_random21_c15449_c15449  100 non-null    float64
dtypes: float64(1), object(1)
memory usage: 1.7+ KB


In [39]:
ppls.head()

Unnamed: 0,text,pp_readit_c2,pp_readit_c32,pp_readit_c512,pp_readit_c8192,pp_readit_c15449,pp_antireadit_c2,pp_antireadit_c32,pp_antireadit_c512,pp_antireadit_c8192,...,pp_random21_c2,pp_random21_c32,pp_random21_c512,pp_random21_c8192,pp_random21_c15449,pp_random37_c2,pp_random37_c32,pp_random37_c512,pp_random37_c8192,pp_random37_c15449
0,Ma l'assassino è stato più veloce di lei.,6.5e-05,0.000361,0.084295,0.3251,0.349395,6e-05,0.000449,0.079402,0.339348,...,6.2e-05,0.000427,0.082885,0.334055,0.342141,6.5e-05,0.000413,0.074851,0.312396,0.362874
1,"Duca di Marino, II Duca di Miraglia, VIII",8.3e-05,0.001558,0.00544,0.071943,0.118871,6.9e-05,0.001579,0.007463,0.057442,...,8.3e-05,0.001382,0.004638,0.047391,0.077207,8.9e-05,0.001954,0.009392,0.055464,0.08264
2,Il singolo vende oltre di copie solo negli Usa.,7.2e-05,0.000333,0.098513,0.145633,0.154799,6.3e-05,0.000406,0.095078,0.153397,...,5.8e-05,0.000329,0.090362,0.155258,0.159268,6.7e-05,0.000364,0.084553,0.158911,0.166263
3,I nuovi successi continentali (1924-1928).,3.2e-05,0.000217,0.07575,0.27828,0.345518,4e-05,0.000245,0.070914,0.260305,...,3.8e-05,0.000252,0.074242,0.287079,0.299804,3.5e-05,0.000227,0.063443,0.284489,0.328087
4,"Intanto, Alaric e Jenna iniziano ad uscire ins...",4.1e-05,0.000765,0.004048,0.032137,0.058406,4.4e-05,0.000727,0.005832,0.023416,...,4.8e-05,0.000711,0.004439,0.022138,0.043696,4.3e-05,0.000921,0.006401,0.031996,0.055702
