In [1]:
import torch
import torch.nn.functional as F
import tokenizers
import transformers
from transformers import BertTokenizer, BertForMaskedLM
import os
import pandas as pd
from tqdm import tqdm


In [2]:
def mask_all_tokens(sentence, tokenizer):
    masked_sentences = []
    tokenized_sentence = tokenizer(sentence, return_tensors='pt')
    for token_idx in range(1, len(tokenized_sentence['input_ids'][0]) - 1):
        masked_input = tokenizer(sentence, return_tensors='pt')  # tokenized_sentence.copy()
        masked_input['input_ids'][0][token_idx] = tokenizer.mask_token_id
        masked_sentences.append(masked_input)
    return masked_sentences

def compute_perplexity(sentence, checkpoint, training_id):
    tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-italian-cased")
    if checkpoint == 15449:
        model = BertForMaskedLM.from_pretrained(f"C:/Users/bergo/OneDrive - University of Pisa/Tesi Magistrale/models/{training_id}/final_pretrained_model")
    else:
        model = BertForMaskedLM.from_pretrained(f"C:/Users/bergo/OneDrive - University of Pisa/Tesi Magistrale/models/{training_id}/checkpoints/checkpoint-step{checkpoint}")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    masked_sentences = mask_all_tokens(sentence, tokenizer)
    original_tokens = tokenizer(sentence)['input_ids']
    sent_results = {'most_prob': [], 'correct_prob': [], 'top_1': [], 'top_5': [], 'top_10': []}
    for sent_idx, masked_sentence in enumerate(masked_sentences):
        masked_idx = sent_idx + 1
        correct_token = original_tokens[masked_idx]
        with torch.no_grad():
            outputs = model(**masked_sentence.to(device))
            logits = outputs.logits[0, masked_idx]
            probs = logits.softmax(dim=-1)
            values, predictions = probs.topk(10)
            # sent_results['top_1'].append(1 if correct_token in predictions[0] else 0)
            # sent_results['top_5'].append(1 if correct_token in predictions[:5] else 0)
            # sent_results['top_10'].append(1 if correct_token in predictions else 0)
            sent_results['correct_prob'].append(probs[correct_token].item())
            # sent_results['most_prob'].append(values[0].item())
    final_result = sum(sent_results['correct_prob']) / len(sent_results['correct_prob'])
    return final_result


In [3]:
'''
frase = "Nonostante l'apparente ineluttabilità del destino, intessuto com'è nel tessuto stesso del continuum spazio-temporale, la teoria della relatività generale di Einstein suggerisce che le curvature dello spazio-tempo, influenzate dalla distribuzione della massa-energia, possono dare origine a fenomeni astrofisici tanto enigmatici quanto i buchi neri e le onde gravitazionali."
result = compute_perplexity(frase, tokenizer, model)
'''

'\nfrase = "Nonostante l\'apparente ineluttabilità del destino, intessuto com\'è nel tessuto stesso del continuum spazio-temporale, la teoria della relatività generale di Einstein suggerisce che le curvature dello spazio-tempo, influenzate dalla distribuzione della massa-energia, possono dare origine a fenomeni astrofisici tanto enigmatici quanto i buchi neri e le onde gravitazionali."\nresult = compute_perplexity(frase, tokenizer, model)\n'

In [21]:
frasi_df = pd.read_csv("data/csv/pseudoppl/f30_words.csv", encoding="utf-8")

In [25]:
frasi_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 33 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   text                100 non-null    object 
 1   num_words           100 non-null    int64  
 2   dataset             100 non-null    object 
 3   pp_gulp_c2          100 non-null    float64
 4   pp_gulp_c32         100 non-null    float64
 5   pp_gulp_c512        100 non-null    float64
 6   pp_gulp_c8192       100 non-null    float64
 7   pp_gulp_c15449      100 non-null    float64
 8   pp_antigulp_c2      100 non-null    float64
 9   pp_antigulp_c32     100 non-null    float64
 10  pp_antigulp_c512    100 non-null    float64
 11  pp_antigulp_c8192   100 non-null    float64
 12  pp_antigulp_c15449  100 non-null    float64
 13  pp_random5_c2       100 non-null    float64
 14  pp_random5_c32      100 non-null    float64
 15  pp_random5_c512     100 non-null    float64
 16  pp_random

In [26]:
checkpoints = [2, 32, 512, 8192, 15449]
training_ids = ["CURRICULUM", "ANTI_CURRICULUM", "RANDOM_S21", "RANDOM_S37"]
column_names = ["readit", "antireadit", "random21", "random37"]
tqdm.pandas(desc="Processing sentences")

for i, training_id in enumerate(training_ids):
    for checkpoint in checkpoints:
        print(f"Calcolo perplexity del modello {training_id} allo step: {checkpoint}")
        frasi_df[f"pp_{column_names[i]}_c{checkpoint}"] = frasi_df["text"].progress_apply(lambda x: compute_perplexity(x, checkpoint, training_id))

Calcolo perplexity del modello CURRICULUM allo step: 2


Processing sentences:   0%|          | 0/100 [00:00<?, ?it/s]

Processing sentences: 100%|██████████| 100/100 [01:06<00:00,  1.49it/s]


Calcolo perplexity del modello CURRICULUM allo step: 32


Processing sentences: 100%|██████████| 100/100 [01:04<00:00,  1.55it/s]


Calcolo perplexity del modello CURRICULUM allo step: 512


Processing sentences: 100%|██████████| 100/100 [00:59<00:00,  1.67it/s]


Calcolo perplexity del modello CURRICULUM allo step: 8192


Processing sentences: 100%|██████████| 100/100 [01:00<00:00,  1.64it/s]


Calcolo perplexity del modello CURRICULUM allo step: 15449


Processing sentences: 100%|██████████| 100/100 [00:59<00:00,  1.68it/s]


Calcolo perplexity del modello ANTI_CURRICULUM allo step: 2


Processing sentences: 100%|██████████| 100/100 [01:05<00:00,  1.52it/s]


Calcolo perplexity del modello ANTI_CURRICULUM allo step: 32


Processing sentences: 100%|██████████| 100/100 [01:05<00:00,  1.52it/s]


Calcolo perplexity del modello ANTI_CURRICULUM allo step: 512


Processing sentences: 100%|██████████| 100/100 [01:01<00:00,  1.63it/s]


Calcolo perplexity del modello ANTI_CURRICULUM allo step: 8192


Processing sentences: 100%|██████████| 100/100 [01:05<00:00,  1.53it/s]


Calcolo perplexity del modello ANTI_CURRICULUM allo step: 15449


Processing sentences: 100%|██████████| 100/100 [01:06<00:00,  1.50it/s]


Calcolo perplexity del modello RANDOM_S21 allo step: 2


Processing sentences: 100%|██████████| 100/100 [01:06<00:00,  1.49it/s]


Calcolo perplexity del modello RANDOM_S21 allo step: 32


Processing sentences: 100%|██████████| 100/100 [01:07<00:00,  1.48it/s]


Calcolo perplexity del modello RANDOM_S21 allo step: 512


Processing sentences: 100%|██████████| 100/100 [01:05<00:00,  1.54it/s]


Calcolo perplexity del modello RANDOM_S21 allo step: 8192


Processing sentences: 100%|██████████| 100/100 [01:07<00:00,  1.48it/s]


Calcolo perplexity del modello RANDOM_S21 allo step: 15449


Processing sentences: 100%|██████████| 100/100 [01:05<00:00,  1.54it/s]


Calcolo perplexity del modello RANDOM_S37 allo step: 2


Processing sentences: 100%|██████████| 100/100 [01:02<00:00,  1.61it/s]


Calcolo perplexity del modello RANDOM_S37 allo step: 32


Processing sentences: 100%|██████████| 100/100 [01:04<00:00,  1.56it/s]


Calcolo perplexity del modello RANDOM_S37 allo step: 512


Processing sentences: 100%|██████████| 100/100 [01:04<00:00,  1.54it/s]


Calcolo perplexity del modello RANDOM_S37 allo step: 8192


Processing sentences: 100%|██████████| 100/100 [01:05<00:00,  1.54it/s]


Calcolo perplexity del modello RANDOM_S37 allo step: 15449


Processing sentences: 100%|██████████| 100/100 [01:05<00:00,  1.52it/s]


In [7]:
#frasi_df["readit_cFinal"] = frasi_df["text"].apply(compute_perplexity, args = )

In [27]:
frasi_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 53 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   text                  100 non-null    object 
 1   num_words             100 non-null    int64  
 2   dataset               100 non-null    object 
 3   pp_gulp_c2            100 non-null    float64
 4   pp_gulp_c32           100 non-null    float64
 5   pp_gulp_c512          100 non-null    float64
 6   pp_gulp_c8192         100 non-null    float64
 7   pp_gulp_c15449        100 non-null    float64
 8   pp_antigulp_c2        100 non-null    float64
 9   pp_antigulp_c32       100 non-null    float64
 10  pp_antigulp_c512      100 non-null    float64
 11  pp_antigulp_c8192     100 non-null    float64
 12  pp_antigulp_c15449    100 non-null    float64
 13  pp_random5_c2         100 non-null    float64
 14  pp_random5_c32        100 non-null    float64
 15  pp_random5_c512       10

In [28]:
frasi_df.to_csv(f"C:/Users/bergo/OneDrive - University of Pisa/Tesi Magistrale/perplexity/pseudoppl_frasi_len30.csv")