In [None]:
import pandas as pd
import numpy as np
from wordfreq import word_frequency
import string
import math

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
dataset = pd.read_csv("data/Provo_Corpus-Predictability_Norms.csv",encoding='latin-1')

# remove rows with NaN values
dataset = dataset.dropna()

# remove rows with the same id
dataset = dataset.drop_duplicates(subset='Word_Unique_ID')

# mantain only the column Word_Unique_ID, Text_ID, Text, Word_Number, Sentence_Number, Word_In_Sentence_Number, Word
dataset = dataset[['Word_Unique_ID', 'Text_ID', 'Text', 'Word_Number', 'Sentence_Number', 'Word_In_Sentence_Number', 'Word']]

In [None]:
# create list_of_text from dataset by filtering on text_id and taking the Text column
list_of_text = dataset.drop_duplicates(subset='Text_ID')['Text'].tolist()

In [None]:
# DA RICONTROLLARE, MANCA WORD_NUMBER
count_text = 1
prefix = ""
list_of_new_rows = []
for text in list_of_text:
  # Split into sentences considering multiple punctuation marks
  sentences = text.split(".")
  count_sentence = 1    
  word_number = 1
  for sentence in sentences:
    # Split into words and filter out empty strings
    words = sentence.split()
    count_word = 1
    for word in words:
      word = word.strip('"') 
      if word:  # Ensure the word is not empty
        # add the new row to the list
        filtered_rows = dataset[(dataset['Text_ID'] == count_text) & 
                              (dataset['Sentence_Number'] == count_sentence) & 
                              (dataset['Word_In_Sentence_Number'] == count_word)]["Word_Unique_ID"]
        if not filtered_rows.empty:
          word_unique_id = filtered_rows.values[0]
          list_of_new_rows.append([word_unique_id, count_text, word_number, count_sentence, count_word, prefix, word])

        prefix = prefix + " " + word if count_word != 1 or count_sentence != 1 else word
        count_word += 1
        word_number += 1
        
    prefix = prefix + "."
    count_sentence += 1
    
  count_text += 1
  prefix = ""

# Create a new DataFrame with the new rows
df = pd.DataFrame(list_of_new_rows, columns=['Word_Unique_ID','Text_ID', 'Word_Number','Sentence_Number', 'Word_In_Sentence_Number', 'Prefix', 'Word'])

# frequency

In [None]:
df['Frequency'] = df['Word'].apply(lambda word: word_frequency(word, 'en'))

df

# surprisal

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained('gpt2',add_prefix_space=True)
model = AutoModelForCausalLM.from_pretrained('gpt2')

In [None]:
# add surprisal column using the gpt-2 model
list_surprisal = []

model = model.to(device)

for index,row in df.iterrows():
  if (row["Sentence_Number"] == 1 and row["Word_In_Sentence_Number"] == 1):
    list_surprisal.append(0)
  else:
    word = row["Word"].strip(string.punctuation).lower()
    context = row["Prefix"]

    encoded_text = tokenizer(context, return_tensors="pt").to(device)

    with torch.inference_mode():
      outputs = model(**encoded_text)
      
    next_token_logits = outputs.logits[0, -1, :]
    next_token_probs = torch.softmax(next_token_logits, -1)
    next_word_gpt = word.strip(string.punctuation).replace(",","")
    word_prob_gpt = next_token_probs[tokenizer(next_word_gpt).input_ids[0]]
    surprisal_gpt = -math.log(word_prob_gpt)
    list_surprisal.append(round(surprisal_gpt,5))

df["Surprisal"] = list_surprisal

In [None]:
df

In [None]:
df.to_csv("data/dataset_text_properties.csv", index=False)