In [None]:
### Clone the repo
%cd /content 
!git clone https://github.com/simonepri/lm-scorer
!sleep 60

%cd content/lm-scorer

!poetry install
!pip install transformers

Cloning into 'lm-scorer'...
remote: Enumerating objects: 396, done.[K
remote: Counting objects: 100% (142/142), done.[K
remote: Compressing objects: 100% (87/87), done.[K
remote: Total 396 (delta 75), reused 113 (delta 53), pack-reused 254[K
Receiving objects: 100% (396/396), 4.69 MiB | 31.17 MiB/s, done.
Resolving deltas: 100% (203/203), done.
/content/lm-scorer
/bin/bash: poetry: command not found
Collecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 5.1 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 42.4 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 67.2 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp

In [None]:
# Libraries
import torch
from lm_scorer.models.auto import AutoLMScorer as LMScorer

from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np
import re

################################
# First calculate unigram product
################################
## Use lm-scorer to calculate the product of the unigram probabilities
# Available models
# list(LMScorer.supported_model_names())
# => ["gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl", distilgpt2"]

# Load model to cpu or cuda
gpt2_variant = "gpt2-medium"

device = "cuda:0" if torch.cuda.is_available() else "cpu"
batch_size = 1
scorer = LMScorer.from_pretrained(gpt2_variant, device=device, batch_size=batch_size)
 

################################
# Calculate the Sentence probability based on GPT-2 LM
# From: samern92/GPT-2-for-Psycholinguistic-Applications repository
################################

def softmax(x):
	exps = np.exp(x)
	return np.divide(exps, np.sum(exps))
	
def cloze_finalword(text):
	'''
	This is a version of cloze generator that can handle words that are not in the model's dictionary.
	'''
	whole_text_encoding = tokenizer.encode(text)
	# Parse out the stem of the whole sentence (i.e., the part leading up to but not including the critical word)
	text_list = text.split()
	stem = ' '.join(text_list[:-1])
	stem_encoding = tokenizer.encode(stem)
	# cw_encoding is just the difference between whole_text_encoding and stem_encoding
	# note: this might not correspond exactly to the word itself
	# e.g., in 'Joe flicked the grasshopper', the difference between stem and whole text (i.e., the cw) is not 'grasshopper', but
	# instead it is ' grass','ho', and 'pper'. This is important when calculating the probability of that sequence.
	cw_encoding = whole_text_encoding[len(stem_encoding):]
	# print (cw_encoding)
	# print (whole_text_encoding)

	# Run the entire sentence through the model. Then go "back in time" to look at what the model predicted for each token, starting at the stem.
	# e.g., for 'Joe flicked the grasshopper', go back to when the model had just received 'Joe flicked the' and
	# find the probability for the next token being 'grass'. Then for 'Joe flicked the grass' find the probability that
	# the next token will be 'ho'. Then for 'Joe flicked the grassho' find the probability that the next token will be 'pper'.

	# Put the whole text encoding into a tensor, and get the model's comprehensive output
	tokens_tensor = torch.tensor([whole_text_encoding])
	
	with torch.no_grad():
		outputs = model(tokens_tensor)
		predictions = outputs[0]   

	logprobs = []
	# start at the stem and get downstream probabilities incrementally from the model(see above)
	# I should make the below code less awkward when I find the time
	start = -1-len(cw_encoding)
	for j in range(start,-1,1):
			# print (j)
			raw_output = []
			for i in predictions[-1][j]:
					raw_output.append(i.item())
	
			logprobs.append(np.log(softmax(raw_output)))
			
	# if the critical word is three tokens long, the raw_probabilities should look something like this:
	# [ [0.412, 0.001, ... ] ,[0.213, 0.004, ...], [0.002,0.001, 0.93 ...]]
	# Then for the i'th token we want to find its associated probability
	# this is just: raw_probabilities[i][token_index]
	conditional_probs = []
	for cw,prob in zip(cw_encoding,logprobs):
			# print (prob[cw])
			conditional_probs.append(prob[cw])
	# now that you have all the relevant probabilities, return their product.
	return np.exp(np.sum(conditional_probs))


# Load pre-trained model (weights) - this takes the most time
model = GPT2LMHeadModel.from_pretrained(gpt2_variant, output_hidden_states = True, output_attentions = True)
model.eval()
tokenizer = GPT2Tokenizer.from_pretrained(gpt2_variant)



####
# Calculate the Sentence Log Odds Ratio (SLOR)
####

def slor(probability_sentence, probability_unigram, text):
  return (np.log(probability_sentence) - np.log(probability_unigram))/len(text.split())

def normilize_slor(list_of_slor):
  mean_value = statistics.mean(list_of_slor)

sentences = ['I love hotdogs','I like kage', 'like jam I']
                  # 3                 2           1

def sentence_slor_scores(sentences):
  slor_scores = []
  for sentence in sentences:
    
    # Compute sentence conditional prob
    sentence_score = cloze_finalword(sentence)
    # print(sentence_score)
    # print(np.log(sentence_score), "log sentence" )

    # Compute sentence score as the product of tokens' probabilities
    unigram_probs_sentence = scorer.sentence_score(sentence, reduce="prod")
    # print(unigram_probs_sentence)
    # print(np.log(unigram_probs_sentence), "log unigram")

    # Sentence Log Odds Ratio
    slor_score = slor(sentence_score, unigram_probs_sentence, sentence)
    # print(f'Text: {sentence} - SLOR: {slor_score}')

    slor_scores.append(-slor_score)
  return slor_scores

Downloading:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

In [None]:
sentences = ['She is a citizen of France', 'Jam jam tells no no today','Krik spurs hotdogs every blue']
slor_scores = sentence_slor_scores(sentences)
print(slor_scores)

[-4.749636232293952, -9.919067396852965, -13.186498687497002]
