In [19]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import transformers
import torch
import os
import nltk
import pandas as pd
import torch
import numpy as np
from jinja2 import Template
import xmltodict
import pickle
from collections import defaultdict
from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import sys
sys.path.append('/scratch/users/bozyurt20/hpc_run/utilities')
sys.path.append("/scratch/users/bozyurt20/hpc_run/blobs/")
from util_research import *

from toy_dataset import contexts

max_len = 512
num_layers = 24
d_model = 4096

In [20]:
tokenizer = AutoTokenizer.from_pretrained("bigscience/T0pp", truncation_side="right")
model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0pp")#, device_map="auto", load_in_8bit=True)

KeyboardInterrupt: 

In [105]:
def find_index_one(input_ids, entity_str, index):
    
    entity_id = tokenizer.encode(entity_str)
    if len(entity_id) != 2:
        print("Not an appropriate entity!")
        return
    entity_id = entity_id[0]
    
    input_ids_list = input_ids.tolist()

    all_entity_mention_indices = []
    for i, j in enumerate(input_ids_list[0]):
        if j == entity_id:
            all_entity_mention_indices.append(i)
    try:
        entity_ind = all_entity_mention_indices[index]
        return entity_ind
    except:
        print("entity not found in the input!")
        return

In [7]:
for location in ["park", "London", "kitchen", "room", "forest"]:
    loc_tokens = tokenizer.encode(location)
    print(location)
    print(loc_tokens)
    print()

park
[2447, 1]

London
[1524, 1]

kitchen
[1228, 1]

room
[562, 1]

forest
[5827, 1]



In [8]:
for character in ["John", "Mary", "David", "Henry", "Lisa"]:
    char_tokens = tokenizer.encode(character)
    print(character)
    print(char_tokens)
    print()

John
[1079, 1]

Mary
[3790, 1]

David
[1955, 1]

Henry
[7780, 1]

Lisa
[11712, 1]



In [9]:
for character in [" John", " Mary", " David", " Henry", " Lisa"]:
    char_tokens = tokenizer.encode(character)
    print(character)
    print(char_tokens)
    print()

 John
[1079, 1]

 Mary
[3790, 1]

 David
[1955, 1]

 Henry
[7780, 1]

 Lisa
[11712, 1]



In [14]:
tm = Template("""Read the following context and choose the best option to answer the
question.
Context: {{ context }}
Question: {{ question }}
Options:
 - {{ answer_choices | join("\n - ") }}""")

context = contexts[0]
answer_choices = ["park", "London", "kitchen", "room"]
question = "Where was John?"
prompt = tm.render(context=context, question=question, answer_choices=answer_choices)
print(prompt)

Read the following context and choose the best option to answer the
question.
Context: John went to the park. John was very happy.
Question: Where
Options:
 - park
 - London
 - kitchen
 - room


In [17]:
input_ids = tokenizer.encode(prompt, return_tensors="pt")

In [58]:
out = model.generate(input_ids, max_new_tokens=1, return_dict_in_generate=True, output_scores=True)

new code working-utils
new code working-modeling_t5
llama generation happening.
new code working-modeling_t5


In [59]:
out

GreedySearchEncoderDecoderOutput(sequences=tensor([[   0, 2447]]), scores=(tensor([[-58.9907, -13.7392, -17.4525,  ..., -57.0442, -58.7730, -57.9561]]),), encoder_attentions=None, encoder_hidden_states=None, decoder_attentions=None, cross_attentions=None, decoder_hidden_states=None)

In [61]:
len(out.scores)

1

In [62]:
out.scores[0].shape

torch.Size([1, 32128])

In [63]:
 torch.argmax(out.scores[0], dim=-1)

tensor([2447])

In [64]:
next_token_scores = torch.nn.functional.softmax(
                out.scores[0], dim=-1
            )  # (batch_size * num_beams, vocab_size)

In [68]:
 torch.argmax(next_token_scores, dim=-1)

tensor([2447])

In [70]:
next_token_scores

tensor([[3.9759e-26, 1.7862e-06, 4.3577e-08,  ..., 2.7846e-25, 4.9429e-26,
         1.1188e-25]])

In [71]:
next_token_scores[0][2447]

tensor(0.9900)

In [80]:
loc_tokens = []
for location in answer_choices:
    loc_tokens.append(tokenizer.encode(location)[0])

In [82]:
loc_tokens

[2447, 1524, 1228, 562]

In [83]:
probabilities = {}
for loc_token in loc_tokens:
    probabilities[loc_token] = next_token_scores[0][loc_token]

In [84]:
probabilities

{2447: tensor(0.9900),
 1524: tensor(0.0066),
 1228: tensor(0.0001),
 562: tensor(0.0002)}

# PART 1: Regular Prompt

In [85]:
tm = Template("""Read the following context and choose the best option to answer the
question.
Context: {{ context }}
Question: {{ question }}
Options:
 - {{ answer_choices | join("\n - ") }}""")

context = contexts[0]
answer_choices = ["park", "London", "kitchen", "room"]
question = "Where was John?"
prompt = tm.render(context=context, question=question, answer_choices=answer_choices)
print(prompt)

Read the following context and choose the best option to answer the
question.
Context: John went to the park. John was very happy.
Question: Where was John?
Options:
 - park
 - London
 - kitchen
 - room


In [86]:
input_ids = tokenizer.encode(prompt, return_tensors="pt")
out = model.generate(input_ids, max_new_tokens=1, return_dict_in_generate=True, output_scores=True)

new code working-utils
new code working-modeling_t5
llama generation happening.
new code working-modeling_t5


In [95]:
next_token_scores = torch.nn.functional.softmax(
                out.scores[0], dim=-1
            )  # (batch_size * num_beams, vocab_size)

loc_tokens = []
for location in answer_choices:
    loc_tokens.append(tokenizer.encode(location)[0])

probabilities = {}
for loc_token in loc_tokens:
    probabilities[tokenizer.decode(loc_token)] = next_token_scores[0][loc_token].item()

In [96]:
probabilities

{'park': 0.9968858361244202,
 'London': 0.0008090201881714165,
 'kitchen': 3.175687379552983e-05,
 'room': 2.1005409507779405e-05}

# PART 2: Corrupted Run Probabilities

In [101]:
tm = Template("""Read the following context and choose the best option to answer the
question.
Context: {{ context }}
Question: {{ question }}
Options:
 - {{ answer_choices | join("\n - ") }}""")

context = "I have never played Monopoly. I am not very happy."
answer_choices = ["park", "London", "kitchen", "room"]
question = "Where was John?"
prompt = tm.render(context=context, question=question, answer_choices=answer_choices)
print(prompt)

Read the following context and choose the best option to answer the
question.
Context: I have never played Monopoly. I am not very happy.
Question: Where was John?
Options:
 - park
 - London
 - kitchen
 - room


In [102]:
input_ids = tokenizer.encode(prompt, return_tensors="pt")
out = model.generate(input_ids, max_new_tokens=1, return_dict_in_generate=True, output_scores=True)

new code working-utils
new code working-modeling_t5
llama generation happening.
new code working-modeling_t5


In [103]:
next_token_scores = torch.nn.functional.softmax(
                out.scores[0], dim=-1
            )  # (batch_size * num_beams, vocab_size)

loc_tokens = []
for location in answer_choices:
    loc_tokens.append(tokenizer.encode(location)[0])

probabilities = {}
for loc_token in loc_tokens:
    probabilities[tokenizer.decode(loc_token)] = next_token_scores[0][loc_token].item()

In [104]:
probabilities

{'park': 0.18794533610343933,
 'London': 0.7177245020866394,
 'kitchen': 0.027446743100881577,
 'room': 0.041640542447566986}

# PART 3: Move Character Embeddings

In [106]:
tm = Template("""Read the following context and choose the best option to answer the
question.
Context: {{ context }}
Question: {{ question }}
Options:
 - {{ answer_choices | join("\n - ") }}""")

context = contexts[0]
answer_choices = ["park", "London", "kitchen", "room"]
question = "Where was John?"
prompt = tm.render(context=context, question=question, answer_choices=answer_choices)
print(prompt)

Read the following context and choose the best option to answer the
question.
Context: John went to the park. John was very happy.
Question: Where was John?
Options:
 - park
 - London
 - kitchen
 - room


In [108]:
input_ids = tokenizer.encode(prompt, return_tensors="pt")
len_input_ids = len(input_ids[0])
out = model.encoder(input_ids, output_special=True)
special_hidden = out.special_hidden_states # 24 x (1, T, d)
    
special_reformatted = torch.zeros(num_layers, len_input_ids, d_model) # (24, T, d)
for i, hidden in enumerate(special_hidden):
    special_reformatted[i:i+1, :, :] = hidden

new code working-modeling_t5


NameError: name 'len_input_ids' is not defined

In [114]:
entity_ind = find_index_one(input_ids, "John", 1)
entity_hidden_state = special_reformatted[:, entity_ind, :].unsqueeze(0)

In [111]:
tm = Template("""Read the following context and choose the best option to answer the
question.
Context: {{ context }}
Question: {{ question }}
Options:
 - {{ answer_choices | join("\n - ") }}""")

context = "I have never played Monopoly. I am not very happy."
context = "John " + context
answer_choices = ["park", "London", "kitchen", "room"]
question = "Where was John?"
prompt = tm.render(context=context, question=question, answer_choices=answer_choices)
print(prompt)

Read the following context and choose the best option to answer the
question.
Context: John I have never played Monopoly. I am not very happy.
Question: Where was John?
Options:
 - park
 - London
 - kitchen
 - room


In [120]:
input_ids = tokenizer.encode(prompt, return_tensors="pt")
entity_inds = [ find_index_one(input_ids, "John", 0) ]
out = model.generate(input_ids=input_ids, entity_hidden_states=entity_hidden_state, entity_inds=entity_inds, max_new_tokens=1, return_dict_in_generate=True, output_scores=True)    

new code working-utils
new code working-modeling_t5
llama generation happening.
new code working-modeling_t5


In [122]:
tokenizer.decode(out[0])

TypeError: argument 'ids': 'list' object cannot be interpreted as an integer

In [123]:
next_token_scores = torch.nn.functional.softmax(
                out.scores[0], dim=-1
            )  # (batch_size * num_beams, vocab_size)

loc_tokens = []
for location in answer_choices:
    loc_tokens.append(tokenizer.encode(location)[0])

probabilities = {}
for loc_token in loc_tokens:
    probabilities[tokenizer.decode(loc_token)] = next_token_scores[0][loc_token].item()

In [124]:
probabilities

{'park': 0.18760742247104645,
 'London': 0.7060736417770386,
 'kitchen': 0.032871078699827194,
 'room': 0.047161877155303955}