In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import transformers
import torch

import os
import nltk
import pandas as pd
import torch
import numpy as np
from jinja2 import Template
import xmltodict
import pickle
from collections import defaultdict

from fuzzywuzzy import fuzz

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import sys
sys.path.append('/scratch/users/bozyurt20/hpc_run/utilities')
sys.path.append("/scratch/users/bozyurt20/hpc_run/blobs/")
from util_research import *
max_len = 512
num_layers = 24
d_model = 4096

In [26]:
tokenizer = AutoTokenizer.from_pretrained("bigscience/T0pp", truncation_side="right", add_prefix_space=True)

In [2]:
tokenizer = AutoTokenizer.from_pretrained("bigscience/T0pp", truncation_side="right", add_prefix_space=True)
model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0pp")#, device_map="auto", load_in_8bit=True)

In [3]:
def find_index_one(input_ids, entity_str, index):
    
    entity_id = tokenizer.encode(entity_str)
    if len(entity_id) != 2:
        print("Not an appropriate entity!")
        return
    entity_id = entity_id[0]
    
    input_ids_list = input_ids.tolist()

    all_entity_mention_indices = []
    for i, j in enumerate(input_ids_list[0]):
        if j == entity_id:
            all_entity_mention_indices.append(i)
    try:
        entity_ind = all_entity_mention_indices[index]
        return entity_ind
    except:
        print("entity not found in the input!")
        return

In [4]:
from toy_dataset import contexts

# PART 1: Regular Prompt

In [5]:
tm = Template("""Question: "{{question}}"
Context: "{{context}}"
Answer:""")

context = contexts[0]
answer_choices = ["park", "London", "kitchen", "room"]
question = "Where was John?"
prompt = tm.render(context=context, question=question)
print(prompt)

Question: "Where was John?"
Context: "John went to the park. John was very happy."
Answer:


In [8]:
input_ids = tokenizer.encode(prompt, return_tensors="pt")
out = model.generate(input_ids, max_new_tokens=1, return_dict_in_generate=True, output_scores=True)

new code working-utils
new code working-modeling_t5
llama generation happening.
new code working-modeling_t5


In [9]:
next_token_scores = torch.nn.functional.softmax(
                out.scores[0], dim=-1
            )  # (batch_size * num_beams, vocab_size)
loc_token = tokenizer.encode("park")[0]
probability = next_token_scores[0][loc_token].item()
probability

0.2379118651151657

In [11]:
next_token_scores[0][1079].item()

0.3644513785839081

In [10]:
torch.argmax(out.scores[0], dim=-1)

tensor([1079])

In [12]:
tokenizer.decode(1079)

'John'

# PART 2: Corrupted Run Probabilities

In [13]:
tm = Template("""Question: "{{question}}"
Context: "{{context}}"
Answer:""")

context = "I have never played Monopoly. I am not very happy."
answer_choices = ["park", "London", "kitchen", "room"]
question = "Where was John?"
prompt = tm.render(context=context, question=question)
print(prompt)

Question: "Where was John?"
Context: "I have never played Monopoly. I am not very happy."
Answer:


In [14]:
input_ids = tokenizer.encode(prompt, return_tensors="pt")
out = model.generate(input_ids, max_new_tokens=1, return_dict_in_generate=True, output_scores=True)

new code working-utils
new code working-modeling_t5
llama generation happening.
new code working-modeling_t5


In [15]:
next_token_scores = torch.nn.functional.softmax(
                out.scores[0], dim=-1
            )  # (batch_size * num_beams, vocab_size)
loc_token = tokenizer.encode("park")[0]
probability = next_token_scores[0][loc_token].item()
probability

0.00013248974573798478

# PART 3: Move Character Embeddings

In [35]:
tm = Template("""Question: "{{question}}"
Context: "{{context}}"
Answer:""")

context = contexts[0]
answer_choices = ["park", "London", "kitchen", "room"]
question = "Where was John?"
prompt = tm.render(context=context, question=question)
prompt = prompt.replace("John", " John")
print(prompt)

Question: "Where was  John?"
Context: " John went to the park.  John was very happy."
Answer:


In [36]:
input_ids = tokenizer.encode(prompt, return_tensors="pt")
len_input_ids = len(input_ids[0])
out = model.encoder(input_ids, output_special=True)
special_hidden = out.special_hidden_states # 24 x (1, T, d)
    
special_reformatted = torch.zeros(num_layers, len_input_ids, d_model) # (24, T, d)
for i, hidden in enumerate(special_hidden):
    special_reformatted[i:i+1, :, :] = hidden

new code working-modeling_t5


In [29]:
input_ids

tensor([[11860,    10,    96, 25217,    47,  1079,  4609,  1193,  6327,    10,
            96, 18300,   877,    12,     8,  2447,     5,  1079,    47,   182,
          1095,   535, 11801,    10,     1]])

In [37]:
input_ids

tensor([[11860,    10,    96, 25217,    47,  1079,  4609,  1193,  6327,    10,
            96,  1079,   877,    12,     8,  2447,     5,  1079,    47,   182,
          1095,   535, 11801,    10,     1]])

In [34]:
tokenizer.encode(""""     John""")

[96, 1079, 1]

In [38]:
entity_ind = find_index_one(input_ids, "John", 2)
entity_hidden_state = special_reformatted[:, entity_ind, :].unsqueeze(0)

In [39]:
tm = Template("""Question: "{{question}}"
Context: "{{context}}"
Answer:""")

context = "I have never played Monopoly. I am not very happy."
context = " John " + context
answer_choices = ["park", "London", "kitchen", "room"]
question = "Where was John?"
prompt = tm.render(context=context, question=question)
print(prompt)

Question: "Where was John?"
Context: " John I have never played Monopoly. I am not very happy."
Answer:


In [40]:
input_ids = tokenizer.encode(prompt, return_tensors="pt")
entity_inds = [ find_index_one(input_ids, "John", 0) ]
out = model.generate(input_ids=input_ids, entity_hidden_states=entity_hidden_state, entity_inds=entity_inds, max_new_tokens=1, return_dict_in_generate=True, output_scores=True)    

new code working-utils
new code working-modeling_t5
llama generation happening.
new code working-modeling_t5


In [42]:
next_token_scores = torch.nn.functional.softmax(
                out.scores[0], dim=-1
            )  # (batch_size * num_beams, vocab_size)
loc_token = tokenizer.encode("park")[0]
probability = next_token_scores[0][loc_token].item()
probability

3.90733803214971e-05