## Necessary Imports and Setup

In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

import os
import nltk
import pandas as pd
import torch
import numpy as np
from jinja2 import Template
import xmltodict
import pickle

from fuzzywuzzy import fuzz
import Levenshtein as lev
from rouge import Rouge

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [46]:
from numba import cuda
cuda.select_device(0)
cuda.close()

In [2]:
stop_words = set(stopwords.words("english"))

In [3]:
path_andersen = "/kuacc/users/bozyurt20/ChildrenStories/Andersen"
path_fanny = "/kuacc/users/bozyurt20/ChildrenStories/Fanny Fern"
path_annotations = "/kuacc/users/bozyurt20/ChildrenStories/Annotations"

dir_list_andersen = os.listdir(path_andersen)
dir_list_fanny = os.listdir(path_fanny)
dir_list_annotations = os.listdir(path_annotations)

In [4]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [5]:
model = T5ForConditionalGeneration.from_pretrained("t5-base")

In [6]:
model = model.cuda()

## Example Pipeline

In [19]:
inputs = tokenizer.encode( "Translate fromEnglish to German: I hate that they cancelled my membership.", return_tensors="pt")

In [20]:
               
inputs = inputs.to("cuda:0")

with torch.no_grad():
    outputs = model.generate(inputs)

out = tokenizer.decode(outputs[0], skip_special_tokens=True)


In [21]:
out

'Ich hasse, dass sie meine Mitgliedschaft storniert.'

## Prompt Creating Function

In [25]:
def create_prompt_clipped(version, context, character, grammatical_number, max_no_tokens=512):
    
    if grammatical_number == 'singular':
        to_be = 'is'
    elif grammatical_number == 'plural':
        to_be = 'are'
    
    if version in [1, 2, 9, 10, 11, 12, 13, 20, 21, 22]:
        question = "Where " + to_be + " " + character + "?"
    elif version in [4, 5, 7, 8, 15, 16, 18, 19]:
        question = "where " + character + " " + to_be + "."
    elif version in [3, 14]:
        question = "where " + character + " " + to_be + "?"
    elif version in [6, 17]:
        question = "where " + to_be + " " + character + "?"
        
    if version == 1 or version == 12:
        intro = "Answer the question depending on the context."
    elif version == 2 or version == 13:
        intro = "What is the answer?"
    elif version == 3 or version == 14:
        intro = "Can you tell me "
    elif version == 4 or version == 15:
        intro = "Please tell me "
    elif version == 5 or version == 16:
        intro = "Tell me "
    elif version == 6 or version == 17:
        intro = "From the passage, "
    elif version == 7 or version == 18:
        intro = "I want to know "
    elif version == 8 or version == 19:
        intro = "I want to ask "
    elif version == 9 or version == 20:
        intro = "What is the answer to: "
    elif version == 10 or version == 21:
        intro = "Find the answer to: "
    elif version == 11 or version == 22:
        intro = "Answer: "     
    
    if version in [1, 2]:
        oo = 0
        tm = Template("""{{ intro }}
Context: {{context}};
Question: {{question}};
Answer: """)        
        prompt = tm.render(intro=intro, context=context, question=question)
        
        while len(tokenizer.encode(prompt)) > max_no_tokens:
            context = tokenizer.encode(context)
            diff = len(tokenizer.encode(prompt)) - max_no_tokens
            context = context[diff:]
            oo += 1
            if oo > 4:
                context = context[1:]
            context = tokenizer.decode(context, skip_special_tokens=True)
            prompt = tm.render(intro=intro, context=context, question=question)
        
    elif version in [3, 4, 5, 6, 7, 8, 9, 10, 11]:
        oo = 0
        tm = Template("{{context}} {{intro}}{{question}}")
        prompt = tm.render(intro=intro, context=context, question=question)
        while len(tokenizer.encode(prompt)) > max_no_tokens:
            context = tokenizer.encode(context)
            diff = len(tokenizer.encode(prompt)) - max_no_tokens
            context = context[diff:]            
            oo += 1
            if oo > 4:
                context = context[1:]
            context = tokenizer.decode(context, skip_special_tokens=True)
            prompt = tm.render(intro=intro, context=context, question=question)
        
        
    elif version in [12, 13]:
        oo = 0
        tm = Template("""{{ intro }}
Context: {{context}};
Question: {{question}};
If you can't find the answer, please respond "unanswerable".
Answer: """)
        prompt = tm.render(intro=intro, context=context, question=question)
        while len(tokenizer.encode(prompt)) > max_no_tokens:
            context = tokenizer.encode(context)
            diff = len(tokenizer.encode(prompt)) - max_no_tokens
            context = context[diff:]
            oo += 1
            if oo > 4:
                context = context[1:]
            context = tokenizer.decode(context, skip_special_tokens=True)
            prompt = tm.render(intro=intro, context=context, question=question)
        
    elif version in [14, 15, 16, 17, 18, 19, 20, 21, 22]:
        oo = 0
        tm = Template('{{context}} {{intro}}{{question}} If you can\'t find the answer, please respond "unanswerable"."')
        prompt = tm.render(intro=intro, context=context, question=question)    
        while len(tokenizer.encode(prompt)) > max_no_tokens:
            context = tokenizer.encode(context)
            diff = len(tokenizer.encode(prompt)) - max_no_tokens
            context = context[diff:]
            oo += 1
            if oo > 4:
                context = context[1:]
            context = tokenizer.decode(context, skip_special_tokens=True)
            prompt = tm.render(intro=intro, context=context, question=question)
            
    elif version == 23:
        oo = 0
        prompt = "Where " + to_be + " " + character + " in the following text: " + context + " Answer: "
        while len(tokenizer.encode(prompt)) > max_no_tokens:
            context = tokenizer.encode(context)
            diff = len(tokenizer.encode(prompt)) - max_no_tokens
            context = context[diff:]
            oo += 1
            if oo > 4:
                context = context[1:]
            context = tokenizer.decode(context, skip_special_tokens=True)
            prompt = "Where " + to_be + " " + character + " in the following text: " + context + " Answer: "
        
    return prompt, context

## Accuracy Calculating Function

In [26]:
def exact_match(predictions):

    matches_exact = {}

    for item in predictions:

        matches_exact[item] = [ [] for _ in range(1,24)]

        f = open(os.path.join(path_annotations, item), 'r')
        annotations = pd.read_csv(f, sep="\t")
        annotations = annotations.values #numpy array
        f.close()

        for k in range(1,24):

            pred_locs = predictions[item][k-1]
            i = 0

            for line in annotations:

                character = line[1]
                gold_locations = line[2].split("/")
                
                pred_tokenized = word_tokenize(pred_locs[i].lower())
                new_pred_tokens = [ token for token in pred_tokenized if token not in stop_words]
                pred_wo_stop_words = " ".join(new_pred_tokens)
                
                char_tokenized = word_tokenize(character.lower())
                new_char_tokens = [ token for token in char_tokenized if token not in stop_words]
                char_wo_stop_words = " ".join(new_char_tokens)
                
                if char_wo_stop_words not in " ".join(gold_locations):
                    pred_wo_stop_words = pred_wo_stop_words.replace(char_wo_stop_words, "")   
                
                else:
                    if pred_wo_stop_words[len(char_wo_stop_words)+1:len(char_wo_stop_words)+3] == "is" or pred_wo_stop_words[len(char_wo_stop_words)+1:len(char_wo_stop_words)+4] == "are":
                        pred_wo_stop_words = pred_wo_stop_words[len(char_wo_stop_words)+1:]

                match = False

                for gold_location in gold_locations:

                    gold_tokenized = word_tokenize(gold_location.lower())
                    new_gold_tokens = [ token for token in gold_tokenized if token not in stop_words]
                    gold_wo_stop_words = " ".join(new_gold_tokens)

                    if gold_wo_stop_words == pred_wo_stop_words:
                        match = True

                if match:
                    matches_exact[item][k-1].append(1)
                else:
                    matches_exact[item][k-1].append(0)

                i += 1
    
    return matches_exact

In [37]:
def fuzzy_match(predictions):
    
    matches_fuzzy = {}

    for item in predictions:

        print(item)
        matches_fuzzy[item] = [ [] for _ in range(1,24)]

        f = open(os.path.join(path_annotations, item), 'r')
        annotations = pd.read_csv(f, sep="\t")
        annotations = annotations.values #numpy array
        f.close()

        for k in range(1,24):

            pred_locs = predictions[item][k-1]        
            i = 0

            for line in annotations:

                gold_locations = line[2].split("/")

                pred_tokenized = word_tokenize(pred_locs[i].lower())
                new_pred_tokens = [ token for token in pred_tokenized if token not in stop_words ]
                pred_wo_stop_words = " ".join(new_pred_tokens)
                
                char_tokenized = word_tokenize(character.lower())
                new_char_tokens = [ token for token in char_tokenized if token not in stop_words]
                char_wo_stop_words = " ".join(new_char_tokens)
                
                if char_wo_stop_words not in " ".join(gold_locations):
                    pred_wo_stop_words = pred_wo_stop_words.replace(char_wo_stop_words, "")   
                
                else:
                    if pred_wo_stop_words[len(char_wo_stop_words)+1:len(char_wo_stop_words)+3] == "is" or pred_wo_stop_words[len(char_wo_stop_words)+1:len(char_wo_stop_words)+4] == "are":
                        pred_wo_stop_words = pred_wo_stop_words[len(char_wo_stop_words)+1:]

                match = False

                for gold_location in gold_locations:

                    gold_tokenized = word_tokenize(gold_location.lower())
                    new_gold_tokens = [ token for token in gold_tokenized if token not in stop_words ]
                    gold_wo_stop_words = " ".join(new_gold_tokens)

                    if fuzz.partial_ratio(gold_wo_stop_words, pred_wo_stop_words) > 90:
                        match = True

                if match: 
                    matches_fuzzy[item][k-1].append(1)
                else:
                    matches_fuzzy[item][k-1].append(0)

                i += 1
                
    return matches_fuzzy

## Making Predictions

In [27]:
def text_clean_ending(example_text):
    example_text = example_text.rstrip(", ;-\n")
    if example_text[-1] != ".":
        example_text += "."
    return example_text

def remove_new_lines(text):
    paragraphs = text.split("\n\n")
    new_paragraphs = []
    for paragraph in paragraphs:
        new_paragraphs.append(paragraph.replace("\n", " "))
    new_text = "\n".join(new_paragraphs)
    return new_text

In [28]:
# prompt has the max number of tokens: 512, and we start at a " " char.

m4_predictions = {}

for item in dir_list_andersen:
    
    if item in dir_list_annotations:
        
        print(item)
        
        f = open(os.path.join(path_andersen, item), 'r') 
        story = f.read()
        f.close()
        
        out_path = "T5_Method4_" + item[:-3] + "xlsx"
        writer = pd.ExcelWriter(out_path, engine='xlsxwriter')
        workbook = writer.book
        format = workbook.add_format({'text_wrap': True})
        
        m4_predictions[item] = [ [] for _ in range(1,24)]
        
        f = open(os.path.join(path_annotations, item), 'r')
        annotations = pd.read_csv(f, sep="\t")
        annotations = annotations.values
        f.close()
        
        i = 0
        
        paragraphs = story.split("\n\n")
        paragraph = paragraphs[0]
        len_title = len(paragraph) + 2        
    
        for line in annotations:
            
            character = line[1]
            gold_answer = line[2]
            grammatical_number = line[3]

            gold_locations = gold_answer.split("/")
            my_dic = {"Prompts": [gold_answer, "-", "-"]}
            
            for k in range(1, 24):
                
                y = line[0]
                x = y - 5120

                if x < len_title:
                    text = story[len_title:y]

                else:
                    x = story[x:y].find(" ") + x
                    text = story[x:y]                
                
                text = text_clean_ending(text)
                text = remove_new_lines(text)                    
                
                prompt, context2 = create_prompt_clipped(k, text, character, grammatical_number, 512)
                inputs = tokenizer.encode(prompt, return_tensors="pt")
                
                inputs = inputs.to("cuda:0")
                
                with torch.no_grad():
                    outputs = model.generate(inputs)
                    
                out = tokenizer.decode(outputs[0], skip_special_tokens=True)
                
                match1 = "No"
                match2 = "No"
                
                pred_tokenized = word_tokenize(out.lower())
                new_pred_tokens = [ token for token in pred_tokenized if token not in stop_words ]
                pred_wo_stop_words = " ".join(new_pred_tokens) 
                
                for gold_location in gold_locations:
                    
                    gold_tokenized = word_tokenize(gold_location.lower())
                    new_gold_tokens = [ token for token in gold_tokenized if token not in stop_words ]
                    gold_wo_stop_words = " ".join(new_gold_tokens)
                    
                    if gold_wo_stop_words == pred_wo_stop_words:
                        match1 = "Yes"
                        
                    if fuzz.partial_ratio(gold_wo_stop_words, pred_wo_stop_words) > 90:
                        match2 = "Yes"
                        
                my_dic[prompt] = [out, match1, match2]
                m4_predictions[item][k-1].append(out)  
                
            df = pd.DataFrame(data=my_dic, index=["output", "exact match?", "fuzzy match?"])
            df = (df.T)
            df.to_excel(writer, sheet_name=str(i+1))
            worksheet = writer.sheets[str(i+1)]
            
            i += 1
            
            for idx, col in enumerate(df):
                max_len = 75
                worksheet.set_column(idx, idx, max_len, format)
        
        writer.save()


Andersen_story2.txt


Token indices sequence length is longer than the specified maximum sequence length for this model (708 > 512). Running this sequence through the model will result in indexing errors


Andersen_story8.txt
Andersen_story11.txt
Andersen_story7.txt
Andersen_story17.txt
Andersen_story15.txt
Andersen_story9.txt
Andersen_story5.txt
Andersen_story1.txt
Andersen_story12.txt
Andersen_story16.txt
Andersen_story18.txt
Andersen_story3.txt
Andersen_story10.txt
Andersen_story13.txt


In [29]:
with open("T5_predictions_1.txt", "wb") as f:
    pickle.dump(m4_predictions, f)

## Calculating the Accuracy

In [None]:
with open("Method4Predictions_distraction.txt", "rb") as f:
    m4_predictions = pickle.load(f)

In [30]:
m4_matches_exact = exact_match(m4_predictions)

In [31]:
m4_accuracy_exact = {}

for item in m4_matches_exact:
    m4_accuracy_exact[item] = []
    for prompt_version in m4_matches_exact[item]:
        m4_accuracy_exact[item].append(np.mean(np.array(prompt_version)))

In [32]:
m4_prompt_accuracies_exact = [ [] for _ in range(23)]

for k in range(23):
    for item in m4_accuracy_exact:
        m4_prompt_accuracies_exact[k].append(m4_accuracy_exact[item][k])

In [33]:
m4_prompt_accuracy_exact = np.mean(np.array(m4_prompt_accuracies_exact), axis=1)

In [34]:
m4_prompt_accuracy_exact.argmax()

0

In [35]:
m4_prompt_accuracy_exact[5]

0.0

In [36]:
m4_prompt_accuracy_exact.mean()

0.0

In [38]:
m4_prompt_accuracy_exact

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0.])

In [39]:
m4_matches_fuzzy = fuzzy_match(m4_predictions)

Andersen_story2.txt
Andersen_story8.txt
Andersen_story11.txt
Andersen_story7.txt
Andersen_story17.txt
Andersen_story15.txt
Andersen_story9.txt
Andersen_story5.txt
Andersen_story1.txt
Andersen_story12.txt
Andersen_story16.txt
Andersen_story18.txt
Andersen_story3.txt
Andersen_story10.txt
Andersen_story13.txt


In [40]:
m4_accuracy_fuzzy = {}

for item in m4_matches_fuzzy:
    m4_accuracy_fuzzy[item] = []
    for prompt_version in m4_matches_fuzzy[item]:
        m4_accuracy_fuzzy[item].append(np.mean(np.array(prompt_version)))

In [41]:
m4_prompt_accuracies_fuzzy = [ [] for _ in range(23)]

for k in range(23):
    for item in m4_accuracy_fuzzy:
        m4_prompt_accuracies_fuzzy[k].append(m4_accuracy_fuzzy[item][k])

In [42]:
m4_prompt_accuracy_fuzzy = np.mean(np.array(m4_prompt_accuracies_fuzzy), axis=1)

In [43]:
m4_prompt_accuracy_fuzzy.argmax()

9

In [44]:
m4_prompt_accuracy_fuzzy[12]

0.11850652036135906

In [45]:
m4_prompt_accuracy_fuzzy.mean()

0.10096236434595482