# Necessary Imports and Settings

In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

import os
import nltk
import pandas as pd
import torch
import numpy as np
from jinja2 import Template
import pickle

from fuzzywuzzy import fuzz

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize



In [3]:
stop_words = set(stopwords.words("english"))

num_templates = 23

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

path_andersen = "/kuacc/users/bozyurt20/ChildrenStories/Andersen"
path_fanny = "/kuacc/users/bozyurt20/ChildrenStories/Fanny Fern"
path_annotations = "/kuacc/users/bozyurt20/ChildrenStories/Annotations"

dir_list_andersen = os.listdir(path_andersen)
dir_list_fanny = os.listdir(path_fanny)
dir_list_annotations = os.listdir(path_annotations)

def text_clean_ending(example_text):
    example_text = example_text.rstrip(", ;-\n")
    if example_text[-1] != ".":
        example_text += "."
    return example_text
    
def prompt_clean_ending(prompt, location):
    if prompt[-1] == " ":
        prompt += location
    else:
        prompt += " " + location
    return prompt

def remove_new_lines(text):
    paragraphs = text.split("\n\n")
    new_paragraphs = []
    for paragraph in paragraphs:
        new_paragraphs.append(paragraph.replace("\n", " "))
    new_text = "\n".join(new_paragraphs)
    return new_text

# All Annotations in one Dictionary

In [6]:
all_annotations = {}

for item in sorted(dir_list_andersen):

    if item in dir_list_annotations:

        print(item)

        f = open(os.path.join(path_annotations, item), 'r')
        annotations = pd.read_csv(f, sep="\t")
        annotations = annotations.values
        f.close()

        all_annotations[item] = []

        for line in annotations:

            character = line[1]
            gold_answer = line[2]
            grammatical_number = line[3]

            all_annotations[item].append({
                "char_no": line[0],
                "character": line[1],
                "location": line[2],
                "grammatical_number": line[3]
            })
annotations_dict = {}
for story in all_annotations:
    annotations_dict[story] = {}
    characters = []
    for line in all_annotations[story]:
        characters.append(line["character"])
    characters = list(set(characters))
    for character in characters:
        annotations_dict[story][character] = []
        for line in all_annotations[story]:
            if line["character"] == character:
                annotations_dict[story][character].append( (line["char_no"], line["location"], line["grammatical_number"]) )
            

Andersen_story1.txt
Andersen_story10.txt
Andersen_story11.txt
Andersen_story12.txt
Andersen_story13.txt
Andersen_story15.txt
Andersen_story16.txt
Andersen_story17.txt
Andersen_story18.txt
Andersen_story2.txt
Andersen_story3.txt
Andersen_story5.txt
Andersen_story7.txt
Andersen_story8.txt
Andersen_story9.txt


In [7]:
annotations_dict["Andersen_story17.txt"]

{'the little boy': [(904,
   "in a warm room/in the room/in a room/inside the old poet's house/in the old poet's room/in the poet's room/in the poet's house",
   'singular'),
  (1566,
   "in his lap/in the old poet's lap/in his room/in the room/in the warm room",
   'singular'),
  (1821,
   'round the kind old poet/round the old poet/round the poet/in his room/in the room/in the warm room',
   'singular'),
  (2513,
   "away the old poet's house/away/away the room/away the poet's house/away the poet's room/away the old poet's room",
   'singular')],
 'the child': [(559,
   "at the door/at the poet's door/outside/under the rain/at the old poet's door",
   'singular')],
 'the old poet': [(129, 'in his room/in the room', 'singular'),
  (266,
   'in his chimney-corner/in his chimney corner/in the chimney-corner/in the chimney corner/in his corner/in the corner/in his room/in the room',
   'singular'),
  (692, 'at the door/in his room/in the room', 'singular'),
  (1527,
   'beside his hearth

In [8]:
for story in annotations_dict:
    dct = annotations_dict[story]
    for char in dct:
        dct[char].sort(key=lambda x: x[0], reverse=True)

In [9]:
annotations_dict["Andersen_story17.txt"]

{'the little boy': [(2513,
   "away the old poet's house/away/away the room/away the poet's house/away the poet's room/away the old poet's room",
   'singular'),
  (1821,
   'round the kind old poet/round the old poet/round the poet/in his room/in the room/in the warm room',
   'singular'),
  (1566,
   "in his lap/in the old poet's lap/in his room/in the room/in the warm room",
   'singular'),
  (904,
   "in a warm room/in the room/in a room/inside the old poet's house/in the old poet's room/in the poet's room/in the poet's house",
   'singular')],
 'the child': [(559,
   "at the door/at the poet's door/outside/under the rain/at the old poet's door",
   'singular')],
 'the old poet': [(2739,
   'on the earth/on the floor/in his room/in the room',
   'singular'),
  (1527,
   'beside his hearth/beside the hearth/in his room/in the room',
   'singular'),
  (692, 'at the door/in his room/in the room', 'singular'),
  (266,
   'in his chimney-corner/in his chimney corner/in the chimney-corne

# Preparing the Book

In [8]:
path = "litbank/original/105_persuasion.txt"

In [10]:
with open(path, "r") as f:
    book = f.read()

In [12]:
indices = []
for i in range(1, 100):
    chapter_header = "Chapter " + str(i) + "\n\n"
    indices.append(book.find(chapter_header))

In [15]:
ind1 = indices[0]
ind2 = indices[1]
i = 2
chapters = []
while ind2 != -1:
    chapters.append(book[ind1:ind2])
    ind1 = ind2
    ind2 = indices[i]
    i += 1

In [None]:
for chapter in chapters:
    

In [16]:
len(chapters)

23

# Preparing the Models

In [10]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")

In [11]:
model = T5ForConditionalGeneration.from_pretrained("t5-base")
model = model.to(device)

In [29]:
from numba import cuda
cuda.select_device(0)
cuda.close()

# Accuracy Calculator

In [13]:
def exact_match(predictions):

    matches_exact = {}

    for item in predictions:

        matches_exact[item] = [ [] for _ in range(1,24)]

        f = open(os.path.join(path_annotations, item), 'r')
        annotations = pd.read_csv(f, sep="\t")
        annotations = annotations.values #numpy array
        f.close()

        for k in range(1,24):

            pred_locs = predictions[item][k-1]
            i = 0

            for line in annotations:

                character = line[1]
                gold_locations = line[2].split("/")
                
                pred_tokenized = word_tokenize(pred_locs[i].lower())
                new_pred_tokens = [ token for token in pred_tokenized if token not in stop_words]
                pred_wo_stop_words = " ".join(new_pred_tokens)
                
                char_tokenized = word_tokenize(character.lower())
                new_char_tokens = [ token for token in char_tokenized if token not in stop_words]
                char_wo_stop_words = " ".join(new_char_tokens)
                
                if char_wo_stop_words not in " ".join(gold_locations):
                    pred_wo_stop_words = pred_wo_stop_words.replace(char_wo_stop_words, "")   
                
                else:
                    if pred_wo_stop_words[len(char_wo_stop_words)+1:len(char_wo_stop_words)+3] == "is" or pred_wo_stop_words[len(char_wo_stop_words)+1:len(char_wo_stop_words)+4] == "are":
                        pred_wo_stop_words = pred_wo_stop_words[len(char_wo_stop_words)+1:]

                match = False

                for gold_location in gold_locations:

                    gold_tokenized = word_tokenize(gold_location.lower())
                    new_gold_tokens = [ token for token in gold_tokenized if token not in stop_words]
                    gold_wo_stop_words = " ".join(new_gold_tokens)

                    if gold_wo_stop_words == pred_wo_stop_words:
                        match = True

                if match:
                    matches_exact[item][k-1].append(1)
                else:
                    matches_exact[item][k-1].append(0)

                i += 1
    
    return matches_exact

In [14]:
def fuzzy_match(predictions):
    
    matches_fuzzy = {}

    for item in predictions:

        print(item)
        matches_fuzzy[item] = [ [] for _ in range(1,24)]

        f = open(os.path.join(path_annotations, item), 'r')
        annotations = pd.read_csv(f, sep="\t")
        annotations = annotations.values #numpy array
        f.close()

        for k in range(1,24):

            pred_locs = predictions[item][k-1]        
            i = 0

            for line in annotations:

                gold_locations = line[2].split("/")

                pred_tokenized = word_tokenize(pred_locs[i].lower())
                new_pred_tokens = [ token for token in pred_tokenized if token not in stop_words ]
                pred_wo_stop_words = " ".join(new_pred_tokens)
                
                char_tokenized = word_tokenize(character.lower())
                new_char_tokens = [ token for token in char_tokenized if token not in stop_words]
                char_wo_stop_words = " ".join(new_char_tokens)
                
                if char_wo_stop_words not in " ".join(gold_locations):
                    pred_wo_stop_words = pred_wo_stop_words.replace(char_wo_stop_words, "")   
                
                else:
                    if pred_wo_stop_words[len(char_wo_stop_words)+1:len(char_wo_stop_words)+3] == "is" or pred_wo_stop_words[len(char_wo_stop_words)+1:len(char_wo_stop_words)+4] == "are":
                        pred_wo_stop_words = pred_wo_stop_words[len(char_wo_stop_words)+1:]

                match = False

                for gold_location in gold_locations:

                    gold_tokenized = word_tokenize(gold_location.lower())
                    new_gold_tokens = [ token for token in gold_tokenized if token not in stop_words ]
                    gold_wo_stop_words = " ".join(new_gold_tokens)

                    if fuzz.partial_ratio(gold_wo_stop_words, pred_wo_stop_words) > 90:
                        match = True

                if match: 
                    matches_fuzzy[item][k-1].append(1)
                else:
                    matches_fuzzy[item][k-1].append(0)

                i += 1
                
    return matches_fuzzy

# Making Predictions

In [19]:
dir_list_annotations

['Andersen_story12.txt',
 'Andersen_story13.txt',
 'Andersen_story15.txt',
 'Andersen_story16.txt',
 'Andersen_story18.txt',
 'Andersen_story1.txt',
 'Andersen_story3.txt',
 'Andersen_story5.txt',
 'Andersen_story9.txt',
 'Andersen_story10.txt']

In [18]:
dir_list_annotations.remove('Andersen_story2.txt')
dir_list_annotations.remove('Andersen_story8.txt')
dir_list_annotations.remove('Andersen_story11.txt')
dir_list_annotations.remove('Andersen_story7.txt')
dir_list_annotations.remove('Andersen_story17.txt')

In [None]:
# prompt has the max number of tokens: 1024, and we start at a " " char.
predictions = {}
matches_exact = {} 
matches_fuzzy = {}
for item in dir_list_andersen:
    
    if item in dir_list_annotations:
        
        matches_exact[item] = [ [] for _ in range(1,24)]
        matches_fuzzy[item] = [ [] for _ in range(1,24)]
        
        f = open(os.path.join(path_andersen, item), 'r') 
        story = f.read()
        f.close()
        
        paragraphs = story.split("\n\n")
        
        out_path = "Memory_" + item[:-3] + "xlsx"
        writer = pd.ExcelWriter(out_path, engine='xlsxwriter')
        workbook = writer.book
        format = workbook.add_format({'text_wrap': True})
        
        predictions[item] = {}
        
        my_dic = {"Prompts": [gold_answer, "-", "-"]}
        story_annotations = annotations_dict[item]
        
        no_paragraphs = len(paragraphs)
        
        i = 0
        
        paragraph = paragraphs[0]
        paragraph = paragraph.replace("\n", " ")
        len_title = len(paragraph) + 2
        char_count = len_title
        
        for paragraph in paragraphs[1:]:
            
            char_count += (len(paragraph) + 2)
            
            y = char_count
            x = y - 5120

            if x < len_title:
                text = story[len_title:y]
            else:
                x = story[x:y].find(" ") + x
                text = story[x:y]

            text = text.rstrip(", ;-\n")    
            text = remove_new_lines(text)
            
            for character in story_annotations:
                
                predictions[item][character] = []
                grammatical_number = story_annotations[character][0][2]
                
                for triple in story_annotations[character]:
                    if triple[0] <= char_count:
                        locations = triple[1].split("/")
                        my_dic = {"Prompts": [triple[1], "-", "-"]}
                        for k in [10, 13, 17, 21]:
                            
                            prompt, _ = create_prompt_clipped(k, text, character, grammatical_number, 1024)
                            
                            inputs = tokenizer.encode(prompt, return_tensors="pt")
                            inputs = inputs.to("cuda:0")

                            with torch.no_grad():
                                outputs = model.generate(inputs)

                            out = tokenizer.decode(outputs[0], skip_special_tokens=True)
                            predictions[item][character].append((out,k))
                            
                            match1 = "No"
                            match2 = "No"

                            pred_tokenized = word_tokenize(out.lower())
                            new_pred_tokens = [ token for token in pred_tokenized if token not in stop_words ]
                            pred_wo_stop_words = " ".join(new_pred_tokens)
                            
                            char_tokenized = word_tokenize(character.lower())
                            new_char_tokens = [ token for token in char_tokenized if token not in stop_words]
                            char_wo_stop_words = " ".join(new_char_tokens)

                            if char_wo_stop_words not in " ".join(locations):
                                pred_wo_stop_words = pred_wo_stop_words.replace(char_wo_stop_words, "")   

                            else:
                                if pred_wo_stop_words[len(char_wo_stop_words)+1:len(char_wo_stop_words)+3] == "is" or pred_wo_stop_words[len(char_wo_stop_words)+1:len(char_wo_stop_words)+4] == "are":
                                    pred_wo_stop_words = pred_wo_stop_words[len(char_wo_stop_words)+1:]

                            for gold_location in locations:

                                gold_tokenized = word_tokenize(gold_location.lower())
                                new_gold_tokens = [ token for token in gold_tokenized if token not in stop_words ]
                                gold_wo_stop_words = " ".join(new_gold_tokens)

                                if gold_wo_stop_words == pred_wo_stop_words:
                                    match1 = "Yes"

                                if fuzz.partial_ratio(gold_wo_stop_words, pred_wo_stop_words) > 90:
                                    match2 = "Yes"

                            my_dic[prompt] = [out, match1, match2]
                            
                            if match1 == "Yes":
                                matches_exact[item][k-1].append(1)
                            else:
                                matches_exact[item][k-1].append(0)
                            if match2 == "Yes":
                                matches_fuzzy[item][k-1].append(1)
                            else:
                                matches_fuzzy[item][k-1].append(0)
                        
                        df = pd.DataFrame(data=my_dic, index=["output", "exact match?", "fuzzy match?"])
                        df = (df.T)
                        sn = character+str(i+1)
                        df.to_excel(writer, sheet_name=sn[-30:])
                        worksheet = writer.sheets[sn[-30:]]
                        for idx, col in enumerate(df):
                            max_len = 75
                            worksheet.set_column(idx, idx, max_len, format)

                        break
                            
            i += 1
            
        writer.save()


  writer.save()
  writer.save()


In [17]:
with open("1-Memory_predictions_stories2_8_11_7_17.txt", "wb") as f:
    pickle.dump(predictions, f)