## Necessary Imports and Settings

In [1]:
import os
import nltk
import pandas as pd
import torch
import numpy as np
from jinja2 import Template
import xmltodict
import pickle
import random

from collections import defaultdict

from fuzzywuzzy import fuzz
from rouge import Rouge

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from transformers import T5Tokenizer, T5ForConditionalGeneration



In [2]:
from numba import cuda
cuda.select_device(0)
cuda.close()

In [2]:
stop_words = set(stopwords.words("english"))

path_andersen = "/kuacc/users/bozyurt20/ChildrenStories/Andersen"
path_fanny = "/kuacc/users/bozyurt20/ChildrenStories/Fanny Fern"
path_annotations = "/kuacc/users/bozyurt20/ChildrenStories/Annotations"

dir_list_andersen = os.listdir(path_andersen)
dir_list_fanny = os.listdir(path_fanny)
dir_list_annotations = os.listdir(path_annotations)

tokenizer = T5Tokenizer.from_pretrained("t5-base")

def text_clean_ending(example_text):
    example_text = example_text.rstrip(", ;-\n")
    if example_text[-1] != ".":
        example_text += "."
    return example_text

def remove_new_lines(text):
    paragraphs = text.split("\n\n")
    new_paragraphs = []
    for paragraph in paragraphs:
        new_paragraphs.append(paragraph.replace("\n", " "))
    new_text = "\n".join(new_paragraphs)
    return new_text

def remove_pronouns(answer):  
    if answer[:6] == "he is ":
        answer = answer[6:]
    elif answer[:7] == "she is ":
        answer = answer[7:]
    elif answer[:6] == "it is ":
        answer = answer[6:]
    elif answer[9:] == "they are ":
        answer = answer[9:]
    return answer

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [3]:
model = T5ForConditionalGeneration.from_pretrained("t5-base")

In [11]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [10]:
model.encoder.block

ModuleList(
  (0): T5Block(
    (layer): ModuleList(
      (0): T5LayerSelfAttention(
        (SelfAttention): T5Attention(
          (q): Linear(in_features=768, out_features=768, bias=False)
          (k): Linear(in_features=768, out_features=768, bias=False)
          (v): Linear(in_features=768, out_features=768, bias=False)
          (o): Linear(in_features=768, out_features=768, bias=False)
          (relative_attention_bias): Embedding(32, 12)
        )
        (layer_norm): T5LayerNorm()
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): T5LayerFF(
        (DenseReluDense): T5DenseActDense(
          (wi): Linear(in_features=768, out_features=3072, bias=False)
          (wo): Linear(in_features=3072, out_features=768, bias=False)
          (dropout): Dropout(p=0.1, inplace=False)
          (act): ReLU()
        )
        (layer_norm): T5LayerNorm()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (1): T5Block(
    (layer): ModuleList(
      

## Creating the Dataset

In [3]:
with open("T5_tokenizer_dataset.txt", "rb") as f:
    dataset = pickle.load(f)

In [4]:
prefix_tuning_dataset = {}
num_templates = 23

for k in range(1, num_templates+1):
    prefix_tuning_dataset[k] = {}
    prefix_tuning_dataset[k]["train"] = []
    prefix_tuning_dataset[k]["validation"] = []
    prefix_tuning_dataset[k]["test"] = []

for k in [6]:#range(1, num_templates+1):
    
    bank = {"train": 199,
       "validation": 0,
       "test": 50 }
    
    source_text = {"train": [],
       "validation": [],
       "test": [] 
    }
    target_text = {"train": [],
       "validation": [],
       "test": [] 
    }
    all_target_text = {"train": [],
       "validation": [],
       "test": [] 
    }
    
    for i in range(len(dataset[k])):

        split = random.choice(list(bank.keys()))

        while bank[split] < 1:
            split = random.choice(list(bank.keys()))

        bank[split] -= 1
        
        source_tokens = tokenizer.encode(dataset[k][i]["prompt"], return_tensors="pt")
        source_text[split].append((i, source_tokens[0]))
        target_text[split].append((i, tokenizer.encode(dataset[k][i]["gold_locations"].split("/")[0], return_tensors="pt")[0]))
        all_target_text[split].append((i, dataset[k][i]["gold_locations"]))
    
    for split in bank.keys():
        
        source_text[split] = sorted(source_text[split], key=lambda x: len(x[1]))

        src_txt = []
        tgt_txt = []
        all_tgt_txt = []

        for tupl in source_text[split]:
            i, tokens = tupl
            for a in target_text[split]:
                if a[0] == i:
                    tgt_txt.append(a[1])
                    break
            for b in all_target_text[split]:
                if b[0] == i:
                    all_tgt_txt.append(b[1])
        for tupl in source_text[split]:
            src_txt.append(tupl[1])
        for i in range(len(src_txt)):
            prompt = tokenizer.decode(src_txt[i], skip_special_tokens=True)
            gold_location = tokenizer.decode(tgt_txt[i], skip_special_tokens=True)
            gold_locations = all_tgt_txt[i]
            data_point = {}
            data_point["text_a"] = prompt
            data_point["tgt_text"] = gold_location
            data_point["guid"] = gold_locations
            data_point["text_b"] = ""
            data_point["meta"] = {}
            data_point["label"] = None

            prefix_tuning_dataset[k][split].append(data_point)

KeyboardInterrupt: 

In [6]:
with open("prefix_tuning_dataset.txt", "wb") as f:
    pickle.dump(prefix_tuning_dataset, f)

In [3]:
with open("prefix_tuning_dataset.txt", "rb") as f:
    prefix_tuning_dataset = pickle.load(f)

## OLD Dataset Creation

In [15]:
dataset = {}
all_locations = {
    "train": 0,
    "validation": 0,
    "test": 0
}
    
for k in range(1, 24):
    dataset[k] = {}
    dataset[k]["train"] = []
    dataset[k]["validation"] = []
    dataset[k]["test"] = []

bank = {"train": 170,
       "validation": 39,
       "test": 40 }

i = {"train": 0,
    "validation": 0,
    "test": 0}

for item in sorted(dir_list_andersen):
    
    if item in dir_list_annotations:
        
        print(item)        
        
        f = open(os.path.join(path_andersen, item), 'r') 
        story = f.read()
        f.close()
                
        f = open(os.path.join(path_annotations, item), 'r')
        annotations = pd.read_csv(f, sep="\t")
        annotations = annotations.values
        f.close()
               
        paragraphs = story.split("\n\n")
        paragraph = paragraphs[0]
        len_title = len(paragraph) + 2        
    
        for line in annotations:
            
            character = line[1]
            gold_answer = line[2]
            grammatical_number = line[3]

            gold_location = gold_answer.split("/")[0]
            
            split = random.choice(list(bank.keys()))
                
            while bank[split] < 1:
                split = random.choice(list(bank.keys()))

            bank[split] -= 1
            
            for k in range(1, 24):
                
                data_point = {}
                
                y = line[0]
                x = y - 5120

                if x < len_title:
                    text = story[len_title:y]

                else:
                    x = story[x:y].find(" ") + x
                    text = story[x:y]
                                    
                text = text_clean_ending(text)
                text = remove_new_lines(text)
                
                prompt, context2 = create_prompt_clipped(k, text, character, grammatical_number, 512)
                
                data_point["text_a"] = prompt
                data_point["tgt_text"] = gold_location
                data_point["guid"] = i[split]
                data_point["text_b"] = ""
                data_point["meta"] = {}
                data_point["label"] = None
                
                dataset[k][split].append(data_point)
                
            all_locations[split][i[split]] = gold_answer.split("/")
          
            
            i[split] += 1
            


Andersen_story1.txt


KeyboardInterrupt: 

In [22]:
i

{'train': 3, 'validation': 4, 'test': 6}

## Prefix Tuning

In [4]:
import argparse
import torch

from openprompt.data_utils import InputExample

from openprompt.data_utils.conditional_generation_dataset import WebNLGProcessor

from openprompt.plms import load_plm

from openprompt.prompts.prefix_tuning_template import PrefixTuningTemplate

from openprompt import PromptDataLoader

from openprompt import PromptForGeneration

from transformers import AdamW

from transformers.optimization import get_linear_schedule_with_warmup

from openprompt.utils.metrics import generation_metric



In [5]:
my_dataset = {}
for k in [6]:
    my_dataset[k] = {}
    my_dataset[k]["train"] = []
    my_dataset[k]["validation"] = []
    my_dataset[k]["test"] = []
for k in [6]:
    for split in prefix_tuning_dataset[k]:
        my_dataset[k][split] = []
        for data_point in prefix_tuning_dataset[k][split]:
            input_example = InputExample(text_a=data_point['text_a'], text_b = data_point['text_b'], tgt_text =data_point['tgt_text'], label=None, guid=data_point['guid'])
            my_dataset[k][split].append(input_example)
print(my_dataset[6]['train'][0])

{
  "guid": "out in the woods/in the woods/woods/the forest",
  "label": null,
  "meta": {},
  "text_a": "Out in the woods stood a nice little Fir Tree. From the passage, where is the fir tree?",
  "text_b": "",
  "tgt_text": "out in the woods"
}



In [6]:
model = "t5"
model_path = "t5-base"
plm_eval_mode = "store_true"
lr = 5e-5
num_epochs = 5

use_cuda = True

In [7]:
# we use WebNLG as an example, as well. Note that the evaluation of generation result should be done
# by using the scripts provided by https://github.com/Yale-LILY/dart/tree/master/evaluation,
# Which we do not include in it.

dataset = {}
dataset[1]['train'] = WebNLGProcessor().get_train_examples("OpenPrompt/datasets/CondGen/webnlg_2017/")
dataset[1]['validation'] = WebNLGProcessor().get_dev_examples("./datasets/CondGen/webnlg_2017/")
dataset[1]['test'] = WebNLGProcessor().get_test_examples("./datasets/CondGen/webnlg_2017/")

FileNotFoundError: [Errno 2] No such file or directory: 'OpenPrompt/datasets/CondGen/webnlg_2017/train.json'

In [7]:
# load a pretrained model, its tokenizer, its config, and its TokenzerWrapper by one function

plm, tokenizer, model_config, WrapperClass = load_plm(model, model_path)
plm = T5ForConditionalGeneration.from_pretrained("T5_prefix_tuned")
# we can use a plain text as the default setting
# i.e.
# mytemplate = PrefixTuningTemplate(model=plm, tokenizer=tokenizer)
# is equal to
# mytemplate = PrefixTuningTemplate(model=plm, tokenizer=tokenizer, text='{"placeholder":"text_a"} {"mask"}')

mytemplate = PrefixTuningTemplate(model=plm,  tokenizer=tokenizer, text=' {"placeholder":"text_a"} {"special": "<eos>"} {"mask"} ', using_decoder_past_key_values=False)

In [8]:
# To better understand how does the template wrap the example, we visualize one instance.
# You may observe that the example doesn't end with <|endoftext|> token. Don't worry, adding specific end-of-text token
# is a language-model-specific token. we will add it for you in the TokenizerWrapper once you pass `predict_eos_token=True`
wrapped_example = mytemplate.wrap_one_example(my_dataset[6]['train'][0])
print(wrapped_example)

[[{'text': ' Out in the woods stood a nice little Fir Tree. From the passage, where is the fir tree?', 'loss_ids': 0, 'shortenable_ids': 1}, {'text': '<eos>', 'loss_ids': 0, 'shortenable_ids': 0}, {'text': '<mask>', 'loss_ids': 1, 'shortenable_ids': 0}], {'guid': 'out in the woods/in the woods/woods/the forest', 'tgt_text': 'out in the woods'}]


In [9]:
# Your can loop over the dataset by yourself by subsequently call mytemplate.wrap_one_example  and WrapperClass().tokenizer()
# but we have provide a PromptDataLoader for you.

dataset = my_dataset[6]
train_dataloader = PromptDataLoader(dataset=dataset["train"], template=mytemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=512, decoder_max_length=512,
    batch_size=1,shuffle=False, teacher_forcing=True, predict_eos_token=True, # be sure to pass predict_eos_token=True if your template doesn't contain one, or you model may fail to stop generation.
    truncate_method="head")

"""validation_dataloader = PromptDataLoader(dataset=dataset["validation"], template=mytemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=512, decoder_max_length=512,
    batch_size=1,shuffle=False, teacher_forcing=False, predict_eos_token=True,
    truncate_method="head")"""

test_dataloader = PromptDataLoader(dataset=dataset["test"], template=mytemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=512, decoder_max_length=512,
    batch_size=1,shuffle=False, teacher_forcing=False, predict_eos_token=True,
    truncate_method="head")

tokenizing: 199it [00:00, 312.29it/s]
tokenizing: 50it [00:00, 384.77it/s]


In [10]:
# load the pipeline model PromptForGeneration.

prompt_model = PromptForGeneration(plm=plm,template=mytemplate, freeze_plm=True, tokenizer=tokenizer, plm_eval_mode=plm_eval_mode)
if use_cuda:
    prompt_model = prompt_model.cuda()

In [11]:
# Follow PrefixTuning（https://github.com/XiangLi1999/PrefixTuning), we also fix the language model
# only include the template's parameters in training.

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{
    "params": [p for n, p in mytemplate.named_parameters() if (not any(nd in n for nd in no_decay)) and p.requires_grad],
    "weight_decay": 0.0,
},
{
    "params": [p for n, p in mytemplate.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad],
    "weight_decay": 0.0,
},
]

In [12]:
optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=1e-8)
tot_step  = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 0, tot_step)



In [22]:
# Define evaluate function

def evaluate(prompt_model, dataloader, split):
    generated_sentence = []
    groundtruth_sentence = []
    exact_accuracy = []
    fuzzy_accuracy = []
    prompt_model.eval()
    for step, inputs in enumerate(dataloader):
        if use_cuda:
            inputs = inputs.cuda()
        _, output_sentences = prompt_model.generate(inputs, **generation_arguments)
        generated_sentence.extend(output_sentences)
        groundtruth_sentence.extend(inputs['tgt_text'])
        # TODO: Tüm gold location'larla karşılaştırmalısın !!!
        gold_locations = inputs["guid"][0].split("/")
        match1 = "No"
        match2 = "No"
        
        out2 = remove_pronouns(output_sentences[0].lower())

        pred_tokenized = word_tokenize(out2)
        new_pred_tokens = [ token for token in pred_tokenized if token not in stop_words ]
        pred_wo_stop_words = " ".join(new_pred_tokens) 

        for gold_location in gold_locations:

            gold_tokenized = word_tokenize(gold_location[0].lower())
            new_gold_tokens = [ token for token in gold_tokenized if token not in stop_words ]
            gold_wo_stop_words = " ".join(new_gold_tokens)

            if gold_wo_stop_words == pred_wo_stop_words:
                match1 = "Yes"
            if fuzz.partial_ratio(gold_wo_stop_words, pred_wo_stop_words) > 90:
                match2 = "Yes"
                
        if match1 == "Yes": 
            exact_accuracy.append(1)
        else:
            exact_accuracy.append(0)
        if match2 == "Yes": 
            fuzzy_accuracy.append(1)
        else:
            fuzzy_accuracy.append(0)

        
    #score = generation_metric(generated_sentence, groundtruth_sentence, "sentence_bleu")
    #print("test_score", score, flush=True)
    return generated_sentence, exact_accuracy, fuzzy_accuracy

In [14]:
tokenizer.decode([198])

'și'

In [15]:
generation_arguments = {
    "max_length": 512,
    "max_new_tokens": None,
    "min_length": 0,
    "temperature": 1.0,
    "do_sample": False,
    "top_k": 0,
    "top_p": 0.9,
    "repetition_penalty": 1.0,
    "num_beams": 5,
    "bad_words_ids": [[628], [198]]
}

In [20]:
# training and generation.
global_step = 0
tot_loss = 0
log_loss = 0
for epoch in range(num_epochs):
    prompt_model.train()
    for step, inputs in enumerate(train_dataloader):
        global_step += 1
        if use_cuda:
            inputs = inputs.cuda()
        loss = prompt_model(inputs)
        loss.backward()
        tot_loss += loss.item()
        torch.nn.utils.clip_grad_norm_(mytemplate.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        if global_step %10 ==0:
            print("Epoch {}, global_step {} average loss: {} lr: {}".format(epoch, global_step, (tot_loss-log_loss)/500, scheduler.get_last_lr()[0]), flush=True)
            log_loss = tot_loss


Epoch 0, global_step 10 average loss: 0.4252411231994629 lr: 4.949748743718593e-05
Epoch 0, global_step 20 average loss: 0.41144375228881835 lr: 4.899497487437186e-05
Epoch 0, global_step 30 average loss: 0.48745624542236327 lr: 4.849246231155779e-05
Epoch 0, global_step 40 average loss: 0.43117365264892576 lr: 4.7989949748743725e-05
Epoch 0, global_step 50 average loss: 0.4726151542663574 lr: 4.748743718592965e-05
Epoch 0, global_step 60 average loss: 0.5515932235717773 lr: 4.6984924623115577e-05
Epoch 0, global_step 70 average loss: 0.5945601272583008 lr: 4.648241206030151e-05
Epoch 0, global_step 80 average loss: 0.5067963943481445 lr: 4.597989949748744e-05
Epoch 0, global_step 90 average loss: 0.467736457824707 lr: 4.5477386934673374e-05
Epoch 0, global_step 100 average loss: 0.43539627265930175 lr: 4.49748743718593e-05
Epoch 0, global_step 110 average loss: 0.43823619079589843 lr: 4.4472361809045225e-05
Epoch 0, global_step 120 average loss: 0.5028336029052735 lr: 4.39698492462311

In [43]:
prompt_model.plm.save_pretrained("T5_prefix_tuned")

In [23]:
generated_sentence, exact_accuracy, fuzzy_accuracy = evaluate(prompt_model, train_dataloader, "train")

In [24]:
generated_sentence

['in the woods.',
 'the time.',
 'the street.',
 'passage.',
 "boy's mother.",
 'the chimney-corner.',
 'leap-frog.',
 'flea?',
 'leap-frog.',
 'king.',
 'palace.',
 'old man.',
 '.',
 'the passage.',
 'the door.',
 'Emperor is sitting in his wardrobe.',
 'the door.',
 '.',
 'old King asked.',
 'Prince asked.',
 'the way.',
 'the carriage.',
 'the passage.',
 'old King asked.',
 'the hall.',
 'the hall.',
 'old King asked.',
 'burdocks.',
 '<unk> <extra_id_1> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>',
 'the house.',
 'the middle of the room.',
 'old house.',
 'the middle of the village.',
 'the palace.',
 'the passage, where is Tuk?',
 'the middle of the street.',
 'the middle of the village.',
 'of the passage.',
 'the other side of the passage.',
 'of the passage.',
 'the castle.',
 'the castle.',
 'the midst of the snow.',
 'the forest?',
 'mother took?',
 'the house?',
 'heaven?',
 '?',
 '?',
 'the tomb?',
 'the church?',
 'the midst

In [34]:
print(PromptForGeneration.named_buffers)

<function Module.named_buffers at 0x2b6db7c46430>


In [86]:
with open(f"prefix_generations.txt",'w') as f:
    for i in generated_sentence:
        f.write(i+"\n")

13

## TO-DELETE

In [21]:
def create_prompt_clipped(version, context, character, grammatical_number, max_no_tokens=512):
    
    if grammatical_number == 'singular':
        to_be = 'is'
    elif grammatical_number == 'plural':
        to_be = 'are'
    
    if version in [1, 2, 9, 10, 11, 12, 13, 20, 21, 22]:
        question = "Where " + to_be + " " + character + "?"
    elif version in [4, 5, 7, 8, 15, 16, 18, 19]:
        question = "where " + character + " " + to_be + "."
    elif version in [3, 14]:
        question = "where " + character + " " + to_be + "?"
    elif version in [6, 17]:
        question = "where " + to_be + " " + character + "?"
        
    if version == 1 or version == 12:
        intro = "Answer the question depending on the context."
    elif version == 2 or version == 13:
        intro = "What is the answer?"
    elif version == 3 or version == 14:
        intro = "Can you tell me "
    elif version == 4 or version == 15:
        intro = "Please tell me "
    elif version == 5 or version == 16:
        intro = "Tell me "
    elif version == 6 or version == 17:
        intro = "From the passage, "
    elif version == 7 or version == 18:
        intro = "I want to know "
    elif version == 8 or version == 19:
        intro = "I want to ask "
    elif version == 9 or version == 20:
        intro = "What is the answer to: "
    elif version == 10 or version == 21:
        intro = "Find the answer to: "
    elif version == 11 or version == 22:
        intro = "Answer: "     
    
    if version in [1, 2]:
        oo = 0
        tm = Template("""{{ intro }}
Context: {{context}};
Question: {{question}};
Answer: """)        
        prompt = tm.render(intro=intro, context=context, question=question)
        
        while len(tokenizer.encode(prompt)) > max_no_tokens:
            context = tokenizer.encode(context)
            diff = len(tokenizer.encode(prompt)) - max_no_tokens
            context = context[diff:]
            oo += 1
            if oo > 4:
                context = context[1:]
            context = tokenizer.decode(context, skip_special_tokens=True)
            prompt = tm.render(intro=intro, context=context, question=question)
        
    elif version in [3, 4, 5, 6, 7, 8, 9, 10, 11]:
        oo = 0
        tm = Template("{{context}} {{intro}}{{question}} ")
        prompt = tm.render(intro=intro, context=context, question=question)
        while len(tokenizer.encode(prompt)) > max_no_tokens:
            context = tokenizer.encode(context)
            diff = len(tokenizer.encode(prompt)) - max_no_tokens
            context = context[diff:]            
            oo += 1
            if oo > 4:
                context = context[1:]
            context = tokenizer.decode(context, skip_special_tokens=True)
            prompt = tm.render(intro=intro, context=context, question=question)
        
        
    elif version in [12, 13]:
        oo = 0
        tm = Template("""{{ intro }}
Context: {{context}};
Question: {{question}};
If you can't find the answer, please respond "unanswerable".
Answer: """)
        prompt = tm.render(intro=intro, context=context, question=question)
        while len(tokenizer.encode(prompt)) > max_no_tokens:
            context = tokenizer.encode(context)
            diff = len(tokenizer.encode(prompt)) - max_no_tokens
            context = context[diff:]
            oo += 1
            if oo > 4:
                context = context[1:]
            context = tokenizer.decode(context, skip_special_tokens=True)
            prompt = tm.render(intro=intro, context=context, question=question)
        
    elif version in [14, 15, 16, 17, 18, 19, 20, 21, 22]:
        oo = 0
        tm = Template('{{context}} {{intro}}{{question}} If you can\'t find the answer, please respond "unanswerable". ')
        prompt = tm.render(intro=intro, context=context, question=question)    
        while len(tokenizer.encode(prompt)) > max_no_tokens:
            context = tokenizer.encode(context)
            diff = len(tokenizer.encode(prompt)) - max_no_tokens
            context = context[diff:]
            oo += 1
            if oo > 4:
                context = context[1:]
            context = tokenizer.decode(context, skip_special_tokens=True)
            prompt = tm.render(intro=intro, context=context, question=question)
            
    elif version == 23:
        oo = 0
        prompt = "Where " + to_be + " " + character + " in the following text: " + context + " Answer: "
        while len(tokenizer.encode(prompt)) > max_no_tokens:
            context = tokenizer.encode(context)
            diff = len(tokenizer.encode(prompt)) - max_no_tokens
            context = context[diff:]
            oo += 1
            if oo > 4:
                context = context[1:]
            context = tokenizer.decode(context, skip_special_tokens=True)
            prompt = "Where " + to_be + " " + character + " in the following text: " + context + " Answer: "
        
    return prompt, context