In [None]:
!pip install datasets torch openai tiktoken nltk seaborn matplotlib 

In [None]:
!git clone https://github.com/booydar/babilong source
!unzip source/data/tasks_1-20_v1-2.zip -d data/

In [None]:
import openai 
import asyncio
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset, Dataset
import numpy as np
import tiktoken
import time
import os
import datasets
from source.babilong_utils import TaskDataset, SentenceSampler, NoiseInjectionDataset

In [None]:
task = 'qa1'
model = 'gpt-4-1106-preview'
message_length = 4000
number_of_samples = 1

In [None]:
class Tokenizer():
    def __init__(self, model):
        self.impl_ = tiktoken.encoding_for_model(model)

    def __call__(self, inp):
        if isinstance(inp, list):
            result = self.impl_.encode_batch(inp)
        else:
            result = self.impl_.encode(inp)
        return {
            'input_ids': result
        }

    def encode(self, inp, add_special_tokens):
        return self.impl_.encode(inp)

    def decode(self, inp):
        return self.impl_.decode(inp)

    def decode_batch(self, inp):
        return self.impl_.decode_batch(inp)

In [None]:
os.makedirs(task, exist_ok=True)

with open('token') as inp:
    key = inp.read().strip()

client = openai.OpenAI(api_key=key)

outfile =  f'{task}/msg_{message_length}.csv'
df = pd.DataFrame({
    'answer': [],
    'gpt4answer': [],
    'result': [],
})

test_path = 'data/tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_test.txt'

noise_dataset = datasets.load_dataset('wikitext', 'wikitext-2-raw-v1')
#noise_dataset = datasets.load_from_disk("pg19-data-test")

tokenizer = Tokenizer(model)

# task 
task_dataset_test = TaskDataset(test_path)


noise_sampler_test = SentenceSampler(noise_dataset['test'], tokenizer=tokenizer)

dataset_test = NoiseInjectionDataset(task_dataset=task_dataset_test,
                                        noise_sampler=noise_sampler_test,
                                        tokenizer=tokenizer,
                                        sample_size=message_length)


for i, sample in zip(range(number_of_samples), dataset_test):
    facts = sample['facts']
    question = sample['question']
    true_answer = tokenizer.decode(sample['target_tokens'])
    background_text = tokenizer.decode_batch(sample['background_text'])
    query = tokenizer.decode(sample['input_tokens'])
    
    messages = [ 
        {
            "role": "system",
            "content": "You are a intelligent assistant."
        },
        {
            "role": "user", 
            "content": 
                "I give you context with the facts about positions of different persons hidden in some random text and a question. "
                "You need to answer the question based only on the information from the facts. "
                "If a person was in different locations use the latest location to answer the question.\n\n"
                "<example>\n"
                f"Charlie went to the hallway. Judith come back to the kitchen. Charlie travelled to balcony. Where is Charlie?\n"
                "Assistant: balcony\n"
                "</example>\n\n"
                "<example>\n"
                f"Alan moved to the garage. Charlie went to the beach. Alan went to the shop. Rouse travelled to balcony. Where is Alan?\n"
                "Assistant: shop\n"
                "</example>\n\n"
                "<context>\n"
                f"{query}"
                "</context>\n\n"
                f"QUESTION: {question}\n" 
                "Your answer should be a single word - the most recent location of ’person’ in the question. "
                "Do not write anything afer that."
        },
    ]

    response = client.chat.completions.create(model=model, messages=messages)
    gpt_answer = response.choices[0].message.content.strip().lower()

    if gpt_answer.endswith('.'):
        gpt_answer = gpt_answer[:-1]

    print(i, true_answer, gpt_answer)

    df.loc[len(df)] = [true_answer, gpt_answer, true_answer == gpt_answer]
    df.to_csv(outfile)

In [None]:
import seaborn as sns
import matplotlib
import matplotlib.pylab as plt

In [None]:
results = np.zeros((1, 7))

for i, msg in enumerate([0, 4, 8, 16, 32, 64, 128]):
    msg = msg * 1000
    fname = f'{task}/msg_{msg}.csv'
    if not os.path.isfile(fname):
        print('not such file', fname)
        continue     
    df = pd.read_csv(fname, index_col=[0])
    last_word = df['gpt4answer'].apply(lambda x: x.split(' ')[-1]).apply(lambda x: x.split('\n')[-1])
    score = (last_word == df['answer']).sum()
    results[0, i] = score / len(df)

font = {
    'size'   : 30
}

matplotlib.rc('font', **font)
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(18.5, 2)


ax = sns.heatmap(results, annot=True, linewidth=0.5)
ax.set_xlabel('Context length')
ax.set_xticks(np.array(range(results.shape[1])) + 0.5, ['0k', '4k', '8k', '16k', '32k', '64k', '128k'])
ax.tick_params(axis='y', rotation=90)
ax.set_yticks([0], [''])
plt.show()

fig.savefig('fig.pdf', dpi=100)