## Run model and collect predictions on BABILong

In [None]:
import os
# os.chdir('..')
os.chdir('/home/booydar/rmt/babilong')

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import datasets
from tqdm.auto import tqdm
import pandas as pd
import time
import json
import seaborn as sns
import matplotlib
import matplotlib.pylab as plt
from matplotlib.colors import LinearSegmentedColormap

import pandas as pd
import numpy as np

from pathlib import Path

from babilong.prompts import DEFAULT_PROMPTS, DEFAULT_TEMPLATE, get_formatted_input
from babilong.metrics import compare_answers, TASK_LABELS

In [None]:
model_name = 'NousResearch/Meta-Llama-3-8B-Instruct'
dtype = torch.bfloat16
device = 'cuda:0'
# device = 'cpu'
max_length = 128000

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True,
                                             device_map=device, torch_dtype=dtype,
                                             attn_implementation='flash_attention_2')
model = model.eval()

terminators = [
                    tokenizer.eos_token_id,
                    tokenizer.convert_tokens_to_ids("<|eot_id|>")
                ]

In [None]:
generate_kwargs = {
    'num_beams': 1,
    'do_sample': False,
    'temperature': None,
    'top_p': None,
    'top_k': None,
    'eos_token_id':terminators
}

In [None]:
def format_examples(default_examples):
    if len(default_examples) == 0:
        return [], []
    
    examples = default_examples.split('<example>\n')
    examples = [e[:e.index("\n</example>")] for e in examples if len(e) > 0]
    inputs = [e[:e.index("\nAnswer")] for e in examples]
    outputs = [e[e.index("\nAnswer") + 9:] for e in examples]
    return inputs, outputs

def get_formatted_input(context, question, examples, instruction, post_prompt):
    # pre_prompt - general instruction
    # examples - in-context examples
    # post_prompt - any additional instructions after examples
    # context - text to use for qa
    # question - question to answer based on context
    inputs, outputs = format_examples(examples)
    messages = []
    if len(instruction) > 0:
        messages.append({"role": "system", "content": instruction })

    for i, o in zip(inputs, outputs):
        messages += [
            {"role": "user", "content": i},
            {"role": "assistant", "content": o}
        ]

    messages.append( {"role": "user", "content": f"{context} {question}\n {post_prompt}"} )
    return messages

### Run evaluation on 1k examples on <=32k

In [None]:
tasks = ['qa1', 'qa2','qa3', 'qa4', 'qa5']
split_names = ['0k', '1k', '2k', '4k', '8k', '16k', '32k']

In [None]:
# zero-shot
for task in tqdm(tasks, desc='tasks'):
    prompt_cfg = {
        'instruction': '',
        'examples': '', 
        'post_prompt': '',
        'template': None,
    }
    
    prompt_name = [f'{k}_no' if len(prompt_cfg[k]) == 0 else f'{k}_yes' for k in prompt_cfg if k != 'template']
    prompt_name = '_'.join(prompt_name)
    for split_name in tqdm(split_names, desc='lengths'):
        data = datasets.load_dataset("booydar/babilong-1k-samples", split_name)
        task_data = data[task]

        outfile = Path(f'./babilong_evals_1k/{model_name}/{task}_{split_name}_{prompt_name}.csv')
        outfile.parent.mkdir(parents=True, exist_ok=True)
        cfg_file = f'./babilong_evals_1k/{model_name}/{task}_{split_name}_{prompt_name}.json'
        json.dump({'prompt': prompt_cfg, 'generate_kwargs': generate_kwargs}, open(cfg_file, 'w'), indent=4)

        df = pd.DataFrame({'target': [], 'output': [], 'question': []})

        for sample in tqdm(task_data):
            target = sample['target']
            context = sample['input']
            question = sample['question']

            messages = get_formatted_input(context, question, prompt_cfg['examples'],
                                             prompt_cfg['instruction'], prompt_cfg['post_prompt'])
            input_ids = tokenizer.apply_chat_template(messages,
                                        add_generation_prompt=True,
                                        return_tensors="pt"
                                        ).to(model.device)

            sample_length = input_ids.shape[1]
            with torch.no_grad():
                output = model.generate(input_ids=input_ids, max_length=sample_length+25, **generate_kwargs)
            output = output[0][sample_length:]
            output = tokenizer.decode(output, skip_special_tokens=True).strip()

            df.loc[len(df)] = [target, output, question]
            df.to_csv(outfile)

In [None]:
# few-shot

for task in tqdm(tasks, desc='tasks'):
    prompt_cfg = {
        'instruction': DEFAULT_PROMPTS[task]['instruction'],
        'examples': DEFAULT_PROMPTS[task]['examples'], 
        'post_prompt': DEFAULT_PROMPTS[task]['post_prompt'],
        'template': None,
    }
    
    prompt_name = [f'{k}_no' if len(prompt_cfg[k]) == 0 else f'{k}_yes' for k in prompt_cfg if k != 'template']
    prompt_name = '_'.join(prompt_name)
    for split_name in tqdm(split_names, desc='lengths'):
        data = datasets.load_dataset("booydar/babilong-1k-samples", split_name)
        task_data = data[task]

        outfile = Path(f'./babilong_evals_1k/{model_name}/{task}_{split_name}_{prompt_name}.csv')
        outfile.parent.mkdir(parents=True, exist_ok=True)
        cfg_file = f'./babilong_evals_1k/{model_name}/{task}_{split_name}_{prompt_name}.json'
        json.dump({'prompt': prompt_cfg, 'generate_kwargs': generate_kwargs}, open(cfg_file, 'w'), indent=4)

        df = pd.DataFrame({'target': [], 'output': [], 'question': []})

        for sample in tqdm(task_data):
            target = sample['target']
            context = sample['input']
            question = sample['question']

            messages = get_formatted_input(context, question, prompt_cfg['examples'],
                                             prompt_cfg['instruction'], prompt_cfg['post_prompt'])
            input_ids = tokenizer.apply_chat_template(messages,
                                        add_generation_prompt=True,
                                        return_tensors="pt"
                                        ).to(model.device)

            sample_length = input_ids.shape[1]
            with torch.no_grad():
                output = model.generate(input_ids=input_ids, max_length=sample_length+25, **generate_kwargs)
            output = output[0][sample_length:]
            output = tokenizer.decode(output, skip_special_tokens=True).strip()

            df.loc[len(df)] = [target, output, question]
            df.to_csv(outfile)

### Run evaluation on 100 examples on >32k

In [None]:
tasks = ['qa1', 'qa2','qa3', 'qa4', 'qa5']
split_names = ['64k', '128k']

In [None]:
# zero-shot
for task in tqdm(tasks, desc='tasks'):
    prompt_cfg = {
        'instruction': '',
        'examples': '', 
        'post_prompt': '',
        'template': None,
    }
    
    prompt_name = [f'{k}_no' if len(prompt_cfg[k]) == 0 else f'{k}_yes' for k in prompt_cfg if k != 'template']
    prompt_name = '_'.join(prompt_name)
    for split_name in tqdm(split_names, desc='lengths'):
        data = datasets.load_dataset("booydar/babilong", split_name)
        task_data = data[task]

        outfile = Path(f'./babilong_evals_1k/{model_name}/{task}_{split_name}_{prompt_name}.csv')
        outfile.parent.mkdir(parents=True, exist_ok=True)
        cfg_file = f'./babilong_evals_1k/{model_name}/{task}_{split_name}_{prompt_name}.json'
        json.dump({'prompt': prompt_cfg, 'generate_kwargs': generate_kwargs}, open(cfg_file, 'w'), indent=4)

        df = pd.DataFrame({'target': [], 'output': [], 'question': []})

        for sample in tqdm(task_data):
            target = sample['target']
            context = sample['input']
            question = sample['question']

            messages = get_formatted_input(context, question, prompt_cfg['examples'],
                                             prompt_cfg['instruction'], prompt_cfg['post_prompt'])
            input_ids = tokenizer.apply_chat_template(messages,
                                        add_generation_prompt=True,
                                        return_tensors="pt"
                                        ).to(model.device)

            sample_length = input_ids.shape[1]
            with torch.no_grad():
                output = model.generate(input_ids=input_ids, max_length=sample_length+25, **generate_kwargs)
            output = output[0][sample_length:]
            output = tokenizer.decode(output, skip_special_tokens=True).strip()

            df.loc[len(df)] = [target, output, question]
            df.to_csv(outfile)

In [None]:
# few-shot

for task in tqdm(tasks, desc='tasks'):
    prompt_cfg = {
        'instruction': DEFAULT_PROMPTS[task]['instruction'],
        'examples': DEFAULT_PROMPTS[task]['examples'], 
        'post_prompt': DEFAULT_PROMPTS[task]['post_prompt'],
        'template': None,
    }
    
    prompt_name = [f'{k}_no' if len(prompt_cfg[k]) == 0 else f'{k}_yes' for k in prompt_cfg if k != 'template']
    prompt_name = '_'.join(prompt_name)
    for split_name in tqdm(split_names, desc='lengths'):
        data = datasets.load_dataset("booydar/babilong", split_name)
        task_data = data[task]

        outfile = Path(f'./babilong_evals_1k/{model_name}/{task}_{split_name}_{prompt_name}.csv')
        outfile.parent.mkdir(parents=True, exist_ok=True)
        cfg_file = f'./babilong_evals_1k/{model_name}/{task}_{split_name}_{prompt_name}.json'
        json.dump({'prompt': prompt_cfg, 'generate_kwargs': generate_kwargs}, open(cfg_file, 'w'), indent=4)

        df = pd.DataFrame({'target': [], 'output': [], 'question': []})

        for sample in tqdm(task_data):
            target = sample['target']
            context = sample['input']
            question = sample['question']

            messages = get_formatted_input(context, question, prompt_cfg['examples'],
                                             prompt_cfg['instruction'], prompt_cfg['post_prompt'])
            input_ids = tokenizer.apply_chat_template(messages,
                                        add_generation_prompt=True,
                                        return_tensors="pt"
                                        ).to(model.device)

            sample_length = input_ids.shape[1]
            with torch.no_grad():
                output = model.generate(input_ids=input_ids, max_length=sample_length+25, **generate_kwargs)
            output = output[0][sample_length:]
            output = tokenizer.decode(output, skip_special_tokens=True).strip()

            df.loc[len(df)] = [target, output, question]
            df.to_csv(outfile)

## Evaluate result

In [None]:
import seaborn as sns
import matplotlib
import matplotlib.pylab as plt
from matplotlib.colors import LinearSegmentedColormap

import pandas as pd
import numpy as np

In [None]:
results_folder = './babilong_evals_1k'
model_names = ['NousResearch/Meta-Llama-3-8B-Instruct']

In [None]:
tasks = ['qa1', 'qa2', 'qa3', 'qa4', 'qa5']
lengths = ['0k', '1k', '2k', '4k', '8k', '16k', '32k']

In [None]:
results = {}
for model_name in model_names:
    prompt_name = 'instruction_no_examples_no_post_prompt_no'
    accuracy = np.ones((len(tasks), len(lengths))) * -1
    try:
        for j, task in enumerate(tasks):
            for i, ctx_length in enumerate(lengths):
                fname = f'{results_folder}/{model_name}/{task}_{ctx_length}_{prompt_name}.csv'
                if not os.path.isfile(fname):
                    # print(f'No such file: {fname}')
                    continue
                
                df = pd.read_csv(fname)
                
                if df['output'].dtype != object:
                    df['output'] = df['output'].astype(str)
                df['output'] = df['output'].fillna('')


                df['correct'] = df.apply(lambda row: compare_answers(target=row['target'], output=row['output'], question=row['question'],
                                                                     task_labels=TASK_LABELS[task]), axis=1)
                score = df['correct'].sum()
                accuracy[j, i] = 100 * score / len(df) if len(df) > 0 else 0

        prompt_name = 'instruction_yes_examples_yes_post_prompt_yes'
        accuracy_fs = np.ones((len(tasks), len(lengths))) * -1
        for j, task in enumerate(tasks):
            for i, ctx_length in enumerate(lengths):
                fname = f'{results_folder}/{model_name}/{task}_{ctx_length}_{prompt_name}.csv'
                if not os.path.isfile(fname):
                    # print(f'No such file: {fname}')
                    continue
                
                df = pd.read_csv(fname)
                
                if df['output'].dtype != object:
                    df['output'] = df['output'].astype(str)
                df['output'] = df['output'].fillna('')


                df['correct'] = df.apply(lambda row: compare_answers(target=row['target'], output=row['output'], question=row['question'],
                                                                     task_labels=TASK_LABELS[task]), axis=1)
                score = df['correct'].sum()
                accuracy_fs[j, i] = 100 * score / len(df) if len(df) > 0 else 0

        results[model_name] = {'zero': accuracy, 'few': accuracy_fs}
        

        # Set large font sizes for better visibility in the PDF
        matplotlib.rc('font', size=14)

        # Create a colormap for the heatmap
        cmap = LinearSegmentedColormap.from_list('ryg', ["red", "yellow", "green"], N=256)

        # Create the heatmap
        fig, ax = plt.subplots(1, 2, figsize=(10, 3.5))  # Adjust the size as necessary
        
        sns.heatmap(accuracy, cmap=cmap, vmin=0, vmax=100, annot=True, fmt=".0f",
                    linewidths=.5, xticklabels=lengths, yticklabels=tasks, ax=ax[0])
        
        sns.heatmap(accuracy_fs, cmap=cmap, vmin=0, vmax=100, annot=True, fmt=".0f",
                    linewidths=.5, xticklabels=lengths, yticklabels=tasks, ax=ax[1])
        
        ax[0].set_title(f'Zero shot \n {model_name}')
        ax[1].set_title(f'Few shot \n {model_name}')
        ax[0].set_xlabel('Context size')
        ax[1].set_xlabel('Context size')
        ax[0].set_ylabel('Tasks')
        ax[1].set_ylabel('Tasks')

        # Save the figure to a PDF
        # plt.savefig('all_tasks_performance.pdf', bbox_inches='tight')
        plt.show()
    except KeyError as e:
        print(f"question not found in {model_name}")
        continue
        

### print  table for copying to google sheets

In [32]:
mn = model_name

tab = results[mn]['zero']
tab = pd.DataFrame(tab, index=tasks, columns=lengths[:tab.shape[1]])

tab_few = results[mn]['few']
tab_few = pd.DataFrame(tab_few, index=tasks, columns=lengths[:tab_few.shape[1]])

tab['avg'] = tab.mean(axis=1)
tab_few['avg'] = tab_few.mean(axis=1)

best_tab = tab_few.copy()
for i, row in tab.iterrows():
    if row.avg > best_tab.loc[i].avg:
        best_tab.loc[i] = row

best_tab.round().astype(int)[best_tab.columns[:-1]]

Unnamed: 0,0k,1k,2k,4k,8k,16k,32k
qa1,98,-1,-1,-1,-1,-1,-1
qa2,-1,-1,-1,-1,-1,-1,-1
qa3,-1,-1,-1,-1,-1,-1,-1
qa4,-1,-1,-1,-1,-1,-1,-1
qa5,-1,-1,-1,-1,-1,-1,-1
