<a href="https://colab.research.google.com/github/antalvdb/mblm/blob/main/timbl_llm_benchmark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Benchmarking MBLEM

##A notebook with a collection of repeatable benchmarks for MBLEM models

This notebook contains a series of benchmarks that evaluate the Memory-Based Language Modeling (MBLM) on autoregression-based (decoder) tasks.

MBLM is a CPU-based LLM, so Colab Runtime can be set to CPU.

##Firing up MBLM

We begin with loading an `mblm` model. This requires installing `python3-timbl`.



In [1]:
!pip install python3-timbl

import timbl



In [2]:
!git clone https://github.com/antalvdb/mblm
%cd mblm
!git lfs pull -I chatbot-instruction-prompts_tok.l16r0.igtree.ibase
#!git lfs pull -I chatbot-instruction-prompts-100k_tok.l16r0.ibase
%cd
# Add the mblm directory to the Python path
import sys
sys.path.append('/content/mblm')

Cloning into 'mblm'...
remote: Enumerating objects: 165, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 165 (delta 11), reused 8 (delta 8), pack-reused 147 (from 1)[K
Receiving objects: 100% (165/165), 297.88 KiB | 6.34 MiB/s, done.
Resolving deltas: 100% (90/90), done.
/content/mblm
/root


## Benchmarking

### Hellaswag

Based on code by Andrew Karpathy, part of his [from-scratch reproduction of nanoGPT](https://github.com/karpathy/build-nanogpt/blob/master/hellaswag.py).

In [3]:
import os
import json
import requests
#import tiktoken
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.nn import functional as F
from IPython import get_ipython

DATA_CACHE_DIR = os.path.join(os.getcwd(), "hellaswag") #Use os.getcwd() instead of os.path.dirname(__file__)

def download_file(url: str, fname: str, chunk_size=1024):
    """Helper function to download a file from a given url"""
    resp = requests.get(url, stream=True)
    total = int(resp.headers.get("content-length", 0))
    with open(fname, "wb") as file, tqdm(
        desc=fname,
        total=total,
        unit="iB",
        unit_scale=True,
        unit_divisor=1024,
    ) as bar:
        for data in resp.iter_content(chunk_size=chunk_size):
            size = file.write(data)
            bar.update(size)

hellaswags = {
    "train": "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_train.jsonl",
    "val": "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl",
    "test": "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_test.jsonl",
}

#enc = tiktoken.get_encoding("gpt2")

def download(split):
    """Downloads HellaSwag DATA_CACHE_DIR"""
    os.makedirs(DATA_CACHE_DIR, exist_ok=True)
    data_url = hellaswags[split]
    data_filename = os.path.join(DATA_CACHE_DIR, f"hellaswag_{split}.jsonl")
    if not os.path.exists(data_filename):
        print(f"Downloading {data_url} to {data_filename}...")
        download_file(data_url, data_filename)

def render_example(example, tokenizer):
    """
    Given the example as a dictionary, render it as three torch tensors:
    - tokens (the tokens of context + completion, of size 4xN, as there are always 4 candidates)
    - mask (is 1 in the region of the candidate completion, where we evaluate likelihoods)
    - label (the index of the correct completion, which we hope has the highest likelihood)
    """
    ctx = example["ctx"]
    label = example["label"]
    endings = example["endings"]

    # data needed to reproduce this eval on the C size
    data = {
        "label": label,
        "ctx_tokens": None,
        "ending_tokens": [],
    }

    # gather up all the tokens
    ctx_tokens = tokenizer.encode(ctx, add_special_tokens=False)
    data["ctx_tokens"] = ctx_tokens
    tok_rows = []
    mask_rows = []
    for end in endings:
        end_tokens = tokenizer.encode(" " + end, add_special_tokens=False)  # note: prepending " " because GPT-2 tokenizer
        tok_rows.append(ctx_tokens + end_tokens)
        mask_rows.append([0]*len(ctx_tokens) + [1]*len(end_tokens))
        data["ending_tokens"].append(end_tokens)

    # have to be careful during the collation because the number of tokens in each row can differ
    max_len = max(len(row) for row in tok_rows)
    tokens = torch.zeros((4, max_len), dtype=torch.long)
    mask = torch.zeros((4, max_len), dtype=torch.long)
    for i, (tok_row, mask_row) in enumerate(zip(tok_rows, mask_rows)):
        tokens[i, :len(tok_row)] = torch.tensor(tok_row)
        mask[i, :len(mask_row)] = torch.tensor(mask_row)

    return data, tokens, mask, label

def iterate_examples(split):
    # there are 10,042 examples in total in val
    download(split)
    with open(os.path.join(DATA_CACHE_DIR, f"hellaswag_{split}.jsonl"), "r") as f:
        for line in f:
            example = json.loads(line)
            yield example

@torch.no_grad()
def evaluate_mblm(model, tokenizer, device):
    num_correct_norm = 0
    num_correct = 0
    num_total = 0

    for example in iterate_examples("val"):
        data, tokens, mask, label = render_example(example, tokenizer)
        tokens = tokens.to(device)
        mask = mask.to(device)

        losses = []
        for i in range(tokens.shape[0]): #iterate through the four options

          input_ids = tokens[i].unsqueeze(0)

          # Get the log probability of the sequence
          logp = model.sequence_logprob(input_ids, tokenizer)
          losses.append(-logp) # append the log probability to the losses list

        # convert to tensor and find argmin
        # losses = torch.tensor(losses)
        # pred_norm = losses.argmin().item()

        # Find the smallest non-zero loss and its index
        min_loss = float('inf')
        pred_norm = -1  # Initialize to an invalid index

        for i, loss in enumerate(losses):
            if loss > 0.0 and loss < min_loss:
                min_loss = loss
                pred_norm = i

        # If no non-zero loss is found, pick randomly
        if pred_norm == -1:
            pred_norm = np.random.randint(0, 4)  # Pick random index between 0 and 3

        # accumulate stats
        num_total += 1
        num_correct_norm += int(pred_norm == label)
        log("---", level = 1)
        log(f"{num_total} acc_norm: {num_correct_norm}/{num_total}={num_correct_norm/num_total:.4f}", level = 1)

        # debug: pretty print a few examples, and the losses in each case
        # if num_total < 10:
        log("---", level = 1)
        log(f"Context:\n {example['ctx']}", level = 1)
        log(f"Endings:", level = 1)
        for i, end in enumerate(example["endings"]):
            log(f"{i} (loss: {losses[i].item():.4f}) {end}", level = 1)
        log(f"predicted: {pred_norm}, actual: {label}", level = 1)

In [None]:
device = "cpu" # Set a default device for notebook execution

from transformers import AutoTokenizer, AutoConfig
import timbl
import torch
import mblm.mblm_model
from mblm.mblm_model import TimblHuggingFaceModel #import the model class

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

# Initialize the Timbl classifier
classifier = timbl.TimblClassifier('/content/mblm/chatbot-instruction-prompts_tok.l16r0.igtree', '-a1 +D')
classifier.load()

config = AutoConfig.from_pretrained("antalvdb/mblm-chatbot-instruction-prompts-igtree")
tokenizer.add_special_tokens({'pad_token': '_'})
tokenizer.pad_token = "_"

# Initialize the TimblHuggingFaceModel
model = TimblHuggingFaceModel(config, classifier, tokenizer)

evaluate_mblm(model, tokenizer, device)

### MMLU