In [157]:
import os
import yaml
import time
from tqdm import tqdm
from src.utils import batch_generate, tokens_generate, run_inference
from src.mem import check_memory
import importlib
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from dotenv import load_dotenv
import plotly.express as px
import torch
import torch.nn.functional as F
import huggingface
from datetime import datetime
load_dotenv('secrets.env')

with open("config/config.yaml", "r") as f: 
    config = yaml.safe_load(f)

model_str = 'deepseek_r1_qwendistill_1.5'
eval_df = 'big_bench'

ds = config['datasets'][eval_df]
model_path = config['models'][model_str]['path']

if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

In [None]:
# !pip freeze > requirements.txt

In [139]:
dataset = load_dataset(ds['source'], ds['subset']).shuffle(config['seed'])
tokenizer = AutoTokenizer.from_pretrained(model_path) 
model = AutoModelForCausalLM.from_pretrained(model_path).to(device)

# Experiment
Goal: test model performance on 100 BBH questions w/ chain of thought reasoning =D

In [178]:
df = pd.DataFrame(dataset['train'][0:10])
df['input'] = apply_instruct_format(df['input'], model = model_str, is_math = True)

batches = batch_generate(df, ds['input_column'], ds['target_column'])
tokens = tokens_generate(batches, tokenizer, device = 'mps')
res = run_inference(model, tokens, tokenizer, time_tracking = True)

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


In [218]:
import re
for result in res: 
    for single_response in result['response']:
        cleaned = re.sub(r'^.*?<think>', '<think>', single_response, flags=re.DOTALL)
        print(cleaned)

<think>
Okay, so I'm trying to figure out whether Kate didn't put oil in the machine causing it
<think>
Okay, so I need to figure out whether Alex caused the plants to dry out. Let me break
<think>
Okay, so I'm trying to figure out whether the motorboat started because Ned changed the motor's
<think>
Okay, so I'm trying to figure out whether the drunk driver caused the injury to Joe's son
<think>
Okay, so I need to figure out if Frank T. was intentionally shooting his neighbor in the body
<think>
Okay, so I'm trying to figure out whether Brown intentionally rolled a six on a die to deton
<think>
Okay, so I need to figure out if Susan intentionally increased the prominence of the Atlantic division. Let
<think>
Okay, so I'm trying to figure out whether the department budget committee causes the approval of Prof.
<think>
Okay, so I'm trying to figure out whether Louie won the $100 bet because
<think>
Okay, so I'm trying to figure out how a typical person would answer this question about c

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [193]:
# [print(result['response'].replace(result['input'], '').strip()) for result in res]

AttributeError: 'list' object has no attribute 'replace'

In [176]:

tokens = tokenizer(test[3], padding = True, truncation = True, return_tensors = 'pt').to(device)
response = model.generate(**tokens, temperature = 0.6, max_tokens = 2000)
full_response = tokenizer.decode(response[0], skip_special_tokens = True)

ValueError: The following `model_kwargs` are not used by the model: ['max_tokens'] (note: typos in the generate arguments will also show up in this list)

In [175]:
full_response.replace(test[3], '').strip() # nice! 

"Okay, so I'm trying to figure out whether the drunk driver caused the injury to Joe's son"