In [1]:
import os
import sys
import torch
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import transformers
import torch.nn.functional as F
import random
from tqdm import tqdm
import subprocess as sp



In [6]:
from google.colab import drive
drive.mount('/content/drive')
#########
#########
wd = "drive/MyDrive/gsm_test" # CHANGE TO YOUR WORKING DIRECTOR
##################

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
##########
file = open("drive/My Drive/gsm_test/HF_token.txt", "r") # REQUIRE HUGGINGFACE TOKEN TO ACCESS CERTAIN MODELS, SETUP TOKEN, SAVE TO TEXT
##########

HF_token = file.readline()
file.close()
os.environ["HF_TOKEN"] = HF_token

#!pip install -U "huggingface_hub[cli]"
!huggingface-cli login --token $HF_TOKEN --add-to-git-credential
!git config --global credential.helper store

Token is valid (permission: read).
The token `MyFirstToken` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
Token has not been saved to git credential helper.
Your token has been saved to /root/.cache/huggingface/token
Login successful.
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [8]:
######### Specify a folder at Google drive to save downloaded pretrained models
###########
model_path = "drive/My Drive/Colab Notebooks/huggingface_models"
###########

def create_folder(dir):
    if not os.path.isdir(dir):
        os.mkdir(dir)

def get_gpu_memory():
    command = "nvidia-smi --query-gpu=memory.free --format=csv"
    memory_free_info = sp.check_output(command.split()).decode('ascii').split('\n')[:-1][1:]
    memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
    return memory_free_values

def fix_random_seed(seed, reproduce=False):
    # torch.backends.cudnn.enabled = True
    # torch.backends.cudnn.benchmark = True

    if reproduce:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        ## NOTE: uncomment for CUDA >= 10.2
        # os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
        ## NOTE: uncomment for pytorch >= 1.8
        # torch.use_deterministic_algorithms(True)

    random.seed(seed)
    np.random.seed(seed)
    rng = torch.manual_seed(seed)

    return rng

In [9]:
model_dict = {
    1: "mistral-7b",  # Checked
    2: "llama2-7b",
    3: "llama3-8b",
    4: "llama3-8B-Instruct",  # Checked
    5: "gemma2-9b-it",
    6: "qwen-1.5B",   # Checked
    7: "qwen-3B",  # Checked
    8: "qwen-7B",  # Checked
    ## The following are math reasoning models
    9: "llama3.1-8b",
    10: "qwen2.5-7b",  # Checked, all correct
    11: "mathstral-7b",  # Checked, incorrect on questions with most operands
    12: "deepseek-7b",  # Checked, incorrect on last two questions with most operands
}


pythia 7b


In [10]:
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
import random

random.seed(2026)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch_dtype = torch.float16
output_hidden_states = False
output_attentions = False
skip = False # skip special tokens
trust_remote_code = True if model_name == "qwen" else False

if model_choice == "gpt2-medium":
  llm_name = 'gpt2-medium'
elif model_choice == "mistral-7b":
  llm_name = "mistralai/Mistral-7B-Instruct-v0.3"
elif model_choice == "gemma2-9b":
  llm_name = "google/gemma-2-9b"
elif model_choice == "llama2-7b":
  llm_name = "meta-llama/Llama-2-7b-chat-hf"
elif model_choice == "llama3-8b":
  llm_name = "meta-llama/Meta-Llama-3-8B"
elif model_choice == "gptJ-6b":
  llm_name = "EleutherAI/gpt-j-6B"
elif model_choice == "llama3-8B-Instruct":
  llm_name = "meta-llama/Meta-Llama-3-8B-Instruct"
  torch_dtype = torch.bfloat16
elif model_choice == "gemma2-9b-it":
  llm_name = "google/gemma-2-9b-it"
  torch_dtype = torch.bfloat16
elif model_choice == "qwen-1.5B":
  llm_name = "Qwen/Qwen2.5-1.5B-Instruct"
elif model_choice == "qwen-3B":
  llm_name = "Qwen/Qwen2.5-3B-Instruct"
elif model_choice == "qwen-7B":
  llm_name = "Qwen/Qwen2.5-7B-Instruct"
elif model_choice == "llama3.1-8b":
  llm_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
elif model_choice == "qwen2.5-7b":
  llm_name = "Qwen/Qwen2.5-Math-7B-Instruct"
elif model_choice == "mathstral-7b":
  llm_name = "mistralai/Mathstral-7B-v0.1"
elif model_choice == "deepseek-7b":
  llm_name = "deepseek-ai/deepseek-math-7b-instruct"


# now loading model
if model_choice == "pythia-7b":
  from transformers import GPTNeoXForCausalLM
  model = GPTNeoXForCausalLM.from_pretrained(
  "EleutherAI/pythia-6.9b-deduped",
  revision="step3000",
  cache_dir=model_path,
  output_hidden_states=output_hidden_states
)

  tokenizer = AutoTokenizer.from_pretrained(
    "EleutherAI/pythia-6.9b-deduped",
    revision="step3000",
    cache_dir=model_path,
  )
else:
  model = AutoModelForCausalLM.from_pretrained(llm_name, cache_dir=model_path, trust_remote_code=trust_remote_code,
                                               torch_dtype="auto", device_map="auto")
  tokenizer = AutoTokenizer.from_pretrained(llm_name, padding_side='left', trust_remote_code=trust_remote_code)


if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token
model.generation_config.pad_token_id = tokenizer.pad_token_id

model.eval()
vocab_size = model.config.vocab_size
n_layer = model.config.num_hidden_layers
H = model.config.hidden_size

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
def gen_clauses(k=0):
  names = ["Seattle", "Alice", "Bob", "David", "Zoe", "John", "Rachel"]
  multipliers = ["twice", "5 times", "3 times", "twice", "4 times", "5 times"]
  factors = [2, 5, 3, 2, 4, 5]
  assert k < 7
  clauses = ""
  q_names = "and Seattle" if k==0 else ", ".join(names[:k]) + " and " + names[k]
  last_name = names[k]
  correct = ((2+1)*4)+1
  for j in range(k):
    clauses += f"{names[j]} has {multipliers[j]} as many sheep as {names[j+1]}. "
    correct = correct * factors[j] + 1
  correct *= 20
  return clauses, q_names, last_name, correct

clauses, q_names, last_name, correct = gen_clauses(3)
question = f"Toulouse has twice as many sheep as Charleston. Charleston has 4 times as many sheep as Seattle. {clauses}How many sheep do Toulouse, Charleston, {q_names} have together if {last_name} has 20 sheep?"
print(question)
print(correct)
print('='*50)

def gen_clauses_2(k=0):
  num_customers = [4, 2, 8, 3, 7, 5]
  num_DVDs = [3, 8, 9, 2, 1, 9]
  assert k < 7
  clauses = ""
  correct = 3 + 2*2
  for j in range(k):
    item = "DVDs" if num_DVDs[j] > 1 else "DVD"
    clauses += f"His next {num_customers[j]} customers buy {num_DVDs[j]} {item} each. "
    correct += num_customers[j] * num_DVDs[j]
  return clauses, correct

clauses, correct = gen_clauses_2(3)
question2 = f"Billy sells DVDs. He has 8 customers on Tuesday. His first 3 customers buy one DVD each. His next 2 customers buy 2 DVDs each. {clauses}His last 3 customers don't buy any DVDs. How many DVDs did Billy sell on Tuesday?"
print(question2)
print(correct)
print('='*50)

def gen_clauses_3(k=0):
  num_pairs = [2, 3, 1, 3, 4, 5]
  items = ["suspenders", "socks", "sunglasses", "gloves", "earrings", "slippers"]
  prices = [20.50, 5.00, 30.00, 21.50, 94.50, 20.50]
  assert k < 7
  clauses = ""
  clauses2 = ""
  correct = 3*16.5 + 3*22.5 + 3*42
  for j in range(k):
    clauses += f"{num_pairs[j]} pairs of {items[j]}, "
    clauses2 += f"One pair of {items[j]} costs ${prices[j]}. "
    correct += num_pairs[j] * prices[j]
  return clauses, clauses2, correct

clauses, clauses2, correct = gen_clauses_3(3)
question3 = f"Mishka bought 3 pairs of shorts, {clauses}3 pairs of pants, and 3 pairs of shoes. One pair of shorts costs $16.50. {clauses2}One pair of pants costs $22.50 and one pair of shoes costs $42. How many dollars did Mishka spend on all the clothing items?"
print(question3)
print(correct)
print('='*50)

def gen_questions(k=0, id=0):
  if id == 0:
    clauses, q_names, last_name, correct = gen_clauses(k)
    question = f"Toulouse has twice as many sheep as Charleston. Charleston has 4 times as many sheep as Seattle. {clauses}How many sheep do Toulouse, Charleston, {q_names} have together if {last_name} has 20 sheep?"
  elif id == 1:
    clauses, correct = gen_clauses_2(k)
    question = f"Billy sells DVDs. He has 8 customers on Tuesday. His first 3 customers buy one DVD each. His next 2 customers buy 2 DVDs each. {clauses}His last 3 customers don't buy any DVDs. How many DVDs did Billy sell on Tuesday?"
  elif id == 2:
    clauses, clauses2, correct = gen_clauses_3(k)
    question = f"Mishka bought 3 pairs of shorts, {clauses}3 pairs of pants, and 3 pairs of shoes. One pair of shorts costs $16.50. {clauses2}One pair of pants costs $22.50 and one pair of shoes costs $42. How many dollars did Mishka spend on all the clothing items?"
  return question, correct

Toulouse has twice as many sheep as Charleston. Charleston has 4 times as many sheep as Seattle. Seattle has twice as many sheep as Alice. Alice has 5 times as many sheep as Bob. Bob has 3 times as many sheep as David. How many sheep do Toulouse, Charleston, Seattle, Alice, Bob and David have together if David has 20 sheep?
8180
Billy sells DVDs. He has 8 customers on Tuesday. His first 3 customers buy one DVD each. His next 2 customers buy 2 DVDs each. His next 4 customers buy 3 DVDs each. His next 2 customers buy 8 DVDs each. His next 8 customers buy 9 DVDs each. His last 3 customers don't buy any DVDs. How many DVDs did Billy sell on Tuesday?
107
Mishka bought 3 pairs of shorts, 2 pairs of suspenders, 3 pairs of socks, 1 pairs of sunglasses, 3 pairs of pants, and 3 pairs of shoes. One pair of shorts costs $16.50. One pair of suspenders costs $20.5. One pair of socks costs $5.0. One pair of sunglasses costs $30.0. One pair of pants costs $22.50 and one pair of shoes costs $42. How ma

### Zero-shot prompting

In [None]:
"""
messages = [
    {"role": "system", "content": "You are a helpful assistant. Solve the math problem. End with: #### <final answer>"},
    {"role": "user", "content": question},
]

prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,  # adds the assistant header for you
)

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# Important: Llama-3 chat templates use <|eot_id|>; include it as a stopping token.
# Some HF configs don’t stop on eot_id unless you pass it explicitly. :contentReference[oaicite:2]{index=2}
eos_ids = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]

out = model.generate(
    **inputs,
    max_new_tokens=256,
    eos_token_id=eos_ids,
    do_sample=False,
)

print(tokenizer.decode(out[0], skip_special_tokens=True))
"""

'\nmessages = [\n    {"role": "system", "content": "You are a helpful assistant. Solve the math problem. End with: #### <final answer>"},\n    {"role": "user", "content": question},\n]\n\nprompt = tokenizer.apply_chat_template(\n    messages,\n    tokenize=False,\n    add_generation_prompt=True,  # adds the assistant header for you\n)\n\ninputs = tokenizer(prompt, return_tensors="pt").to(model.device)\n\n# Important: Llama-3 chat templates use <|eot_id|>; include it as a stopping token.\n# Some HF configs don’t stop on eot_id unless you pass it explicitly. :contentReference[oaicite:2]{index=2}\neos_ids = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]\n\nout = model.generate(\n    **inputs,\n    max_new_tokens=256,\n    eos_token_id=eos_ids,\n    do_sample=False,\n)\n\nprint(tokenizer.decode(out[0], skip_special_tokens=True))\n'

In [13]:
questions = []
ground_truths = []
for id in range(3):
  for k in range(6):
    question, correct = gen_questions(k, id)
    questions.append(question)
    ground_truths.append(correct)

# --- helper: safe prompt builder ---
def safe_build_prompt(tokenizer, system_text: str, user_text: str, model_name=None):
    """
    Returns a tokenizable prompt string (or chat template) that works with or without
    tokenizer.chat_template being set.
    """
    messages = [
        {"role": "system", "content": system_text},
        {"role": "user",   "content": user_text},
    ]

    # If tokenizer supports apply_chat_template and has a chat_template configured, use it.
    try:
        if hasattr(tokenizer, "chat_template") and tokenizer.chat_template is not None:
            # tokenizer.apply_chat_template returns a string
            return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        # some tokenizers expose apply_chat_template but require a template arg
        if hasattr(tokenizer, "apply_chat_template"):
            # attempt with explicit template if that exists
            try:
                return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            except Exception:
                pass
    except Exception:
        # if anything goes wrong, fallback to manual
        pass

    # Fallback: plain text chat-style prompt. This is generic and usually works.
    # Adjust separator if your model expects special tokens (e.g., "<|assistant|>" etc.)
    prompt = f"SYSTEM: {system_text}\n\nUSER: {user_text}\n\nASSISTANT:"
    return prompt

# --- Use it in your code ---
def build_prompt(q: str) -> str:
    system_text = "Solve the problem. Give the final answer as: #### <number>"
    # make special-case for qwen if you want a different instruction
    if model_name == "qwen":
        system_text = "You are a helpful assistant that solves math word problems."
        user_text = (
            "Solve the problem step by step. Show your reasoning.\n"
            "At the end, give the final answer on its own line in the format:\n"
            "#### <number>\n\n"
            f"{q}"
        )
    else:
        user_text = q

    return safe_build_prompt(tokenizer, system_text, user_text, model_name=model_name)

prompts = [build_prompt(q) for q in questions]

enc = tokenizer(
    prompts,
    return_tensors="pt",
    padding=True,       # <- handles different lengths
).to(model.device)

print(enc['input_ids'].shape)
eos_ids = [x for x in [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>"),
] if x is not None]

max_new_tokens = 1024 if model_name == "qwen" else 512

out = model.generate(
    **enc,
    max_new_tokens=max_new_tokens,
    do_sample=False,
    eos_token_id=eos_ids if len(eos_ids) > 1 else eos_ids[0],
)

decoded = tokenizer.batch_decode(out, skip_special_tokens=True)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


torch.Size([18, 171])


In [14]:
import re

def extract_number(text: str):
    pattern = re.compile(r"(?:####|answer\s+is)\s*(.*)", re.IGNORECASE)
    matches = pattern.findall(text)
    return matches[-1].strip() if matches else None

def extract_last_number(text: str):
    nums = re.findall(r"-?\d+(?:\.\d+)?", text)
    return nums[-1] if nums else None

for i, generated in enumerate(decoded):
  #print(i, ground_truths[i], "|||||", generated[-15:])
  #print(i, ground_truths[i], "|||||", extract_number(generated))
  print(i, ground_truths[i], "|||||", extract_last_number(generated))

0 260 ||||| 4
1 540 ||||| 20
2 2720 ||||| 4
3 8180 ||||| 3
4 16380 ||||| 20
5 65540 ||||| 20
6 7 ||||| 3
7 19 ||||| 3
8 35 ||||| 4
9 107 ||||| 4
10 113 ||||| 4
11 120 ||||| 2
12 243.0 ||||| 1.00
13 284.0 ||||| 1.00
14 299.0 ||||| 1.00
15 329.0 ||||| 1.00
16 393.5 ||||| 20.00
17 771.5 ||||| 000


In [None]:
decoded[4]

"Solve the problem. Give the final answer as: #### <number>\n\nUser: Toulouse has twice as many sheep as Charleston. Charleston has 4 times as many sheep as Seattle. Seattle has twice as many sheep as Alice. Alice has 5 times as many sheep as Bob. Bob has 3 times as many sheep as David. David has twice as many sheep as Zoe. How many sheep do Toulouse, Charleston, Seattle, Alice, Bob, David and Zoe have together if Zoe has 20 sheep?\n\nAssistant: If Zoe has 20 sheep, then David has 2 * 20 = 40 sheep.\nIf David has 40 sheep, then Bob has 40 / 3 = 13.33 sheep.\nSince we can't have a fraction of a sheep, we'll round down to 13 sheep for Bob.\nIf Bob has 13 sheep, then Alice has 5 * 13 = 65 sheep.\nIf Alice has 65 sheep, then Seattle has 65 / 2 = 32.5 sheep.\nAgain, we'll round down to 32 sheep for Seattle.\nIf Seattle has 32 sheep, then Charleston has 4 * 32 = 128 sheep.\nIf Charleston has 128 sheep, then Toulouse has 2 * 128 = 256 sheep.\nIn total, Toulouse, Charleston, Seattle, Alice, Bo