In [None]:
import os
import torch

# Set-up CUDA device
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" 
# use a specific GPU
os.environ["CUDA_VISIBLE_DEVICES"]="5,6,7"

# Use GPU for inference
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Print the device being used
print(f"Using device: {device}")

# Check the GPU name
if device.type == 'cuda':
    gpu_name_1 = torch.cuda.get_device_name(0)  # 0 because CUDA_VISIBLE_DEVICES=4 means GPU 5 is now 0
    gpu_name_2 = torch.cuda.get_device_name(1)
    gpu_name_3 = torch.cuda.get_device_name(2)
    print("Using GPUs:", gpu_name_1, gpu_name_2, gpu_name_3)

Using device: cuda
Using GPUs: NVIDIA A100-SXM4-40GB NVIDIA A100-SXM4-40GB NVIDIA A100-SXM4-40GB


In [8]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Replace with your desired model
model_name = "l3lab/L1-Qwen-1.5B-Max"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Fetching 2 files: 100%|██████████| 2/2 [1:24:29<00:00, 2534.77s/it]  
Loading checkpoint shards: 100%|██████████| 2/2 [00:18<00:00,  9.20s/it]


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536, padding_idx=151643)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((1536,), eps=1e-06)


In [9]:
save_path = "../models/L1-Qwen-1.5B-Max"
tokenizer.save_pretrained(save_path)
model.save_pretrained(save_path)

In [10]:
# Reload local copy
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(save_path)
model = AutoModelForCausalLM.from_pretrained(save_path, torch_dtype=torch.float16).eval()

In [12]:
import pandas as pd

# Load Math validation dataset
math_val = pd.read_csv("../MATH/math_val.csv")

sample_problem = math_val.iloc[0]['problem']
math_val

Unnamed: 0,problem,level,type,split,solution,answer
0,Josh and Mike live 13 miles apart. Yesterday J...,4.0,Algebra,test,Because $\text{(rate)(time)} = \text{(distance...,5
1,Simplify the expression $(9x^2+3x+7)+(3x^2+7x^...,3.0,Algebra,test,"Combining like terms, we find that \begin{ali...",7x^5+12x^2+3x+9
2,"The arithmetic mean (or average) of $A$, $B$ a...",3.0,Algebra,test,"We know that $\frac{A+B+C}{3} = 10$, thus, $A+...",14
3,Eleven pencils cost as much as three pens. If ...,2.0,Algebra,test,"If seven pens costs $\$9.24$, then each pen co...",36
4,"Assuming $x\ne0$, simplify $\frac{12}{x \cdot ...",2.0,Algebra,test,We have \begin{align*}\n\frac{12}{x \cdot x} ...,10
...,...,...,...,...,...,...
4495,Below is the graph of an ellipse. (Assume that...,3.0,Intermediate Algebra,test,We see that the endpoints of the major axis of...,-4
4496,Find the center of the ellipse whose equation ...,1.0,Intermediate Algebra,test,"Completing the square in $x$ and $y,$ we get\n...","(2,1)"
4497,Let a sequence be defined as follows: $a_1 = 3...,5.0,Intermediate Algebra,test,The fact that the equation $a_{n+1}a_{n-1} = a...,224
4498,What is the area of the region in the $xy-$pla...,4.0,Intermediate Algebra,test,Because $\lfloor x \rfloor$ and $\lfloor y \rf...,10


In [None]:
# Sample inference
messages = [
    {
        "role": "user",
        "content": f"{sample_problem} Let's think step by step and output the final answer within boxed{{}}. Think for 3600 tokens."
    }
]

prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True  # adds assistant role if needed
)

inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=3600, do_sample=False)
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(decoded)