In [2]:
import gc

gc.collect()

0

In [3]:
import os
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TextStreamer,
    pipeline,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from huggingface_hub import login, notebook_login
from IPython.display import display, Markdown

In [4]:
def print_markdown(text: str):
  return display(Markdown(text))
model_id="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
notebook_login()

In [5]:

# quantization settings
quantization_configs = BitsAndBytesConfig(
  load_in_4bit=True,
  bnb_4bit_compute_dtype=torch.bfloat16,
  bnb_4bit_quant_type='nf4',
  bnb_4bit_use_double_quant=True
)

torch.device("mps")

model = AutoModelForCausalLM.from_pretrained(
  model_id,
  attn_implementation="sdpa",
  quantization_config=quantization_configs,
  device_map="auto",
  trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

prompts = [
  "if (5x - 10) = 2, solve for x",
  "if x = [1 0 2] and y = [0 1 0] find x^T y",
  r"""
**Advanced Linear Algebra — Singular Value Decomposition (SVD) Computation**
Let
\[
A =
\begin{bmatrix}
4 & 0 & 2 \\
1 & 3 & 1 \\
0 & 2 & 2
\end{bmatrix}.
\]

Compute the full Singular Value Decomposition of \(A\). That is, determine orthogonal matrices \(U \in \mathbb{R}^{3 \times 3}\) and \(V \in \mathbb{R}^{3 \times 3}\), along with a diagonal matrix \(\Sigma \in \mathbb{R}^{3 \times 3}\) with non-negative entries, such that

\[
A = U \Sigma V^\top.
\]

Your computation must proceed from first principles: derive the singular values via the eigenvalues of \(A^\top A\), construct \(V\) from the corresponding orthonormal eigenvectors, compute \(U\) using \(U = A V \Sigma^{-1}\), and explicitly verify the decomposition by matrix multiplication. Finally, use the SVD to determine the rank and 2-norm condition number of \(A\).
"""
]

Loading weights:   0%|          | 0/339 [00:00<?, ?it/s]

In [6]:
# inputs = [tokenizer(prompt, return_tensors='pt').to('cuda') for prompt in prompts]
# outputs = model.generate(**inputs[-1], max_new_tokens=1500, streamer=streamer, return_dict_in_generate=True)
# response = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)

# parse chain-of-thought tokens

def process_response(response):
  reason_start = response.find('<think>') + len('<think>')
  reason_end = response.find('</think>')
  reason = response[reason_start:reason_end].strip()

  # output
  output_start = reason_end + len('</think>')
  output = response[output_start:].strip()

  return reason, output

# reason, output = process_response(response)
# print_markdown(f"**Reasoning**: {reason}")
# print_markdown(f"**Output**: {output}")

### Let's fine-tune Deepseek-R1-Qwen 1.5 b on math & physics problems

Datasets:

<b>Math</b>
```{python}
from datasets import load_dataset

ds = load_dataset("math-ai/StackMathQA", "stackmathqa1600k") # or any valid config_name
```

<b> General 'science' datasets </b>
<li> Scibench </li>
<li> Others </li>

```{python}
from datasets import load_dataset

dataset_names = [('xw27/scibench', 'xw27/scibench'), (name, config)]
for ds_name, config_name in dataset_names:
  ds = load_dataset(ds_name, config_name)
```


In [None]:
from datasets import load_dataset, concatenate_datasets

# Load datasets
print("Loading mathllm/MathCodeInstruct...")
math_dataset = load_dataset("mathllm/MathCodeInstruct", split="train")

print("Loading xw27/scibench...")
scibench_dataset = load_dataset("xw27/scibench", split="train")

# Display dataset sizes
print(f"\nOriginal dataset sizes:")
print(f"  MathCodeInstruct: {len(math_dataset):,} examples")
print(f"  SciBench: {len(scibench_dataset):,} examples")

# Calculate sampling to get ~25k total examples
# Allocate 60% to MathCodeInstruct (for tool-calling) and 40% to SciBench (for science)
max_total = 25000
math_samples = min(len(math_dataset), int(max_total * 0.6))  # 15k for tool-calling & math
scibench_samples = min(len(scibench_dataset), max_total - math_samples)  # ~10k for science

# Sample from datasets
print(f"\nSampling datasets to reach ~{max_total:,} examples...")
print(f"  Target: {math_samples:,} from MathCodeInstruct (tool-calling & math)")
print(f"  Target: {scibench_samples:,} from SciBench (science)")

math_subset = math_dataset.shuffle(seed=42).select(range(math_samples))
scibench_subset = scibench_dataset.shuffle(seed=42).select(range(scibench_samples))

# Combine datasets
print("\nCombining datasets...")
combined_dataset = concatenate_datasets([math_subset, scibench_subset])

# Shuffle the combined dataset
combined_dataset = combined_dataset.shuffle(seed=42)

print(f"\n Combined dataset created:")
print(f"  Total examples: {len(combined_dataset):,}")
print(f"  - From MathCodeInstruct (tool-calling & math): {math_samples:,} ({math_samples/len(combined_dataset)*100:.1f}%)")
print(f"  - From SciBench (science): {scibench_samples:,} ({scibench_samples/len(combined_dataset)*100:.1f}%)")
print(f"\nDataset features: {combined_dataset.features}")

Loading mathllm/MathCodeInstruct...


train_80k.jsonl:   0%|          | 0.00/186M [00:00<?, ?B/s]

# Fine-Tuning Setup

Now we'll fine-tune DeepSeek-R1-Distill-Qwen-1.5B on our mixed math/science dataset using QLoRA.

In [None]:
first_item = combined_dataset[0]

message = first_item["messages"]
for msg in message:
  print(f"{msg['role']}: {msg['content']}")
  print()

system: [{'type': 'text', 'content': 'Below is a math problem. Please solve it step by step.'}]

user: [{'type': 'text', 'content': 'A soccer team has three goalies and ten defenders. The team also has twice as many midfielders as defenders, and the rest of the players are strikers. If the team has 40 players, how many strikers are in the team?'}]

assistant: [{'type': 'code', 'content': '# Given:\ngoalies = 3\ndefenders = 10\nmidfielders = 2 * defenders  # twice as many midfielders as defenders\n\n# Total players in the team is 40\ntotal_players = 40\n\n# The rest of the players are strikers, so:\nstrikers = total_players - (goalies + defenders + midfielders)\nstrikers'}, {'type': 'execution', 'content': '7'}, {'type': 'text', 'content': 'The number of strikers in the team is \\(\\boxed{7}\\).'}, {'type': 'text', 'content': '7'}]



In [None]:
# Install fine-tuning dependencies
!pip install -q -U \
    peft>=0.14.0 \
    trl>=0.12.0 \
    accelerate>=1.2.0 \
    scipy \
    tensorboard

In [None]:
from datasets import Dataset, concatenate_datasets

def format_mathcodeinstruct(example):
    """Convert MathCodeInstruct to DeepSeek-R1 chat format with reasoning."""
    messages = example.get("messages", [])
    if not messages:
        return None

    user_msg = None
    assistant_msg = None

    for msg in messages:
        if msg.get('role') == 'user':
            user_msg = msg.get('content', '')
        elif msg.get('role') == 'assistant':
            assistant_msg = msg.get('content', '')

    if not user_msg or not assistant_msg:
        return None

    formatted_string = ""
    if isinstance(assistant_msg, list):
        for block in assistant_msg:
            if isinstance(block, dict):
                if block.get('type') == 'code':
                    formatted_string += f"<code>\n{block.get('content', '')}\n</code>\n"
                else:
                    formatted_string += f"{block.get('content', '')}\n"
            else:
                formatted_string += str(block) + "\n"
        formatted_string = f"<think>\n{formatted_string.strip()}\n</think>\n\nThe solution is provided above."
    else:
        formatted_string = f"<think>\n{assistant_msg}\n</think>\n\nThe solution is provided above."

    return {
        "messages": [
            {"role": "user", "content": str(user_msg)},
            {"role": "assistant", "content": formatted_string}
        ]
    }


def format_scibench(example):
    """Convert SciBench to DeepSeek-R1 chat format with reasoning."""
    problem = example.get('problem_text', '')
    solution = example.get('solution', '')
    answer = example.get('answer_latex', '')
    unit = example.get('unit', '')

    if not problem or not solution:
        return None

    final_answer = answer
    if unit:
        final_answer = f"{answer} {unit}"

    formatted_response = f"<think>\n{solution}\n</think>\n\nFinal Answer: {final_answer}"

    return {
        "messages": [
            {"role": "user", "content": str(problem)},
            {"role": "assistant", "content": formatted_response}
        ]
    }


print("Formatting datasets to chat format with <think> reasoning tags...")

# Build lists directly — no .map(), no Arrow type issues
print("Formatting MathCodeInstruct...")
math_rows = []
for example in math_subset:
    result = format_mathcodeinstruct(example)
    if result is not None:
        math_rows.append(result)

print("Formatting SciBench...")
sci_rows = []
for example in scibench_subset:
    result = format_scibench(example)
    if result is not None:
        sci_rows.append(result)

# Create datasets from consistent lists
math_formatted = Dataset.from_list(math_rows)
scibench_formatted = Dataset.from_list(sci_rows)

print(f"  MathCodeInstruct: {len(math_formatted):,} valid examples")
print(f"  SciBench: {len(scibench_formatted):,} valid examples")

# Combine and shuffle
formatted_dataset = concatenate_datasets([math_formatted, scibench_formatted])
formatted_dataset = formatted_dataset.shuffle(seed=42)

print(f"\n Formatted dataset: {len(formatted_dataset):,} examples")
print("\nExample:")
print(formatted_dataset[0])

Formatting datasets to chat format with <think> reasoning tags...
Formatting MathCodeInstruct...
Formatting SciBench...
  MathCodeInstruct: 15,000 valid examples
  SciBench: 112 valid examples

✓ Formatted dataset: 15,112 examples

Example:
{'messages': [{'content': '[{\'type\': \'text\', \'content\': "Chloe bought chocolate-dipped strawberries at $50 a dozen. She then sold them for $30 for half a dozen during the Mother\'s Day celebration. How much is Chloe\'s profit if she sold 50 dozens?"}]', 'role': 'user'}, {'content': "<think>\nAlright, let's break the problem down step-by-step:\n\n1. **Determine Chloe's expense for 1 dozen strawberries:**\n   Chloe bought strawberries for $50 a dozen.\n\n2. **Determine Chloe's revenue from selling 1 dozen strawberries:**\n   Chloe sells them for $30 for half a dozen. \n   So, for 1 dozen, she would sell them for \\(2 \\times 30\\).\n\n3. **Determine Chloe's profit for 1 dozen strawberries:**\n   Profit for 1 dozen = Revenue from 1 dozen - Expens

In [None]:
# Create 90/10 train/eval split
print("Creating train/eval split (90/10)...")
split_dataset = formatted_dataset.train_test_split(test_size=0.1, seed=42)

train_dataset = split_dataset['train']
eval_dataset = split_dataset['test']

print(f"\n✓ Split complete:")
print(f"  Training examples: {len(train_dataset):,}")
print(f"  Evaluation examples: {len(eval_dataset):,}")

Creating train/eval split (90/10)...

✓ Split complete:
  Training examples: 13,600
  Evaluation examples: 1,512


In [None]:
# Load model for fine-tuning with QLoRA
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model
)


if tokenizer.pad_token is None:
  tokenizer.pad_token = tokenizer.eos_token
  tokenizer.padding_side = "right"  # Important for training
  model.config.pad_token_id = tokenizer.pad_token

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

# LoRA version
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CASUAL_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# tokenize dataset?
def tokenize(examples):
  return tokenizer.apply_chat_template(
      examples["messages"],
      truncation=True
      )

tokenized_training_dataset = train_dataset.map(
    tokenize,
    batched=True,
    remove_columns=train_dataset.column_names
)

tokenized_eval_dataset = test_dataset.map(
    tokenize,
    batched=True,
    remove_columns=test_dataset.column_names
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer, mlm=False
) # next-token prediction


training_args = TrainingArguments(
    output_dir="./deepseek-r1-math-science-lora-fine-tuning",
    overwrite_output_dir=True,
    num_train_epochs=2,
    max_steps=-1, # -1 value ensures that it follows the number of training epochs param

    # batch configs
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    gradient_checkpointing=True,

    # learning rate configs
    learning_rate=2e-4, # generally 5e-5 for full fine-tuning
    lr_scheduler_type="cosine",
    warmup_ratio=0.10,

    # optimization configs
    optim="paged_adamw_8bit", # memory efficient, adamw_torch_fused is fastest
    weight_decay=0.1,
    max_grad_norm=0.1,

    # eval configs
    evaluation_strategy="steps",
    eval_steps=500,

    # saving configs
    save_strategy="steps",
    save_steps=500,
    save_total_limit=3,

    # Logging configs
    logging_dir="./logs",
    logging_steps=50,
    logging_first_step=True,

    # performance configs
    fp16=True,
    dataloader_num_workers=4,
    dataloader_pin_memory=True,

    # Misc
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

# initialize trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_training_dataset,
    eval_dataset=tokenized_eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# train
print("TRAINING STARTED")
trainer.train()
print("TRAINING COMPLETE")


# save model
output_path = "./deepseek-r1-math-science-full-fine-tuning"
trainer.save_model(output_path)
tokenizer.save_pretrained(output_path)


Model and tokenizer loaded


# Testing Fine-Tuned Model

Let's test the fine-tuned model on some math and science problems.

# Merge LoRA Adapters with Base Model

For deployment, you can merge the LoRA adapters with the base model to create a standalone model.

In [None]:
# Optional: Merge LoRA adapters with base model for deployment
print("Merging LoRA adapters with base model...")
print("This creates a standalone model (~3GB) that doesn't require PEFT.\n")

# Reload base model in fp16 (not quantized) for merging
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Load base model in fp16
merge_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)

# Load LoRA adapters
merge_model = PeftModel.from_pretrained(merge_model, output_path)

# Merge adapters into base model
merged_model = merge_model.merge_and_unload()

# Save merged model
merged_output_path = "./deepseek-r1-math-science-merged"
merged_model.save_pretrained(merged_output_path)
tokenizer.save_pretrained(merged_output_path)

print(f"\n Merged model saved to {merged_output_path}")
print(f"  Size: ~3GB")
print(f"\nYou can now convert this to GGUF for on-device deployment using llama.cpp.")

# Visualise Attention

<p> Use TinyBERT & visualizew the self attention mechanism </p>

In [None]:
from transformers import AutoModel
from bertviz import head_view, model_view

In [None]:
model=AutoModel.from_pretrained("prajjwal1/bert-tiny", output_attentions=True)
tokenizer=AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")

prompt = "John's dog ate Nemis's cat"
inputs = tokenizer(prompt, return_tensors='pt')
with torch.no_grad():
  outputs = model(**inputs)



# Stop doing this blindly:
model = AutoModelForCausalLM.from_pretrained("model")

#### Start understanding:
#### - What are attention heads doing?
#### - How does positional encoding work?
#### - What's the difference between encoder-only (BERT), decoder-only (GPT), encoder-decoder (T5)?
#### - Why does model size scale quadratically with context length?

Action items:

Read "Attention Is All You Need" paper (seriously)
Implement a mini-transformer from scratch (100-200 lines)
Use model.generate(output_attentions=True) to visualize what the model "sees"
Study the HuggingFace modeling_*.py source code for your favorite models
Resources:

Andrej Karpathy's "Neural Networks: Zero to Hero" (YouTube)
Jay Alammar's "Illustrated Transformer" blog
HuggingFace Transformers source code

#### Bootcamp level:
model = AutoModelForCausalLM.from_pretrained("model", load_in_4bit=True)

#### Production level:
#### - Batching multiple requests efficiently
#### - KV-cache optimization for faster inference
#### - Flash Attention for 2-4x speedups
#### - ONNX/TensorRT conversion for production serving
#### - Model distillation to create smaller models
#### - Prompt caching to reduce redundant computation

Learn:

vLLM / Text Generation Inference (TGI): Production inference servers
GGML/llama.cpp: CPU-optimized inference
Triton Inference Server: Multi-model deployment
Batch processing: Processing 100 requests > processing 1 request 100 times
Continuous batching: Dynamic batching as requests arrive
Metrics that matter:

Throughput (tokens/second)
Latency (time to first token, time per token)
Cost per 1M tokens
GPU utilization %