# Library Install

In [12]:
!pip install -q transformers datasets matplotlib accelerate

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


# Check Environment

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer
import matplotlib.pyplot as plt

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

PyTorch version: 2.4.1+cu124
CUDA available: True
GPU: NVIDIA RTX 4000 Ada Generation


# Load the Teacher Model

In [2]:
print("Loading teacher model (Qwen2.5-0.5B)...")

# Load the tokenizer (converts text to numbers)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B")

# Load the pretrained teacher model
teacher_model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-0.5B",
    torch_dtype=torch.float16,  # Use half precision to save memory
    device_map="cuda"            # Put it on your GPU
)

# Set to evaluation mode (no training, just inference)
teacher_model.eval()

print(f"Teacher model loaded!")
print(f"Number of parameters: {sum(p.numel() for p in teacher_model.parameters()):,}")

Loading teacher model (Qwen2.5-0.5B)...


`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

Teacher model loaded!
Number of parameters: 494,032,768


# Eval on Teacher Model

In [4]:
# Cell 3: Test Teacher Model
print("Testing teacher model...")

# Input text
prompt = "The capital of France is"

# Tokenize (convert text to numbers)
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate next tokens
with torch.no_grad():  # Don't compute gradients (saves memory)
    outputs = teacher_model.generate(
        **inputs,
        max_new_tokens=10,      # Generate 10 new tokens
        do_sample=False,        # Greedy decoding (pick highest probability)
        pad_token_id=tokenizer.eos_token_id
    )

# Decode back to text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"\nInput: {prompt}")
print(f"Generated: {generated_text}")

Testing teacher model...

Input: The capital of France is
Generated: The capital of France is Paris. It is the largest city in Europe and


In [5]:
# Cell 4: Look at Teacher's Soft Targets
print("Looking at teacher's probability distribution...\n")

# Simple prompt
prompt = "The cat sat on the"

# Tokenize
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# Get model's output (logits = raw scores before softmax)
with torch.no_grad():
    outputs = teacher_model(**inputs)
    logits = outputs.logits  # Shape: [batch_size, sequence_length, vocab_size]

# Get the logits for predicting the NEXT token after our prompt
next_token_logits = logits[0, -1, :]  # Last position's predictions

# Convert logits to probabilities using softmax
probs = F.softmax(next_token_logits, dim=-1)

# Get top 10 most likely next tokens
top_probs, top_indices = torch.topk(probs, k=10)

print(f"Prompt: '{prompt}'")
print(f"\nTop 10 predictions for next token:")
print("-" * 50)
for i, (prob, idx) in enumerate(zip(top_probs, top_indices)):
    token = tokenizer.decode([idx])
    print(f"{i+1}. '{token}' → {prob.item()*100:.2f}%")

Looking at teacher's probability distribution...

Prompt: 'The cat sat on the'

Top 10 predictions for next token:
--------------------------------------------------
1. ' mat' → 84.33%
2. ' cat' → 2.35%
3. ' fence' → 1.11%
4. '
' → 0.95%
5. ' window' → 0.47%
6. ' green' → 0.45%
7. ' ' → 0.44%
8. ' moon' → 0.42%
9. ' l' → 0.32%
10. ' ____' → 0.26%


# Setting Up the Student Model

In [8]:
# Cell 5: Create Student Model Configuration
from transformers import AutoConfig

print("Creating student model configuration...\n")

# Load teacher's config
teacher_config = teacher_model.config

print("Teacher configuration:")
print(f"  - Hidden size: {teacher_config.hidden_size}")
print(f"  - Number of layers: {teacher_config.num_hidden_layers}")
print(f"  - Attention heads: {teacher_config.num_attention_heads}")
print(f"  - Vocab size: {teacher_config.vocab_size}")
print(f"  - Total params: 494M")

# Create student config (SMALLER)
student_config = AutoConfig.from_pretrained("Qwen/Qwen2.5-0.5B")
student_config.hidden_size = 256           # Teacher: 896, Student: 256
student_config.num_hidden_layers = 6       # Teacher: 24, Student: 6  
student_config.num_attention_heads = 8     # Teacher: 14, Student: 8
student_config.intermediate_size = 1024    # Teacher: 4864, Student: 1024

print("\nStudent configuration:")
print(f"  - Hidden size: {student_config.hidden_size}")
print(f"  - Number of layers: {student_config.num_hidden_layers}")
print(f"  - Attention heads: {student_config.num_attention_heads}")
print(f"  - Vocab size: {student_config.vocab_size}")

Creating student model configuration...

Teacher configuration:
  - Hidden size: 896
  - Number of layers: 24
  - Attention heads: 14
  - Vocab size: 151936
  - Total params: 494M

Student configuration:
  - Hidden size: 256
  - Number of layers: 6
  - Attention heads: 8
  - Vocab size: 151936


In [9]:
# Cell 6: Create Student Model
print("Creating student model from config...\n")

# Create model with our custom config
student_model = AutoModelForCausalLM.from_config(student_config)

# Move to GPU
student_model = student_model.to("cuda")

# Count parameters
student_params = sum(p.numel() for p in student_model.parameters())
teacher_params = sum(p.numel() for p in teacher_model.parameters())

print(f"Teacher parameters: {teacher_params:,}")
print(f"Student parameters: {student_params:,}")
print(f"Reduction: {teacher_params / student_params:.1f}x smaller")
print(f"\nStudent model created with RANDOM weights (untrained)")

Creating student model from config...

Teacher parameters: 494,032,768
Student parameters: 44,602,880
Reduction: 11.1x smaller

Student model created with RANDOM weights (untrained)


# Test the Student Model Note:This is an model which hasn't leanred

In [10]:
# Cell 7: Test Untrained Student
print("Testing UNTRAINED student (random weights)...\n")

prompt = "The capital of France is"

# Tokenize
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# Get student's predictions (with random weights!)
with torch.no_grad():
    student_outputs = student_model(**inputs)
    student_logits = student_outputs.logits[0, -1, :]
    
# Convert to probabilities
student_probs = F.softmax(student_logits, dim=-1)

# Get top 10 predictions
top_probs, top_indices = torch.topk(student_probs, k=10)

print(f"Prompt: '{prompt}'")
print(f"\nStudent's top 10 predictions (UNTRAINED):")
print("-" * 50)
for i, (prob, idx) in enumerate(zip(top_probs, top_indices)):
    token = tokenizer.decode([idx])
    print(f"{i+1}. '{token}' → {prob.item()*100:.2f}%")

Testing UNTRAINED student (random weights)...

Prompt: 'The capital of France is'

Student's top 10 predictions (UNTRAINED):
--------------------------------------------------
1. '的基础上' → 0.00%
2. ' CUT' → 0.00%
3. 'act' → 0.00%
4. 'Compra' → 0.00%
5. '市中心' → 0.00%
6. ' PowerPoint' → 0.00%
7. ' pdb' → 0.00%
8. ' abort' → 0.00%
9. ' specification' → 0.00%
10. ' kenn' → 0.00%


# Load Trainning Data

In [22]:
# Cell 14: Clean the Dataset
print("Cleaning dataset - removing empty/short examples...\n")

# Reload fresh dataset
from datasets import load_dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")

print(f"Original dataset size: {len(dataset)}")

# Filter function - keep only good examples
def is_good_example(example):
    text = example['text'].strip()
    # Keep if: has real content AND is at least 50 characters
    return len(text) > 50 and not text.startswith('=')  # Remove headers

# Filter dataset
dataset = dataset.filter(is_good_example)

print(f"After filtering: {len(dataset)} examples")
print(f"Removed: {36718 - len(dataset)} empty/bad examples")

# Take more examples now (since we have good data)
dataset = dataset.select(range(min(5000, len(dataset))))

print(f"Using: {len(dataset)} examples for training")

# Show example
print(f"\nExample good text:")
print("-" * 60)
print(dataset[0]['text'][:200])

Cleaning dataset - removing empty/short examples...

Original dataset size: 36718


Filter:   0%|          | 0/36718 [00:00<?, ? examples/s]

After filtering: 16018 examples
Removed: 20700 empty/bad examples
Using: 5000 examples for training

Example good text:
------------------------------------------------------------
 Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ p


# Setup the Loss Function

In [14]:
# Cell 9: Define Distillation Loss Function
import torch.nn.functional as F

def distillation_loss(student_logits, teacher_logits, temperature=2.0):
    """
    Compute KL divergence loss between student and teacher
    
    Args:
        student_logits: Raw scores from student [batch, seq_len, vocab_size]
        teacher_logits: Raw scores from teacher [batch, seq_len, vocab_size]
        temperature: Softmax temperature (higher = softer distributions)
    
    Returns:
        loss: KL divergence loss (scalar)
    """
    
    # Apply temperature scaling and softmax to get soft targets
    teacher_probs = F.softmax(teacher_logits / temperature, dim=-1)
    student_log_probs = F.log_softmax(student_logits / temperature, dim=-1)
    
    # KL divergence: sum(teacher_probs * log(teacher_probs / student_probs))
    # Equivalent to: sum(teacher_probs * (log(teacher_probs) - log(student_probs)))
    kl_div = F.kl_div(
        student_log_probs, 
        teacher_probs, 
        reduction='batchmean'
    )
    
    # Scale by temperature^2 (standard practice in distillation)
    kl_div = kl_div * (temperature ** 2)
    
    return kl_div

# Test the function
print("Distillation loss function defined!")
print(f"Temperature parameter: Makes distributions 'softer'")
print(f"Higher temp (e.g., 4.0) → more uniform probabilities")
print(f"Lower temp (e.g., 1.0) → sharper probabilities")

Distillation loss function defined!
Temperature parameter: Makes distributions 'softer'
Higher temp (e.g., 4.0) → more uniform probabilities
Lower temp (e.g., 1.0) → sharper probabilities


# Prepare Data for Training

In [23]:
# Cell 10: Prepare Training Data
print("Preparing training data...\n")

# Function to tokenize text
def tokenize_function(examples):
    # Tokenize the text
    return tokenizer(
        examples['text'],
        truncation=True,           # Cut off if too long
        max_length=128,            # Maximum 128 tokens per example
        padding='max_length',      # Pad shorter sequences
        return_tensors='pt'
    )

# Tokenize the dataset
print("Tokenizing dataset...")
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=['text']  # Remove original text, keep only token IDs
)

# Set format for PyTorch
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

print(f"✓ Dataset tokenized!")
print(f"  Total examples: {len(tokenized_dataset)}")
print(f"  Max sequence length: 128 tokens")
print(f"\nExample tokenized:")
print(f"  Input IDs shape: {tokenized_dataset[0]['input_ids'].shape}")
print(f"  First 10 token IDs: {tokenized_dataset[0]['input_ids'][:10].tolist()}")

Preparing training data...

Tokenizing dataset...


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

✓ Dataset tokenized!
  Total examples: 5000
  Max sequence length: 128 tokens

Example tokenized:
  Input IDs shape: torch.Size([128])
  First 10 token IDs: [5363, 73, 55661, 902, 85162, 88, 4204, 220, 18, 549]


In [24]:
# Cell 10.5: Check tokenized data
print("Checking tokenized examples...\n")

# Look at a few examples to find real text
for i in range(10):
    input_ids = tokenized_dataset[i]['input_ids']
    # Count non-padding tokens
    non_padding = (input_ids != 151643).sum().item()
    
    if non_padding > 20:  # Find one with at least 20 real tokens
        print(f"Example {i}:")
        print(f"  Non-padding tokens: {non_padding}")
        print(f"  First 20 token IDs: {input_ids[:20].tolist()}")
        
        # Decode to see the actual text
        decoded = tokenizer.decode(input_ids[:50], skip_special_tokens=True)
        print(f"  Decoded text: {decoded[:150]}")
        break

Checking tokenized examples...

Example 0:
  Non-padding tokens: 128
  First 20 token IDs: [5363, 73, 55661, 902, 85162, 88, 4204, 220, 18, 549, 1230, 8548, 291, 65316, 320, 10769, 549, 49434, 99, 74167]
  Decoded text:  Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chron


# Creating the Data Loader

In [25]:
# Cell 11: Create DataLoader
from torch.utils.data import DataLoader

print("Creating DataLoader...\n")

# Create DataLoader
train_dataloader = DataLoader(
    tokenized_dataset,
    batch_size=8,        # Process 8 examples at once
    shuffle=True,        # Randomize order each epoch
)

print(f"✓ DataLoader created!")
print(f"  Batch size: 8")
print(f"  Total batches: {len(train_dataloader)}")
print(f"  Total examples: {len(tokenized_dataset)}")

# Test the dataloader
print(f"\nTesting dataloader...")
batch = next(iter(train_dataloader))
print(f"  Batch input_ids shape: {batch['input_ids'].shape}")
print(f"  Batch attention_mask shape: {batch['attention_mask'].shape}")

Creating DataLoader...

✓ DataLoader created!
  Batch size: 8
  Total batches: 625
  Total examples: 5000

Testing dataloader...
  Batch input_ids shape: torch.Size([8, 128])
  Batch attention_mask shape: torch.Size([8, 128])


# Train the smaller Model

In [26]:
# Cell 12: Training Loop
from torch.optim import AdamW
from tqdm import tqdm

print("Setting up training...\n")

# Optimizer (updates student weights)
optimizer = AdamW(student_model.parameters(), lr=5e-5)

# Training settings
num_epochs = 3
temperature = 2.0

print(f"Training configuration:")
print(f"  Epochs: {num_epochs}")
print(f"  Batch size: 8")
print(f"  Learning rate: 5e-5")
print(f"  Temperature: {temperature}")
print(f"  Total training steps: {len(train_dataloader) * num_epochs}")
print("\n" + "="*60)

# Put student in training mode
student_model.train()

# Track losses
losses = []

# Training loop
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    print("-" * 60)
    
    epoch_loss = 0
    progress_bar = tqdm(train_dataloader, desc=f"Training")
    
    for batch_idx, batch in enumerate(progress_bar):
        # Move batch to GPU
        input_ids = batch['input_ids'].to('cuda')
        attention_mask = batch['attention_mask'].to('cuda')
        
        # Get teacher predictions (no gradients needed)
        with torch.no_grad():
            teacher_outputs = teacher_model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            teacher_logits = teacher_outputs.logits
        
        # Get student predictions (we WILL compute gradients)
        student_outputs = student_model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        student_logits = student_outputs.logits
        
        # Compute distillation loss
        loss = distillation_loss(student_logits, teacher_logits, temperature)
        
        # Backpropagation
        optimizer.zero_grad()  # Clear old gradients
        loss.backward()        # Compute new gradients
        optimizer.step()       # Update weights
        
        # Track loss
        epoch_loss += loss.item()
        losses.append(loss.item())
        
        # Update progress bar
        progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    avg_loss = epoch_loss / len(train_dataloader)
    print(f"Average loss: {avg_loss:.4f}")

print("\n" + "="*60)
print("✓ Training complete!")

Setting up training...

Training configuration:
  Epochs: 3
  Batch size: 8
  Learning rate: 5e-5
  Temperature: 2.0
  Total training steps: 1875


Epoch 1/3
------------------------------------------------------------


Training: 100%|██████████| 625/625 [00:54<00:00, 11.44it/s, loss=772.7190] 


Average loss: 859.0203

Epoch 2/3
------------------------------------------------------------


Training: 100%|██████████| 625/625 [00:54<00:00, 11.43it/s, loss=751.7917]


Average loss: 791.3633

Epoch 3/3
------------------------------------------------------------


Training: 100%|██████████| 625/625 [00:55<00:00, 11.33it/s, loss=785.4382]

Average loss: 773.9486

✓ Training complete!





# Test the student Model

In [27]:
# Cell 13: Test Trained Student
print("Testing TRAINED student vs UNTRAINED...\n")

prompt = "The capital of France is"

# Tokenize
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# Set student to eval mode
student_model.eval()

# Get trained student's predictions
with torch.no_grad():
    student_outputs = student_model(**inputs)
    student_logits = student_outputs.logits[0, -1, :]
    student_probs = F.softmax(student_logits, dim=-1)

# Get teacher's predictions (for comparison)
with torch.no_grad():
    teacher_outputs = teacher_model(**inputs)
    teacher_logits = teacher_outputs.logits[0, -1, :]
    teacher_probs = F.softmax(teacher_logits, dim=-1)

# Get top 10 for both
student_top_probs, student_top_indices = torch.topk(student_probs, k=10)
teacher_top_probs, teacher_top_indices = torch.topk(teacher_probs, k=10)

print(f"Prompt: '{prompt}'")
print("\n" + "="*60)
print("TEACHER predictions:")
print("-"*60)
for i, (prob, idx) in enumerate(zip(teacher_top_probs, teacher_top_indices)):
    token = tokenizer.decode([idx])
    print(f"{i+1}. '{token}' → {prob.item()*100:.2f}%")

print("\n" + "="*60)
print("TRAINED STUDENT predictions:")
print("-"*60)
for i, (prob, idx) in enumerate(zip(student_top_probs, student_top_indices)):
    token = tokenizer.decode([idx])
    print(f"{i+1}. '{token}' → {prob.item()*100:.2f}%")

print("\n" + "="*60)
print("REMEMBER - Untrained student predicted:")
print("'的基础上', 'CUT', 'PowerPoint', etc. (nonsense!)")

Testing TRAINED student vs UNTRAINED...

Prompt: 'The capital of France is'

TEACHER predictions:
------------------------------------------------------------
1. ' Paris' → 31.57%
2. ' ______' → 11.43%
3. ' ____' → 6.41%
4. ' __' → 5.48%
5. ':
' → 5.15%
6. ' located' → 3.57%
7. ' the' → 2.80%
8. '
' → 2.47%
9. ' (' → 1.99%
10. ' .
' → 1.90%

TRAINED STUDENT predictions:
------------------------------------------------------------
1. ' .
' → 0.09%
2. ' .

' → 0.08%
3. '.' → 0.08%
4. ' [' → 0.07%
5. ' B' → 0.07%
6. ' in' → 0.07%
7. ' .' → 0.06%
8. ' and' → 0.06%
9. '9' → 0.06%
10. '

' → 0.06%

REMEMBER - Untrained student predicted:
'的基础上', 'CUT', 'PowerPoint', etc. (nonsense!)


In [21]:
# Cell 13.5: Check if student is actually updating
print("Checking student's actual learning...\n")

# Generate text to see behavior
prompt = "The capital of France is"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

student_model.eval()
with torch.no_grad():
    outputs = student_model.generate(
        **inputs,
        max_new_tokens=10,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id,
        temperature=1.0
    )

generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Student generates: {generated}")

# Also check teacher for comparison
with torch.no_grad():
    teacher_gen = teacher_model.generate(
        **inputs,
        max_new_tokens=10,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id
    )
teacher_text = tokenizer.decode(teacher_gen[0], skip_special_tokens=True)
print(f"Teacher generates: {teacher_text}")

Checking student's actual learning...

Student generates: The capital of France isHumanHumanHumanHumanHumanHumanHumanHumanHumanHuman
Teacher generates: The capital of France is Paris. It is the largest city in Europe and


# Better Training Loop

In [28]:
# Cell 15: RESTART Training Properly
print("Restarting with fresh student model and better config...\n")

# 1. CREATE FRESH STUDENT (important!)
print("Creating NEW student model...")
student_config = AutoConfig.from_pretrained("Qwen/Qwen2.5-0.5B")
student_config.hidden_size = 256
student_config.num_hidden_layers = 6
student_config.num_attention_heads = 8
student_config.intermediate_size = 1024

student_model = AutoModelForCausalLM.from_config(student_config)
student_model = student_model.to("cuda")
print("✓ Fresh student created with random weights")

# 2. BETTER OPTIMIZER CONFIG
from torch.optim import AdamW
optimizer = AdamW(student_model.parameters(), lr=1e-4, weight_decay=0.01)

# 3. MORE EPOCHS
num_epochs = 10  # Increased from 3 to 10!
temperature = 3.0  # Increased temperature for softer targets

print(f"\nNew training configuration:")
print(f"  Epochs: {num_epochs} (was 3)")
print(f"  Learning rate: 1e-4 (was 5e-5)")
print(f"  Temperature: {temperature} (was 2.0)")
print(f"  Weight decay: 0.01 (regularization)")
print(f"  Total steps: {len(train_dataloader) * num_epochs}")
print("\n" + "="*60)

# 4. TRAIN
student_model.train()
losses = []

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    print("-" * 60)
    
    epoch_loss = 0
    progress_bar = tqdm(train_dataloader, desc=f"Training")
    
    for batch in progress_bar:
        input_ids = batch['input_ids'].to('cuda')
        attention_mask = batch['attention_mask'].to('cuda')
        
        # Teacher predictions (frozen)
        with torch.no_grad():
            teacher_outputs = teacher_model(input_ids=input_ids, attention_mask=attention_mask)
            teacher_logits = teacher_outputs.logits
        
        # Student predictions
        student_outputs = student_model(input_ids=input_ids, attention_mask=attention_mask)
        student_logits = student_outputs.logits
        
        # Loss
        loss = distillation_loss(student_logits, teacher_logits, temperature)
        
        # Update
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        losses.append(loss.item())
        progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    avg_loss = epoch_loss / len(train_dataloader)
    print(f"Epoch {epoch+1} avg loss: {avg_loss:.4f}")

print("\n" + "="*60)
print("✓ Training complete!")

Restarting with fresh student model and better config...

Creating NEW student model...
✓ Fresh student created with random weights

New training configuration:
  Epochs: 10 (was 3)
  Learning rate: 1e-4 (was 5e-5)
  Temperature: 3.0 (was 2.0)
  Weight decay: 0.01 (regularization)
  Total steps: 6250


Epoch 1/10
------------------------------------------------------------


Training: 100%|██████████| 625/625 [00:54<00:00, 11.45it/s, loss=435.6461]


Epoch 1 avg loss: 514.9640

Epoch 2/10
------------------------------------------------------------


Training: 100%|██████████| 625/625 [00:54<00:00, 11.49it/s, loss=381.9105]


Epoch 2 avg loss: 391.7198

Epoch 3/10
------------------------------------------------------------


Training: 100%|██████████| 625/625 [00:54<00:00, 11.48it/s, loss=374.4852]


Epoch 3 avg loss: 374.8575

Epoch 4/10
------------------------------------------------------------


Training: 100%|██████████| 625/625 [00:54<00:00, 11.38it/s, loss=385.5041]


Epoch 4 avg loss: 365.0590

Epoch 5/10
------------------------------------------------------------


Training: 100%|██████████| 625/625 [00:55<00:00, 11.34it/s, loss=350.7498]


Epoch 5 avg loss: 358.5952

Epoch 6/10
------------------------------------------------------------


Training: 100%|██████████| 625/625 [00:54<00:00, 11.43it/s, loss=414.4724]


Epoch 6 avg loss: 354.7307

Epoch 7/10
------------------------------------------------------------


Training: 100%|██████████| 625/625 [00:54<00:00, 11.42it/s, loss=357.2596]


Epoch 7 avg loss: 351.9713

Epoch 8/10
------------------------------------------------------------


Training: 100%|██████████| 625/625 [00:54<00:00, 11.43it/s, loss=320.4314]


Epoch 8 avg loss: 349.9068

Epoch 9/10
------------------------------------------------------------


Training: 100%|██████████| 625/625 [00:54<00:00, 11.44it/s, loss=324.0681]


Epoch 9 avg loss: 348.2993

Epoch 10/10
------------------------------------------------------------


Training: 100%|██████████| 625/625 [00:54<00:00, 11.44it/s, loss=400.7427]

Epoch 10 avg loss: 347.0600

✓ Training complete!





In [29]:
# Cell 16: Test the PROPERLY Trained Student
print("Testing the PROPERLY trained student...\n")

prompt = "The capital of France is"

# Tokenize
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# Set to eval mode
student_model.eval()

# Get predictions
with torch.no_grad():
    # Teacher
    teacher_outputs = teacher_model(**inputs)
    teacher_logits = teacher_outputs.logits[0, -1, :]
    teacher_probs = F.softmax(teacher_logits, dim=-1)
    
    # Student
    student_outputs = student_model(**inputs)
    student_logits = student_outputs.logits[0, -1, :]
    student_probs = F.softmax(student_logits, dim=-1)

# Top 10 for both
teacher_top_probs, teacher_top_indices = torch.topk(teacher_probs, k=10)
student_top_probs, student_top_indices = torch.topk(student_probs, k=10)

print(f"Prompt: '{prompt}'")
print("\n" + "="*70)
print("TEACHER predictions:")
print("-"*70)
for i, (prob, idx) in enumerate(zip(teacher_top_probs, teacher_top_indices)):
    token = tokenizer.decode([idx])
    print(f"{i+1:2d}. '{token:15s}' → {prob.item()*100:6.2f}%")

print("\n" + "="*70)
print("TRAINED STUDENT predictions (10 epochs, proper config):")
print("-"*70)
for i, (prob, idx) in enumerate(zip(student_top_probs, student_top_indices)):
    token = tokenizer.decode([idx])
    is_match = idx in teacher_top_indices
    marker = " ✓ (matches teacher!)" if is_match else ""
    print(f"{i+1:2d}. '{token:15s}' → {prob.item()*100:6.2f}%{marker}")

print("\n" + "="*70)
print("Let's also try generation:")
print("-"*70)

# Generate text
with torch.no_grad():
    student_gen = student_model.generate(
        **inputs,
        max_new_tokens=15,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id
    )
    teacher_gen = teacher_model.generate(
        **inputs,
        max_new_tokens=15,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id
    )

student_text = tokenizer.decode(student_gen[0], skip_special_tokens=True)
teacher_text = tokenizer.decode(teacher_gen[0], skip_special_tokens=True)

print(f"\nTeacher: {teacher_text}")
print(f"Student: {student_text}")

Testing the PROPERLY trained student...

Prompt: 'The capital of France is'

TEACHER predictions:
----------------------------------------------------------------------
 1. ' Paris         ' →  31.57%
 2. ' ______        ' →  11.43%
 3. ' ____          ' →   6.41%
 4. ' __            ' →   5.48%
 5. ':
             ' →   5.15%
 6. ' located       ' →   3.57%
 7. ' the           ' →   2.80%
 8. '
              ' →   2.47%
 9. ' (             ' →   1.99%
10. ' .
            ' →   1.90%

TRAINED STUDENT predictions (10 epochs, proper config):
----------------------------------------------------------------------
 1. ',              ' →   0.57%
 2. ' .
            ' →   0.35% ✓ (matches teacher!)
 3. ' -             ' →   0.28%
 4. ' .

           ' →   0.25%
 5. ' that          ' →   0.20%
 6. ' [             ' →   0.19%
 7. ' as            ' →   0.19%
 8. ' in            ' →   0.19%
 9. ' more          ' →   0.19%
10. ' only          ' →   0.18%

Let's also try generation:
--------------