# SmolLM2-135M Model Training in Colab

This notebook sets up and runs the SmolLM2-135M model training on Google Colab with T4 GPU.

In [1]:
# Install required packages
!pip install -q torch>=2.0.0 transformers>=4.30.0 pytorch-lightning>=2.0.0 tensorboard>=2.12.0 pyyaml>=6.0 tqdm>=4.65.0


In [2]:
# Verify T4 GPU is available
!nvidia-smi
import torch
print("GPU available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU type:", torch.cuda.get_device_name(0))

Tue Apr 22 22:55:03 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   51C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
# Create output directories
!mkdir -p output/checkpoints
!mkdir -p output/logs

# Run the training with optimized parameters for T4 GPU
# Run the training with 512 sequence length
!python train.py --input_file "input.txt" --output_dir "output" --max_steps 5000 --continue_steps 50 --save_every 500 --num_workers 2 --batch_size 4 --seq_length 512 --val_check_interval 360

Seed set to 42
tokenizer_config.json: 100% 2.28k/2.28k [00:00<00:00, 16.8MB/s]
tokenizer.model: 100% 500k/500k [00:00<00:00, 14.4MB/s]
tokenizer.json: 100% 1.84M/1.84M [00:00<00:00, 6.37MB/s]
special_tokens_map.json: 100% 411/411 [00:00<00:00, 2.16MB/s]
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.
Using default Llama tokenizer
GPU available. Optimizing memory usage.
Using text data from input.txt
Batch size: 4, Sequence length: 512
Using 2 data loader workers
Detected NVIDIA T4 GP

## Model Evaluation

After training is complete, we can evaluate the model by generating text from different checkpoints to compare performance.

In [None]:
# Compare text generation from different checkpoints to see improvements
import os
import torch
from transformers import AutoTokenizer
from model import SmolLM2LightningModule

# Define test prompts
test_prompts = [
    "Once upon a time in a land far away,",
    "The key benefits of deep learning include",
    "To make a delicious chocolate cake, you need",
    "In the year 2050, technology has evolved to",
]

# Function to generate text from a checkpoint
def generate_from_checkpoint(checkpoint_path, prompt, max_length=100):
    # Load tokenizer
    try:
        tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
    except Exception:
        tokenizer = AutoTokenizer.from_pretrained("gpt2")
    
    # Load model from checkpoint
    model = SmolLM2LightningModule.load_from_checkpoint(checkpoint_path)
    model.eval()
    
    # Move to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    # Tokenize prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    input_ids = input_ids.to(device)
    
    # Generate text
    with torch.no_grad():
        output_ids = model.model.generate(
            input_ids=input_ids,
            max_length=max_length,
            temperature=0.8,
            top_p=0.9,
            top_k=40,
            do_sample=True,
        )
    
    # Decode the generated text
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return generated_text

In [None]:
# Compare generation from different checkpoints
# Get available checkpoints
checkpoints = [
    "output/checkpoints/final_model.ckpt",  # 5000 steps
    "output/checkpoints/final_continued_model.ckpt"  # 5050 steps
]

# Check if additional intermediary checkpoints exist (every 500 steps)
for step in range(500, 5000, 500):
    checkpoint_path = f"output/checkpoints/smollm2-135-{step:06d}.ckpt"
    if os.path.exists(checkpoint_path):
        checkpoints.insert(-2, checkpoint_path)  # Insert before the final checkpoints

for prompt in test_prompts:
    print(f"\n{'=' * 50}")
    print(f"Prompt: {prompt}")
    print(f"{'-' * 50}")
    
    for checkpoint in checkpoints:
        try:
            name = checkpoint.split('/')[-1]
            print(f"\nCheckpoint: {name}")
            generated = generate_from_checkpoint(checkpoint, prompt, max_length=150)
            print(f"Generated:\n{generated}\n")
        except Exception as e:
            print(f"Error with checkpoint {checkpoint}: {str(e)}")
    
    print(f"{'=' * 50}\n")

In [None]:
# Analyze training logs - load and plot loss values
import pandas as pd
import matplotlib.pyplot as plt
import re

# Function to extract losses from training_loss.txt
def extract_losses(log_file):
    steps = []
    train_losses = []
    val_losses = []
    
    try:
        with open(log_file, 'r') as f:
            # Skip header lines
            next(f)  # Header
            next(f)  # Separator
            
            for line in f:
                # Parse line with regex to handle formatting
                match = re.search(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} \| (\d+) \| ([\d\.]+) \| ([\d\.nan]+)', line)
                if match:
                    step = int(match.group(1))
                    train_loss = float(match.group(2))
                    val_loss = float(match.group(3)) if match.group(3) != 'nan' else None
                    
                    steps.append(step)
                    train_losses.append(train_loss)
                    val_losses.append(val_loss)
    except Exception as e:
        print(f"Error parsing log file: {e}")
    
    return steps, train_losses, val_losses

# Extract losses from the log file
steps, train_losses, val_losses = extract_losses('output/logs/training_loss.txt')

# Plot loss curve
plt.figure(figsize=(12, 6))
plt.plot(steps, train_losses, label='Training Loss')
plt.xlabel('Training Steps')
plt.ylabel('Loss')
plt.title('SmolLM2-135M Training Loss')
plt.legend()
plt.grid(True)
plt.show()

# Save loss values as CSV for further analysis
loss_df = pd.DataFrame({
    'Step': steps,
    'Training Loss': train_losses,
    'Validation Loss': val_losses
})
loss_df.to_csv('output/training_loss_analysis.csv', index=False)
print(f"Saved loss values to 'output/training_loss_analysis.csv'")

## Save Model to Drive

After training and evaluation, you can save the model to Google Drive for later use.

In [None]:
# Mount Google Drive to save model checkpoints
from google.colab import drive
drive.mount('/content/drive')

# Create directory for model
!mkdir -p /content/drive/MyDrive/SmolLM2-135M

# Copy final model checkpoints to Drive
!cp output/checkpoints/final_model.ckpt /content/drive/MyDrive/SmolLM2-135M/
!cp output/checkpoints/final_continued_model.ckpt /content/drive/MyDrive/SmolLM2-135M/
!cp smoll2.yaml /content/drive/MyDrive/SmolLM2-135M/

# Copy necessary files for inference
!cp model.py /content/drive/MyDrive/SmolLM2-135M/
!cp generate.py /content/drive/MyDrive/SmolLM2-135M/
!cp app.py /content/drive/MyDrive/SmolLM2-135M/

print("Model and necessary files saved to Google Drive at /content/drive/MyDrive/SmolLM2-135M/")