# SFT Training for Tweet Generation

This notebook implements Supervised Fine-Tuning (SFT) for tweet generation using GPT-2.

## Setup and Installation


In [None]:
# Install required packages
%pip install -q transformers datasets trl wandb accelerate

# Import libraries
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from trl import SFTTrainer
from datasets import load_dataset
import torch
import wandb
import os
import json

print("✅ Packages installed and imported successfully!")


## GPU Setup and Device Detection


In [None]:
# Check GPU availability
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"🚀 Using GPU: {torch.cuda.get_device_name(0)}")
    print(f"📊 GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    print(f"🔧 CUDA Version: {torch.version.cuda}")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print(f"🍎 Using Apple Silicon GPU (MPS)")
else:
    device = torch.device("cpu")
    print(f"💻 Using CPU (training will be slower)")

print(f"\n🎯 Selected device: {device}")


## Data Setup


In [None]:
# Mount Google Drive (optional - if you want to store data there)
# drive.mount('/content/drive')

# Check if dataset file exists
dataset_path = "tweet_sft_dataset_10k.jsonl"

if os.path.exists(dataset_path):
    print(f"✅ Found dataset: {dataset_path}")
    data_path = dataset_path
else:
    print("📝 Creating sample dataset for testing...")
    # Create a small sample dataset
    sample_data = [
        {"instruction": "Write a personal_story tweet about coding", "response": "Spent 2 hours debugging a typo. It was a missing semicolon 😅"},
        {"instruction": "Write a classic tweet about wisdom", "response": "The most dangerous phrase in programming: 'Just a small change'"},
        {"instruction": "Write a funny tweet about technology", "response": "My computer is so slow, it's still processing my thoughts from yesterday"},
        {"instruction": "Write a motivational tweet about learning", "response": "Every expert was once a beginner. Keep coding! 💪"},
        {"instruction": "Write a relatable tweet about work", "response": "Me: I'll just fix this one small bug. Also me: 3 hours later..."}
    ]
    
    # Save sample data
    with open(dataset_path, 'w') as f:
        for item in sample_data:
            f.write(json.dumps(item) + '\n')
    
    data_path = dataset_path
    print(f"✅ Created sample dataset: {dataset_path}")

# Load dataset
dataset = load_dataset("json", data_files=data_path)
print(f"📊 Dataset loaded: {len(dataset['train'])} examples")


## Model Setup


In [None]:
# Model configuration
model_name = "gpt2"

print(f"🤖 Loading model: {model_name}")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Set pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Move model to device
model = model.to(device)

print(f"✅ Model loaded and moved to {device}")
print(f"📏 Model parameters: {model.num_parameters():,}")


## Data Preprocessing


In [None]:
def format_dataset(examples):
    """Format the dataset for the model"""
    texts = [
        inst + "\nResponse: " + resp
        for inst, resp in zip(examples["instruction"], examples["response"])
    ]
    return {"text": texts}

# Pre-format the dataset and remove all other columns
print("🔄 Formatting dataset...")
formatted_dataset = dataset["train"].map(
    format_dataset, 
    batched=True, 
    remove_columns=dataset["train"].column_names
)

print(f"✅ Formatted dataset columns: {formatted_dataset.column_names}")
print(f"📝 First example: {formatted_dataset[0]}")
print(f"📊 Total examples: {len(formatted_dataset)}")


## Weights & Biases Setup


In [None]:
# Initialize W&B
wandb.init(
    project="rlhf-learning-sft",
    name="tweet-generation-sft-colab",
    config={
        "model_name": model_name,
        "dataset_size": len(formatted_dataset),
        "num_epochs": 1,
        "batch_size": 4,
        "gradient_accumulation_steps": 4,
        "learning_rate": 5e-5,
        "warmup_steps": 100,
        "max_length": 512,
        "device": str(device),
        "cuda_available": torch.cuda.is_available(),
        "mps_available": torch.backends.mps.is_available(),
        "gpu_name": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "Apple Silicon (MPS)" if torch.backends.mps.is_available() else "CPU",
        "cuda_version": torch.version.cuda if torch.cuda.is_available() else None,
    }
)

print("✅ W&B initialized successfully!")


## Training Configuration


In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./sft_results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    warmup_steps=100,
    logging_steps=10,
    save_steps=2000,
    save_strategy="steps",
    load_best_model_at_end=False,
    report_to="wandb",
    run_name="sft_tweet_generation",
    logging_dir="./logs",
    # Colab-specific optimizations
    dataloader_pin_memory=False,  # Reduce memory usage
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
)

print("✅ Training arguments configured!")
print(f"📊 Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"💾 Output directory: {training_args.output_dir}")


## Trainer Setup


In [None]:
# SFT Trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=formatted_dataset,
)

print("✅ SFT Trainer configured!")
print(f"🎯 Training dataset size: {len(formatted_dataset)}")
print(f"🔄 Total training steps: {len(formatted_dataset) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps) * training_args.num_train_epochs}")


## Training


In [None]:
# Start training
print("🚀 Starting SFT training...")
print("📊 Check your W&B dashboard for real-time metrics!")

trainer.train()

print("✅ Training completed successfully!")
print("📁 Check the sft_results folder for saved models")


## Cleanup and Finalization


In [None]:
# Finish W&B run
wandb.finish()

print("🎉 Training session completed!")
print("📊 Check your W&B dashboard for detailed results")
print("💾 Model checkpoints saved in ./sft_results/")

# Display final model info
print(f"\n📈 Final model info:")
print(f"   Device: {device}")
print(f"   Parameters: {model.num_parameters():,}")
print(f"   Training examples: {len(formatted_dataset)}")


## Test the Trained Model (Optional)

Test your fine-tuned model with some sample prompts!


In [None]:
# Test the trained model
def generate_tweet(prompt, max_length=100):
    """Generate a tweet using the trained model"""
    input_text = f"{prompt}\nResponse:"
    inputs = tokenizer(input_text, return_tensors="pt").to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            num_return_sequences=1,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Test with sample prompts
test_prompts = [
    "Write a funny tweet about programming",
    "Write a motivational tweet about learning",
    "Write a personal story tweet about coding"
]

print("🧪 Testing the trained model...")
print("=" * 50)

for prompt in test_prompts:
    print(f"\n📝 Prompt: {prompt}")
    response = generate_tweet(prompt)
    print(f"🤖 Generated: {response}")
    print("-" * 30)
