In [2]:
# environment setup

import torch
import os
from google.colab import drive

# Check GPU
print("🖥️  GPU Check:")
print(f"   CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"   GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"   GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

# Mount Google Drive
print("\n📁 Mounting Google Drive...")
drive.mount('/content/drive', force_remount=True)

# Create working directory
DRIVE_PATH = "/content/drive/MyDrive/lyrics_model_training"
os.makedirs(DRIVE_PATH, exist_ok=True)
print(f"✅ Working directory: {DRIVE_PATH}")

🖥️  GPU Check:
   CUDA Available: True
   GPU Name: NVIDIA L4
   GPU Memory: 23.80 GB

📁 Mounting Google Drive...
Mounted at /content/drive
✅ Working directory: /content/drive/MyDrive/lyrics_model_training


In [None]:
# install packages restart runtime and run cell 3

print("📦 Installing packages...")
!pip install -q datasets transformers

print("✅ Packages installed!")
print("⚠️  Go to: Runtime → Restart runtime")
print("   Then continue from Cell 3")

📦 Installing packages...
✅ Packages installed!
⚠️  Go to: Runtime → Restart runtime
   Then continue from Cell 3


In [3]:
# verify after restart

import transformers
import datasets
import torch

print("✅ Verification:")
print(f"   Transformers: {transformers.__version__}")
print(f"   Datasets: {datasets.__version__}")
print(f"   Torch: {torch.__version__}")
print(f"   CUDA: {torch.cuda.is_available()}")

from transformers import AutoModelForCausalLM, AutoTokenizer
print("\n✅ Ready to train!")

✅ Verification:
   Transformers: 4.57.3
   Datasets: 4.0.0
   Torch: 2.9.0+cu126
   CUDA: True

✅ Ready to train!


In [None]:
# upload training data

from google.colab import files
import shutil
import json
import os

DRIVE_PATH = "/content/drive/MyDrive/lyrics_model_training"

print("📤 Upload your training data file")
uploaded = files.upload()

for filename in uploaded.keys():
    dest = os.path.join(DRIVE_PATH, filename)
    shutil.move(filename, dest)
    TRAINING_FILE = dest
    print(f"✅ Saved: {dest}")

# Check file
with open(TRAINING_FILE, 'r') as f:
    sample_count = sum(1 for _ in f)
    print(f"   Total samples: {sample_count}")

📤 Upload your training data file


Saving training_data_augmented.jsonl to training_data_augmented.jsonl
✅ Saved: /content/drive/MyDrive/lyrics_model_training/training_data_augmented.jsonl
   Total samples: 110954


In [4]:
# training configuration

import os

DRIVE_PATH = "/content/drive/MyDrive/lyrics_model_training"
TRAINING_FILE = "/content/drive/MyDrive/lyrics_model_training/training/training_data_augmented.jsonl"  # UPDATE THIS

CONFIG = {
    "model_name": "gpt2",
    "epochs": 3,
    "batch_size": 4,
    "learning_rate": 5e-5,
    "max_length": 512,
    "validation_split": 0.05,
    "save_every": 500,  # Save every N steps
    "output_dir": os.path.join(DRIVE_PATH, "checkpoints"),
    "final_model_dir": os.path.join(DRIVE_PATH, "final_model"),
}

os.makedirs(CONFIG["output_dir"], exist_ok=True)
os.makedirs(CONFIG["final_model_dir"], exist_ok=True)

print("⚙️ Configuration:")
for k, v in CONFIG.items():
    print(f"   {k}: {v}")

⚙️ Configuration:
   model_name: gpt2
   epochs: 3
   batch_size: 4
   learning_rate: 5e-05
   max_length: 512
   validation_split: 0.05
   save_every: 500
   output_dir: /content/drive/MyDrive/lyrics_model_training/checkpoints
   final_model_dir: /content/drive/MyDrive/lyrics_model_training/final_model


In [5]:
# prepare and load dataset

from datasets import Dataset
import json

print("📊 Loading dataset...")

# Load data
with open(TRAINING_FILE, 'r') as f:
    data = [json.loads(line) for line in f]

dataset = Dataset.from_list(data)
print(f"   ✅ Loaded {len(dataset)} samples")

# Split train/validation
split = dataset.train_test_split(test_size=CONFIG['validation_split'], seed=42)
train_data = split['train']
val_data = split['test']

print(f"   Training: {len(train_data)}")
print(f"   Validation: {len(val_data)}")

📊 Loading dataset...
   ✅ Loaded 110954 samples
   Training: 105406
   Validation: 5548


In [None]:
# load model

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

print(f"🤖 Loading: {CONFIG['model_name']}...")

tokenizer = AutoTokenizer.from_pretrained(CONFIG['model_name'])
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(CONFIG['model_name'])

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

print(f"✅ Model on {device}")
print(f"   Parameters: {sum(p.numel() for p in model.parameters()):,}")

🤖 Loading: gpt2...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

✅ Model on cuda
   Parameters: 124,439,808


In [None]:
# tokenize data

print("🔤 Tokenizing...")

def tokenize_batch(examples):
    texts = []
    for inst, out in zip(examples['instruction'], examples['output']):
        text = f"### Instruction:\n{inst}\n\n### Response:\n{out}{tokenizer.eos_token}"
        texts.append(text)
    return tokenizer(texts, truncation=True, max_length=CONFIG['max_length'], padding=False)

train_dataset = train_data.map(tokenize_batch, batched=True, remove_columns=train_data.column_names)
val_dataset = val_data.map(tokenize_batch, batched=True, remove_columns=val_data.column_names)

print(f"✅ Tokenized!")
print(f"   Train: {len(train_dataset)}")
print(f"   Val: {len(val_dataset)}")


🔤 Tokenizing...


Map:   0%|          | 0/105406 [00:00<?, ? examples/s]

Map:   0%|          | 0/5548 [00:00<?, ? examples/s]

✅ Tokenized!
   Train: 105406
   Val: 5548


In [6]:
# rerun for epoch 3

# Load from checkpoint
from transformers import AutoModelForCausalLM, AutoTokenizer

checkpoint_path = os.path.join(CONFIG["output_dir"], "checkpoint-epoch-2")

print(f"📥 Loading from: {checkpoint_path}")
model = AutoModelForCausalLM.from_pretrained(checkpoint_path)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
tokenizer.pad_token = tokenizer.eos_token

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

print(f"✅ Model loaded on {device}")

📥 Loading from: /content/drive/MyDrive/lyrics_model_training/checkpoints/checkpoint-epoch-2
✅ Model loaded on cuda


In [7]:
# rerun for epoch 3

# Tokenize
def tokenize_batch(examples):
    texts = []
    for inst, out in zip(examples['instruction'], examples['output']):
        text = f"### Instruction:\n{inst}\n\n### Response:\n{out}{tokenizer.eos_token}"
        texts.append(text)
    return tokenizer(texts, truncation=True, max_length=512, padding=False)

train_dataset = train_data.map(tokenize_batch, batched=True, remove_columns=train_data.column_names)
val_dataset = val_data.map(tokenize_batch, batched=True, remove_columns=val_data.column_names)

print(f"✅ Tokenized")

Map:   0%|          | 0/105406 [00:00<?, ? examples/s]

Map:   0%|          | 0/5548 [00:00<?, ? examples/s]

✅ Tokenized


In [8]:
# training setup

from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F

# Collate function for batching
def collate_fn(batch):
    input_ids = [torch.tensor(item['input_ids']) for item in batch]
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)

    attention_mask = (input_ids != tokenizer.pad_token_id).long()
    labels = input_ids.clone()
    labels[labels == tokenizer.pad_token_id] = -100

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=CONFIG['batch_size'], collate_fn=collate_fn)

# Optimizer
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=CONFIG['learning_rate'])

print("✅ Training setup complete!")
print(f"   Train batches: {len(train_loader)}")
print(f"   Val batches: {len(val_loader)}")

✅ Training setup complete!
   Train batches: 26352
   Val batches: 1387


In [None]:
# training function

from tqdm.auto import tqdm
import os

def train_epoch(model, loader, optimizer, device, epoch):
    model.train()
    total_loss = 0
    progress = tqdm(loader, desc=f"Epoch {epoch}")

    for batch in progress:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress.set_postfix({'loss': f'{loss.item():.4f}'})

    return total_loss / len(loader)

def validate(model, loader, device):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in tqdm(loader, desc="Validating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            total_loss += outputs.loss.item()

    return total_loss / len(loader)

print("✅ Training functions ready!")

✅ Training functions ready!


In [None]:
# start training

print("=" * 70)
print("🚀 STARTING TRAINING")
print("=" * 70)

print(f"\n📊 Setup:")
print(f"   Model: {CONFIG['model_name']}")
print(f"   Samples: {len(train_dataset)}")
print(f"   Epochs: {CONFIG['epochs']}")
print(f"   Device: {device}")

print("\n" + "=" * 70)
input("Press Enter to start...")
print("=" * 70)

# Training loop
best_val_loss = float('inf')
global_step = 0

for epoch in range(1, CONFIG['epochs'] + 1):
    print(f"\n📊 Epoch {epoch}/{CONFIG['epochs']}")

    # Train
    train_loss = train_epoch(model, train_loader, optimizer, device, epoch)
    print(f"   Train Loss: {train_loss:.4f}")

    # Validate
    val_loss = validate(model, val_loader, device)
    print(f"   Val Loss: {val_loss:.4f}")

    # Save checkpoint
    checkpoint_dir = os.path.join(CONFIG['output_dir'], f"checkpoint-epoch-{epoch}")
    os.makedirs(checkpoint_dir, exist_ok=True)
    model.save_pretrained(checkpoint_dir)
    tokenizer.save_pretrained(checkpoint_dir)
    print(f"   💾 Saved: {checkpoint_dir}")

    # Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        model.save_pretrained(CONFIG['final_model_dir'])
        tokenizer.save_pretrained(CONFIG['final_model_dir'])
        print(f"   ⭐ New best! Saved to: {CONFIG['final_model_dir']}")

print("\n" + "=" * 70)
print("✅ TRAINING COMPLETE!")
print("=" * 70)


🚀 STARTING TRAINING

📊 Setup:
   Model: gpt2
   Samples: 105406
   Epochs: 3
   Device: cuda


📊 Epoch 1/3


Epoch 1:   0%|          | 0/26352 [00:00<?, ?it/s]

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


   Train Loss: 1.9243


Validating:   0%|          | 0/1387 [00:00<?, ?it/s]

   Val Loss: 1.7944
   💾 Saved: /content/drive/MyDrive/lyrics_model_training/checkpoints/checkpoint-epoch-1
   ⭐ New best! Saved to: /content/drive/MyDrive/lyrics_model_training/final_model

📊 Epoch 2/3


Epoch 2:   0%|          | 0/26352 [00:00<?, ?it/s]

   Train Loss: 1.7902


Validating:   0%|          | 0/1387 [00:00<?, ?it/s]

   Val Loss: 1.7295
   💾 Saved: /content/drive/MyDrive/lyrics_model_training/checkpoints/checkpoint-epoch-2
   ⭐ New best! Saved to: /content/drive/MyDrive/lyrics_model_training/final_model

📊 Epoch 3/3


Epoch 3:   0%|          | 0/26352 [00:00<?, ?it/s]

In [10]:
# resuming for epoch 3

# RESUME TRAINING
from tqdm.auto import tqdm

print("🚀 RESUMING EPOCH 3")

for epoch in [3]:  # Just epoch 3
    print(f"\n📊 Epoch {epoch}/3")

    # Train
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"   Train Loss: {total_loss/len(train_loader):.4f}")

    # Validate
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            outputs = model(
                input_ids=batch['input_ids'].to(device),
                attention_mask=batch['attention_mask'].to(device),
                labels=batch['labels'].to(device)
            )
            val_loss += outputs.loss.item()

    print(f"   Val Loss: {val_loss/len(val_loader):.4f}")

    # Save
    checkpoint_dir = os.path.join(CONFIG['output_dir'], f"checkpoint-epoch-{epoch}")
    model.save_pretrained(checkpoint_dir)
    tokenizer.save_pretrained(checkpoint_dir)

    model.save_pretrained(CONFIG['final_model_dir'])
    tokenizer.save_pretrained(CONFIG['final_model_dir'])
    print(f"   💾 Saved!")

print("\n✅ TRAINING COMPLETE!")

🚀 RESUMING EPOCH 3

📊 Epoch 3/3


Training:   0%|          | 0/26352 [00:00<?, ?it/s]

   Train Loss: 1.6461


Validating:   0%|          | 0/1387 [00:00<?, ?it/s]

   Val Loss: 1.6441
   💾 Saved!

✅ TRAINING COMPLETE!


In [11]:
# test generation

print("🧪 Testing model...")

model.eval()

test_prompts = [
    "Write lyrics for a pop song about summer love",
    "Write lyrics for a rock song about freedom",
    "Write lyrics for a sad ballad about heartbreak"
]

for i, prompt in enumerate(test_prompts, 1):
    print(f"\n{'='*70}")
    print(f"Test {i}: {prompt}")
    print('='*70)

    text = f"### Instruction:\n{prompt}\n\n### Response:\n"
    inputs = tokenizer(text, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            max_length=300,
            temperature=0.8,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

    result = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if "### Response:" in result:
        lyrics = result.split("### Response:")[1].strip()
    else:
        lyrics = result

    print(lyrics[:500])
    print('-'*70)

print("\n✅ Testing complete!")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


🧪 Testing model...

Test 1: Write lyrics for a pop song about summer love
In the summertime 
I'd give you everything 
I'd give you anything 
You need 
 
I'd give you every kiss 
That I'd ever hold 
You'd give me everything 
You'd give me anything 
 
Just as long as you 
Just as long as you 
Just as long as you 
 
And you 
You'd give me everything 
And you'd give me everything 
I would give you everything 
You need 
 
I'd give you everything 
I would give you anything 
You need 
 
I'd give you everything 
I would give you everything 
I would g
----------------------------------------------------------------------

Test 2: Write lyrics for a rock song about freedom
They say that freedom is just a word 
But they don't understand 
It's not a law that says 
You gotta live by the sword 
Or be cut down by a knife 
 
It's a hard word to say 
But you know that it's true 
We all have to give a little more 
If we just want it to be 
We'll give it everything 
 
We gotta make a stand 
'Cause we're 

In [12]:
# download model

from google.colab import files
import shutil

print("📦 Creating zip...")

zip_path = "/content/lyrics_model_final.zip"
shutil.make_archive(
    zip_path.replace('.zip', ''),
    'zip',
    CONFIG["final_model_dir"]
)

size = os.path.getsize(zip_path) / (1024**2)
print(f"✅ Zip created: {size:.2f} MB")

download = input("\nDownload now? [y/n]: ").strip().lower()
if download == 'y':
    files.download(zip_path)
    print("✅ Download started!")

📦 Creating zip...
✅ Zip created: 442.56 MB

Download now? [y/n]: y


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Download started!


In [13]:
# summary

print("=" * 70)
print("🎉 TRAINING COMPLETE!")
print("=" * 70)

print(f"\n📊 Summary:")
print(f"   Model: {CONFIG['model_name']}")
print(f"   Samples: {len(train_dataset)}")
print(f"   Epochs: {CONFIG['epochs']}")
print(f"   Final model: {CONFIG['final_model_dir']}")

print("\n🚀 Next Steps:")
print("   1. Download model (Cell 13)")
print("   2. Upload to HuggingFace")
print("   3. Deploy your app!")

print("\n" + "=" * 70)
print("🎵 Happy generating! ✨")
print("=" * 70)

🎉 TRAINING COMPLETE!

📊 Summary:
   Model: gpt2
   Samples: 105406
   Epochs: 3
   Final model: /content/drive/MyDrive/lyrics_model_training/final_model

🚀 Next Steps:
   1. Download model (Cell 13)
   2. Upload to HuggingFace
   3. Deploy your app!

🎵 Happy generating! ✨
