# Sports Domain LLM - SageMaker Training

This notebook guides you through training the Sports Domain LLM on AWS SageMaker.

## Prerequisites
- AWS account with SageMaker access
- S3 bucket for data storage
- SageMaker execution role

## 1. Setup and Configuration

In [None]:
# Install required packages
!pip install sagemaker boto3 tokenizers torch --quiet

In [None]:
import sagemaker
import boto3
from sagemaker.pytorch import PyTorch
from sagemaker.inputs import TrainingInput

# Get SageMaker session
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
region = sagemaker_session.boto_region_name

print(f"Region: {region}")
print(f"Role: {role}")

In [None]:
# Configuration
S3_BUCKET = sagemaker_session.default_bucket()  # Or use your own bucket
S3_PREFIX = "sports-llm"

# Model configuration
MODEL_SIZE = "small"  # small, medium, or large
MAX_SEQ_LENGTH = 2048

# Training configuration
INSTANCE_TYPE = "ml.g5.2xlarge"  # Single A10G GPU
INSTANCE_COUNT = 1

print(f"S3 Bucket: {S3_BUCKET}")
print(f"S3 Prefix: {S3_PREFIX}")

## 2. Prepare and Upload Training Data

In [None]:
# Create sample training data (for testing)
sample_data = '''Lionel Messi led Argentina to victory in the 2022 FIFA World Cup.
LeBron James has established himself as one of the greatest basketball players in NBA history.
The New York Yankees hold the record for the most World Series championships with 27 titles.
Tom Brady retired after winning seven Super Bowl championships.
Manchester City won the UEFA Champions League for the first time in 2023.
Novak Djokovic has won 24 Grand Slam singles titles.
The Golden State Warriors revolutionized basketball with their three-point shooting philosophy.
Michael Jordan is widely considered the greatest basketball player of all time.
'''

# Save locally
with open('/tmp/sports_data.txt', 'w') as f:
    f.write(sample_data)

print("Sample data created!")

In [None]:
# Upload training data to S3
s3_client = boto3.client('s3')

# Upload data
train_s3_path = f"s3://{S3_BUCKET}/{S3_PREFIX}/data/train/sports_data.txt"
s3_client.upload_file('/tmp/sports_data.txt', S3_BUCKET, f"{S3_PREFIX}/data/train/sports_data.txt")
print(f"Training data uploaded to: {train_s3_path}")

In [None]:
# Train and upload tokenizer
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders
import json
import tarfile
import os

# Create tokenizer
tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
tokenizer.decoder = decoders.ByteLevel()

# Train tokenizer
trainer = trainers.BpeTrainer(
    vocab_size=8000,
    min_frequency=1,
    special_tokens=["<pad>", "<s>", "</s>", "<unk>", "<mask>"]
)
tokenizer.train(['/tmp/sports_data.txt'], trainer)

# Save tokenizer
os.makedirs('/tmp/tokenizer', exist_ok=True)
tokenizer.save('/tmp/tokenizer/tokenizer.json')

# Save config
config = {
    "vocab_size": 8000,
    "min_frequency": 1,
    "lowercase": False,
    "pad_token_id": 0,
    "bos_token_id": 1,
    "eos_token_id": 2,
    "unk_token_id": 3
}
with open('/tmp/tokenizer/config.json', 'w') as f:
    json.dump(config, f)

print(f"Tokenizer vocab size: {tokenizer.get_vocab_size()}")

In [None]:
# Create tarball and upload tokenizer
with tarfile.open('/tmp/tokenizer.tar.gz', 'w:gz') as tar:
    tar.add('/tmp/tokenizer/tokenizer.json', arcname='tokenizer.json')
    tar.add('/tmp/tokenizer/config.json', arcname='config.json')

tokenizer_s3_path = f"s3://{S3_BUCKET}/{S3_PREFIX}/tokenizer/tokenizer.tar.gz"
s3_client.upload_file('/tmp/tokenizer.tar.gz', S3_BUCKET, f"{S3_PREFIX}/tokenizer/tokenizer.tar.gz")
print(f"Tokenizer uploaded to: {tokenizer_s3_path}")

## 3. Upload Source Code

In [None]:
# Note: In a real setup, you would upload the src/ directory
# For SageMaker, the source code is typically included via source_dir parameter
# or uploaded separately

# List current directory structure
import os
print("Expected project structure:")
print("""
build-fresh-llm/
├── sagemaker/
│   └── train.py       # Entry point
├── src/
│   ├── models/
│   ├── tokenizer/
│   ├── data/
│   └── training/
└── configs/
""")

## 4. Configure and Launch Training Job

In [None]:
# Hyperparameters
hyperparameters = {
    "model-size": MODEL_SIZE,
    "max-seq-length": MAX_SEQ_LENGTH,
    "epochs": 1,
    "max-steps": 100,  # Quick test
    "batch-size": 4,
    "gradient-accumulation-steps": 4,
    "learning-rate": 3e-4,
    "warmup-steps": 10,
    "save-steps": 50,
    "logging-steps": 5,
    "use-amp": "",  # Flag to enable AMP
}

print("Hyperparameters:")
for k, v in hyperparameters.items():
    print(f"  {k}: {v}")

In [None]:
# Create PyTorch Estimator
estimator = PyTorch(
    entry_point="train.py",
    source_dir="../sagemaker",  # Path to sagemaker directory
    role=role,
    instance_type=INSTANCE_TYPE,
    instance_count=INSTANCE_COUNT,
    volume_size=100,  # GB
    framework_version="2.1.0",
    py_version="py310",
    hyperparameters=hyperparameters,
    output_path=f"s3://{S3_BUCKET}/{S3_PREFIX}/output",
    sagemaker_session=sagemaker_session,
    max_run=3600 * 4,  # 4 hours max
    environment={
        "PYTORCH_CUDA_ALLOC_CONF": "max_split_size_mb:512",
    },
    dependencies=["../src", "../configs"],  # Include source code
)

print(f"Estimator created with instance type: {INSTANCE_TYPE}")

In [None]:
# Define input channels
inputs = {
    "train": TrainingInput(
        s3_data=f"s3://{S3_BUCKET}/{S3_PREFIX}/data/train",
        content_type="text/plain",
    ),
    "tokenizer": TrainingInput(
        s3_data=f"s3://{S3_BUCKET}/{S3_PREFIX}/tokenizer",
        content_type="application/x-tar",
    ),
}

print("Input channels:")
for name, channel in inputs.items():
    print(f"  {name}: {channel.s3_data}")

In [None]:
# Launch training job!
print("Launching training job...")
print(f"Instance: {INSTANCE_TYPE} x {INSTANCE_COUNT}")
print(f"Model size: {MODEL_SIZE}")
print("")

estimator.fit(inputs, wait=True)  # Set wait=False to run asynchronously

## 5. Monitor Training

In [None]:
# Get training job info
training_job_name = estimator.latest_training_job.name
print(f"Training job name: {training_job_name}")

# Get model artifacts location
model_data = estimator.model_data
print(f"Model artifacts: {model_data}")

In [None]:
# View CloudWatch logs
# You can also view logs in the AWS Console:
# https://console.aws.amazon.com/cloudwatch/home#logsV2:log-groups/log-group/$252Faws$252Fsagemaker$252FTrainingJobs

print(f"View logs at:")
print(f"https://console.aws.amazon.com/sagemaker/home?region={region}#/jobs/{training_job_name}")

## 6. Download Trained Model

In [None]:
# Download model artifacts
import os

local_model_path = "./outputs/sagemaker_model"
os.makedirs(local_model_path, exist_ok=True)

# Download from S3
!aws s3 cp {model_data} {local_model_path}/model.tar.gz
!cd {local_model_path} && tar -xzf model.tar.gz

print(f"Model downloaded to: {local_model_path}")

## 7. Test the Trained Model

In [None]:
import torch
import sys
sys.path.insert(0, '..')

from src.models.transformer import SportsLLM
from src.tokenizer.tokenizer import SportsTokenizer

# Load model
config = torch.load(f"{local_model_path}/config.pt")
model = SportsLLM(config)
model.load_state_dict(torch.load(f"{local_model_path}/model.pt"))
model.eval()

# Load tokenizer
tokenizer = SportsTokenizer.load(f"{local_model_path}/tokenizer")

print("Model loaded successfully!")

In [None]:
# Generate text
prompt = "The NBA"
input_ids = torch.tensor([tokenizer.encode(prompt)])

with torch.no_grad():
    generated = model.generate(
        input_ids,
        max_new_tokens=50,
        temperature=0.8,
        do_sample=True,
    )

generated_text = tokenizer.decode(generated[0].tolist())
print(f"Prompt: {prompt}")
print(f"Generated: {generated_text}")

## Cost Estimation

In [None]:
# Approximate costs per hour (USD, varies by region)
COSTS = {
    "ml.g5.xlarge": 1.41,
    "ml.g5.2xlarge": 1.69,
    "ml.g5.4xlarge": 2.27,
    "ml.g5.12xlarge": 7.09,
    "ml.p4d.24xlarge": 37.69,
}

hours = 1  # Estimated training time
cost = COSTS.get(INSTANCE_TYPE, 0) * INSTANCE_COUNT * hours

print(f"Estimated cost for {hours} hour(s):")
print(f"  Instance: {INSTANCE_TYPE} x {INSTANCE_COUNT}")
print(f"  Cost: ${cost:.2f} USD")
print(f"  With spot instances (~65% savings): ${cost * 0.35:.2f} USD")