In [None]:
# Install required packages
!pip install --upgrade datasets transformers
!pip install -U datasets==3.0.1 transformers==4.45.2
!pip install -q transformers sentence-transformers datasets

# AthenAI v1 - Workout Generation Model Training

This notebook documents the training process for the AthenAI workout generation model using T5 and LoRA adaptation.

## 1. Setup and Memory Management
First, we'll set up memory management and GPU monitoring utilities.

In [None]:
# Clean up memory periodically
import gc
import torch

def cleanup_memory():
    gc.collect()
    torch.cuda.empty_cache()

# Call this function between major steps
cleanup_memory()

# Monitor GPU memory
def print_gpu_utilization():
    if torch.cuda.is_available():
        print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
        print(f"GPU memory cached: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

print_gpu_utilization()

## 2. Package Installation and Imports
Install required packages and import necessary libraries.

In [None]:
# Install required packages
!pip install transformers datasets torch accelerate peft bitsandbytes kaggle requests tqdm
!pip install --upgrade huggingface_hub

import os
import json
import pandas as pd
import requests
import zipfile
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType
import torch
from tqdm import tqdm

## 3. Data Collection
Download exercise data from GitHub and optionally from Kaggle.

In [None]:
# Download GitHub exercise database
def download_github_exercises():
    """Download and extract GitHub exercise database"""
    url = "https://github.com/yuhonas/free-exercise-db/archive/main.zip"
    response = requests.get(url)

    with open("exercise_db.zip", "wb") as f:
        f.write(response.content)

    with zipfile.ZipFile("exercise_db.zip", 'r') as zip_ref:
        zip_ref.extractall("./")

    # Load exercises
    exercises_path = "./free-exercise-db-main/exercises"
    exercises = []

    for file in os.listdir(exercises_path):
        if file.endswith('.json'):
            with open(os.path.join(exercises_path, file), 'r') as f:
                exercise = json.load(f)
                exercises.append(exercise)

    return exercises

github_exercises = download_github_exercises()
print(f"Loaded {len(github_exercises)} exercises from GitHub")

# Setup Kaggle API and download dataset
from google.colab import files
import os

def setup_kaggle_api():
    """Upload kaggle.json and configure Kaggle API"""
    print("Please upload your kaggle.json file:")
    uploaded = files.upload()

    os.makedirs('/root/.kaggle', exist_ok=True)
    for filename in uploaded.keys():
        if filename == 'kaggle.json':
            os.rename(filename, '/root/.kaggle/kaggle.json')
            os.chmod('/root/.kaggle/kaggle.json', 0o600)
            print("✅ Kaggle API configured successfully!")
            return True

    print("❌ kaggle.json not found in uploaded files")
    return False

kaggle_configured = setup_kaggle_api()

In [None]:
# Download Kaggle dataset if API is configured
kaggle_exercises = None
if kaggle_configured:
    try:
        !kaggle datasets download -d niharika41298/gym-exercise-data
        if os.path.exists('gym-exercise-data.zip'):
            !unzip -o gym-exercise-data.zip
            print("✅ Dataset downloaded and extracted!")

        csv_files = [f for f in os.listdir('.') if f.endswith('.csv')]
        if csv_files:
            csv_file = csv_files[0]
            kaggle_exercises = pd.read_csv(csv_file)
            print(f"✅ Loaded Kaggle dataset: {csv_file} with {len(kaggle_exercises)} exercises")
            print("Dataset columns:", list(kaggle_exercises.columns))
        else:
            print("❌ No CSV file found after extraction")

    except Exception as e:
        print(f"❌ Error downloading Kaggle dataset: {e}")
        print("You can manually upload the CSV file instead")
        print("\nAlternative: Upload CSV file manually:")
        uploaded_csv = files.upload()
        for filename in uploaded_csv.keys():
            if filename.endswith('.csv'):
                kaggle_exercises = pd.read_csv(filename)
                print(f"✅ Manually uploaded CSV loaded: {len(kaggle_exercises)} exercises")
                break

## 4. Data Standardization
Standardize exercise data from both sources into a common format.

In [None]:
def standardize_exercise_data(github_exercises, kaggle_df=None):
    """
    Standardize both datasets into a common format
    """
    standardized_exercises = []

    # Process GitHub exercises
    for ex in github_exercises:
        standardized_ex = {
            'name': ex.get('name', ''),
            'difficulty_level': ex.get('level', 'beginner'),
            'exercise_type': ex.get('category', 'strength'),
            'primary_muscles': ex.get('primaryMuscles', []),
            'secondary_muscles': ex.get('secondaryMuscles', []),
            'equipment': ex.get('equipment', ''),
            'instructions': ' '.join(ex.get('instructions', [])),
            'force_type': ex.get('force', ''),
            'mechanic': ex.get('mechanic', ''),
            'source': 'github'
        }
        standardized_exercises.append(standardized_ex)

    # Process Kaggle exercises (if available)
    if kaggle_df is not None:
        for _, row in kaggle_df.iterrows():
            standardized_ex = {
                'name': row.get('Title', ''),
                'difficulty_level': 'intermediate',
                'exercise_type': 'strength',
                'primary_muscles': [row.get('BodyPart', '')] if pd.notna(row.get('BodyPart')) else [],
                'secondary_muscles': [],
                'equipment': row.get('Equipment', ''),
                'instructions': row.get('Desc', ''),
                'force_type': '',
                'mechanic': '',
                'source': 'kaggle'
            }
            standardized_exercises.append(standardized_ex)

    return standardized_exercises

# Process the data
standardized_exercises = standardize_exercise_data(github_exercises, kaggle_exercises)
print(f"Standardized {len(standardized_exercises)} exercises")
cleanup_memory()

## 5. Training Data Preparation
Create training prompts and responses for the model.

In [None]:
def create_training_prompts(exercises):
    """Create training prompts for workout generation"""
    training_data = []
    user_profiles = [
        {
            'training_phase': 'weight_loss',
            'motivation': 'self_improvement',
            'special_situation': 'none',
            'description': 'Beginner looking to lose weight'
        },
        {
            'training_phase': 'muscle_gain',
            'motivation': 'self_improvement',
            'special_situation': 'none',
            'description': 'Intermediate wanting to build muscle'
        },
        {
            'training_phase': 'cardio_improve',
            'motivation': 'wellbeing',
            'special_situation': 'none',
            'description': 'Advanced athlete improving cardio'
        },
        {
            'training_phase': 'maintenance',
            'motivation': 'wellbeing',
            'special_situation': 'elderly_population',
            'description': 'Senior maintaining fitness'
        }
    ]

    for profile in user_profiles:
        suitable_exercises = filter_exercises_for_profile(exercises, profile)
        if len(suitable_exercises) < 5:
            continue

        prompt = f"""Generate a workout routine for:
User Profile: {profile['description']}
Training Phase: {profile['training_phase']}
Motivation: {profile['motivation']}
Special Situation: {profile['special_situation']}

Available Exercises:
{format_exercises_for_prompt(suitable_exercises[:10])}

Generate a workout with the following structure:
- Workout Name
- Description
- Difficulty Level
- Estimated Duration
- Target Audience
- Blocks (Warmup, Main, Core, Cooldown)

Workout:"""

        response = generate_workout_response(suitable_exercises, profile)
        training_data.append({
            'prompt': prompt,
            'response': response
        })

    return training_data

def filter_exercises_for_profile(exercises, profile):
    """Filter exercises suitable for user profile"""
    suitable = []
    for ex in exercises:
        if profile['training_phase'] == 'weight_loss' and ex['exercise_type'] in ['cardio', 'strength']:
            suitable.append(ex)
        elif profile['training_phase'] == 'muscle_gain' and ex['exercise_type'] == 'strength':
            suitable.append(ex)
        elif profile['special_situation'] == 'elderly_population' and ex['difficulty_level'] == 'beginner':
            suitable.append(ex)
        else:
            suitable.append(ex)
    return suitable[:20]

def format_exercises_for_prompt(exercises):
    """Format exercises for inclusion in prompt"""
    formatted = []
    for ex in exercises[:5]:
        formatted.append(f"- {ex['name']}: {ex['exercise_type']} ({ex['difficulty_level']})")
    return '\n'.join(formatted)

def generate_workout_response(exercises, profile):
    """Generate a simple workout response template"""
    return f"""Workout Name: {profile['training_phase'].title()} Focused Routine
Description: A targeted workout designed for {profile['description'].lower()}
Difficulty Level: {profile.get('difficulty', 'intermediate')}
Estimated Duration: 45-60 minutes
Target Audience: {profile['training_phase']}

Blocks:
1. Warmup Block (5-10 minutes)
   - Light cardio and dynamic stretches

2. Main Block (30-40 minutes)
   - {exercises[0]['name']}: 3 sets x 10-12 reps
   - {exercises[1]['name']}: 3 sets x 10-12 reps
   - {exercises[2]['name']}: 3 sets x 8-10 reps

3. Core Block (5-10 minutes)
   - Core strengthening exercises

4. Cooldown Block (5-10 minutes)
   - Static stretching and relaxation"""

# Generate training data
training_data = create_training_prompts(standardized_exercises)
print(f"Generated {len(training_data)} training examples")
cleanup_memory()

## 6. Model Configuration
Load the pre-trained model and configure LoRA adaptation.

In [None]:
# Load pre-trained model and tokenizer
model_name = "a-albiol/AthenAI"
tokenizer = AutoTokenizer.from_pretrained(model_name)

from transformers import AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

# Configure LoRA
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q", "v", "o", "wi_0", "wi_1", "wo"],
    bias="none",
    use_rslora=False,
)

model = get_peft_model(model, lora_config)
model.train()
model.enable_input_require_grads()

# Print parameter analysis
print("=== Model Parameter Analysis ===")
total_params = 0
trainable_params = 0
for name, param in model.named_parameters():
    total_params += param.numel()
    if param.requires_grad:
        trainable_params += param.numel()
        print(f"✅ Trainable: {name} - {param.numel():,} params")

print(f"\n📊 Total parameters: {total_params:,}")
print(f"📊 Trainable parameters: {trainable_params:,}")
print(f"📊 Trainable %: {100 * trainable_params / total_params:.2f}%")

cleanup_memory()
print("✅ Memory cleaned after model setup")
print_gpu_utilization()

## 7. Training Setup
Prepare the dataset, training arguments, and trainer.

In [None]:
def tokenize_function_t5(examples):
    """Tokenize the training data for T5"""
    inputs = [prompt for prompt in examples['prompt']]
    targets = [response for response in examples['response']]

    model_inputs = tokenizer(
        inputs,
        truncation=True,
        padding=False,
        max_length=512,
        return_tensors=None
    )

    labels = tokenizer(
        targets,
        truncation=True,
        padding=False,
        max_length=512,
        return_tensors=None
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Create dataset
train_dataset = Dataset.from_list(training_data)
train_dataset = train_dataset.map(
    tokenize_function_t5,
    batched=True,
    remove_columns=['prompt', 'response']
)

print(f"Dataset size: {len(train_dataset)}")
cleanup_memory()

# Training arguments
training_args = TrainingArguments(
    output_dir="./workout-model-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=3e-4,
    weight_decay=0.01,
    logging_steps=10,
    save_steps=100,
    save_total_limit=2,
    prediction_loss_only=True,
    remove_unused_columns=False,
    dataloader_pin_memory=False,
    fp16=True,
    report_to=[],
    gradient_checkpointing=True,
    disable_tqdm=False,
)

# Data collator
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    pad_to_multiple_of=8,
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

# Disable wandb
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"

try:
    import wandb
    wandb.init(mode="disabled")
except ImportError:
    pass

## 8. Model Training
Train the model and save checkpoints.

In [None]:
# Verify model is ready
print("=== Pre-training Verification ===")
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
if trainable_params == 0:
    raise ValueError("❌ No trainable parameters! Check LoRA configuration.")
else:
    print(f"✅ Ready to train with {trainable_params:,} parameters")

# Test forward pass
print("🧪 Testing forward pass...")
try:
    sample_batch = next(iter(trainer.get_train_dataloader()))
    with torch.no_grad():
        outputs = model(**sample_batch)
    print("✅ Forward pass successful!")
except Exception as e:
    print(f"❌ Forward pass failed: {e}")
    print("🔧 This might indicate a data formatting issue")

# Start training
print("Starting training...")
print_gpu_utilization()

trainer.train()

cleanup_memory()
print("✅ Memory cleaned after training")
print_gpu_utilization()

# Save the model
trainer.save_model("./workout-model-final")
tokenizer.save_pretrained("./workout-model-final")

print("Training completed!")

## 9. Inference and Testing
Implement and test workout generation functions.

In [None]:
def generate_workout_t5(user_profile, available_exercises=None):
    """Generate a workout using T5 model"""
    if available_exercises is None:
        available_exercises = standardized_exercises[:5]

    prompt = f"""Generate workout routine:
User: {user_profile}
Phase: muscle_gain
Motivation: self_improvement
Special: none
Exercises: {format_exercises_for_prompt(available_exercises)}
Format: name, description, difficulty, duration, blocks"""

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        max_length=512,
        truncation=True,
        padding=True
    )

    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}

    model.eval()
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=300,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            early_stopping=True
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def generate_workout_simple(user_profile):
    """Simpler generation function"""
    prompt = f"Generate a workout for: {user_profile}"
    inputs = tokenizer(prompt, return_tensors="pt", max_length=256, truncation=True)

    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}

    model.eval()
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=512,
            do_sample=True,
            temperature=0.8,
            num_return_sequences=1
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test generation
print("=== Testing T5 Generation ===")
try:
    print("🧪 Testing main generation function...")
    test_response = generate_workout_t5("Intermediate athlete wanting to build muscle")
    print("✅ Main function works!")
    print("Generated Workout:")
    print(test_response)
except Exception as e:
    print(f"❌ Main function failed: {e}")
    print("🔄 Trying simple function...")
    
    try:
        test_response = generate_workout_simple("Intermediate athlete wanting to build muscle")
        print("✅ Simple function works!")
        print("Generated Workout:")
        print(test_response)
    except Exception as e:
        print(f"❌ Simple function also failed: {e}")
        print("🔧 Try running the model loading section again")

## 10. Model Deployment
Save and upload the model to Hugging Face Hub.

In [None]:
# Login to Hugging Face
from huggingface_hub import login
login(ACCESS_TOKEN)  # Replace ACCESS_TOKEN with your actual token

# Push model to hub
model.push_to_hub("a-albiol/AthenAI")
tokenizer.push_to_hub("a-albiol/AthenAI")

print("✅ Model successfully pushed to Hugging Face Hub")