# Enhanced Workout Generation Model Training V2
Building on successful V1 with more power and better structure

## Setup and Installation (Same as V1)

In [None]:
# Install required packages
!pip install transformers datasets torch accelerate peft bitsandbytes kaggle requests tqdm
!pip install --upgrade huggingface_hub

import os
import json
import pandas as pd
import requests
import zipfile
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, 
    AutoModelForSeq2SeqLM,  # T5 model type
    TrainingArguments, 
    Trainer,
    DataCollatorForSeq2Seq
)
from peft import LoraConfig, get_peft_model, TaskType
import torch
from tqdm import tqdm
import random

## Enhanced Memory Management

In [None]:
def cleanup_memory():
    """Enhanced memory cleanup"""
    import gc
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
    print("🧹 Memory cleanup completed")
    
def print_gpu_utilization():
    """Print current GPU memory usage"""
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / 1024**3
        reserved = torch.cuda.memory_reserved() / 1024**3
        free = torch.cuda.get_device_properties(0).total_memory / 1024**3 - reserved
        print(f"📊 GPU - Used: {allocated:.2f}GB, Reserved: {reserved:.2f}GB, Free: {free:.2f}GB")
    else:
        print("📊 No GPU available")

print("Initial GPU state:")
print_gpu_utilization()

## Data Loading (Enhanced - from your successful V1)

In [None]:
# Use your existing data loading functions but enhanced
def download_and_load_data():
    """Load both datasets efficiently"""
    
    # GitHub exercises (from V1)
    def download_github_exercises():
        url = "https://github.com/yuhonas/free-exercise-db/archive/main.zip"
        response = requests.get(url)
        
        with open("exercise_db.zip", "wb") as f:
            f.write(response.content)
        
        with zipfile.ZipFile("exercise_db.zip", 'r') as zip_ref:
            zip_ref.extractall("./")
        
        exercises_path = "./free-exercise-db-main/exercises"
        exercises = []
        
        for file in os.listdir(exercises_path):
            if file.endswith('.json'):
                with open(os.path.join(exercises_path, file), 'r') as f:
                    exercise = json.load(f)
                    exercises.append(exercise)
        
        return exercises
    
    github_exercises = download_github_exercises()
    print(f"✅ Loaded {len(github_exercises)} GitHub exercises")
    
    # Kaggle dataset (optional)
    kaggle_exercises = None
    if os.path.exists('gym_exercise_data.csv'):  # If you uploaded it
        kaggle_exercises = pd.read_csv('gym_exercise_data.csv')
        print(f"✅ Loaded {len(kaggle_exercises)} Kaggle exercises")
    
    return github_exercises, kaggle_exercises

github_exercises, kaggle_exercises = download_and_load_data()
cleanup_memory()

## Enhanced Training Data Generation

In [None]:
def create_enhanced_training_data(github_exercises, kaggle_exercises=None, num_samples=200):
    """Create high-quality, diverse training data matching your database schema"""
    
    # Standardize exercises first
    def standardize_exercise_data(github_ex, kaggle_df=None):
        standardized = []
        
        for ex in github_ex:
            standardized_ex = {
                'name': ex.get('name', ''),
                'difficulty_level': ex.get('level', 'beginner'),
                'exercise_type': ex.get('category', 'strength'),
                'primary_muscles': ex.get('primaryMuscles', []),
                'secondary_muscles': ex.get('secondaryMuscles', []),
                'equipment': ex.get('equipment', ''),
                'instructions': ' '.join(ex.get('instructions', [])),
                'force_type': ex.get('force', ''),
                'mechanic': ex.get('mechanic', '')
            }
            standardized.append(standardized_ex)
        
        return standardized
    
    exercises = standardize_exercise_data(github_exercises, kaggle_exercises)
    
    # Enhanced user profiles matching your database
    user_profiles = [
        {
            'training_phase': 'weight_loss',
            'motivation': 'self_improvement',
            'special_situation': 'none',
            'description': 'Beginner focused on losing weight',
            'duration': 30,
            'difficulty': 'beginner'
        },
        {
            'training_phase': 'muscle_gain',
            'motivation': 'self_improvement',
            'special_situation': 'none',
            'description': 'Intermediate athlete building muscle',
            'duration': 45,
            'difficulty': 'intermediate'
        },
        {
            'training_phase': 'muscle_gain',
            'motivation': 'competition',
            'special_situation': 'none',
            'description': 'Advanced lifter preparing for competition',
            'duration': 60,
            'difficulty': 'advanced'
        },
        {
            'training_phase': 'cardio_improve',
            'motivation': 'wellbeing',
            'special_situation': 'none',
            'description': 'Runner improving cardiovascular fitness',
            'duration': 40,
            'difficulty': 'intermediate'
        },
        {
            'training_phase': 'maintenance',
            'motivation': 'medical_recommendation',
            'special_situation': 'elderly_population',
            'description': 'Senior maintaining health and mobility',
            'duration': 25,
            'difficulty': 'beginner'
        },
        {
            'training_phase': 'weight_loss',
            'motivation': 'medical_recommendation',
            'special_situation': 'post_partum',
            'description': 'New mother returning to fitness',
            'duration': 20,
            'difficulty': 'beginner'
        },
        {
            'training_phase': 'muscle_gain',
            'motivation': 'rehabilitation',
            'special_situation': 'injury_recovery',
            'description': 'Athlete recovering from injury',
            'duration': 35,
            'difficulty': 'beginner'
        }
    ]
    
    # Workout block types matching your database
    block_types = {
        'warmup': {'duration': 5, 'exercises': 2},
        'main': {'duration': 25, 'exercises': 4},
        'core': {'duration': 8, 'exercises': 3},
        'cardio': {'duration': 10, 'exercises': 2},
        'cooldown': {'duration': 7, 'exercises': 2}
    }
    
    training_data = []
    
    for profile in user_profiles:
        for sample_num in range(num_samples // len(user_profiles)):
            # Create structured input prompt
            prompt = f"""Create workout routine:
USER_PROFILE: {profile['description']}
TRAINING_PHASE: {profile['training_phase']}
MOTIVATION: {profile['motivation']}
SPECIAL_SITUATION: {profile['special_situation']}
DURATION: {profile['duration']} minutes
DIFFICULTY: {profile['difficulty']}
TARGET_AUDIENCE: {profile['training_phase']}

Generate structured workout with blocks: warmup, main, core, cooldown."""

            # Create structured response matching your database schema
            workout_name = f"{profile['training_phase'].replace('_', ' ').title()} {profile['difficulty'].title()} Routine"
            
            response = f"""WORKOUT_TEMPLATE:
name: {workout_name}
description: {profile['description']} focused routine
difficulty_level: {profile['difficulty']}
estimated_duration_minutes: {profile['duration']}
target_audience: {profile['training_phase']}

TEMPLATE_BLOCKS:
Block 1:
block_name: Warmup
block_type: warmup
block_order: 1
exercise_count: 2
estimated_duration_minutes: 5
instructions: Light movement to prepare body for workout

Block 2: 
block_name: Main Training
block_type: main
block_order: 2
exercise_count: 4
estimated_duration_minutes: {profile['duration'] - 15}
instructions: Primary exercises targeting {profile['training_phase']}

Block 3:
block_name: Core Strengthening  
block_type: core
block_order: 3
exercise_count: 3
estimated_duration_minutes: 8
instructions: Core stability and strength exercises

Block 4:
block_name: Cooldown
block_type: cooldown
block_order: 4
exercise_count: 2
estimated_duration_minutes: 5
instructions: Stretching and mobility for recovery

EXERCISE_SUGGESTIONS:
- Focus on {profile['training_phase']} appropriate movements
- Match {profile['difficulty']} difficulty level
- Consider {profile['special_situation']} requirements
- Ensure proper progression and safety"""

            training_data.append({
                'prompt': prompt,
                'response': response
            })
    
    # Add variations with different focus areas
    focus_variations = ['upper_body', 'lower_body', 'full_body', 'push_muscles', 'pull_muscles']
    
    for focus in focus_variations:
        for profile in user_profiles[:3]:  # Use first 3 profiles
            prompt = f"""Create {focus} workout:
USER_PROFILE: {profile['description']}
FOCUS_AREA: {focus}
TRAINING_PHASE: {profile['training_phase']}  
DURATION: {profile['duration']} minutes
DIFFICULTY: {profile['difficulty']}"""

            response = f"""WORKOUT_TEMPLATE:
name: {focus.replace('_', ' ').title()} {profile['training_phase'].title()}
description: {focus} focused {profile['training_phase']} routine
difficulty_level: {profile['difficulty']}
estimated_duration_minutes: {profile['duration']}
target_audience: {profile['training_phase']}

FOCUS: {focus} specialization
STRUCTURE: Warm-up → {focus} main exercises → Core → Cool-down
PROGRESSION: Appropriate for {profile['difficulty']} level"""

            training_data.append({
                'prompt': prompt, 
                'response': response
            })
    
    print(f"✅ Generated {len(training_data)} enhanced training examples")
    return training_data

# Generate enhanced training data
enhanced_training_data = create_enhanced_training_data(github_exercises, kaggle_exercises, 300)
cleanup_memory()

## Enhanced Model Configuration (More Power!)

In [None]:
# Load your successful model but with more training power
model_name = "a-albiol/AthenAI"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

# Enhanced LoRA configuration - More power!
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=32,  # Increased from 16 - more learning capacity
    lora_alpha=64,  # Increased proportionally
    lora_dropout=0.1,
    target_modules=["q", "v", "o", "wi_0", "wi_1", "wo"],
    bias="none",
    use_rslora=False,
)

model = get_peft_model(model, lora_config)
model.train()
model.enable_input_require_grads()

# Check parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"📊 Total parameters: {total_params:,}")
print(f"📊 Trainable parameters: {trainable_params:,}")
print(f"📊 Trainable %: {100 * trainable_params / total_params:.2f}%")

cleanup_memory()

## Enhanced Dataset Preparation

In [None]:
def tokenize_function_enhanced(examples):
    """Enhanced tokenization with better handling"""
    
    inputs = [prompt for prompt in examples['prompt']]
    targets = [response for response in examples['response']]
    
    # Tokenize with optimized settings
    model_inputs = tokenizer(
        inputs,
        truncation=True,
        padding=False,
        max_length=768,  # Increased context
        return_tensors=None
    )
    
    labels = tokenizer(
        targets,
        truncation=True,
        padding=False,
        max_length=768,  # Increased output length
        return_tensors=None
    )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Create enhanced dataset
train_dataset = Dataset.from_list(enhanced_training_data)
train_dataset = train_dataset.map(
    tokenize_function_enhanced,
    batched=True,
    remove_columns=['prompt', 'response']
)

print(f"📊 Enhanced dataset size: {len(train_dataset)}")
cleanup_memory()

## Enhanced Training Configuration (More Power!)

In [None]:
# Disable logging
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"

# Enhanced training arguments - More power!
training_args = TrainingArguments(
    output_dir="./workout-model-v2-enhanced",
    overwrite_output_dir=True,
    num_train_epochs=4,  # More epochs!
    per_device_train_batch_size=2,  # Slightly larger batch
    gradient_accumulation_steps=8,
    learning_rate=5e-4,  # Higher learning rate
    weight_decay=0.01,
    warmup_steps=50,  # Add warmup
    logging_steps=25,
    save_steps=150,
    save_total_limit=3,
    prediction_loss_only=True,
    remove_unused_columns=False,
    dataloader_pin_memory=False,
    fp16=True,
    report_to=[],
    gradient_checkpointing=True,
    dataloader_num_workers=2,  # Faster data loading
    lr_scheduler_type="cosine",  # Better learning rate schedule
)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    pad_to_multiple_of=8,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

print("🚀 Enhanced trainer configured!")
print(f"📊 Training {trainable_params:,} parameters for {training_args.num_train_epochs} epochs")
print(f"📊 Total training steps: ~{len(train_dataset) * training_args.num_train_epochs // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps)}")

## Enhanced Training with Monitoring

In [None]:
import time

print("🚀 Starting Enhanced Training...")
print_gpu_utilization()

# Verify model before training
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"✅ Ready to train {trainable_params:,} parameters")

# Start enhanced training
start_time = time.time()
trainer.train()
end_time = time.time()

print(f"✅ Enhanced training completed in {(end_time - start_time) / 60:.1f} minutes")
cleanup_memory()

# Save enhanced model
trainer.save_model("./workout-model-v2-final")
tokenizer.save_pretrained("./workout-model-v2-final")
print("💾 Enhanced model saved!")

## Enhanced Testing and Evaluation

In [None]:
def test_enhanced_model():
    """Comprehensive testing of the enhanced model"""
    
    def generate_workout_enhanced(prompt):
        inputs = tokenizer(prompt, return_tensors="pt", max_length=768, truncation=True, padding=True)
        
        if torch.cuda.is_available():
            inputs = {k: v.cuda() for k, v in inputs.items()}
        
        model.eval()
        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_new_tokens=600,  # Longer outputs
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                repetition_penalty=1.1,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )
        
        return tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    test_cases = [
        """Create workout routine:
USER_PROFILE: Beginner looking to lose weight
TRAINING_PHASE: weight_loss
MOTIVATION: self_improvement
SPECIAL_SITUATION: none
DURATION: 30 minutes
DIFFICULTY: beginner
TARGET_AUDIENCE: weight_loss""",
        
        """Create upper_body workout:
USER_PROFILE: Advanced athlete building muscle
FOCUS_AREA: upper_body
TRAINING_PHASE: muscle_gain
DURATION: 45 minutes
DIFFICULTY: advanced""",
        
        """Create workout routine:
USER_PROFILE: Senior maintaining health
TRAINING_PHASE: maintenance
MOTIVATION: medical_recommendation
SPECIAL_SITUATION: elderly_population
DURATION: 25 minutes
DIFFICULTY: beginner"""
    ]
    
    print("🧪 Testing Enhanced Model:")
    for i, prompt in enumerate(test_cases, 1):
        print(f"\n=== Test Case {i} ===")
        try:
            response = generate_workout_enhanced(prompt)
            print("✅ Generated Response:")
            print(response[:500] + "..." if len(response) > 500 else response)
        except Exception as e:
            print(f"❌ Error: {e}")

# Run enhanced testing
test_enhanced_model()

## Summary and Next Steps

In [None]:
print("🎉 Enhanced Training Complete!")
print(f"📊 Model improvements:")
print(f"   - Increased LoRA rank: 16 → 32 (more learning capacity)")
print(f"   - More training epochs: 2 → 4")
print(f"   - Better training data: ~300 structured examples")
print(f"   - Longer context: 512 → 768 tokens")
print(f"   - Enhanced prompting aligned with your database schema")
print(f"   - Cosine learning rate schedule")

print(f"\n🚀 Expected improvements:")
print(f"   - Better structured outputs matching your database")
print(f"   - More diverse workout generation")
print(f"   - Better understanding of user profiles")
print(f"   - More professional workout formatting")

print(f"\n💾 Your enhanced model is saved at: './workout-model-v2-final'")
print(f"🔄 Ready to integrate into your Go application!")

## Push Enhanced Model to HuggingFace Hub

In [None]:
# Push enhanced model to HuggingFace Hub
from huggingface_hub import login

# Login to HuggingFace (use your token)
login(ACCESS_TOKEN)

# Push model and tokenizer to Hub with v2 tag
model.push_to_hub("a-albiol/AthenAI", tags=["v2"])
tokenizer.push_to_hub("a-albiol/AthenAI", tags=["v2"])

print("✅ Enhanced model (v2) successfully pushed to Hugging Face Hub")