In [1]:
# Cell 1: Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVR, SVC
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, f1_score
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

# Fitness AI: Intelligent Workout & Calorie Prediction System

## Problem Definition

**Context**: Modern fitness apps lack personalization. Users need data-driven workout recommendations and accurate calorie burn predictions based on their physiology and goals.

**Problem Type**: 
- **Regression**: Predict calories burned and future weight
- **Classification**: Recommend workout types based on user profile
- **Multi-task ML system** with 3 interconnected models

**Objectives**:
1. Predict calories burned during a workout (Â±100 cal accuracy)
2. Forecast weight change over time based on activity
3. Recommend optimal workout types for user goals (lose weight, gain muscle, maintain)

**Constraints**:
- Limited to 13 features from gym tracking data
- Must handle missing values and inconsistent data formats
- Real-time predictions required (&lt; 500ms)
- Models must be interpretable for user trust

**Expected Impact**: 80%+ accuracy in workout recommendations, RÂ² &gt; 0.7 for calorie prediction

In [2]:
# Load dataset (use the same path logic from your project)
csv_path = '../datasets/gym_members_exercise_tracking_synthetic_data.csv'

# Try multiple possible paths like your test script
possible_paths = [
    '../upload/gym_members_exercise_tracking_synthetic_data.csv',
    '../datasets/gym_members_exercise_tracking_synthetic_data.csv',
    'gym_members_exercise_tracking_synthetic_data.csv'
]

df = None
for path in possible_paths:
    try:
        df = pd.read_csv(path)
        print(f"âœ… Found dataset at: {path}")
        break
    except:
        continue

if df is None:
    raise FileNotFoundError("Could not find dataset!")

print(f"Dataset shape: {df.shape}")
print("\nColumn info:")
print(df.info())
print("\nFirst 5 rows:")
df.head()

FileNotFoundError: Could not find dataset!

In [None]:
def clean_fitness_data(df):
    """
    Comprehensive cleaning function combining logic from all training scripts
    """
    # Numeric columns from train_calories_model_gradient_boosting.py
    numeric_cols = ['Age', 'Weight (kg)', 'Height (m)', 'Max_BPM', 'Avg_BPM', 'Resting_BPM',
                    'Session_Duration (hours)', 'Calories_Burned', 'Fat_Percentage', 
                    'Water_Intake (liters)', 'Workout_Frequency (days/week)', 'Experience_Level', 'BMI']
    
    for col in numeric_cols:
        if col in df.columns:
            # Your regex cleaning pattern
            df[col] = df[col].astype(str).str.replace(r'\t|\n|\r', '', regex=True).str.strip()
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Clean categorical columns
    if 'Workout_Type' in df.columns:
        df['Workout_Type'] = df['Workout_Type'].astype(str).str.replace(r'\t|\n|\r', '', regex=True).str.strip()
        df['Workout_Type'].replace(['nan', ''], np.nan, inplace=True)
    
    if 'Gender' in df.columns:
        df['Gender'] = df['Gender'].astype(str).str.strip()
    
    # Drop rows with critical missing values
    critical_cols = ['Workout_Type', 'Calories_Burned', 'Weight (kg)']
    df = df.dropna(subset=critical_cols)
    
    return df

df_cleaned = clean_fitness_data(df.copy())
print(f"After cleaning: {df_cleaned.shape}")
print(f"Removed {df.shape[0] - df_cleaned.shape[0]} rows with missing critical data")

In [None]:
# For calories prediction
X_calories = df_cleaned[['Age', 'Weight (kg)', 'Height (m)', 'Avg_BPM', 'Resting_BPM',
                         'Session_Duration (hours)', 'Fat_Percentage', 'Workout_Frequency (days/week)', 
                         'Experience_Level', 'BMI', 'Workout_Type']]
y_calories = df_cleaned['Calories_Burned']

X_train_cal, X_test_cal, y_train_cal, y_test_cal = train_test_split(
    X_calories, y_calories, test_size=0.2, random_state=42
)

print(f"Training set: {X_train_cal.shape}, Test set: {X_test_cal.shape}")

In [None]:
# Figure 1: Calorie distribution by workout type (for poster)
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Calories by workout type
sns.boxplot(data=df_cleaned, x='Workout_Type', y='Calories_Burned', ax=axes[0,0])
axes[0,0].set_title('Calorie Burn Distribution by Workout Type', fontsize=14, fontweight='bold')
axes[0,0].tick_params(axis='x', rotation=45)

# 2. Correlation heatmap (most important for poster)
numeric_features = ['Age', 'Weight (kg)', 'Height (m)', 'Avg_BPM', 'BMI', 'Fat_Percentage', 'Calories_Burned']
corr_matrix = df_cleaned[numeric_features].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, ax=axes[0,1])
axes[0,1].set_title('Feature Correlation Matrix', fontsize=14, fontweight='bold')

# 3. BMI vs Calories with goal coloring
df_cleaned['goal'] = 'maintain'
df_cleaned.loc[df_cleaned['BMI'] < 18.5, 'goal'] = 'gain_muscle'
df_cleaned.loc[df_cleaned['BMI'] > 25, 'goal'] = 'lose_weight'

sns.scatterplot(data=df_cleaned, x='BMI', y='Calories_Burned', hue='goal', ax=axes[1,0])
axes[1,0].set_title('BMI vs Calorie Burn (Colored by Goal)', fontsize=14, fontweight='bold')

# 4. Feature importance (placeholder - will be filled after modeling)
axes[1,1].text(0.5, 0.5, 'Feature importance will be shown after model training', 
               ha='center', va='center', fontsize=12)
axes[1,1].set_title('Model Feature Importance', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('poster_visualizations.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Create pipeline template
def create_calories_pipeline(model):
    """Create preprocessing + model pipeline"""
    numeric_features = ['Age', 'Weight (kg)', 'Height (m)', 'Avg_BPM', 'Resting_BPM',
                       'Session_Duration (hours)', 'Fat_Percentage', 'Workout_Frequency (days/week)', 
                       'Experience_Level', 'BMI']
    categorical_features = ['Workout_Type']
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder='passthrough'
    )
    
    return Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

# Model 1: Gradient Boosting (your best model)
gb_pipeline = create_calories_pipeline(
    GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)
)

# Model 2: Linear Regression (baseline)
lr_pipeline = create_calories_pipeline(LinearRegression())

# Model 3: SVM (alternative)
svm_pipeline = create_calories_pipeline(
    SVR(kernel='rbf', C=1000, gamma=0.1, epsilon=0.1)
)

# Train all models
print("Training Gradient Boosting...")
gb_pipeline.fit(X_train_cal, y_train_cal)

print("Training Linear Regression...")
lr_pipeline.fit(X_train_cal, y_train_cal)

print("Training SVM...")
svm_pipeline.fit(X_train_cal, y_train_cal)

print("âœ… All models trained!")

In [None]:
def evaluate_regression_model(pipeline, X_test, y_test, model_name):
    """Calculate all required metrics"""
    y_pred = pipeline.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"\n{'='*50}")
    print(f"{model_name} Results:")
    print(f"{'='*50}")
    print(f"MSE:  {mse:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE:  {mae:.2f}")
    print(f"RÂ²:   {r2:.4f}")
    
    return {
        'Model': model_name,
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2,
        'pipeline': pipeline
    }

# Evaluate all models
results = []
results.append(evaluate_regression_model(gb_pipeline, X_test_cal, y_test_cal, "Gradient Boosting"))
results.append(evaluate_regression_model(lr_pipeline, X_test_cal, y_test_cal, "Linear Regression"))
results.append(evaluate_regression_model(svm_pipeline, X_test_cal, y_test_cal, "SVM"))

# Create comparison DataFrame for report
results_df = pd.DataFrame(results)[['Model', 'R2', 'RMSE', 'MAE']]
results_df = results_df.sort_values('R2', ascending=False)
print("\nðŸ“Š Model Comparison Table:")
print(results_df.to_string(index=False))

In [None]:
# Figure 2: Model comparison for poster
fig, ax = plt.subplots(figsize=(12, 8))

# Horizontal bar chart of RÂ² scores
bars = ax.barh(results_df['Model'], results_df['R2'], 
               color=['#2ecc71', '#3498db', '#e74c3c'])
ax.set_xlabel('RÂ² Score (Higher is Better)', fontsize=12)
ax.set_title('Calorie Prediction Model Comparison', fontsize=16, fontweight='bold')

# Add value labels on bars
for i, bar in enumerate(bars):
    width = bar.get_width()
    ax.text(width + 0.001, bar.get_y() + bar.get_height()/2, 
            f'{width:.4f}', ha='left', va='center', fontsize=11)

ax.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.savefig('model_comparison_poster.png', dpi=300, bbox_inches='tight')
plt.show()

# Save best model
best_model = max(results, key=lambda x: x['R2'])['pipeline']
joblib.dump(best_model, 'best_calories_model.pkl')
print(f"\nðŸ’¾ Best model saved: {max(results, key=lambda x: x['R2'])['Model']}")

In [None]:
# Create synthetic weight prediction data (as in your project)
np.random.seed(42)

# Add time features to existing data
df_weight = df_cleaned.copy()
df_weight['days_future'] = np.random.uniform(1, 365, len(df_weight))
df_weight['steps'] = np.random.uniform(2000, 20000, len(df_weight))

# Your physics-based calculation
df_weight['weight_loss'] = df_weight['Calories_Burned'] * df_weight['days_future'] / 7700
df_weight['step_effect'] = df_weight['steps'] * df_weight['days_future'] / 1e6
df_weight['workout_effect'] = df_weight['Workout_Type'].map(
    {'Strength': 0.3, 'Cardio': -1.2, 'HIIT': -1.0, 'Yoga': -0.5}
).fillna(0)

# Predicted weight formula
df_weight['predicted_weight'] = (
    df_weight['Weight (kg)'] - df_weight['weight_loss'] - df_weight['step_effect'] + 
    df_weight['workout_effect'] * df_weight['days_future'] / 30 + 
    np.random.normal(0, 1.5, len(df_weight))
)

# Drop NaNs
df_weight = df_weight.dropna(subset=['predicted_weight'])
print(f"Weight prediction dataset: {df_weight.shape}")

# Features and target
X_weight = df_weight[['days_future', 'steps', 'Calories_Burned', 'Workout_Type']]
y_weight = df_weight['predicted_weight']

X_train_w, X_test_w, y_train_w, y_test_w = train_test_split(
    X_weight, y_weight, test_size=0.2, random_state=42
)

In [None]:
# Train two models as required
weight_gb = Pipeline([
    ('preprocessor', ColumnTransformer([
        ('cat', OneHotEncoder(), ['Workout_Type'])
    ], remainder='passthrough')),
    ('regressor', GradientBoostingRegressor(n_estimators=200, random_state=42))
])

weight_lr = Pipeline([
    ('preprocessor', ColumnTransformer([
        ('cat', OneHotEncoder(), ['Workout_Type'])
    ], remainder='passthrough')),
    ('regressor', LinearRegression())
])

print("Training Gradient Boosting for weight...")
weight_gb.fit(X_train_w, y_train_w)

print("Training Linear Regression for weight...")
weight_lr.fit(X_train_w, y_train_w)

# Evaluate
def evaluate_weight_model(pipeline, X_test, y_test, name):
    y_pred = pipeline.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    print(f"{name} - RÂ²: {r2:.4f}, RMSE: {rmse:.2f} kg")
    return pipeline, r2

gb_w, r2_gb_w = evaluate_weight_model(weight_gb, X_test_w, y_test_w, "Weight GB")
lr_w, r2_lr_w = evaluate_weight_model(weight_lr, X_test_w, y_test_w, "Weight LR")

# Save best
best_weight_model = gb_w if r2_gb_w > r2_lr_w else lr_w
joblib.dump(best_weight_model, 'best_weight_model.pkl')

In [None]:
# Create goal variable
df_rec = df_cleaned.copy()
df_rec['goal'] = 'maintain'
df_rec.loc[df_rec['BMI'] < 18.5, 'goal'] = 'gain_muscle'
df_rec.loc[df_rec['BMI'] > 25, 'goal'] = 'lose_weight'

# Features for classification
features = ['BMI', 'Fat_Percentage', 'Age', 'Session_Duration (hours)', 
            'Calories_Burned', 'goal', 'Gender']
df_rec = df_rec.dropna(subset=features + ['Workout_Type'])

# Encode
le_goal = LabelEncoder()
le_workout = LabelEncoder()

X_rec = df_rec[features].copy()
X_rec['goal'] = le_goal.fit_transform(df_rec['goal'])
X_rec['Gender'] = df_rec['Gender'].map({'Male': 0, 'Female': 1})
y_rec = le_workout.fit_transform(df_rec['Workout_Type'])

X_train_rec, X_test_rec, y_train_rec, y_test_rec = train_test_split(
    X_rec, y_rec, test_size=0.2, random_state=42, stratify=y_rec
)
print(f"Recommendation dataset: {X_train_rec.shape}")
print(f"Workout types: {le_workout.classes_}")

In [None]:
# Train multiple classifiers (Professor requires â‰¥2)
models_class = {
    'Gradient Boosting': GradientBoostingClassifier(
        n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42
    ),
    'Logistic Regression': LogisticRegression(
        max_iter=1000, class_weight='balanced', random_state=42
    ),
    'Random Forest': RandomForestClassifier(
        n_estimators=200, max_depth=10, random_state=42, class_weight='balanced'
    )
}

results_class = {}

for name, model in models_class.items():
    model.fit(X_train_rec, y_train_rec)
    y_pred = model.predict(X_test_rec)
    
    acc = accuracy_score(y_test_rec, y_pred)
    f1 = f1_score(y_test_rec, y_pred, average='weighted')
    
    results_class[name] = {
        'Accuracy': acc,
        'F1-Score': f1,
        'model': model
    }
    
    print(f"{name} - Accuracy: {acc:.4f}, F1: {f1:.4f}")

# Save best
best_rec_model = max(results_class.items(), key=lambda x: x[1]['F1-Score'])[1]['model']
joblib.dump({
    'model': best_rec_model,
    'workout_encoder': le_workout,
    'goal_encoder': le_goal
}, 'best_recommendation_model.pkl')

In [None]:
# Simulate your Flask API routes in the notebook
def predict_calories(user_data):
    """
    Simulates /api/predict from calories.py
    user_data: dict with all required features
    """
    model = joblib.load('best_calories_model.pkl')
    df = pd.DataFrame([user_data])
    return model.predict(df)[0]

def predict_weight(days_ahead, steps, calories, workout_type):
    """
    Simulates /api/predict-weight from weight.py
    """
    model = joblib.load('best_weight_model.pkl')
    df = pd.DataFrame([{
        'days_future': days_ahead,
        'steps': steps,
        'calories_burned': calories,
        'workout_type': workout_type
    }])
    return model.predict(df)[0]

def recommend_workout_ml(user_profile):
    """
    Simulates /api/recommend-ml from recommend_ml.py
    """
    saved = joblib.load('best_recommendation_model.pkl')
    model, enc = saved['model'], saved['workout_encoder']
    
    # Process user_profile (same logic as your route)
    bmi = user_profile['current_weight'] / (user_profile['height'] ** 2)
    goal_map = {"lose_weight": 0, "gain_muscle": 1, "maintain": 2}
    
    row = pd.DataFrame([[
        bmi, user_profile['fat_percentage'], user_profile['age'],
        user_profile['avg_duration'], user_profile['avg_calories'],
        goal_map[user_profile['goal']], 
        0 if user_profile['gender'] == 'Male' else 1
    ]], columns=['BMI', 'Fat_Percentage', 'Age', 'Session_Duration (hours)', 
                 'Calories_Burned', 'goal', 'Gender'])
    
    pred = model.predict(row)[0]
    return enc.inverse_transform([pred])[0]

# Test examples
print("ðŸ”¥ Testing API functions:")
print("Calories:", predict_calories({
    'Avg_BPM': 140, 'Max_BPM': 180, 'Session_Duration (hours)': 1.5,
    'Weight (kg)': 75, 'Height (m)': 1.75, 'BMI': 24.5, 
    'Fat_Percentage': 20, 'Workout_Type': 'HIIT'
}))

print("Workout:", recommend_workout_ml({
    'current_weight': 75, 'height': 1.75, 'age': 30, 'fat_percentage': 20,
    'avg_duration': 1.0, 'avg_calories': 500, 'goal': 'lose_weight', 'gender': 'Male'
}))

In [None]:
## Discussion & Perspectives

### Model Limitations
1. **Synthetic Data**: Models trained on synthetic data may not generalize to real gym data
2. **Feature Engineering**: BMI-based goal inference is simplistic; real goals require user input
3. **Temporal Dynamics**: No actual time-series modeling; weight prediction uses simplified physics
4. **Class Imbalance**: Workout types likely not equally represented (need to check `value_counts()`)

### Improvements & Future Work
1. **Deep Learning**: LSTM for time-series weight prediction
2. **Feature Engineering**: Add sleep, nutrition, stress metrics
3. **Ensemble Methods**: Combine rule-based + ML recommendations
4. **A/B Testing**: Deploy both models and track user engagement
5. **Explainable AI**: SHAP values for recommendation transparency
6. **Real Data Collection**: Partner with gyms for authentic datasets

### Ethical Considerations
- Model bias: Ensure recommendations work for all body types and ages
- Overtraining risk: Add safety caps on intensity
- Privacy: Local model deployment option

In [None]:
# Generate LaTeX tables for your report
print("ðŸ“„ LaTeX Table for Report:")
print(results_df.to_latex(index=False, caption="Model Performance Comparison", label="tab:model_comparison"))

# Feature importance for best model
best_model = joblib.load('best_calories_model.pkl')
if hasattr(best_model.named_steps['regressor'], 'feature_importances_'):
    importance = pd.DataFrame({
        'feature': X_train_cal.columns,
        'importance': best_model.named_steps['regressor'].feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nðŸ“Š Feature Importance Table:")
    print(importance.head().to_latex(index=False, caption="Top Features", label="tab:features"))

In [None]:
# Create deployment package
import os
os.makedirs('deployment', exist_ok=True)

# Copy models
joblib.dump(best_model, 'deployment/calories_model.pkl')
joblib.dump(best_weight_model, 'deployment/weight_model.pkl')
joblib.dump({
    'model': best_rec_model,
    'encoders': {'workout': le_workout, 'goal': le_goal}
}, 'deployment/recommendation_model.pkl')

# Create a simple API file (like your app.py but simpler)
with open('deployment/api.py', 'w') as f:
    f.write('''
from flask import Flask, request, jsonify
import joblib
import pandas as pd

app = Flask(__name__)

calories_model = joblib.load('calories_model.pkl')
weight_model = joblib.load('weight_model.pkl')
rec_data = joblib.load('recommendation_model.pkl')

@app.route('/predict', methods=['POST'])
def predict_calories():
    data = request.json
    df = pd.DataFrame([data])
    return jsonify({'prediction': calories_model.predict(df)[0]})

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)
''')

print("ðŸš€ Ready for deployment!")
print("Files created:")
print("- deployment/calories_model.pkl")
print("- deployment/weight_model.pkl") 
print("- deployment/recommendation_model.pkl")
print("- deployment/api.py")