In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report


In [13]:
df = pd.read_csv("../data/workout_fitness_tracker_data.csv")
df.head()

Unnamed: 0,User ID,Age,Gender,Height (cm),Weight (kg),Workout Type,Workout Duration (mins),Calories Burned,Heart Rate (bpm),Steps Taken,Distance (km),Workout Intensity,Sleep Hours,Water Intake (liters),Daily Calories Intake,Resting Heart Rate (bpm),VO2 Max,Body Fat (%),Mood Before Workout,Mood After Workout
0,1,39,Male,175,99,Cycling,79,384,112,8850,14.44,High,8.2,1.9,3195,61,38.4,28.5,Tired,Fatigued
1,2,36,Other,157,112,Cardio,73,612,168,2821,1.1,High,8.6,1.9,2541,73,38.4,28.5,Happy,Energized
2,3,25,Female,180,66,HIIT,27,540,133,18898,7.28,High,9.8,1.9,3362,80,38.4,28.5,Happy,Fatigued
3,4,56,Male,154,89,Cycling,39,672,118,14102,6.55,Medium,5.8,1.9,2071,65,38.4,28.5,Neutral,Neutral
4,5,53,Other,194,59,Strength,56,410,170,16518,3.17,Medium,7.3,1.9,3298,59,38.4,28.5,Stressed,Energized


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   User ID                   10000 non-null  int64  
 1   Age                       10000 non-null  int64  
 2   Gender                    10000 non-null  object 
 3   Height (cm)               10000 non-null  int64  
 4   Weight (kg)               10000 non-null  int64  
 5   Workout Type              10000 non-null  object 
 6   Workout Duration (mins)   10000 non-null  int64  
 7   Calories Burned           10000 non-null  int64  
 8   Heart Rate (bpm)          10000 non-null  int64  
 9   Steps Taken               10000 non-null  int64  
 10  Distance (km)             10000 non-null  float64
 11  Workout Intensity         10000 non-null  object 
 12  Sleep Hours               10000 non-null  float64
 13  Water Intake (liters)     10000 non-null  float64
 14  Daily C

In [15]:
df.duplicated().sum() 

0

In [16]:
# select the columns to use in the model
df = df[[
    'Age', 'Gender', 'Height (cm)', 'Weight (kg)', 
    'Workout Duration (mins)', 'Workout Intensity', 'Workout Type'
]]

In [17]:
# add workout days to the dataset randomly
np.random.seed(42)
df['Workout Days'] = np.random.choice([2, 3, 4, 5, 6], size=len(df))


In [18]:
# use BMI and Duration Per Day as features
df['Height (m)'] = df['Height (cm)'] / 100
df['BMI'] = df['Weight (kg)'] / (df['Height (m)'] ** 2)
df['Duration Per Day'] = df['Workout Duration (mins)'] / df['Workout Days']


In [19]:
# drop the original columns that are not needed anymore
df.drop(columns=['Height (cm)', 'Weight (kg)', 'Height (m)', 'Workout Duration (mins)'], inplace=True)

In [20]:
# change other in gender column to I prefer not to say
if 'other' in df['Gender'].unique():
    df['Gender'] = df['Gender'].replace('other', 'I prefer not to say')

In [21]:
df['Workout Type'] = LabelEncoder().fit_transform(df['Workout Type'])
workout_type_decoder = {0: 'Cardio', 1: 'Cycling', 2: 'HIIT', 3: 'Running', 4: 'Strength', 5: 'Yoga'}
df['Workout Type'] = df['Workout Type'].map(workout_type_decoder)

merge_map = {
    'HIIT': 'High Effort',
    'Strength': 'High Effort',
    'Cardio': 'Endurance',
    'Running': 'Endurance',
    'Cycling': 'Endurance',
    'Yoga': 'Flexibility'
}
df['Workout Category'] = df['Workout Type'].map(merge_map)


label_encoders = {}
for col in ['Gender', 'Workout Intensity', 'Workout Category']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


In [22]:
df.to_csv("../data/cleaned_workout_fitness_tracker_data.csv", index=False)

In [23]:
# RF model 
X = df.drop(columns=['Workout Type', 'Workout Category'])
y = df['Workout Category']
scaler_features = X.columns.tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)


y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='macro')
report = classification_report(y_test, y_pred, target_names=label_encoders['Workout Category'].classes_)

print("\n Accuracy:", round(accuracy * 100, 2), "%")
print(" F1 Score:", round(f1 * 100, 2), "%")
print("\n Classification Report:\n", report)

# Recommendation  
def detailed_recommendation(user_input: dict, model, encoders, scaler_features):
    input_df = pd.DataFrame([user_input])
    for col in ['Gender', 'Workout Intensity']:
        if col in encoders:
            input_df[col] = encoders[col].transform(input_df[col])
    input_df['Height (m)'] = input_df['Height (cm)'] / 100
    input_df['BMI'] = input_df['Weight (kg)'] / (input_df['Height (m)'] ** 2)
    input_df['Duration Per Day'] = input_df['Workout Duration (mins)'] / input_df['Workout Days']
    input_df.drop(columns=['Height (cm)', 'Weight (kg)', 'Height (m)', 'Workout Duration (mins)'], inplace=True)
    input_df = input_df[scaler_features]
    prediction_code = model.predict(input_df)[0]
    category = encoders['Workout Category'].inverse_transform([prediction_code])[0]

    suggestions = {
        "Endurance": ["Cardio", "Running", "Cycling"],
        "High Effort": ["HIIT", "Strength"],
        "Flexibility": ["Yoga"]
    }

    recommendation_text = f" {category}  Training is a Good Fit for You!\n"
    recommendation_text += f"Based on your data, Fit Vision recommends you focus on {category.upper()}-BASED exercises.\n"
    recommendation_text += "Here are some great starting points:\n"
    for item in suggestions.get(category, []):
        recommendation_text += f"- {item}\n"

    return recommendation_text.strip()


example_user = {
    'Age': 25,
    'Gender': 'Female',
    'Height (cm)': 165,
    'Weight (kg)': 60,
    'Workout Duration (mins)': 45,
    'Workout Intensity': 'Medium',
    'Workout Days': 3
}

print("\n", detailed_recommendation(example_user, rf_model, label_encoders, scaler_features))



 Accuracy: 45.65 %
 F1 Score: 32.98 %

 Classification Report:
               precision    recall  f1-score   support

   Endurance       0.49      0.72      0.58       984
 Flexibility       0.29      0.06      0.09       320
 High Effort       0.38      0.26      0.31       696

    accuracy                           0.46      2000
   macro avg       0.38      0.35      0.33      2000
weighted avg       0.42      0.46      0.41      2000


 Endurance  Training is a Good Fit for You!
Based on your data, Fit Vision recommends you focus on ENDURANCE-BASED exercises.
Here are some great starting points:
- Cardio
- Running
- Cycling
