In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# load data
df1 = pd.read_csv("../data/fitness_and_workout_dataset.csv")
df2 = pd.read_csv("../data/GYM.csv")

df1.shape, df2.shape  

((2598, 10), (80000, 5))

In [5]:
df1.head()

Unnamed: 0,title,description,level,goal,equipment,program_length,time_per_workout,total_exercises,created,last_edit
0,(MASS MONSTER) High Intensity 4 Day Upper Lowe...,Build tones of muscular with this high intensi...,['Intermediate'],"['Muscle & Sculpting', 'Bodyweight Fitness']",Full Gym,12.0,90.0,384,2024-01-20 10:23:00,2025-06-29 12:39:00
1,(NOT MY PROGRAM)SHJ Jotaro,Build strength and size,"['Advanced', 'Intermediate']",['Bodybuilding'],Full Gym,8.0,60.0,224,2024-07-08 02:28:00,2025-06-18 09:15:00
2,1 PowerLift Per Day Powerbuilding 5 Day Bro Split,Based off of Andy Baker's KCS (Kingwood Streng...,"['Beginner', 'Novice', 'Intermediate']","['Athletics', 'Powerlifting', 'Powerbuilding']",Full Gym,6.0,90.0,237,2025-04-23 09:21:00,2025-06-18 11:55:00
3,10 Week Mass Building Program,This workout is designed to increase your musc...,"['Intermediate', 'Advanced']",['Powerbuilding'],Garage Gym,10.0,70.0,280,2024-09-07 03:44:00,2025-06-18 08:01:00
4,10 week deadlift focus,Increase deadlift,"['Intermediate', 'Advanced']","['Powerbuilding', 'Powerlifting', 'Bodybuildin...",Full Gym,10.0,80.0,356,2024-12-23 03:13:00,2025-06-18 12:19:00


# Dataset 1 (fitness_and_workout_dataset.csv) has more features (10 columns) that are important to the project goal. There are also some features  like 'Created and 'Last edit' that needs to be dropped. Other features need exact units e.g Program, lenght and total exercises. Finally the columns level needs to be more specific. 

In [6]:
df2.head()

Unnamed: 0,Gender,Goal,BMI Category,Exercise Schedule,Meal Plan
0,Female,muscle_gain,Normal weight,"Moderate cardio, Strength training, and 5000 s...",Balanced diet with moderate protein and carboh...
1,Male,fat_burn,Underweight,"Light weightlifting, Yoga, and 2000 steps walking","High-calorie, protein-rich diet: Whole milk, p..."
2,Male,muscle_gain,Normal weight,"Moderate cardio, Strength training, and 5000 s...",Balanced diet with moderate protein and carboh...
3,Male,muscle_gain,Overweight,"High-intensity interval training (HIIT), Cardi...","Low-carb, high-fiber diet: Avocado, grilled fi..."
4,Female,muscle_gain,Normal weight,"Moderate cardio, Strength training, and 5000 s...",Balanced diet with moderate protein and carboh...


# Dataset 2 has fewer yet relevant features (5 columns). However the BMI category needs to be more specific, or we can include other features like body length and weight.

In [7]:
df1.tail()

Unnamed: 0,title,description,level,goal,equipment,program_length,time_per_workout,total_exercises,created,last_edit
2593,🎧,Lihaskasvu,"['Intermediate', 'Advanced']",['Bodybuilding'],Garage Gym,12.0,90.0,228,2024-10-10 04:20:00,2025-06-18 11:32:00
2594,👾Reza's Routine👾,This is a beginner friendly routine made for m...,"['Beginner', 'Intermediate']",['Muscle & Sculpting'],Dumbbell Only,1.0,60.0,60,2024-09-15 08:45:00,2025-06-18 07:48:00
2595,"🔥 ""Upper Body Dominance: 3-Day Growth System"" 🔥","""Upper Body Dominance: A science-based 3-day w...","['Intermediate', 'Novice']",['Muscle & Sculpting'],Full Gym,6.0,60.0,96,2025-02-15 08:18:00,2025-06-18 07:48:00
2596,🙈🙉🙊🐵,Muscle Memory Training,['Intermediate'],['Bodybuilding'],Full Gym,8.0,90.0,211,2024-12-08 01:04:00,2025-06-18 11:35:00
2597,🥷🥷🥷,To become stronger without becoming “bulky”,"['Intermediate', 'Novice']","['Bodybuilding', 'Powerbuilding']",Garage Gym,9.0,100.0,216,2025-05-15 10:44:00,2025-06-18 12:08:00


In [8]:
df2.tail()

Unnamed: 0,Gender,Goal,BMI Category,Exercise Schedule,Meal Plan
79995,Male,fat_burn,Normal weight,"Moderate cardio, Strength training, and 5000 s...",Balanced diet with moderate protein and carboh...
79996,Female,fat_burn,Underweight,"Light weightlifting, Yoga, and 2000 steps walking","High-calorie, protein-rich diet: Whole milk, p..."
79997,Female,muscle_gain,Obesity,"Low-impact cardio, Swimming, and 10000 steps w...","Low-calorie, nutrient-dense diet with portion ..."
79998,Male,fat_burn,Normal weight,"Moderate cardio, Strength training, and 5000 s...",Balanced diet with moderate protein and carboh...
79999,Male,fat_burn,Overweight,"High-intensity interval training (HIIT), Cardi...","Low-carb, high-fiber diet: Avocado, grilled fi..."


In [None]:
# Dataset 1 has special features like emojis in the 'Title' column, hence we might need to drop these features. 

In [9]:
df1.columns, df2.columns

(Index(['title', 'description', 'level', 'goal', 'equipment', 'program_length',
        'time_per_workout', 'total_exercises', 'created', 'last_edit'],
       dtype='object'),
 Index(['Gender', 'Goal', 'BMI Category', 'Exercise Schedule', 'Meal Plan'], dtype='object'))

In [11]:
df1.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2598 entries, 0 to 2597
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   title             2598 non-null   object 
 1   description       2594 non-null   object 
 2   level             2598 non-null   object 
 3   goal              2598 non-null   object 
 4   equipment         2597 non-null   object 
 5   program_length    2597 non-null   float64
 6   time_per_workout  2598 non-null   float64
 7   total_exercises   2598 non-null   int64  
 8   created           2597 non-null   object 
 9   last_edit         2596 non-null   object 
dtypes: float64(2), int64(1), object(7)
memory usage: 203.1+ KB


In [12]:
df2.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Gender             80000 non-null  object
 1   Goal               80000 non-null  object
 2   BMI Category       80000 non-null  object
 3   Exercise Schedule  80000 non-null  object
 4   Meal Plan          80000 non-null  object
dtypes: object(5)
memory usage: 3.1+ MB


In [13]:
df1.isnull().sum()

title               0
description         4
level               0
goal                0
equipment           1
program_length      1
time_per_workout    0
total_exercises     0
created             1
last_edit           2
dtype: int64

In [14]:
df2.isnull().sum()

Gender               0
Goal                 0
BMI Category         0
Exercise Schedule    0
Meal Plan            0
dtype: int64

# Dataset 1 has missing values in columns 5 columns, we need to find a way to fill up the missing values either be mean, median or mode. Or any other methods we chose to use.Dataset 2 has no missing values hence we can wotk with it.

In [15]:
df1.dtypes

title                object
description          object
level                object
goal                 object
equipment            object
program_length      float64
time_per_workout    float64
total_exercises       int64
created              object
last_edit            object
dtype: object

In [16]:
df2.dtypes

Gender               object
Goal                 object
BMI Category         object
Exercise Schedule    object
Meal Plan            object
dtype: object

# Dataset one has some floats and integers while dataset 2 is has just objects, we might need to use processes like one hot encoding etc on dataset one. 

In [17]:
df1.dtypes.value_counts()

object     7
float64    2
int64      1
Name: count, dtype: int64

In [18]:
df2.dtypes.value_counts()

object    5
Name: count, dtype: int64