In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

In [2]:
# load dataset
df = pd.read_csv("../data/raw/life_style.csv")
# display first few rows
df.head()

Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,...,Workout,BMI_calc,cal_from_macros,pct_carbs,protein_per_kg,pct_HRR,pct_maxHR,cal_balance,lean_mass_kg,expected_burn
0,34.91,Male,65.27,1.62,188.58,157.65,69.05,1.0,1080.9,Strength,...,Dumbbell flyes,24.870447,2139.59,0.500432,1.624789,0.741237,0.835985,725.1,47.777394,685.16
1,23.37,Female,56.41,1.55,179.43,131.75,73.18,1.37,1809.91,HIIT,...,Lateral raises,23.479709,1711.65,0.50085,1.514093,0.551247,0.73427,-232.91,40.809803,978.6184
2,33.2,Female,58.98,1.67,175.04,123.95,54.96,0.91,802.26,Cardio,...,Standing calf raises,21.148123,1965.92,0.50061,1.663445,0.574534,0.708124,805.74,44.63558,654.5266
3,38.69,Female,93.78,1.7,191.21,155.1,50.07,1.1,1450.79,HIIT,...,Incline dumbbell flyes,32.449827,1627.28,0.499533,0.862017,0.744155,0.81115,1206.21,63.007432,773.63
4,45.09,Male,52.42,1.88,193.58,152.88,70.84,1.08,1166.4,Strength,...,Military press,14.831372,2659.23,0.500581,2.538153,0.668405,0.789751,303.6,43.347504,711.4176


In [3]:
df.columns

Index(['Age', 'Gender', 'Weight (kg)', 'Height (m)', 'Max_BPM', 'Avg_BPM',
       'Resting_BPM', 'Session_Duration (hours)', 'Calories_Burned',
       'Workout_Type', 'Fat_Percentage', 'Water_Intake (liters)',
       'Workout_Frequency (days/week)', 'Experience_Level', 'BMI',
       'Daily meals frequency', 'Physical exercise', 'Carbs', 'Proteins',
       'Fats', 'Calories', 'meal_name', 'meal_type', 'diet_type', 'sugar_g',
       'sodium_mg', 'cholesterol_mg', 'serving_size_g', 'cooking_method',
       'prep_time_min', 'cook_time_min', 'rating', 'is_healthy',
       'Name of Exercise', 'Sets', 'Reps', 'Benefit',
       'Burns Calories (per 30 min)', 'Target Muscle Group',
       'Equipment Needed', 'Difficulty Level', 'Body Part', 'Type of Muscle',
       'Workout', 'BMI_calc', 'cal_from_macros', 'pct_carbs', 'protein_per_kg',
       'pct_HRR', 'pct_maxHR', 'cal_balance', 'lean_mass_kg', 'expected_burn'],
      dtype='object')

In [None]:
# replacing space in column names with underscore and making lowercase
# preprocessing column names
df.columns = df.columns.str.replace(' ', '_').str.lower()

In [8]:
# let's see the updated column names
df.columns


Index(['age', 'gender', 'weight_(kg)', 'height_(m)', 'max_bpm', 'avg_bpm',
       'resting_bpm', 'session_duration_(hours)', 'calories_burned',
       'workout_type', 'fat_percentage', 'water_intake_(liters)',
       'workout_frequency_(days/week)', 'experience_level', 'bmi',
       'daily_meals_frequency', 'physical_exercise', 'carbs', 'proteins',
       'fats', 'calories', 'meal_name', 'meal_type', 'diet_type', 'sugar_g',
       'sodium_mg', 'cholesterol_mg', 'serving_size_g', 'cooking_method',
       'prep_time_min', 'cook_time_min', 'rating', 'is_healthy',
       'name_of_exercise', 'sets', 'reps', 'benefit',
       'burns_calories_(per_30_min)', 'target_muscle_group',
       'equipment_needed', 'difficulty_level', 'body_part', 'type_of_muscle',
       'workout', 'bmi_calc', 'cal_from_macros', 'pct_carbs', 'protein_per_kg',
       'pct_hrr', 'pct_maxhr', 'cal_balance', 'lean_mass_kg', 'expected_burn'],
      dtype='object')

In [10]:
# saving preprocessing dataset
os.makedirs("../data/processed/", exist_ok=True)
df.to_csv("../data/processed/life_style_cleaned.csv", index=False)

In [11]:
# splitting the dataset into training and testing sets and save them to interim folder
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)
os.makedirs('../data/interim', exist_ok=True)
train_set.to_csv('../data/interim/train_set.csv', index=False)
test_set.to_csv('../data/interim/test_set.csv', index=False)

In [13]:
# working with the training set for further preprocessing
train_set.shape, test_set.shape

((16000, 53), (4000, 53))

In [14]:
train_set.columns

Index(['age', 'gender', 'weight_(kg)', 'height_(m)', 'max_bpm', 'avg_bpm',
       'resting_bpm', 'session_duration_(hours)', 'calories_burned',
       'workout_type', 'fat_percentage', 'water_intake_(liters)',
       'workout_frequency_(days/week)', 'experience_level', 'bmi',
       'daily_meals_frequency', 'physical_exercise', 'carbs', 'proteins',
       'fats', 'calories', 'meal_name', 'meal_type', 'diet_type', 'sugar_g',
       'sodium_mg', 'cholesterol_mg', 'serving_size_g', 'cooking_method',
       'prep_time_min', 'cook_time_min', 'rating', 'is_healthy',
       'name_of_exercise', 'sets', 'reps', 'benefit',
       'burns_calories_(per_30_min)', 'target_muscle_group',
       'equipment_needed', 'difficulty_level', 'body_part', 'type_of_muscle',
       'workout', 'bmi_calc', 'cal_from_macros', 'pct_carbs', 'protein_per_kg',
       'pct_hrr', 'pct_maxhr', 'cal_balance', 'lean_mass_kg', 'expected_burn'],
      dtype='object')

In [None]:
# working with the training set for further preprocessing
# select the features needed from the training set
attribute_list = ['calories', 'weight_(kg)', 'lean_mass_kg', 'bmi','fat_percentage','protein_per_kg', 'benefit', 'body_part', 'equipment_needed', '']

# selecting the attributes from the training set
data = train_set[attribute_list]