In [2]:
#Import packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import statsmodels.api as sm
import time
import itertools
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

Original Model

In [3]:
#Import dataframe
df = pd.read_csv("calories.csv")
df['Gender'] = df['Gender'].map({'male': 0, 'female': 1})
df.drop(['User_ID'], axis=1, inplace=True)
df

Unnamed: 0,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,0,68,190.0,94.0,29.0,105.0,40.8,231.0
1,1,20,166.0,60.0,14.0,94.0,40.3,66.0
2,0,69,179.0,79.0,5.0,88.0,38.7,26.0
3,1,34,179.0,71.0,13.0,100.0,40.5,71.0
4,1,27,154.0,58.0,10.0,81.0,39.8,35.0
...,...,...,...,...,...,...,...,...
14995,1,20,193.0,86.0,11.0,92.0,40.4,45.0
14996,1,27,165.0,65.0,6.0,85.0,39.2,23.0
14997,1,43,159.0,58.0,16.0,90.0,40.1,75.0
14998,0,78,193.0,97.0,2.0,84.0,38.3,11.0


In [4]:
# Assign Calories data to X and Y, split into test and train.
X = df.drop(['Calories'], axis=1)
y = df['Calories']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
#param_grid = {
#    'n_estimators': [300, 500, 1000],
#  'max_depth': [None, 10, 20, 30],
#   'min_samples_split': [2, 5, 10],
#   'min_samples_leaf': [1, 2, 4, 8, 10],
#   'max_features': ['auto', 'sqrt', 'log2']
# }

#None of the grid search CV outperformed the baseline model, so stuck with those hyperparameters

bagging_model = RandomForestRegressor(n_estimators=500, random_state=42, oob_score=True)

#grid_search.fit(X_train, y_train)

#fit model
bagging_model.fit(X_train, y_train)

#bagging_model = grid_search.best_estimator_

#calculate Test MSE
y_pred_bagging = bagging_model.predict(X_test)
test_mse_bagging = mean_squared_error(y_test, y_pred_bagging)

print(f"Test MSE for Bagging: {test_mse_bagging}")

# Feature Importance
importances = bagging_model.feature_importances_
feature_importances = pd.Series(importances, index=X_train.columns)
feature_importances = feature_importances.sort_values(ascending=False)
print("\nFeature Importances:")
feature_importances

Test MSE for Bagging: 6.916138568

Feature Importances:


Duration      0.914048
Heart_Rate    0.048105
Age           0.026350
Gender        0.006776
Weight        0.002985
Height        0.001248
Body_Temp     0.000489
dtype: float64

In [6]:
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import accuracy_score

In [22]:
#Full range of metrics for model performance

#Preductions on training data
y_train_pred = bagging_model.predict(X_train)

#MSE
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_pred_bagging)

#MAE
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_pred_bagging)

# R^2
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_pred_bagging)

print(f"Training MSE: {train_mse}")
print(f"Test MSE: {test_mse}")
print(f"Training MAE: {train_mae}")
print(f"Test MAE: {test_mae}")
print(f"Training R-squared: {train_r2}")
print(f"Test R-squared: {test_r2}")

Training MSE: 1.0929336293333332
Test MSE: 6.916138568
Training MAE: 0.637364
Test MAE: 1.6695346666666664
Training R-squared: 0.9997172838314758
Test R-squared: 0.9982862993276923


Imputed

In [23]:
#Load the 2nd dataset (version with imputed data)
imputed = pd.read_csv("gym_imputed.csv")
y_imp = imputed['Calories']
X_imp = imputed.drop(['Calories'], axis =1)

In [24]:
#Test imputed data on trained original bagging model
imp_y_pred_bagging = bagging_model.predict(X_imp)
imp_mse_bagging = mean_squared_error(y_imp, imp_y_pred_bagging)
imp_mae = mean_absolute_error(y_imp, imp_y_pred_bagging)

In [25]:
#Results
print("Imputed Test MSE:", imp_mse_bagging)
print("Imputed Test MAE:", imp_mae)

Imputed Test MSE: 506288.8212896731
Imputed Test MAE: 657.9307358684481


Not Imputed

In [26]:
#Load 2nd dataset (unmodified)
gym = pd.read_csv("gym_nonimputed.csv")

In [27]:
# Set X and y, split into train and test
X = gym.drop(['Calories'], axis=1)
y = gym['Calories']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
# Train new bagging model on nonimputed 2nd dataset

bagging_model = RandomForestRegressor(n_estimators=500, random_state=42, oob_score=True)
bagging_model.fit(X_train, y_train)

# Print results
y_pred_bagging = bagging_model.predict(X_test)
test_mse_bagging = mean_squared_error(y_test, y_pred_bagging)

print(f"Test MSE for Bagging: {test_mse_bagging}")

# Feature Importance
importances = bagging_model.feature_importances_
feature_importances = pd.Series(importances, index=X_train.columns)
feature_importances = feature_importances.sort_values(ascending=False)
print("\nFeature Importances:")
feature_importances

Test MSE for Bagging: 1753.1470931282051

Feature Importances:


Duration      0.842144
Heart_Rate    0.104846
Age           0.023423
Gender        0.014306
Weight        0.007960
Height        0.007320
dtype: float64

In [32]:
#Full model metrics

#Predictions on training data
y_train_pred = bagging_model.predict(X_train)

#MSE
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_pred_bagging)

#MAE
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_pred_bagging)

# R^2
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_pred_bagging)

print(f"Training MSE: {train_mse}")
print(f"Test MSE: {test_mse}")
print(f"Training MAE: {train_mae}")
print(f"Test MAE: {test_mae}")
print(f"Training R-squared: {train_r2}")
print(f"Test R-squared: {test_r2}")

Training MSE: 207.9101010025707
Test MSE: 1753.1470931282051
Training MAE: 10.569282776349615
Test MAE: 30.83189743589744
Training R-squared: 0.9971103269157254
Test R-squared: 0.9789853574553046
