In [20]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import statsmodels.api as sm
import time
import itertools
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor as GBR
import sklearn.model_selection as skm

In [21]:
# Load the dataset
df = pd.read_csv('/Users/Vicki/Library/CloudStorage/OneDrive-Emory/24-25/Spring 2025/QTM 347/calories.csv')
df['Gender'] = df['Gender'].map({'male': 0, 'female': 1})
df.drop(['User_ID'], axis=1, inplace=True)
df

Unnamed: 0,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,0,68,190.0,94.0,29.0,105.0,40.8,231.0
1,1,20,166.0,60.0,14.0,94.0,40.3,66.0
2,0,69,179.0,79.0,5.0,88.0,38.7,26.0
3,1,34,179.0,71.0,13.0,100.0,40.5,71.0
4,1,27,154.0,58.0,10.0,81.0,39.8,35.0
...,...,...,...,...,...,...,...,...
14995,1,20,193.0,86.0,11.0,92.0,40.4,45.0
14996,1,27,165.0,65.0,6.0,85.0,39.2,23.0
14997,1,43,159.0,58.0,16.0,90.0,40.1,75.0
14998,0,78,193.0,97.0,2.0,84.0,38.3,11.0


In [22]:
# Split df into training and test set
X = df.drop(['Calories'], axis=1)
y = df['Calories']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
# Perform bagging
bagging_model = RandomForestRegressor(n_estimators=500, random_state=42, oob_score=True)
bagging_model.fit(X_train, y_train)

# Predicted y-value and test MSE
y_pred_bagging = bagging_model.predict(X_test)
test_mse_bagging = mean_squared_error(y_test, y_pred_bagging)
print(f"Test MSE for Bagging: {test_mse_bagging}")

# Feature Importance
importances = bagging_model.feature_importances_
feature_importances = pd.Series(importances, index=X_train.columns)
feature_importances = feature_importances.sort_values(ascending=False)
print("\nFeature Importances:")
feature_importances

Test MSE for Bagging: 6.916138568

Feature Importances:


Duration      0.914048
Heart_Rate    0.048105
Age           0.026350
Gender        0.006776
Weight        0.002985
Height        0.001248
Body_Temp     0.000489
dtype: float64

In [24]:
# Bagging accuracy
y_train_pred = bagging_model.predict(X_train)
y_pred_bagging = bagging_model.predict(X_test)

# MSE
train_mse_bag = mean_squared_error(y_train, y_train_pred)
test_mse_bag = mean_squared_error(y_test, y_pred_bagging)

# MAE
train_mae_bag = mean_absolute_error(y_train, y_train_pred)
test_mae_bag = mean_absolute_error(y_test, y_pred_bagging)

# R^2
train_r2_bag = r2_score(y_train, y_train_pred)
test_r2_bag = r2_score(y_test, y_pred_bagging)

# Print results
print(f"Training MSE: {train_mse_bag}")
print(f"Test MSE: {test_mse_bag}")
print(f"Training MAE: {train_mae_bag}")
print(f"Test MAE: {test_mae_bag}")
print(f"Training R-squared: {train_r2_bag}")
print(f"Test R-squared: {test_r2_bag}")

Training MSE: 1.0929336293333332
Test MSE: 6.916138568
Training MAE: 0.637364
Test MAE: 1.6695346666666664
Training R-squared: 0.9997172838314758
Test R-squared: 0.9982862993276923


In [25]:
# Perform bagging with CV
bagging_model1 = RandomForestRegressor(random_state=42, oob_score=True)
param_grid = {
    'n_estimators': [500, 600, 700, 800, 900, 1000],
}
grid_search = GridSearchCV(estimator=bagging_model1, param_grid=param_grid,
                           scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)
print("Best hyperparameters:", grid_search.best_params_) # CV does not improve model, baseline of 500 estimators is better

# Fit model with the best hyperparameters
bagging_model1 = RandomForestRegressor(**grid_search.best_params_, max_features=X_train.shape[1], random_state=42, oob_score=True)
bagging_model1.fit(X_train, y_train)

# Bagging accuracy
y_pred_bagging1 = bagging_model1.predict(X_test)
y_train_pred1 = bagging_model1.predict(X_train)

# MSE
train_mse_bagging1 = mean_squared_error(y_train, y_train_pred1)
print(f"Training MSE: {train_mse_bagging1}")
test_mse_bagging1 = mean_squared_error(y_test, y_pred_bagging1)
print(f"Test MSE for Bagging: {test_mse_bagging1}")

# MAE
train_mse_bagging1 = mean_absolute_error(y_train, y_train_pred1)
print(f"Training MSE: {train_mse_bagging1}")
test_mae_bagging1 = mean_absolute_error(y_test, y_pred_bagging1)
print(f"Test MAE: {test_mae_bagging1}")

# R^2
train_mse_bagging1 = r2_score(y_train, y_train_pred1)
print(f"Training MSE: {train_mse_bagging1}")
test_r2_bagging1 = r2_score(y_test, y_pred_bagging1)
print(f"Test R-squared: {test_r2_bagging1}")

# Feature Importance
importances = bagging_model1.feature_importances_
feature_importances = pd.Series(importances, index=X_train.columns)
feature_importances = feature_importances.sort_values(ascending=False)
print("\nFeature Importances:")
feature_importances

Best hyperparameters: {'n_estimators': 600}
Training MSE: 1.0931519418981481
Test MSE for Bagging: 6.923419925000002
Training MSE: 0.63684375
Test MAE: 1.6670194444444444
Training MSE: 0.9997172273591611
Test R-squared: 0.9982844951321482

Feature Importances:


Duration      0.914043
Heart_Rate    0.048128
Age           0.026348
Gender        0.006770
Weight        0.002975
Height        0.001248
Body_Temp     0.000489
dtype: float64

In [26]:
from xgboost import XGBRegressor

In [27]:
# Perform boosting
boost_model = XGBRegressor(n_estimators=500, random_state=42)
boost_model.fit(X_train,y_train)

In [28]:
# Boosting accuracy
y_train_pred_boost = boost_model.predict(X_train)
y_pred_boost = boost_model.predict(X_test)

# MSE
train_mse_boost = mean_squared_error(y_train, y_train_pred)
test_mse_boost = mean_squared_error(y_test, y_pred_boost)

# MAE
train_mae_boost = mean_absolute_error(y_train, y_train_pred)
test_mae_boost = mean_absolute_error(y_test, y_pred_boost)

# R^2
train_r2_boost = r2_score(y_train, y_train_pred)
test_r2_boost = r2_score(y_test, y_pred_boost)

print(f"Training MSE: {train_mse_boost}")
print(f"Test MSE: {test_mse_boost}")
print(f"Training MAE: {train_mae_boost}")
print(f"Test MAE: {test_mae_boost}")
print(f"Training R-squared: {train_r2_boost}")
print(f"Test R-squared: {test_r2_boost}")

Training MSE: 1.0929336293333332
Test MSE: 3.4537828599802753
Training MAE: 0.637364
Test MAE: 1.253790535847346
Training R-squared: 0.9997172838314758
Test R-squared: 0.9991442117662971


In [29]:
# Boosting with CV
boost_model1 = XGBRegressor()
param_grid = {
    'n_estimators': [500, 600, 700, 800, 900, 1000], 
    'max_depth': [1, 2, 3, 4, 5, 6, 7]
}
grid_search = GridSearchCV(estimator=boost_model1, param_grid=param_grid,
                           scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)
print("Best hyperparameters:", grid_search.best_params_)

# Fit model with the best hyperparameters
boost_model1 = XGBRegressor(**grid_search.best_params_, random_state=42, oob_score=True)
boost_model1.fit(X_train, y_train)

# Boosting accuracy: MSE, MAE, and R^2
y_pred_boost1 = boost_model1.predict(X_test)
test_mse_boost1 = mean_squared_error(y_test, y_pred_boost1)
print(f"Test MSE for Boosting: {test_mse_boost1}") # MSE decreases significantly with CV, continues decreasing with more estimators

test_mae_boost1 = mean_absolute_error(y_test, y_pred_boost1)
print(f"Test MAE: {test_mae_boost1}")

test_r2_boost1 = r2_score(y_test, y_pred_boost1)
print(f"Test R-squared: {test_r2_boost1}")

# Feature Importance
importances = boost_model1.feature_importances_
feature_importances = pd.Series(importances, index=X_train.columns)
feature_importances = feature_importances.sort_values(ascending=False)
print("\nFeature Importances:")
feature_importances

Best hyperparameters: {'max_depth': 3, 'n_estimators': 1000}
Test MSE for Boosting: 1.3425181402585544
Test MAE: 0.8544543500145276
Test R-squared: 0.9996673470005082

Feature Importances:


Duration      0.830674
Heart_Rate    0.113266
Age           0.027884
Gender        0.019615
Weight        0.008056
Body_Temp     0.000316
Height        0.000190
dtype: float32

In [30]:
# Boosting feature importance
importances = boost_model.feature_importances_
feature_importances = pd.Series(importances, index=X_train.columns)
feature_importances = feature_importances.sort_values(ascending=False)
print("\nFeature Importances:")
feature_importances


Feature Importances:


Duration      0.891348
Heart_Rate    0.048222
Gender        0.037311
Age           0.018500
Weight        0.004270
Height        0.000192
Body_Temp     0.000157
dtype: float32

In [31]:
# Perform random forest with CV
rforest_model = RandomForestRegressor(random_state=42)
param_grid = {
    'max_features': [4, 5, 6],
    'n_estimators': [1100, 1150, 1200, 1250, 1300],
}
grid_search = GridSearchCV(estimator=rforest_model, param_grid=param_grid,
                           scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)
print("Best hyperparameters:", grid_search.best_params_)

# Fit model with the best hyperparameters
rf_model = RandomForestRegressor(**grid_search.best_params_, random_state=42, oob_score=True)
rf_model.fit(X_train, y_train)

# Random forest accuracy: MSE, MAE, and R^2
y_pred_rf = rf_model.predict(X_test)
test_mse_rf = mean_squared_error(y_test, y_pred_rf)
print(f"Test MSE for Random Forest: {test_mse_rf}")

test_mae_rf = mean_absolute_error(y_test, y_pred_rf)
print(f"Test MAE: {test_mae_rf}")

test_r2_rf = r2_score(y_test, y_pred_rf)
print(f"Test R-squared: {test_r2_rf}")

# Feature Importance
importances = rf_model.feature_importances_
feature_importances = pd.Series(importances, index=X_train.columns)
feature_importances = feature_importances.sort_values(ascending=False)
print("\nFeature Importances:")
feature_importances


Best hyperparameters: {'max_features': 6, 'n_estimators': 1200}
Test MSE for Random Forest: 6.54698112199074
Test MAE: 1.6144886111111112
Test R-squared: 0.9983777702196059

Feature Importances:


Duration      0.802452
Heart_Rate    0.145167
Age           0.026435
Body_Temp     0.014741
Gender        0.006219
Weight        0.003572
Height        0.001414
dtype: float64

In [32]:
# Load new nonimputed dataset, missing Body_Temp variable
df2 = pd.read_csv('/Users/Vicki/Library/CloudStorage/OneDrive-Emory/24-25/Spring 2025/QTM 347/gym_nonimputed.csv')
df2

Unnamed: 0,Age,Gender,Weight,Height,Heart_Rate,Duration,Calories
0,56,0,88.3,171,157,101.4,1313
1,46,1,74.9,153,151,78.0,883
2,32,1,68.1,166,122,66.6,677
3,25,0,53.2,170,164,35.4,532
4,38,0,46.1,179,158,38.4,556
...,...,...,...,...,...,...,...
968,24,0,87.1,174,158,94.2,1364
969,25,0,66.6,161,166,82.8,1260
970,59,1,60.4,176,120,103.2,929
971,32,0,126.4,183,146,66.0,883


In [33]:
# Define X and y of nonimputed data
X2 = df2.drop(['Calories'], axis=1)
y2 = df2['Calories']
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

# Run random forest on nonimputed dataset
rforest_model = RandomForestRegressor(random_state=42)
param_grid = {
    'max_features': [1,2,3,4,5],
    'n_estimators': [1000, 1100, 1200, 1300, 1400],
}
grid_search = GridSearchCV(estimator=rforest_model, param_grid=param_grid,
                           scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
grid_search.fit(X2_train, y2_train)
print("Best hyperparameters:", grid_search.best_params_)

# Fit model with the best hyperparameters
rf_model2 = RandomForestRegressor(**grid_search.best_params_, random_state=42, oob_score=True)
rf_model2.fit(X2_train, y2_train)

# Random forest accuracy: MSE, MAE, and R^2
y2_pred_rf = rf_model2.predict(X2_test)
test_mse_rf2 = mean_squared_error(y2_test, y2_pred_rf)
print(f"Test MSE for Random Forest: {test_mse_rf2}") # High MSE compared to first dataset, calories is difficult to predict

test_mae_rf2 = mean_absolute_error(y2_test, y2_pred_rf)
print(f"Test MAE: {test_mae_rf2}")

test_r2_rf2 = r2_score(y2_test, y2_pred_rf)
print(f"Test R-squared: {test_r2_rf2}")

# Feature Importance
importances = rf_model2.feature_importances_
feature_importances = pd.Series(importances, index=X2_train.columns)
feature_importances = feature_importances.sort_values(ascending=False)
print("\nFeature Importances:")
feature_importances

Best hyperparameters: {'max_features': 4, 'n_estimators': 1300}
Test MSE for Random Forest: 1702.4371065179787
Test MAE: 31.123783037475345
Test R-squared: 0.9795932084714785

Feature Importances:


Duration      0.803100
Heart_Rate    0.111502
Weight        0.036471
Age           0.026146
Gender        0.011652
Height        0.011130
dtype: float64

In [34]:
# Load dataset with imputed Body_Temp values, can now test with original dataset
imputed = pd.read_csv('/Users/Vicki/Library/CloudStorage/OneDrive-Emory/24-25/Spring 2025/QTM 347/gym_imputed.csv')
imputed

Unnamed: 0,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,0,56,171,88.3,101.4,157,40.90,1313
1,1,46,153,74.9,78.0,151,40.90,883
2,1,32,166,68.1,66.6,122,40.80,677
3,0,25,170,53.2,35.4,164,40.70,532
4,0,38,179,46.1,38.4,158,40.75,556
...,...,...,...,...,...,...,...,...
968,0,24,174,87.1,94.2,158,40.90,1364
969,0,25,161,66.6,82.8,166,40.70,1260
970,1,59,176,60.4,103.2,120,40.90,929
971,0,32,183,126.4,66.0,146,40.70,883


In [35]:
# Use imputed data as a test set for the model built on df
# Define X and y to be used as test set
X3 = imputed.drop(['Calories'], axis=1)
y3 = imputed['Calories']

# Random forest accuracy: MSE, MAE, and R^2
y_pred_rf = rf_model.predict(X3) # rf_model as defined with original df
test_mse = mean_squared_error(y3, y_pred_rf)
print(f"Test MSE for Random Forest: {test_mse}") # MSE very high, original model does not generalize well to this data

test_mae = mean_absolute_error(y3, y_pred_rf)
print(f"Test MAE: {test_mae}")

test_r2 = r2_score(y3, y_pred_rf)
print(f"Test R-squared: {test_r2}")

Test MSE for Random Forest: 509937.8940365843
Test MAE: 660.5586921891057
Test R-squared: -5.867202974439566


In [36]:
# Define X and y of nonimputed data
X2 = df2.drop(['Calories'], axis=1)
y2 = df2['Calories']
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

# Run boosting on nonimputed dataset
boosting_model = XGBRegressor(n_estimators=500)
boosting_model.fit(X2_train,y2_train)

# Boosting accuracy
y_pred_boosting = boosting_model.predict(X2_test)

# MSE
test_mse_boosting = mean_squared_error(y2_test, y_pred_boosting) # High MSE compared to first dataset, calories is difficult to predict

# MAE
test_mae_boosting = mean_absolute_error(y2_test, y_pred_boosting)

# R^2
test_r2_boosting = r2_score(y2_test, y_pred_boosting)


print(f"Test MSE: {test_mse_boosting}")
print(f"Test MAE: {test_mae_boosting}")
print(f"Test R-squared: {test_r2_boosting}")

Test MSE: 963.7697959407591
Test MAE: 22.89215870392628
Test R-squared: 0.9884474737821739


In [37]:
# Use imputed as a test set for the model built on df
# Boosting accuracy
y_pred_boost = boost_model.predict(X3) # y_pred_boost as defined with original df

#MSE
test_mse_boost = mean_squared_error(y3, y_pred_boost) # MSE very high, original model does not generalize well to this data

#MAE
test_mae_boost = mean_absolute_error(y3, y_pred_boost)

# R^2
test_r2_boost = r2_score(y3, y_pred_boost)


print(f"Test MSE: {test_mse_boost}")
print(f"Test MAE: {test_mae_boost}")
print(f"Test R-squared: {test_r2_boost}")

Test MSE: 500587.26377882814
Test MAE: 652.928061098136
Test R-squared: -5.741280432361646
