In [None]:
import pandas as pd
import numpy as np

from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split

Start by loading the data and separating out the three subsets of variables we'll be regressing: physical, fitness, and BIA

Big question (for our mentor): for ridge regression, do we actually want to split this up three ways...?

In [None]:
# Import the data
# Note that (ridge) regression can't handle missing values. Since we'll do imputation later, we'll need to drop NaNs

# Load the (split) data set train_imp.csv
train = pd.read_csv('train_imp.csv')

# Create a list of the 'physical' variables. Based on previous exploration, we can remove some variables and observations due to high rates of NaNs
physical_vars = [col for col in train.columns if col.startswith('Physical') and train[col].dtype in ['float64', 'int64']]
train_physical = train[physical_vars]
train_physical = train_physical.drop(columns=['Physical-Waist_Circumference'])
train_physical = train_physical.dropna()



# Create a list of the 'fitness' variables. Based on previous exploration, we can remove some variables and observations due to high rates of NaNs
# Note that the Fitness_Endurance variables and the grip strength variables have too many missing values to include in the list
train['FGC_Zone_Total'] = train['FGC-FGC_CU_Zone'] + train['FGC-FGC_PU_Zone'] + train['FGC-FGC_SRL_Zone'] + train['FGC-FGC_SRR_Zone'] + train['FGC-FGC_TL_Zone']
fitness_vars = ['FGC-FGC_CU','FGC-FGC_PU','FGC-FGC_SRL','FGC-FGC_SRR','FGC-FGC_TL', 'FGC_Zone_Total']
train_fitness = train[fitness_vars]
train_fitness = train_fitness.dropna()


# Create a new data set from train called train_bia that includes all variables that start with BIA-BIA_
train_bia = train[[col for col in train.columns if col.startswith('BIA-BIA_')]]
train_bia = train_bia.drop(columns=['BIA-BIA_Activity_Level_num','BIA-BIA_Frame_num'])
train_bia = train_bia.dropna()


Next we'll do some hyperparameter tuning for the ridge regression

Now that the hyperparameter is tuned, we'll compare the performance of the ridge regression and a PCA.

Note that in previous explorations we've identified n=3 as the "ideal" number of PCA components for each set of predictor variables

In [None]:


# Instantiate some models. From previous exploration, we've been using 3 components for the PCA
ridge_pipe = Pipeline([('scale', StandardScaler()), ('ridge', Ridge())])
pca_pipe = Pipeline([('scale', StandardScaler()), ('pca', PCA(1)), ('reg', LinearRegression())])

# Fit the models to the training data
ridge_pipe.fit(X_train, y_train)
pca_pipe.fit(X_train, y_train)

# Find the model predictions on the training set
lr_train_preds = lr.predict(X_train)
ridge_train_preds = ridge_pipe.predict(X_train)
pca_train_preds = pca_pipe.predict(X_train)

# Find the model predictions on the test set
lr_test_preds = lr.predict(X_test)
ridge_test_preds = ridge_pipe.predict(X_test)
pca_test_preds = pca_pipe.predict(X_test)

# Find the mse on the training set
lr_train_mse = mse(y_train, lr_train_preds)
ridge_train_mse = mse(y_train, ridge_train_preds)
pca_train_mse = mse(y_train, pca_train_preds)

# Find the mse on the test set
lr_test_mse = mse(y_test, lr_test_preds)
ridge_test_mse = mse(y_test, ridge_test_preds)
pca_test_mse = mse(y_test, pca_test_preds)

# Results
print(f"OLS Training MSE: {lr_train_mse}")
print(f"Ridge Training MSE: {ridge_train_mse}")
print(f"PCA Training MSE: {pca_train_mse}")
print(f"OLS Test MSE: {lr_test_mse}")
print(f"Ridge Test MSE: {ridge_test_mse}")
print(f"PCA Test MSE: {pca_test_mse}")