In [1]:
import pandas as pd
import numpy as np

Start by loading the data and separating out the three subsets of variables we'll be regressing: physical, fitness, and BIA

Big question (for our mentor): for ridge regression, do we actually want to split this up three ways...?

In [2]:
# Import the data, including the outcome variable (in this case, PCIAT-PCIAT_Total)
# Note that (ridge) regression can't handle missing values. Since we'll do imputation later, we'll need to drop NaNs

# Load the (split) data set train_imp.csv
train = pd.read_csv('train_cleaned.csv')

# Create a list of the 'physical' variables. Based on previous exploration, we can remove some variables and observations due to high rates of NaNs
physical_vars = [col for col in train.columns if col.startswith('Physical') and train[col].dtype in ['float64', 'int64']]
train_physical = train[physical_vars]
train_physical['PCIAT-PCIAT_Total'] = train['PCIAT-PCIAT_Total']
train_physical = train_physical.drop(columns=['Physical-Waist_Circumference'])
train_physical = train_physical.dropna()
train_physical.name = 'train_physical'

# Create a list of the 'fitness' variables. Based on previous exploration, we can remove some variables and observations due to high rates of NaNs
# Note that the Fitness_Endurance variables and the grip strength variables have too many missing values to include in the list
train['FGC_Zone_Total'] = train['FGC-FGC_CU_Zone'] + train['FGC-FGC_PU_Zone'] + train['FGC-FGC_SRL_Zone'] + train['FGC-FGC_SRR_Zone'] + train['FGC-FGC_TL_Zone']
fitness_vars = ['FGC-FGC_CU','FGC-FGC_PU','FGC-FGC_SRL','FGC-FGC_SRR','FGC-FGC_TL', 'FGC_Zone_Total', 'PCIAT-PCIAT_Total']
train_fitness = train[fitness_vars]
train_fitness = train_fitness.dropna()
train_fitness.name = 'train_fitness'

# Create a new data set from train called train_bia that includes all variables that start with BIA-BIA_
train_bia = train[[col for col in train.columns if col.startswith('BIA-BIA_')]]
train_bia = train_bia.drop(columns=['BIA-BIA_Activity_Level_num','BIA-BIA_Frame_num'])
train_bia['PCIAT-PCIAT_Total'] = train['PCIAT-PCIAT_Total']
train_bia = train_bia.dropna()
train_bia.name = 'train_bia'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_physical['PCIAT-PCIAT_Total'] = train['PCIAT-PCIAT_Total']


Next we'll do some hyperparameter tuning for the ridge regression

We'll make a Ridge regression model and compute the MSE for each of several values of alpha

We'll do this on a k-fold split of the training data (because that's always a good thing to do?)

In [3]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.metrics import root_mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Set up the kfold split
num_splits = 5
kfold = KFold(num_splits, shuffle=True)

# Define a range of alpha values
alphas = [0.00001,0.0001,0.001,0.01,0.1,1,10,100,1000]
numalphas = len(alphas)

# Create an empty array with num_splits rows and numalphas columns
rmses = np.zeros((num_splits, numalphas))

# Iterate over the three data sets
listofdatasets = [train_physical, train_fitness, train_bia]

# A data frame to store the optimal alpha values
bestalphas = pd.DataFrame(index=range(0,len(listofdatasets)))
bestalphas['dfname'] = ''
bestalphas['best_alpha'] = 0

k=0
for df in listofdatasets:
    i = 0
    for train_index, test_index in kfold.split(df):
        tt_X = df.iloc[train_index].drop(columns=['PCIAT-PCIAT_Total'])
        tt_y = df.iloc[train_index]['PCIAT-PCIAT_Total']
        ho_X = df.iloc[test_index].drop(columns=['PCIAT-PCIAT_Total'])
        ho_y = df.iloc[test_index]['PCIAT-PCIAT_Total']

        # Iterate over alpha values with counter j
        j = 0
        for alpha in alphas:
            ridge_pipe = Pipeline([('scale', StandardScaler()),('ridge', Ridge(alpha=alpha, max_iter=5000000) )])
            ridge_pipe.fit(tt_X, tt_y)
            y_pred = ridge_pipe.predict(ho_X)
            rmses[i, j] = root_mean_squared_error(ho_y, y_pred)
            
            j=j+1

        i=i+1

    # Compute the mean of each column of rmses
    mean_rmses_within_alphas = np.mean(rmses, axis=0)

    # Compute the mean and standard deviation of each row of rmses
    mean_rmses = np.mean(mean_rmses_within_alphas, axis=0)
    std_rmses = np.std(mean_rmses_within_alphas, axis=0)

    # Identify the column of min_rmse that contains the minimum value
    best_alpha_index = np.argmin(mean_rmses_within_alphas)

    bestalphas.loc[k,'dfname'] = df.name
    bestalphas.loc[k,'best_alpha'] = alphas[best_alpha_index]

    print('The alpha value with the lowest RMSE for the', df.name ,'variables is', alphas[best_alpha_index],'. The mean RMSE was', mean_rmses, ' and the standard deviation was', std_rmses )
    i=0
    j=0
    k=k+1

The alpha value with the lowest RMSE for the train_physical variables is 1 . The mean RMSE was 18.651445261657724  and the standard deviation was 0.06647389746200107
The alpha value with the lowest RMSE for the train_fitness variables is 10 . The mean RMSE was 19.069065125676516  and the standard deviation was 0.12410237832274675
The alpha value with the lowest RMSE for the train_bia variables is 10 . The mean RMSE was 18.680638382918563  and the standard deviation was 0.03897038533887072


In [4]:
bestalphas

Unnamed: 0,dfname,best_alpha
0,train_physical,1
1,train_fitness,10
2,train_bia,10


Now that the hyperparameter is tuned, we'll compare the performance of the ridge regression and a PCA.

Note that in previous explorations we've identified n=3 as the "ideal" number of PCA components for each set of predictor variables

In [11]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression


for df in listofdatasets:
    # Identify the best alpha value we computed earlier
    best_alpha = bestalphas.loc[bestalphas['dfname'] == df.name, 'best_alpha'].values[0]
    
    # Instantiate some models. From previous exploration, we've been using 3 components for the PCA
    ridge_pipe = Pipeline([('scale', StandardScaler()), ('ridge', Ridge(alpha = best_alpha, max_iter=5000000))])
    pca_pipe = Pipeline([('scale', StandardScaler()), ('pca', PCA(n_components=3)), ('reg', LinearRegression())])

    # The training data
    X_train = df.iloc[train_index].drop(columns=['PCIAT-PCIAT_Total'])
    y_train =  df.iloc[train_index]['PCIAT-PCIAT_Total']

    # Fit the models to the training data
    ridge_pipe.fit(X_train, y_train)
    pca_pipe.fit(X_train, y_train)

    # Find the model predictions on the training set
    ridge_train_preds = ridge_pipe.predict(X_train)
    pca_train_preds = pca_pipe.predict(X_train)

    # Find the mse on the training set
    ridge_train_rmse = root_mean_squared_error(y_train, ridge_train_preds)
    pca_train_rmse = root_mean_squared_error(y_train, pca_train_preds)

    # Results
    print(df.name, f"Ridge Training MSE: {ridge_train_rmse}")
    print(df.name, f"PCA Training MSE: {pca_train_rmse}")

train_physical Ridge Training MSE: 18.533899980839752
train_physical PCA Training MSE: 19.235425201158005
train_fitness Ridge Training MSE: 18.695066004261484
train_fitness PCA Training MSE: 19.37497603121806
train_bia Ridge Training MSE: 18.770987125663712
train_bia PCA Training MSE: 19.057632568137585
