In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split

Start by loading the data and separating out the three subsets of variables we'll be regressing: physical, fitness, and BIA

Big question (for our mentor): for ridge regression, do we actually want to split this up three ways...?

In [None]:
# Import the data, including the outcome variable (in this case, PCIAT-PCIAT_Total)
# Note that (ridge) regression can't handle missing values. Since we'll do imputation later, we'll need to drop NaNs

# Load the (split) data set train_imp.csv
train = pd.read_csv('train_cleaned.csv')

# Create a list of the 'physical' variables. Based on previous exploration, we can remove some variables and observations due to high rates of NaNs
physical_vars = [col for col in train.columns if col.startswith('Physical') and train[col].dtype in ['float64', 'int64']]
train_physical = train[physical_vars]
train_physical['PCIAT-PCIAT_Total'] = train['PCIAT-PCIAT_Total']
train_physical = train_physical.drop(columns=['Physical-Waist_Circumference'])
train_physical = train_physical.dropna()
train_physical.name = 'Physical'

# Create a list of the 'fitness' variables. Based on previous exploration, we can remove some variables and observations due to high rates of NaNs
# Note that the Fitness_Endurance variables and the grip strength variables have too many missing values to include in the list
train['FGC_Zone_Total'] = train['FGC-FGC_CU_Zone'] + train['FGC-FGC_PU_Zone'] + train['FGC-FGC_SRL_Zone'] + train['FGC-FGC_SRR_Zone'] + train['FGC-FGC_TL_Zone']
fitness_vars = ['FGC-FGC_CU','FGC-FGC_PU','FGC-FGC_SRL','FGC-FGC_SRR','FGC-FGC_TL', 'FGC_Zone_Total', 'PCIAT-PCIAT_Total']
train_fitness = train[fitness_vars]
train_fitness = train_fitness.dropna()
train_physical.name = 'Fitness'

# Create a new data set from train called train_bia that includes all variables that start with BIA-BIA_
train_bia = train[[col for col in train.columns if col.startswith('BIA-BIA_')]]
train_bia = train_bia.drop(columns=['BIA-BIA_Activity_Level_num','BIA-BIA_Frame_num'])
train_bia['PCIAT-PCIAT_Total'] = train['PCIAT-PCIAT_Total']
train_bia = train_bia.dropna()
train_physical.name = 'BIA'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_physical['PCIAT-PCIAT_Total'] = train['PCIAT-PCIAT_Total']


Next we'll do some hyperparameter tuning for the ridge regression

We'll make a Ridge regression model and compute the MSE for each of several values of alpha

We'll do this on a k-fold split of the training data (because that's always a good thing to do?)

In [31]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.metrics import root_mean_squared_error

# Set up the kfold split
num_splits = 5
kfold = KFold(num_splits, shuffle=True)

# Define a range of alpha values
alphas = [0.00001,0.0001,0.001,0.01,0.1,1,10,100,1000]
numalphas = len(alphas)

# Create an empty array with num_splits rows and numalphas columns
rmses = np.zeros((num_splits, numalphas))

# A counter to loop through the splits
i = 0

# Iterate over the three data sets
listofdatasets = [train_physical, train_fitness, train_bia]

for df in listofdatasets:
    for train_index, test_index in kfold.split(df):
        tt_X = df.iloc[train_index].drop(columns=['PCIAT-PCIAT_Total'])
        tt_y = df.iloc[train_index]['PCIAT-PCIAT_Total']
        ho_X = df.iloc[test_index].drop(columns=['PCIAT-PCIAT_Total'])
        ho_y = df.iloc[test_index]['PCIAT-PCIAT_Total']

        # Iterate over alpha values with counter j
        j = 0
        for alpha in alphas:
            model = Ridge(alpha=alpha)
            model.fit(tt_X, tt_y)
            y_pred = model.predict(ho_X)
            rmses[i, j] = root_mean_squared_error(ho_y, y_pred)
            
            j=j+1

        i=i+1

    # Compute the mean of each column of rmses
    mean_rmses = np.mean(rmses, axis=0)

    # Identify the column of min_rmse that contains the minimum value
    best_alpha_index = np.argmin(mean_rmses)

    print('The alpha value with the lowest RMSE for the', df.name, 'variables is', alphas[best_alpha_index])




AttributeError: 'DataFrame' object has no attribute 'name'

In [22]:
mean_rmses

array([18.64702808, 18.64702808, 18.64702808, 18.64702803, 18.64702759,
       18.64702322, 18.64698245, 18.64684388, 18.65627351])

Now that the hyperparameter is tuned, we'll compare the performance of the ridge regression and a PCA.

Note that in previous explorations we've identified n=3 as the "ideal" number of PCA components for each set of predictor variables

In [None]:


# Instantiate some models. From previous exploration, we've been using 3 components for the PCA
ridge_pipe = Pipeline([('scale', StandardScaler()), ('ridge', Ridge())])
pca_pipe = Pipeline([('scale', StandardScaler()), ('pca', PCA(1)), ('reg', LinearRegression())])

# Fit the models to the training data
ridge_pipe.fit(X_train, y_train)
pca_pipe.fit(X_train, y_train)

# Find the model predictions on the training set
lr_train_preds = lr.predict(X_train)
ridge_train_preds = ridge_pipe.predict(X_train)
pca_train_preds = pca_pipe.predict(X_train)

# Find the model predictions on the test set
lr_test_preds = lr.predict(X_test)
ridge_test_preds = ridge_pipe.predict(X_test)
pca_test_preds = pca_pipe.predict(X_test)

# Find the mse on the training set
lr_train_mse = mse(y_train, lr_train_preds)
ridge_train_mse = mse(y_train, ridge_train_preds)
pca_train_mse = mse(y_train, pca_train_preds)

# Find the mse on the test set
lr_test_mse = mse(y_test, lr_test_preds)
ridge_test_mse = mse(y_test, ridge_test_preds)
pca_test_mse = mse(y_test, pca_test_preds)

# Results
print(f"OLS Training MSE: {lr_train_mse}")
print(f"Ridge Training MSE: {ridge_train_mse}")
print(f"PCA Training MSE: {pca_train_mse}")
print(f"OLS Test MSE: {lr_test_mse}")
print(f"Ridge Test MSE: {ridge_test_mse}")
print(f"PCA Test MSE: {pca_test_mse}")