In [184]:
# general
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

%matplotlib inline

from pylab import rcParams
rcParams['figure.figsize'] = 8, 5

# first used in exercise one
import midterm as mt
from sklearn import preprocessing # for scale
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

# first used in exercise two
# TODO

In [4]:
# import importlib
# importlib.reload(midterm)

In [134]:
h = pd.read_csv('Hitters.csv')
h.shape

(322, 20)

In [135]:
h.columns

Index(['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks', 'Years', 'CAtBat',
       'CHits', 'CHmRun', 'CRuns', 'CRBI', 'CWalks', 'League', 'Division',
       'PutOuts', 'Assists', 'Errors', 'Salary', 'NewLeague'],
      dtype='object')

In [136]:
# remove rows w/ empty values - i think this is only those w/ no salary
h = h.dropna()
h.shape

(263, 20)

In [137]:
# convert categorical vars to dummy variables
h = pd.get_dummies(h)
h.shape

(263, 23)

In [138]:
h[:1]

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,...,PutOuts,Assists,Errors,Salary,League_A,League_N,Division_E,Division_W,NewLeague_A,NewLeague_N
1,315,81,7,24,38,39,14,3449,835,69,...,632,43,10,475.0,0,1,0,1,0,1


In [144]:
# split into features and the value to predict (X and y)
# scale all features so each col has mean zero and std 1
X_scaled = preprocessing.scale(h.drop('Salary', axis=1))
y_scaled = preprocessing.scale(h['Salary']) 

(X_scaled.shape, y_scaled.shape)

((263, 22), (263,))

In [186]:
def train_and_test_single_fold(X_full, y_full, train_index, test_index, lam, alpha):
    """
    Train using the data identified by the indices in train_index, and then test
    (and return MSE) using the data identified by the indices in test_index.
    """
    beta_vals = mt.randcoorddescent(X_full[train_index], y_full[train_index], lam, alpha)
    final_coefs = mt.get_final_coefs(beta_vals)
    
    return mean_squared_error(y_full[test_index], X_full[test_index].dot(final_coefs))

In [231]:
def train_and_test_for_all_folds(X_full, y_full, train_indices, test_indices, lam, alpha):
    """
    Train and test for all folds - for now, 10 folds, hard-coded. Return the mean of the 
    set of MSE values from all folds."""
    mses = [train_and_test_single_fold(X_scaled, y_scaled, 
                                       train_indices[i], test_indices[i], lam, alpha) for i in range(10)]
    return(np.mean(mses))

In [241]:
# get arrays with 10 sets of test and train indices - one for each fold

kf = KFold(10, shuffle=True, random_state=42)

train_indices_list = []
test_indices_list = []
for train_index, test_index in kf.split(X_scaled):
    train_indices_list.append(train_index)
    test_indices_list.append(test_index)

In [242]:
lambdas = np.logspace(-5, 0.1, num=10)
lambdas

array([  1.00000000e-05,   3.68694506e-05,   1.35935639e-04,
         5.01187234e-04,   1.84784980e-03,   6.81292069e-03,
         2.51188643e-02,   9.26118728e-02,   3.41454887e-01,
         1.25892541e+00])

In [244]:
# and finally, do 10-fold cross validation for each value of lambda, and
# show the mean of each set's MSEs
mses_by_lambda = [train_and_test_for_all_folds(X_scaled, y_scaled, 
                        train_indices, test_indices, lam, 0.9) for lam in lambdas]
list(zip(lambdas, mses_by_lambda))

[(1.0000000000000001e-05, 0.56424450918337576),
 (3.6869450645195736e-05, 0.57144463480265129),
 (0.00013593563908785255, 0.55591509549347318),
 (0.00050118723362727253, 0.56362600944864671),
 (0.0018478497974222907, 0.56120114551231837),
 (0.0068129206905796083, 0.55970789734015913),
 (0.025118864315095794, 0.5746324830734949),
 (0.092611872812879373, 0.57702369609266979),
 (0.34145488738336005, 0.63596685054189406),
 (1.2589254117941673, 0.99170675337388681)]

**TODO** do at least basic sanity checks (at least review the code with a careful eye) for the cross validation stuff above.