In [25]:
import numpy as np
import matplotlib 
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.cm as cmx
import matplotlib.colors as colors
import pandas as pd
%matplotlib inline

In [26]:
# Read in predictors
x_df = pd.read_csv('datasets/predictors_filled.csv')

# read in disease rates
diabetes_df = pd.read_csv('datasets/diabetes_df.csv',index_col = 0)
cardio_df = pd.read_csv('datasets/cardio_df.csv',index_col = 0)
cancer_df= pd.read_csv('datasets/cancer_df.csv',index_col = 0)

In [84]:
def ridge_k_fold_r_squared(x_train, y_train, num_folds, param_val):
    n_train = x_train.shape[0]
    n = int(np.round(n_train * 1. / num_folds)) # points per fold

    # Iterate over folds
    cv_r_squared = 0
    
    for fold in range(1, num_folds + 1):
        # Take k-1 folds for training 
        x_first_half = x_train.iloc[:n * (fold - 1), :]
        x_second_half = x_train.iloc[n * fold + 1:, :]
        x_train_cv = np.concatenate((x_first_half, x_second_half), axis=0)
        
        y_first_half = y_train.iloc[:n * (fold - 1)]
        y_second_half = y_train.iloc[n * fold + 1:]
        y_train_cv = np.concatenate((y_first_half, y_second_half), axis=0)
        
        # Take the middle fold for testing
        x_test_cv = x_train.iloc[1 + n * (fold - 1):n * fold, :]
        y_test_cv = y_train.iloc[1 + n * (fold - 1):n * fold]

        # Fit Decision Tree model with parameter value on CV train set, and evaluate CV test performance
        reg = Ridge(alpha = param_val, normalize=True)
        reg.fit(x_train_cv, y_train_cv)
        r_squared = reg.score(x_test_cv, y_test_cv)
    
        # Cummulative R^2 value across folds
        cv_r_squared += r_squared

    # Return average R^2 value across folds
    return cv_r_squared * 1.0 / num_folds

In [115]:
def lasso_k_fold_r_squared(x_train, y_train, num_folds, param_val):
    n_train = x_train.shape[0]
    n = int(np.round(n_train * 1. / num_folds)) # points per fold

    # Iterate over folds
    cv_r_squared = 0
    
    for fold in range(1, num_folds + 1):
        # Take k-1 folds for training 
        x_first_half = x_train.iloc[:n * (fold - 1), :]
        x_second_half = x_train.iloc[n * fold + 1:, :]
        x_train_cv = np.concatenate((x_first_half, x_second_half), axis=0)
        
        y_first_half = y_train.iloc[:n * (fold - 1)]
        y_second_half = y_train.iloc[n * fold + 1:]
        y_train_cv = np.concatenate((y_first_half, y_second_half), axis=0)
        
        # Take the middle fold for testing
        x_test_cv = x_train.iloc[1 + n * (fold - 1):n * fold, :]
        y_test_cv = y_train.iloc[1 + n * (fold - 1):n * fold]

        # Fit Decision Tree model with parameter value on CV train set, and evaluate CV test performance
        reg = Lasso(alpha = param_val, normalize=True)
        reg.fit(x_train_cv, y_train_cv)
        coefficients = reg.coef_
        #print len([i for i, item in enumerate(coefficients) if abs(item) >0])
        r_squared = reg.score(x_test_cv, y_test_cv)
    
        # Cummulative R^2 value across folds
        cv_r_squared += r_squared

    # Return average R^2 value across folds
    return cv_r_squared * 1.0 / num_folds

In [116]:
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
cancer_lasso_reg = Lasso(alpha = .01, normalize=True)
cancer_lasso_reg.fit(x_df, cancer_df)
print cancer_lasso_reg.score(x_df, cancer_df)

cancer_lassoCV_reg = LassoCV(alphas = [.01, .1], normalize=True)
cancer_lassoCV_reg.fit(x_df, cancer_df.values.ravel())
print cancer_lassoCV_reg.score(x_df, cancer_df)

print "LASSO"
for alpha in [.001, .01, .1, 1, 3,5,10, 11,12, 13, 14, 15, 16, 17, 18, 19, 20, 25, 100, 200,500,1000]:
    print "cancer",alpha, lasso_k_fold_r_squared(x_df,cancer_df,5, alpha)
    
print "RIDGE"

for alpha in [.001, .01, .1, 1, 3, 5,10, 11,12, 13, 14, 15, 16, 17, 18, 19, 20, 25, 100, 200,500,1000]:
    print "cancer",alpha, ridge_k_fold_r_squared(x_df,cancer_df,5, alpha)

0.795989014205
0.595066579537
LASSO
cancer 0.001 -8.74966599085
cancer 0.01 -2.96325751054
cancer 0.1 -0.190699091902
cancer 1 0.059000481265
cancer 3 -0.0667064097657
cancer 5 -0.0667064097657
cancer 10 -0.0667064097657
cancer 11 -0.0667064097657
cancer 12 -0.0667064097657
cancer 13 -0.0667064097657
cancer 14 -0.0667064097657
cancer 15 -0.0667064097657
cancer 16 -0.0667064097657
cancer 17 -0.0667064097657
cancer 18 -0.0667064097657
cancer 19 -0.0667064097657
cancer 20 -0.0667064097657
cancer 25 -0.0667064097657
cancer 100 -0.0667064097657
cancer 200 -0.0667064097657
cancer 500 -0.0667064097657
cancer 1000 -0.0667064097657
RIDGE
cancer 0.001 -7.5846526988
cancer 0.01 -3.04209158397
cancer 0.1 -0.743910830295
cancer 1 0.078166346437
cancer 3 0.179617238894
cancer 5 0.182894051905
cancer 10 0.157128814375
cancer 11 0.151574001586
cancer 12 0.146178164685
cancer 13 0.140966406052
cancer 14 0.13594946692
cancer 15 0.13112966807
cancer 16 0.126504384414
cancer 17 0.12206812724
cancer 18 0.1

In [28]:
from sklearn.linear_model import Ridge
cancer_ridge = Ridge(alpha = .01, normalize=True)
cancer_ridge.fit(x_df, cancer_df)
cancer_ridge.score(x_df, cancer_df)

0.8042871716620823

# Cardio

In [114]:
cardio_reg = Lasso(alpha=.01, normalize=True)
cardio_reg.fit(x_df, cardio_df)
print cardio_reg.score(x_df, cardio_df)

cardio_lassoCV_reg = LassoCV(alphas = [.001, .01, .1, 1], normalize=True)
cardio_lassoCV_reg.fit(x_df, cardio_df.values.ravel())
print cardio_lassoCV_reg.score(x_df, cardio_df)

print "LASSO"
for alpha in [.001, .01, .1, 1, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 25, 100, 1000]:
    print "cardio",alpha, lasso_k_fold_r_squared(x_df,cardio_df,5, alpha)
    
print "RIDGE"

for alpha in [.001, .01, .1, 1, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 25, 100, 1000]:
    print "cardio",alpha, ridge_k_fold_r_squared(x_df,cardio_df,5, alpha)

0.84267925017
0.577876646041
LASSO
cardio 0.001 102
101
102
99
102
-4.38550193519
cardio 0.01 92
93
91
96
96
-3.01117707597
cardio 0.1 68
68
70
66
68
-0.123533778277
cardio 1 17
20
18
20
18
0.45802637765
cardio 10 0
0
0
0
0
-0.0121108016136
cardio 11 0
0
0
0
0
-0.0121108016136
cardio 13 0
0
0
0
0
-0.0121108016136
cardio 14 0
0
0
0
0
-0.0121108016136
cardio 15 0
0
0
0
0
-0.0121108016136
cardio 16 0
0
0
0
0
-0.0121108016136
cardio 17 0
0
0
0
0
-0.0121108016136
cardio 18 0
0
0
0
0
-0.0121108016136
cardio 19 0
0
0
0
0
-0.0121108016136
cardio 20 0
0
0
0
0
-0.0121108016136
cardio 25 0
0
0
0
0
-0.0121108016136
cardio 100 0
0
0
0
0
-0.0121108016136
cardio 1000 0
0
0
0
0
-0.0121108016136
RIDGE
cardio 0.001 -3.96732782949
cardio 0.01 -2.04659444672
cardio 0.1 -0.0743876496175
cardio 1 0.420913047131
cardio 10 0.285504207166
cardio 11 0.272129846453
cardio 13 0.248595424699
cardio 14 0.238207682288
cardio 15 0.228608950052
cardio 16 0.219716026242
cardio 17 0.211456067161
cardio 18 0.203765303377

# Diabetes

In [98]:
diabetes_reg = Lasso(alpha=.001, normalize=True)
diabetes_reg.fit(x_df, diabetes_df)
print diabetes_reg.score(x_df, diabetes_df)

diabetes_lassoCV_reg = LassoCV(alphas = [.001, .01, .1, 1], normalize=True)
diabetes_lassoCV_reg.fit(x_df, diabetes_df.values.ravel())
print diabetes_lassoCV_reg.score(x_df, diabetes_df)

for alpha in [.001, .01, .1, 1, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 25, 100, 1000]:
    print "diabetes",alpha, ridge_k_fold_r_squared(x_df,diabetes_df,4, alpha)

0.823730474505
0.0
diabetes 0.001 -5.5597290689
diabetes 0.01 -2.50308003386
diabetes 0.1 -0.259201378475
diabetes 1 0.316918281716
diabetes 10 0.234159110602
diabetes 11 0.225550052854
diabetes 13 0.210095567282
diabetes 14 0.203127987416
diabetes 15 0.196598615778
diabetes 16 0.190464650311
diabetes 17 0.184688825621
diabetes 18 0.179238579694
diabetes 19 0.174085345279
diabetes 20 0.169203954677
diabetes 25 0.148175512443
diabetes 100 0.0430740924706
diabetes 1000 -0.0145201142658


In [31]:
def k_fold_r_squared(x_train, y_train, num_folds, param_val):
    n_train = x_train.shape[0]
    n = int(np.round(n_train * 1. / num_folds)) # points per fold

    # Iterate over folds
    cv_r_squared = 0
    
    for fold in range(1, num_folds + 1):
        # Take k-1 folds for training 
        x_first_half = x_train.iloc[:n * (fold - 1), :]
        x_second_half = x_train.iloc[n * fold + 1:, :]
        x_train_cv = np.concatenate((x_first_half, x_second_half), axis=0)
        
        y_first_half = y_train.iloc[:n * (fold - 1)]
        y_second_half = y_train.iloc[n * fold + 1:]
        y_train_cv = np.concatenate((y_first_half, y_second_half), axis=0)
        
        # Take the middle fold for testing
        x_test_cv = x_train.iloc[1 + n * (fold - 1):n * fold, :]
        y_test_cv = y_train.iloc[1 + n * (fold - 1):n * fold]

        # Fit Decision Tree model with parameter value on CV train set, and evaluate CV test performance
        reg = DecisionTreeRegressor(max_depth=param_val)
        reg.fit(x_train_cv, y_train_cv)
        r_squared = reg.score(x_test_cv, y_test_cv)
    
        # Cummulative R^2 value across folds
        cv_r_squared += r_squared

    # Return average R^2 value across folds
    return cv_r_squared * 1.0 / num_folds

In [64]:
# Fit regression model
for depth in [2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 50, 70, 100]:
    reg = DecisionTreeRegressor(max_depth=depth)
    reg.fit(x_df, cancer_df)
    print depth, reg.score(x_df, cancer_df)

2 0.481615672246
3 0.680470026388
4 0.791007766296
5 0.872312891098
6 0.918227247708
7 0.950040031391
8 0.976641337886
9 0.993858789579
10 0.999167360437
20 1.0
50 1.0
70 1.0
100 1.0


In [79]:
# Fit regression model
for depth in [2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 50, 70, 100]:
    reg = DecisionTreeRegressor(max_depth=depth)
    reg.fit(x_df, cardio_df)
    print depth, reg.score(x_df, cardio_df)

2 0.612719201546
3 0.754302949283
4 0.871314547554
5 0.940479469018
6 0.974736803109
7 0.991617733605
8 0.996587978435
9 0.999219309243
10 0.999922435105
20 1.0
50 1.0
70 1.0
100 1.0


In [78]:
for depth in [2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 50, 70, 100]:
    print k_fold_r_squared(x_df,diabetes_df,5, depth)

0.157263311113
0.0383906647747
-0.00942628271465
-0.0593289349265
-0.120257271122
-0.22705646368
-0.243135582395
-0.351739864219
-0.282642169159
-0.431793222486
-0.301257613118
-0.376315252601
-0.260990194789


In [24]:
for depth in [2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 50, 70, 100]:
    print k_fold_r_squared(x_df,cardio_df,10, depth)

NameError: name 'k_fold_r_squared' is not defined

In [77]:
for depth in [2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 50, 70, 100]:
    print k_fold_r_squared(x_df,cancer_df,5, depth)

-0.110776258483
-0.385507423298
-0.314279272363
-0.738707373085
-0.717019374631
-0.893857314375
-0.617305657296
-0.830736099529
-0.44166106094
-0.753511958835
-0.378120502515
-0.820893373863
-0.481570507449


In [67]:
gridsearch = GridSearchCV(DecisionTreeRegressor(), {'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 50, 70]})

In [69]:
gridsearch.fit(x_df, cancer_df)

GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 50, 70]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [70]:
gridsearch.best_params_

{'max_depth': 2}

In [71]:
gridsearch.best_score_

-0.050020039633497548