In [80]:
import numpy as np
import matplotlib 
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.cm as cmx
import matplotlib.colors as colors
import pandas as pd
import math
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import Lasso as Lasso_Reg
from sklearn.cross_validation import KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LinearRegression as LinReg
from sklearn.cross_validation import train_test_split as sk_split
%matplotlib inline

---
# Decision Tree Regressor
--- 

We will attempt to improve upon our naive Linear regression model by attempting a decision tree regressor technique.

We start by reading in the data from the filled-in predictors dataframe, as well as the desired predicted values for each of the diseases.

In [81]:
# Read in predictors
x_df = pd.read_csv('datasets/predictors_filled.csv')

# read in disease rates
diabetes_df = pd.read_csv('datasets/diabetes_df.csv',index_col = 0)
cardio_df = pd.read_csv('datasets/cardio_df.csv',index_col = 0)
cancer_df= pd.read_csv('datasets/cancer_df.csv',index_col = 0)

# Initial Regression Models

Without training/ testing, we're able to achieve an $R^2$ score of 1.0. However, this model would clearly be severley overfitted.

We will use this function to find the cross-validated $R^2$ value for a given number of folds on a certain parameter value.

In [30]:
def k_fold_r_squared(x_train, y_train, num_folds, param_val):
    n_train = x_train.shape[0]
    n = int(np.round(n_train * 1. / num_folds)) # points per fold

    # Iterate over folds
    cv_r_squared = 0
    
    for fold in range(1, num_folds + 1):
        # Take k-1 folds for training 
        x_first_half = x_train.iloc[:n * (fold - 1), :]
        x_second_half = x_train.iloc[n * fold + 1:, :]
        x_train_cv = np.concatenate((x_first_half, x_second_half), axis=0)
        
        y_first_half = y_train.iloc[:n * (fold - 1)]
        y_second_half = y_train.iloc[n * fold + 1:]
        y_train_cv = np.concatenate((y_first_half, y_second_half), axis=0)
        
        # Take the middle fold for testing
        x_test_cv = x_train.iloc[1 + n * (fold - 1):n * fold, :]
        y_test_cv = y_train.iloc[1 + n * (fold - 1):n * fold]

        # Fit Decision Tree model with parameter value on CV train set, and evaluate CV test performance
        reg = DecisionTreeRegressor(max_depth=param_val)
        reg.fit(x_train_cv, y_train_cv)
        r_squared = reg.score(x_test_cv, y_test_cv)
    
        # Cummulative R^2 value across folds
        cv_r_squared += r_squared

    # Return average R^2 value across folds
    return cv_r_squared * 1.0 / num_folds

In [79]:
# Fit regression model
for depth in [2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 50, 70, 100]:
    reg = DecisionTreeRegressor(max_depth=depth)
    reg.fit(x_df, cardio_df)
    print depth, reg.score(x_df, cardio_df)

2 0.612719201546
3 0.754302949283
4 0.871314547554
5 0.940479469018
6 0.974736803109
7 0.991617733605
8 0.996587978435
9 0.999219309243
10 0.999922435105
20 1.0
50 1.0
70 1.0
100 1.0


In [78]:
for depth in [2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 50, 70, 100]:
    print k_fold_r_squared(x_df,diabetes_df,5, depth)

0.157263311113
0.0383906647747
-0.00942628271465
-0.0593289349265
-0.120257271122
-0.22705646368
-0.243135582395
-0.351739864219
-0.282642169159
-0.431793222486
-0.301257613118
-0.376315252601
-0.260990194789


In [96]:
for depth in [2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20,25, 30, 40, 50, 60, 63, 65, 70, 75, 80, 85, 90, 95, 100]:
    print depth, k_fold_r_squared(x_df,cardio_df,5, depth)

2 0.440138259697
3 0.359546067951
4 0.255686157778
5 0.185733552088
6 0.167862565285
7 0.145218333196
8 0.292624004319
9 0.235725252374
10 0.2445180363
15 0.230592699691
20 0.228649424618
25 0.201039203613
30 0.168123872115
40 0.141606206668
50 0.198096945624
60 0.206140435852
63 0.154363605106
65 0.14860491201
70 0.246917280243
75 0.23520892355
80 0.148110852764
85 0.242671502156
90 0.169559997625
95 0.173961115686
100 0.115020195838


In [109]:
for depth in [2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 50, 70, 100]:
    print k_fold_r_squared(x_df,cancer_df,5, depth)

-0.24979964973
-0.53598823759
-0.601677512502
-0.807935847895
-0.726970285135
-0.579468092489
-0.917212855579
-0.646920501575
-0.734774439111
-0.647075970826
-0.622443414197
-0.530582949237
-0.591049842579


In [67]:
gridsearch = GridSearchCV(DecisionTreeRegressor(), {'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 50, 70]})

In [69]:
gridsearch.fit(x_df, cancer_df)

GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 50, 70]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [70]:
gridsearch.best_params_

{'max_depth': 2}

In [71]:
gridsearch.best_score_

-0.050020039633497548