## Test a random forest model on full dataset

In [1]:
## Python packages - you may have to pip install sqlalchemy, sqlalchemy_utils, and psycopg2.

import pandas as pd
# Make the graphs a bit prettier, and bigger
pd.set_option('display.mpl_style', 'default')

# This is necessary to show lots of columns in pandas 0.12. 
# Not necessary in pandas 0.13.
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)
pd.set_option('display.max_rows', 20)

import numpy as np
import math
# The usual preamble
#%matplotlib inline00
#%pylab inline
import matplotlib.pyplot as plt
plt.rcParams['axes.color_cycle'] = ['r', 'g', 'b', 'c']
plt.rcParams['lines.color'] = 'r'
plt.rcParams['figure.figsize'] = (15, 5)

import process
from modeling import categorize



Load in dataset

In [2]:
test_data,val_data = process.get_split_add_data(frac=.4)
val_data

Unnamed: 0_level_0,urban,rural,institutionalized_all,institutionalized_adult_all,institutionalized_adult_federal_detention_all,institutionalized_adult_federal_prison_all,institutionalized_adult_state_prison_all,institutionalized_adult_local_jail_all,institutionalized_juvenile_all,institutionalized_white,institutionalized_adult_white,institutionalized_adult_federal_detention_white,institutionalized_adult_federal_prison_white,institutionalized_adult_state_prison_white,institutionalized_adult_local_jail_white,institutionalized_juvenile_white,institutionalized_black,institutionalized_adult_black,institutionalized_adult_federal_detention_black,institutionalized_adult_federal_prison_black,institutionalized_adult_state_prison_black,institutionalized_adult_local_jail_black,institutionalized_juvenile_black,population_white,population_black,total_income_estimate_all,total_income_estimate_white,total_income_estimate_black,swnauthemp,swnftemp,...,numothunm_per_capita,numplanes_per_capita,numcopters_per_capita,numboats_per_capita,nummotor_per_capita,numcarcam_per_capita,numfixcam_per_capita,nummobcam_per_capita,diversity_index,white_per_totofficers,black_per_totofficers,hispanic_per_totofficers,asian_per_totofficers,nathaw_per_totofficers,amerind_per_totofficers,multrace_per_totofficers,unkrace_per_totofficers,rpsi,black_over_white_population_disparity,black_over_white_income_disparity,black_over_white_institutionalized_disparity,black_over_white_institutionalized_adult_disparity,black_over_white_institutionalized_adult_federal_detention_disparity,black_over_white_institutionalized_adult_federal_prison_disparity,black_over_white_institutionalized_adult_state_prison_disparity,black_over_white_institutionalized_adult_local_jail_disparity,black_over_white_institutionalized_juvenile_disparity,black_officer_disparity,white_officer_disparity,black_over_white_officer_disparity
surveyid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1
1222,26616,31,186,0,0,0,0,0,13,144,0,0,0,0,0,5,37,0,0,0,0,0,6,24649,323,9712,9063,35,65,65,...,0.000150,0,0.000000,0.000075,0.000038,0.000075,0.000525,0.000000,0.089423,0.953846,0.000000,0.046154,0.000000,0,0.000000,0.000000,0,3.067020,0.013104,0.003862,0.256944,1.000000,1,1.000000,1.000000,1.000000,1.200000,0.000000,1.031163,0.000000
184,110372,0,1149,219,0,0,0,0,44,861,83,0,0,0,0,23,209,82,0,0,0,0,13,64836,22146,40950,26944,7822,300,300,...,0.000136,0,0.000000,0.000000,0.000082,0.000000,0.000027,0.000027,0.339331,0.800000,0.070000,0.130000,0.000000,0,0.000000,0.000000,0,1.356672,0.341569,0.290306,0.242741,0.987952,1,1.000000,1.000000,1.000000,0.565217,0.348868,1.361861,0.256170
175,123919,0,2194,1277,0,0,0,1095,119,1027,378,0,0,0,309,46,683,512,0,0,0,436,46,36606,48114,46061,17050,17580,592,417,...,0.000169,0,0.000000,0.000024,0.000105,0.000347,0.000016,0.000000,0.545690,0.625899,0.151079,0.201439,0.021583,0,0.000000,0.000000,0,1.708121,1.314375,1.031085,0.665044,1.354497,1,1.000000,1.000000,1.411003,1.000000,0.389109,2.118801,0.183646
170,78229,2653,1904,1341,0,1339,0,0,25,1162,637,0,637,0,0,17,434,406,0,406,0,0,7,55191,5803,29034,21017,1763,152,148,...,0.000074,0,0.000000,0.000012,0.000000,0.000000,0.000000,0.000000,0.199577,0.891892,0.027027,0.074324,0.006757,0,0.000000,0.000000,0,1.210668,0.105144,0.083884,0.373494,0.637363,1,0.637363,1.000000,1.000000,0.411765,0.376702,1.307061,0.288205
174,59531,1435,427,0,0,0,0,0,82,363,0,0,0,0,0,43,53,0,0,0,0,0,28,41750,12292,23342,16691,4492,107,101,...,0.000066,0,0.000000,0.000000,0.000066,0.000197,0.000000,0.000016,0.133465,0.930693,0.029703,0.019802,0.019802,0,0.000000,0.000000,0,2.385800,0.294419,0.269127,0.146006,1.000000,1,1.000000,1.000000,1.000000,0.651163,0.147321,1.359057,0.108400
179,73240,0,790,0,0,0,0,0,59,673,0,0,0,0,0,34,59,0,0,0,0,0,14,46607,9530,27832,19961,2811,160,172,...,0.000096,0,0.000000,0.000000,0.000055,0.000000,0.000000,0.000000,0.203794,0.889535,0.058140,0.052326,0.000000,0,0.000000,0.000000,0,1.547964,0.204476,0.140825,0.087667,1.000000,1,1.000000,1.000000,1.000000,0.411765,0.446814,1.397849,0.319644
177,60499,413,688,4,0,0,0,0,24,624,1,0,0,0,0,15,45,1,0,0,0,0,8,44770,5876,24060,19356,2243,126,120,...,0.000082,0,0.000000,0.000000,0.000066,0.000821,0.000000,0.000000,0.267927,0.850000,0.041667,0.100000,0.000000,0,0.000000,0.008333,0,2.008135,0.131249,0.115881,0.072115,1.000000,1,1.000000,1.000000,1.000000,0.533333,0.431926,1.156471,0.373487
183,51382,0,291,0,0,0,0,0,8,264,0,0,0,0,0,4,24,0,0,0,0,0,4,39248,7347,20330,16244,2797,111,105,...,0.000019,0,0.000019,0.000019,0.000000,0.000195,0.000000,0.000000,0.257143,0.857143,0.085714,0.057143,0.000000,0,0.000000,0.000000,0,1.531770,0.187194,0.172187,0.090909,1.000000,1,1.000000,1.000000,1.000000,1.000000,0.599452,1.122139,0.534204
173,33799,5020,471,0,0,0,0,0,0,448,0,0,0,0,0,0,17,0,0,0,0,0,0,33341,1042,14572,13072,318,157,156,...,0.000155,0,0.000000,0.000077,0.000155,0.000000,0.000000,0.000000,0.120761,0.935897,0.064103,0.000000,0.000000,0,0.000000,0.000000,0,2.066508,0.031253,0.024327,0.037946,1.000000,1,1.000000,1.000000,1.000000,1.000000,2.388097,1.089667,2.191584
186,55564,0,458,0,0,0,0,0,0,350,0,0,0,0,0,0,75,0,0,0,0,0,0,36498,10917,20463,14420,4202,122,117,...,0.000090,0,0.000000,0.000018,0.000054,0.000000,0.000000,0.000000,0.249779,0.863248,0.068376,0.042735,0.000000,0,0.000000,0.025641,0,1.339448,0.299112,0.291401,0.214286,1.000000,1,1.000000,1.000000,1.000000,1.000000,0.348012,1.314195,0.264810


In [3]:
val_data.to_sql('val_data',process.engine,if_exists='replace')



In [4]:
val_data

Unnamed: 0_level_0,urban,rural,institutionalized_all,institutionalized_adult_all,institutionalized_adult_federal_detention_all,institutionalized_adult_federal_prison_all,institutionalized_adult_state_prison_all,institutionalized_adult_local_jail_all,institutionalized_juvenile_all,institutionalized_white,institutionalized_adult_white,institutionalized_adult_federal_detention_white,institutionalized_adult_federal_prison_white,institutionalized_adult_state_prison_white,institutionalized_adult_local_jail_white,institutionalized_juvenile_white,institutionalized_black,institutionalized_adult_black,institutionalized_adult_federal_detention_black,institutionalized_adult_federal_prison_black,institutionalized_adult_state_prison_black,institutionalized_adult_local_jail_black,institutionalized_juvenile_black,population_white,population_black,total_income_estimate_all,total_income_estimate_white,total_income_estimate_black,swnauthemp,swnftemp,...,numothunm_per_capita,numplanes_per_capita,numcopters_per_capita,numboats_per_capita,nummotor_per_capita,numcarcam_per_capita,numfixcam_per_capita,nummobcam_per_capita,diversity_index,white_per_totofficers,black_per_totofficers,hispanic_per_totofficers,asian_per_totofficers,nathaw_per_totofficers,amerind_per_totofficers,multrace_per_totofficers,unkrace_per_totofficers,rpsi,black_over_white_population_disparity,black_over_white_income_disparity,black_over_white_institutionalized_disparity,black_over_white_institutionalized_adult_disparity,black_over_white_institutionalized_adult_federal_detention_disparity,black_over_white_institutionalized_adult_federal_prison_disparity,black_over_white_institutionalized_adult_state_prison_disparity,black_over_white_institutionalized_adult_local_jail_disparity,black_over_white_institutionalized_juvenile_disparity,black_officer_disparity,white_officer_disparity,black_over_white_officer_disparity
surveyid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1
1222,26616,31,186,0,0,0,0,0,13,144,0,0,0,0,0,5,37,0,0,0,0,0,6,24649,323,9712,9063,35,65,65,...,0.000150,0,0.000000,0.000075,0.000038,0.000075,0.000525,0.000000,0.089423,0.953846,0.000000,0.046154,0.000000,0,0.000000,0.000000,0,3.067020,0.013104,0.003862,0.256944,1.000000,1,1.000000,1.000000,1.000000,1.200000,0.000000,1.031163,0.000000
184,110372,0,1149,219,0,0,0,0,44,861,83,0,0,0,0,23,209,82,0,0,0,0,13,64836,22146,40950,26944,7822,300,300,...,0.000136,0,0.000000,0.000000,0.000082,0.000000,0.000027,0.000027,0.339331,0.800000,0.070000,0.130000,0.000000,0,0.000000,0.000000,0,1.356672,0.341569,0.290306,0.242741,0.987952,1,1.000000,1.000000,1.000000,0.565217,0.348868,1.361861,0.256170
175,123919,0,2194,1277,0,0,0,1095,119,1027,378,0,0,0,309,46,683,512,0,0,0,436,46,36606,48114,46061,17050,17580,592,417,...,0.000169,0,0.000000,0.000024,0.000105,0.000347,0.000016,0.000000,0.545690,0.625899,0.151079,0.201439,0.021583,0,0.000000,0.000000,0,1.708121,1.314375,1.031085,0.665044,1.354497,1,1.000000,1.000000,1.411003,1.000000,0.389109,2.118801,0.183646
170,78229,2653,1904,1341,0,1339,0,0,25,1162,637,0,637,0,0,17,434,406,0,406,0,0,7,55191,5803,29034,21017,1763,152,148,...,0.000074,0,0.000000,0.000012,0.000000,0.000000,0.000000,0.000000,0.199577,0.891892,0.027027,0.074324,0.006757,0,0.000000,0.000000,0,1.210668,0.105144,0.083884,0.373494,0.637363,1,0.637363,1.000000,1.000000,0.411765,0.376702,1.307061,0.288205
174,59531,1435,427,0,0,0,0,0,82,363,0,0,0,0,0,43,53,0,0,0,0,0,28,41750,12292,23342,16691,4492,107,101,...,0.000066,0,0.000000,0.000000,0.000066,0.000197,0.000000,0.000016,0.133465,0.930693,0.029703,0.019802,0.019802,0,0.000000,0.000000,0,2.385800,0.294419,0.269127,0.146006,1.000000,1,1.000000,1.000000,1.000000,0.651163,0.147321,1.359057,0.108400
179,73240,0,790,0,0,0,0,0,59,673,0,0,0,0,0,34,59,0,0,0,0,0,14,46607,9530,27832,19961,2811,160,172,...,0.000096,0,0.000000,0.000000,0.000055,0.000000,0.000000,0.000000,0.203794,0.889535,0.058140,0.052326,0.000000,0,0.000000,0.000000,0,1.547964,0.204476,0.140825,0.087667,1.000000,1,1.000000,1.000000,1.000000,0.411765,0.446814,1.397849,0.319644
177,60499,413,688,4,0,0,0,0,24,624,1,0,0,0,0,15,45,1,0,0,0,0,8,44770,5876,24060,19356,2243,126,120,...,0.000082,0,0.000000,0.000000,0.000066,0.000821,0.000000,0.000000,0.267927,0.850000,0.041667,0.100000,0.000000,0,0.000000,0.008333,0,2.008135,0.131249,0.115881,0.072115,1.000000,1,1.000000,1.000000,1.000000,0.533333,0.431926,1.156471,0.373487
183,51382,0,291,0,0,0,0,0,8,264,0,0,0,0,0,4,24,0,0,0,0,0,4,39248,7347,20330,16244,2797,111,105,...,0.000019,0,0.000019,0.000019,0.000000,0.000195,0.000000,0.000000,0.257143,0.857143,0.085714,0.057143,0.000000,0,0.000000,0.000000,0,1.531770,0.187194,0.172187,0.090909,1.000000,1,1.000000,1.000000,1.000000,1.000000,0.599452,1.122139,0.534204
173,33799,5020,471,0,0,0,0,0,0,448,0,0,0,0,0,0,17,0,0,0,0,0,0,33341,1042,14572,13072,318,157,156,...,0.000155,0,0.000000,0.000077,0.000155,0.000000,0.000000,0.000000,0.120761,0.935897,0.064103,0.000000,0.000000,0,0.000000,0.000000,0,2.066508,0.031253,0.024327,0.037946,1.000000,1,1.000000,1.000000,1.000000,1.000000,2.388097,1.089667,2.191584
186,55564,0,458,0,0,0,0,0,0,350,0,0,0,0,0,0,75,0,0,0,0,0,0,36498,10917,20463,14420,4202,122,117,...,0.000090,0,0.000000,0.000018,0.000054,0.000000,0.000000,0.000000,0.249779,0.863248,0.068376,0.042735,0.000000,0,0.000000,0.025641,0,1.339448,0.299112,0.291401,0.214286,1.000000,1,1.000000,1.000000,1.000000,1.000000,0.348012,1.314195,0.264810


# Build Model

In [3]:
processor = process.Processor(val_data,categorize=False)
X_val,y_val = processor.get_scaled_Xy(val_data)
X_test,y_test = processor.get_scaled_Xy(test_data)

In [6]:
len(val_data)

114

In [7]:
val_data_scaled = pd.DataFrame(np.c_[X_val,y_val],index=val_data.index,columns=val_data.drop('rpsi',1).columns.tolist()+['rpsi'])
val_data_scaled.to_sql('val_data_scaled',process.engine,if_exists='replace')#print X_val+y_val

In [8]:
plt.hist(val_data['rpsi'].tolist(),bins=40)

(array([ 15.,  15.,  22.,  14.,  12.,   4.,  10.,  10.,   2.,   2.,   3.,
          1.,   1.,   0.,   0.,   0.,   0.,   0.,   1.,   0.,   1.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   1.]),
 array([  0.94214049,   1.21256609,   1.48299169,   1.75341729,
          2.02384289,   2.29426849,   2.56469409,   2.83511969,
          3.10554528,   3.37597088,   3.64639648,   3.91682208,
          4.18724768,   4.45767328,   4.72809888,   4.99852448,
          5.26895008,   5.53937567,   5.80980127,   6.08022687,
          6.35065247,   6.62107807,   6.89150367,   7.16192927,
          7.43235487,   7.70278047,   7.97320606,   8.24363166,
          8.51405726,   8.78448286,   9.05490846,   9.32533406,
          9.59575966,   9.86618526,  10.13661085,  10.40703645,
         10.67746205,  10.94788765,  11.21831325,  11.48873885,  11.75916445]),
 <a list of 40 Patch objects>)

In [None]:
val_data_scaled['rpsi']

In [9]:
for cat in sort(val_data_scaled['rpsi'].map(categorize).unique().tolist()):
    
    print cat, shape(val_data_scaled[val_data_scaled['rpsi'].map(categorize)==cat])[0]

NameError: name 'sort' is not defined

In [None]:
np.shape(X_val)

In [9]:
from sklearn import linear_model,cross_validation,metrics,grid_search
from sklearn.ensemble import RandomForestRegressor
n_features = np.shape(X_val)[1]
values = range(10,n_features+10,10)
n_iter = len(values)

param_grid = {'n_estimators': values}
clf = RandomForestRegressor(n_jobs=4,max_features=n_features)
clfmodel = grid_search.RandomizedSearchCV(clf,param_grid,
                                          n_iter,
                                          cv=cross_validation.LeaveOneOut(len(y_val)),
                                          verbose=1,random_state=2)
clfmodel.fit(X_val,y_val)

[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:   15.4s
[Parallel(n_jobs=1)]: Done 199 tasks       | elapsed:  1.0min
[Parallel(n_jobs=1)]: Done 449 tasks       | elapsed:  2.5min
[Parallel(n_jobs=1)]: Done 799 tasks       | elapsed:  5.4min
[Parallel(n_jobs=1)]: Done 1249 tasks       | elapsed: 10.0min
[Parallel(n_jobs=1)]: Done 1799 tasks       | elapsed: 17.3min


Fitting 114 folds for each of 19 candidates, totalling 2166 fits


[Parallel(n_jobs=1)]: Done 2166 out of 2166 | elapsed: 23.2min finished


RandomizedSearchCV(cv=sklearn.cross_validation.LeaveOneOut(n=114),
          error_score='raise',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=183, max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=4, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
          fit_params={}, iid=True, n_iter=19, n_jobs=1,
          param_distributions={'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190]},
          pre_dispatch='2*n_jobs', random_state=2, refit=True,
          scoring=None, verbose=1)

In [11]:
print clfmodel.best_params_
print clfmodel.best_score_
est = clfmodel.best_estimator_

features = []
importances = []
for feature,importance in zip(val_data.drop('rpsi',1).columns.tolist(),clfmodel.best_estimator_.feature_importances_):
    #print feature,importance
    features.append(feature)
    importances.append(importance)
indices = [i[0] for i in sorted(enumerate(importances), key=lambda x:x[1])]
best_feature_names = []
for index in reversed(indices):
    print features[index],importances[index]
    if importances[index] < 0.005:
        continue
    best_feature_names.append(features[index])
print len(best_feature_names)

{'n_estimators': 10}
0.0
asian_per_totofficers 0.252615231095
numfixcam_per_capita 0.154447659502
entrymin 0.0877029698725
entrymax 0.0419575533643
numcpo 0.0371003275781
totacad 0.0325965872664
sgtmin 0.0264505869786
total_income_estimate_white_per_capita 0.0262088645622
opbudget_per_capita 0.0217077132901
swnftemp_per_capita 0.0216311067578
numfixcam 0.0188598211041
numumkcars_per_capita 0.0155390709596
totinsrv 0.0147338563416
civftemp 0.0144294065253
rural 0.0118500611127
institutionalized_juvenile_all_per_capita 0.0112561092012
numpatr_per_capita 0.0110980704421
numrespoff_per_capita 0.00977742641728
pterroff_per_capita 0.00888565784427
numcarcam 0.00844889797677
institutionalized_juvenile_white 0.00838190314798
civftemp_per_capita 0.00777995752677
institutionalized_white_per_capita 0.00759299775935
black_over_white_institutionalized_juvenile_disparity 0.00723458443846
hispanic_per_totofficers 0.00717731923991
nummrkcars_per_capita 0.0068496211312
totftemp_per_capita 0.00674563360

In [24]:
clfmodel.grid_scores_

[mean: 0.00000, std: 0.00000, params: {'n_estimators': 10},
 mean: 0.00000, std: 0.00000, params: {'n_estimators': 20},
 mean: 0.00000, std: 0.00000, params: {'n_estimators': 30},
 mean: 0.00000, std: 0.00000, params: {'n_estimators': 40},
 mean: 0.00000, std: 0.00000, params: {'n_estimators': 50},
 mean: 0.00000, std: 0.00000, params: {'n_estimators': 60},
 mean: 0.00000, std: 0.00000, params: {'n_estimators': 70},
 mean: 0.00000, std: 0.00000, params: {'n_estimators': 80},
 mean: 0.00000, std: 0.00000, params: {'n_estimators': 90},
 mean: 0.00000, std: 0.00000, params: {'n_estimators': 100},
 mean: 0.00000, std: 0.00000, params: {'n_estimators': 110},
 mean: 0.00000, std: 0.00000, params: {'n_estimators': 120},
 mean: 0.00000, std: 0.00000, params: {'n_estimators': 130},
 mean: 0.00000, std: 0.00000, params: {'n_estimators': 140},
 mean: 0.00000, std: 0.00000, params: {'n_estimators': 150},
 mean: 0.00000, std: 0.00000, params: {'n_estimators': 160},
 mean: 0.00000, std: 0.00000, par

In [12]:
import pickle
pickle.dump(best_feature_names,open('best_features_names_reg_diversity_60_40_Train_Test_loo.p','wb'))


In [13]:
import pickle
best_feature_names = pickle.load(open('best_features_names_reg_diversity_60_40_Train_Test_loo.p','rb'))
print best_feature_names

['asian_per_totofficers', 'numfixcam_per_capita', 'entrymin', 'entrymax', 'numcpo', 'totacad', 'sgtmin', 'total_income_estimate_white_per_capita', 'opbudget_per_capita', 'swnftemp_per_capita', 'numfixcam', 'numumkcars_per_capita', 'totinsrv', 'civftemp', 'rural', 'institutionalized_juvenile_all_per_capita', 'numpatr_per_capita', 'numrespoff_per_capita', 'pterroff_per_capita', 'numcarcam', 'institutionalized_juvenile_white', 'civftemp_per_capita', 'institutionalized_white_per_capita', 'black_over_white_institutionalized_juvenile_disparity', 'hispanic_per_totofficers', 'nummrkcars_per_capita', 'totftemp_per_capita', 'numcpo_per_capita', 'drugforf_per_capita', 'sgtmin_per_capita', 'numothunm', 'institutionalized_juvenile_white_per_capita', 'numprocserv_per_capita']


In [14]:
import pickle
pickle.dump(est,open('randomforestclassifier_fixed_reg_diversity_60_40_Train_Test_loo.p','wb'))

In [15]:
est = pickle.load(open('randomforestclassifier_fixed_reg_diversity_60_40_Train_Test_loo.p','rb'))
print est

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=183, max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=4, oob_score=False, random_state=None,
           verbose=0, warm_start=False)


In [16]:
est.score(X_val,y_val)

0.69794857487503292

In [17]:
est.score(X_test,y_test)

0.043758705715531421

In [18]:
def categorize(rpsi):
    if rpsi >=0 and rpsi <=1.6:
        return 0
    elif rpsi < 2.2:
        return 1
    else: return 2


In [19]:
#classification
nmatch = 0
total = 0
y_predict=[]
for y in est.predict(X_test).tolist():
    y_predict.append(categorize(y))
y_actual=[]
for y in y_test.tolist():
    y_actual.append(categorize(y))

from collections import defaultdict
cattotal = defaultdict(int)
for prediction, actual in zip(y_predict,y_actual):
    ismatch = prediction==actual
    cattotal[actual]+=1
    if ismatch:
        nmatch+=1
    total+=1

print "test accuracy =",float(nmatch)/float(total)
print "total",total
for cat in cattotal:
    print cat,float(cattotal[cat])/float(total)

test accuracy = 0.623188405797
total 69
0 0.347826086957
1 0.31884057971
2 0.333333333333


In [None]:

x = est.predict(X_val)
y = y_val


In [None]:
plt.scatter(est.predict(X_val),y_val)
plt.scatter(est.predict(X_test),y_test,color='r')



plt.plot(np.linspace(0, 12, 1000),np.linspace(0, 12, 1000))

x = est.predict(X_val)
y = y_val
plt.plot(x, np.poly1d(np.polyfit(x, y, 1))(x))

x = est.predict(X_test)
y = y_test
plt.plot(x, np.poly1d(np.polyfit(x, y, 1))(x))

plt.ylim([0,12])
plt.xlim([0,12])

In [20]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y_actual,y_predict) #clfmodel2.predict(X_test))
#plt.pcolor(matrix.T.T)
#plt.colorbar()
matrix

array([[12,  6,  6],
       [ 1, 14,  7],
       [ 1,  5, 17]])

In [None]:
sql_query = """
    SELECT  * FROM traffic_joined_with_features;
    """
data = add_features(get_data())

descriptors = []
descriptors = ['agency','city','state','zipcode']
test = pd.read_sql_query(sql_query,con).drop('index',axis=1)
test = test[test['surveyid'].isin(data.index.tolist())]
#test = test.merge(data,how='right',right_on='surveyid')
#data = add_features(get_data())[best_feature_names+['agency']]
#test
test = test.sort(columns='surveyid').set_index('surveyid',drop=True)
test = test[descriptors]
test
sorted_data = data.sort_index()
sorted_data = sorted_data[best_feature_names]
output = pd.concat([test,sorted_data],axis=1)
output

In [None]:
import pickle
#clf = pickle.load(open('randomforestclassifier.p','rb'))
best_feature_names = pickle.load(open('best_features_names.p','rb'))

In [None]:
plt.pcolor(np.abs(np.corrcoef((X_val.T))))
plt.colorbar()

In [None]:
plot_data

In [None]:
plot_data = process.add_features(process.get_data())
processor = process.Processor(val_data,categorize=False)
X,y = processor.get_scaled_Xy(plot_data)
Xy = np.c_[y,X]

feature_names = ['rpsi']+plot_data.drop(['rpsi'],1).columns.tolist()
correlations = np.corrcoef(Xy)[0].tolist()
indices = np.argsort(correlations).tolist()
indices.reverse()
for index in indices:
    print feature_names[index],correlations[index]