## Test a regression model on full dataset

In [1]:
## Python packages - you may have to pip install sqlalchemy, sqlalchemy_utils, and psycopg2.
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd
# Make the graphs a bit prettier, and bigger
pd.set_option('display.mpl_style', 'default')

# This is necessary to show lots of columns in pandas 0.12. 
# Not necessary in pandas 0.13.
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)
pd.set_option('display.max_rows', 20)

import numpy as np
import math
# The usual preamble
%matplotlib inline
%pylab inline
import matplotlib.pyplot as plt
plt.rcParams['axes.color_cycle'] = ['r', 'g', 'b', 'c']
plt.rcParams['lines.color'] = 'r'
plt.rcParams['figure.figsize'] = (15, 5)

import munging

Populating the interactive namespace from numpy and matplotlib




Load in dataset

In [2]:
dbname = 'combined_profiling'
username = 'along528'
pswd = 'password'
con = None
con = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

In [3]:
dbname = 'combined_profiling'
username = 'along528'
pswd = 'password'
## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))
print engine.url

postgresql://along528:password@localhost/combined_profiling


In [44]:
def get_data():
    sql_query = """
    SELECT  * FROM traffic_joined_with_features;
    """
    #data = munging.process_df(pd.read_sql_query(sql_query,con))
    data = pd.read_sql_query(sql_query,con).drop('index',axis=1)
    data = data.set_index('surveyid',drop=True)
    data = data[[ 'stops_total', 'searches_total', 'hits_total', 'stops_white', 'searches_white',
     'hits_white', 'stops_black', 'searches_black', 'hits_black', 
     'total',
     'urban','rural', 
     'institutionalized_all', 'institutionalized_adult_all',
     'institutionalized_adult_federal_detention_all',
     'institutionalized_adult_federal_prison_all',
     'institutionalized_adult_state_prison_all',
     'institutionalized_adult_local_jail_all',
     'institutionalized_juvenile_all',
     'institutionalized_white', 'institutionalized_adult_white',
     'institutionalized_adult_federal_detention_white',
     'institutionalized_adult_federal_prison_white',
     'institutionalized_adult_state_prison_white',
     'institutionalized_adult_local_jail_white',
     'institutionalized_juvenile_white', 'institutionalized_black',
     'institutionalized_adult_black', 'institutionalized_adult_federal_detention_black',
     'institutionalized_adult_federal_prison_black',
     'institutionalized_adult_state_prison_black',
     'institutionalized_adult_local_jail_black', 
     'institutionalized_juvenile_black',
     'population_white', 'population_black', 'total_income_estimate_all',
     'total_income_estimate_white', 'total_income_estimate_black', 'swnauthemp',
     'swnftemp', 
     'swnptemp', 
     'civftemp', 'civptemp', 'totftemp', 'totptemp',
     'ftreserveswn', 'ptreserveswn', 'ftreserveciv', 'ptreserveciv', 'ftgangoff',
     'ptgangoff', 'ftdrugoff', 'ptdrugoff', 'ftterroff', 'pterroff', 'fthumtrfoff',
     'pthumtrfoff', 'numrespoff', 'numcpo', 'numsro', 'numpatr', 'numinvst', 'numjail',
     'numcrtsec', 'numprocserv', 
     'opbudget',
     'drugforf', 'totacad', 'totfield',
     'totinsrv', 
     'white', 'black', #really doesn't like these variables when dividing
     'hispanic', 'asian', 'nathaw', 'amerind', 'multrace',
     'unkrace', 'male', 'female', 'totgender', 'chiefmin', 'chiefmax', 'sgtmin',
     'sgtmax', 'entrymin', 'entrymax', 'nummrkcars', 'numothmrk', 'numumkcars',
     'numothunm', 'numplanes', 'numcopters', 'numboats', 'nummotor', 'numcarcam',
     'numfixcam', 'nummobcam', 'population']]
    data = data.replace(' ',0)
    data = data.replace([np.inf, -np.inf], np.nan)
    data = data.dropna()
    data = data.apply(lambda x: pd.to_numeric(x))
    return data
def split_data(data):
    test = data.sample(frac=0.2,random_state=20)
    val = data[data.index.isin(test.index.values.tolist())==False]
    return test,val

test_data,val_data = split_data(get_data())
val_data

Unnamed: 0_level_0,stops_total,searches_total,hits_total,stops_white,searches_white,hits_white,stops_black,searches_black,hits_black,total,urban,rural,institutionalized_all,institutionalized_adult_all,institutionalized_adult_federal_detention_all,institutionalized_adult_federal_prison_all,institutionalized_adult_state_prison_all,institutionalized_adult_local_jail_all,institutionalized_juvenile_all,institutionalized_white,institutionalized_adult_white,institutionalized_adult_federal_detention_white,institutionalized_adult_federal_prison_white,institutionalized_adult_state_prison_white,institutionalized_adult_local_jail_white,institutionalized_juvenile_white,institutionalized_black,institutionalized_adult_black,institutionalized_adult_federal_detention_black,institutionalized_adult_federal_prison_black,...,totinsrv,white,black,hispanic,asian,nathaw,amerind,multrace,unkrace,male,female,totgender,chiefmin,chiefmax,sgtmin,sgtmax,entrymin,entrymax,nummrkcars,numothmrk,numumkcars,numothunm,numplanes,numcopters,numboats,nummotor,numcarcam,numfixcam,nummobcam,population
surveyid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1
1222,9648,336,128,8463,242,100,992,87,26,26647,26616,31,186,0,0,0,0,0,13,144,0,0,0,0,0,5,37,0,0,0,...,38,62,0,3,0,0,0,0,0,58,7,65,131725.00,131725.00,75256,79016,44519,65454,18,4,12,4,0,0,2,1,2,14,0,26704
184,2673,676,131,1832,417,78,829,256,53,110372,110372,0,1149,219,0,0,0,0,44,861,83,0,0,0,0,23,209,82,0,0,...,35,240,21,39,0,0,0,0,0,276,24,300,109000.00,114000.00,63596,64844,44824,45708,132,21,81,15,0,0,0,9,0,3,3,107241
175,10980,182,64,6526,85,31,4315,96,33,123919,123919,0,2194,1277,0,0,0,1095,119,1027,378,0,0,0,309,46,683,512,0,0,...,20,261,63,84,9,0,0,0,0,371,46,417,119000.00,140000.00,67158,71851,41119,61906,183,29,113,21,0,0,3,13,43,2,0,124558
1214,1764,34,10,1708,30,8,43,3,2,8373,4821,3552,147,0,0,0,0,0,0,145,0,0,0,0,0,0,1,0,0,0,...,20,14,0,3,0,0,1,0,0,18,0,18,64000.00,64000.00,53000,53000,46780,52952,5,2,4,0,0,0,0,0,5,0,0,15538
170,7757,497,34,6992,445,28,597,46,6,80882,78229,2653,1904,1341,0,1339,0,0,25,1162,637,0,637,0,0,17,434,406,0,406,...,20,132,4,11,1,0,0,0,0,138,10,148,106712.00,108712.00,67239,69270,48265,61946,42,9,27,6,0,0,1,0,0,0,0,79893
174,7772,149,20,4796,61,10,2900,88,10,60966,59531,1435,427,0,0,0,0,0,82,363,0,0,0,0,0,43,53,0,0,0,...,80,94,3,2,2,0,0,0,0,97,4,101,92190.15,92190.15,71338,71338,48571,64669,23,6,20,4,0,0,0,4,12,0,1,57982
179,10035,452,231,8020,330,175,1884,120,54,73240,73240,0,790,0,0,0,0,0,59,673,0,0,0,0,0,34,59,0,0,0,...,12,153,10,9,0,0,0,0,0,160,12,172,92000.00,102000.00,64572,79785,48819,65027,30,10,12,7,0,0,0,4,0,0,0,70630
177,4107,285,100,3341,199,65,719,86,35,60912,60499,413,688,4,0,0,0,0,24,624,1,0,0,0,0,15,45,1,0,0,...,30,102,5,12,0,0,0,1,0,111,9,120,105000.00,115000.00,64090,67380,40081,61027,50,7,18,5,0,0,0,4,50,0,0,59607
1219,3896,131,56,3112,98,39,694,32,17,36091,36054,37,417,0,0,0,0,0,0,399,0,0,0,0,0,0,15,0,0,0,...,20,68,1,4,0,0,0,0,0,69,4,73,102410.00,102410.00,62529,67865,42314,59305,17,4,7,4,0,0,0,4,10,0,0,35055
172,8194,198,100,6986,152,82,1111,46,18,55023,53775,1248,845,0,0,0,0,0,0,818,0,0,0,0,0,0,25,0,0,0,...,30,102,2,4,0,0,0,0,0,100,8,108,110000.00,120000.00,63517,72865,50826,62985,15,6,12,4,0,1,2,6,0,0,0,57889


In [45]:
def add_features(data_tmp):
    data = pd.DataFrame(data_tmp)
    
    
    #create rpsi label
    num = data['searches_black'] * data['stops_white'] 
    denom = data['stops_black'] * data['searches_white']
    rpsi = num.div(denom)
    #drop remaining traffic features
    data = data.drop(['stops_total', 'searches_total', 'hits_total', 'stops_white', 'searches_white',
                      'hits_white', 'stops_black', 'searches_black', 'hits_black'],axis=1)
    #create per_capita features from census population
    population = data['total']
    per_capita = data.drop('total',axis=1)
    per_capita = per_capita.div(population,axis=0)
    per_capita.rename(columns=lambda x: x+'_per_capita',inplace=True)
    data = pd.concat([data,per_capita],axis=1)
    data['total'] = population
    
    
    data['rpsi'] = rpsi
    data = data[data['rpsi']<10]
    data = data[data['total']>10000]

    #build comparison features
    data['black_over_white_population_disparity'] = data['population_black'].div(data['population_white'],axis=0).fillna(1)
    data['black_over_white_income_disparity'] = data['total_income_estimate_black'].div(data['total_income_estimate_white'],axis=0).fillna(1)
    data['black_over_white_population_disparity'] = data['population_black'].div(data['population_white'],axis=0).fillna(1)
    data['black_over_white_institutionalized_disparity'] = data['institutionalized_black'].div(data['institutionalized_white'],axis=0).fillna(1)
    data['black_over_white_institutionalized_adult_disparity'] = data['institutionalized_adult_black'].div(data['institutionalized_adult_white'],axis=0).fillna(1)
    data['black_over_white_institutionalized_adult_federal_detention_disparity'] = data['institutionalized_adult_federal_detention_black'].div(data['institutionalized_adult_federal_detention_white'],axis=0).fillna(1)
    data['black_over_white_institutionalized_adult_federal_prison_disparity'] = data['institutionalized_adult_federal_prison_black'].div(data['institutionalized_adult_federal_prison_white'],axis=0).fillna(1)
    data['black_over_white_institutionalized_adult_state_prison_disparity'] = data['institutionalized_adult_state_prison_black'].div(data['institutionalized_adult_state_prison_white'],axis=0).fillna(1)
    data['black_over_white_institutionalized_adult_local_jail_disparity'] = data['institutionalized_adult_local_jail_black'].div(data['institutionalized_adult_local_jail_white'],axis=0).fillna(1)
    data['black_over_white_institutionalized_juvenile_disparity'] = data['institutionalized_juvenile_black'].div(data['institutionalized_juvenile_white'],axis=0).fillna(1)
    #compare deomgraphics in police department and in population
    for race in ['black','white']:
        num = data[race].div(data['swnftemp'],axis=0)
        denom = data['population_'+race].div(data['total'],axis=0)
        data[race+'_officer_disparity'] = num.div(denom)
    data['black_over_white_officer_disparity'] = data['black_officer_disparity'].div(data['white_officer_disparity'])
    data = data.replace([np.inf, -np.inf], np.nan)
    data = data.dropna()
    return data
val_data = add_features(val_data)
test_data = add_features(test_data)
val_data

Unnamed: 0_level_0,total,urban,rural,institutionalized_all,institutionalized_adult_all,institutionalized_adult_federal_detention_all,institutionalized_adult_federal_prison_all,institutionalized_adult_state_prison_all,institutionalized_adult_local_jail_all,institutionalized_juvenile_all,institutionalized_white,institutionalized_adult_white,institutionalized_adult_federal_detention_white,institutionalized_adult_federal_prison_white,institutionalized_adult_state_prison_white,institutionalized_adult_local_jail_white,institutionalized_juvenile_white,institutionalized_black,institutionalized_adult_black,institutionalized_adult_federal_detention_black,institutionalized_adult_federal_prison_black,institutionalized_adult_state_prison_black,institutionalized_adult_local_jail_black,institutionalized_juvenile_black,population_white,population_black,total_income_estimate_all,total_income_estimate_white,total_income_estimate_black,swnauthemp,...,chiefmax_per_capita,sgtmin_per_capita,sgtmax_per_capita,entrymin_per_capita,entrymax_per_capita,nummrkcars_per_capita,numothmrk_per_capita,numumkcars_per_capita,numothunm_per_capita,numplanes_per_capita,numcopters_per_capita,numboats_per_capita,nummotor_per_capita,numcarcam_per_capita,numfixcam_per_capita,nummobcam_per_capita,population_per_capita,rpsi,black_over_white_population_disparity,black_over_white_income_disparity,black_over_white_institutionalized_disparity,black_over_white_institutionalized_adult_disparity,black_over_white_institutionalized_adult_federal_detention_disparity,black_over_white_institutionalized_adult_federal_prison_disparity,black_over_white_institutionalized_adult_state_prison_disparity,black_over_white_institutionalized_adult_local_jail_disparity,black_over_white_institutionalized_juvenile_disparity,black_officer_disparity,white_officer_disparity,black_over_white_officer_disparity
surveyid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1
1222,26647,26616,31,186,0,0,0,0,0,13,144,0,0,0,0,0,5,37,0,0,0,0,0,6,24649,323,9712,9063,35,65,...,4.943333,2.824183,2.965287,1.670695,2.456337,0.000675,0.000150,0.000450,0.000150,0,0.000000,0.000075,0.000038,0.000075,0.000525,0.000000,1.002139,3.067020,0.013104,0.003862,0.256944,1.000000,1,1.000000,1.000000,1.000000,1.200000,0.000000,1.031163,0.000000
184,110372,110372,0,1149,219,0,0,0,0,44,861,83,0,0,0,0,23,209,82,0,0,0,0,13,64836,22146,40950,26944,7822,300,...,1.032871,0.576197,0.587504,0.406117,0.414127,0.001196,0.000190,0.000734,0.000136,0,0.000000,0.000000,0.000082,0.000000,0.000027,0.000027,0.971632,1.356672,0.341569,0.290306,0.242741,0.987952,1,1.000000,1.000000,1.000000,0.565217,0.348868,1.361861,0.256170
175,123919,123919,0,2194,1277,0,0,0,1095,119,1027,378,0,0,0,309,46,683,512,0,0,0,436,46,36606,48114,46061,17050,17580,592,...,1.129770,0.541951,0.579822,0.331822,0.499568,0.001477,0.000234,0.000912,0.000169,0,0.000000,0.000024,0.000105,0.000347,0.000016,0.000000,1.005157,1.708121,1.314375,1.031085,0.665044,1.354497,1,1.000000,1.000000,1.411003,1.000000,0.389109,2.118801,0.183646
170,80882,78229,2653,1904,1341,0,1339,0,0,25,1162,637,0,637,0,0,17,434,406,0,406,0,0,7,55191,5803,29034,21017,1763,152,...,1.344082,0.831322,0.856433,0.596734,0.765881,0.000519,0.000111,0.000334,0.000074,0,0.000000,0.000012,0.000000,0.000000,0.000000,0.000000,0.987772,1.210668,0.105144,0.083884,0.373494,0.637363,1,0.637363,1.000000,1.000000,0.411765,0.376702,1.307061,0.288205
174,60966,59531,1435,427,0,0,0,0,0,82,363,0,0,0,0,0,43,53,0,0,0,0,0,28,41750,12292,23342,16691,4492,107,...,1.512157,1.170128,1.170128,0.796690,1.060739,0.000377,0.000098,0.000328,0.000066,0,0.000000,0.000000,0.000066,0.000197,0.000000,0.000016,0.951055,2.385800,0.294419,0.269127,0.146006,1.000000,1,1.000000,1.000000,1.000000,0.651163,0.147321,1.359057,0.108400
179,73240,73240,0,790,0,0,0,0,0,59,673,0,0,0,0,0,34,59,0,0,0,0,0,14,46607,9530,27832,19961,2811,160,...,1.392682,0.881649,1.089364,0.666562,0.887862,0.000410,0.000137,0.000164,0.000096,0,0.000000,0.000000,0.000055,0.000000,0.000000,0.000000,0.964364,1.547964,0.204476,0.140825,0.087667,1.000000,1,1.000000,1.000000,1.000000,0.411765,0.446814,1.397849,0.319644
177,60912,60499,413,688,4,0,0,0,0,24,624,1,0,0,0,0,15,45,1,0,0,0,0,8,44770,5876,24060,19356,2243,126,...,1.887970,1.052174,1.106186,0.658015,1.001888,0.000821,0.000115,0.000296,0.000082,0,0.000000,0.000000,0.000066,0.000821,0.000000,0.000000,0.978576,2.008135,0.131249,0.115881,0.072115,1.000000,1,1.000000,1.000000,1.000000,0.533333,0.431926,1.156471,0.373487
1219,36091,36054,37,417,0,0,0,0,0,0,399,0,0,0,0,0,0,15,0,0,0,0,0,0,32471,1140,12205,11289,240,75,...,2.837550,1.732537,1.880386,1.172425,1.643207,0.000471,0.000111,0.000194,0.000111,0,0.000000,0.000000,0.000111,0.000277,0.000000,0.000000,0.971295,1.464212,0.035108,0.021260,0.037594,1.000000,1,1.000000,1.000000,1.000000,1.000000,0.433682,1.035355,0.418873
172,55023,53775,1248,845,0,0,0,0,0,0,818,0,0,0,0,0,0,25,0,0,0,0,0,0,50326,1034,18594,17260,311,108,...,2.180906,1.154372,1.324264,0.923723,1.144703,0.000273,0.000109,0.000218,0.000073,0,0.000018,0.000036,0.000109,0.000000,0.000000,0.000000,1.052087,1.902956,0.020546,0.018019,0.030562,1.000000,1,1.000000,1.000000,1.000000,1.000000,0.985440,1.032591,0.954337
183,51382,51382,0,291,0,0,0,0,0,8,264,0,0,0,0,0,4,24,0,0,0,0,0,4,39248,7347,20330,16244,2797,111,...,2.101903,1.276614,1.346425,0.922950,1.135047,0.000837,0.000136,0.000331,0.000019,0,0.000019,0.000019,0.000000,0.000195,0.000000,0.000000,0.962205,1.531770,0.187194,0.172187,0.090909,1.000000,1,1.000000,1.000000,1.000000,1.000000,0.599452,1.122139,0.534204


In [None]:
val_data.to_sql('val_data',engine,if_exists='replace')

In [63]:
def categorize(rpsi):
    if rpsi >=0 and rpsi <=1:
        return 0
    if rpsi > 1 and rpsi <=1.5:
        return 1
    if rpsi > 1.5 and rpsi <=2:
        return 2
    if rpsi > 2 and rpsi <=3:
        return 3
    else:
        return 4
def categorize(rpsi):
    if rpsi >=0 and rpsi <=2:
        return 0
    else: return 1

# Build Model

In [65]:
X_unscaled = np.array(val_data.drop('rpsi',1))
mean = np.mean(X_unscaled, axis=0)
std = np.std(X_unscaled, axis=0)
X_val = (X_unscaled-mean)/std
y_val = np.array(val_data['rpsi'])
y_val_cat = np.array(val_data['rpsi'].map(categorize))
X_unscaled_test = np.array(test_data.drop('rpsi',1)) 
X_test = (X_unscaled_test-mean)/std
y_test = np.array(test_data['rpsi'])

In [27]:
val_data_scaled = pd.DataFrame(np.c_[X_val,y_val_cat],index=val_data.index,columns=val_data.drop('rpsi',1).columns.tolist()+['rpsi'])
val_data_scaled.to_sql('val_data_scaled',engine,if_exists='replace')#print X_val+y_val

In [48]:
for cat in sort(val_data_scaled['rpsi'].unique().tolist()):
    
    print cat, shape(val_data_scaled[val_data_scaled['rpsi']==cat])[0]

0.0 73
1.0 82


In [None]:
for feature in val_data_scaled.drop('rpsi',1).columns.tolist():
    plt.hist(val_data_scaled[feature].tolist(),bins=100)
    plt.savefig('images/histos/'+feature+'.png')
    plt.clf()


In [54]:
for feature in val_data.drop('rpsi',1).columns.tolist():
    
    #if 'disparity' not in feature: continue
    if feature not in best_feature_names: continue
    plt.hist2d(val_data[feature].tolist(),val_data_scaled['rpsi'].tolist())
    plt.savefig('images/hist2d/'+feature+'.png')
    plt.clf()

<matplotlib.figure.Figure at 0x10d5ebd50>

In [None]:
np.isinf(y_val).any()

In [29]:
from sklearn import linear_model,cross_validation,metrics,grid_search
from sklearn.learning_curve import learning_curve
regr = linear_model.LogisticRegression()

n_iter = 10
param_grid = {'C':  np.logspace(-2,3,n_iter)}


cvmodel = grid_search.RandomizedSearchCV(regr,param_grid,n_iter,cv=5,verbose=1)
cvmodel.fit(X_val,y_val_cat)

print cvmodel.best_params_
print cvmodel.best_score_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'C': 1000.0}
0.535483870968


[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    0.6s
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.7s finished


In [50]:
from sklearn.ensemble import RandomForestClassifier
values = range(10,80,10)
n_iter = len(values)
param_grid = {'n_estimators': values,
             'max_features': values}
clf = RandomForestClassifier(n_jobs=2)
clfmodel = grid_search.RandomizedSearchCV(clf,param_grid,
                                          n_iter*n_iter,cv=5,verbose=1)
clfmodel.fit(X_val,y_val_cat)

Fitting 5 folds for each of 49 candidates, totalling 245 fits



KeyboardInterrupt


In [51]:
from sklearn.ensemble import RandomForestRegressor
values = range(10,80,10)
n_iter = len(values)
param_grid = {'n_estimators': values,
             'max_features': values}
clf = RandomForestRegressor(n_jobs=2)
clfmodel = grid_search.RandomizedSearchCV(clf,param_grid,
                                          n_iter*n_iter,cv=5,verbose=1)
clfmodel.fit(X_val,y_val)

[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:   18.0s
[Parallel(n_jobs=1)]: Done 199 tasks       | elapsed:  1.2min


Fitting 5 folds for each of 49 candidates, totalling 245 fits


[Parallel(n_jobs=1)]: Done 245 out of 245 | elapsed:  1.5min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=2, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
          fit_params={}, iid=True, n_iter=49, n_jobs=1,
          param_distributions={'n_estimators': [10, 20, 30, 40, 50, 60, 70], 'max_features': [10, 20, 30, 40, 50, 60, 70]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring=None, verbose=1)

In [52]:
print clfmodel.best_params_
print clfmodel.best_score_
features = []
importances = []
for feature,importance in zip(val_data.drop('rpsi',1).columns.tolist(),clfmodel.best_estimator_.feature_importances_):
    #print feature,importance
    features.append(feature)
    importances.append(importance)
indices = [i[0] for i in sorted(enumerate(importances), key=lambda x:x[1])]
best_feature_names = []
for index in reversed(indices):
    print features[index],importances[index]
    if importances[index] < 0.009:
        continue
    best_feature_names.append(features[index])
print len(best_feature_names)

{'n_estimators': 60, 'max_features': 20}
-0.216449776314
totacad 0.0200217501045
entrymin 0.0194527866272
black_over_white_institutionalized_adult_disparity 0.0179625486485
black_over_white_institutionalized_adult_local_jail_disparity 0.0177388921563
sgtmax 0.0176526553239
entrymax 0.0173870703504
ptreserveswn_per_capita 0.0172695255101
opbudget_per_capita 0.0166544350051
white_officer_disparity 0.0161109216049
sgtmin 0.0155390247308
opbudget 0.0151925765948
numcarcam_per_capita 0.0145327133996
totfield 0.0140444581724
urban_per_capita 0.0139861473005
black_per_capita 0.0139260904311
swnauthemp_per_capita 0.0138581270177
sgtmin_per_capita 0.0134698788634
swnptemp_per_capita 0.0134345451314
population_white_per_capita 0.0134337537919
total_income_estimate_all_per_capita 0.0132508938599
black_over_white_institutionalized_disparity 0.0130431539818
black_over_white_institutionalized_juvenile_disparity 0.0126694254301
unkrace_per_capita 0.0121214700726
pterroff_per_capita 0.0116641896158
dr

In [68]:
nmatch = 0
total =0
for prediction,actual in zip(clfmodel.best_estimator_.predict(X_test),y_test):
    prediction = categorize(prediction)
    actual = categorize(actual)
    if prediction == actual:
        nmatch+=1
    total+=1
print "accuracy =",float(nmatch)/float(total)
    

accuracy = 0.5


In [None]:
from sklearn.learning_curve import learning_curve

train_sizes, train_scores, test_scores = \
            learning_curve(regr, X_val, 
                           y_val_cat, cv=5,train_sizes=np.linspace(.1, 1.0, 10))
test_scores
plt.errorbar(train_sizes, np.mean(train_scores, axis=0), np.std(train_scores, axis=0))
plt.errorbar(train_sizes, np.mean(test_scores, axis=0), np.std(test_scores, axis=0))

In [None]:
plt.pcolor(np.abs(np.corrcoef(X_val.T)))
plt.colorbar()

In [None]:
from scipy.stats import mannwhitneyu
F = X_val.shape[1]
#probabilities = [mannwhitneyu(X_val[y_val_cat==0, i], X_val[y_val_cat==1, i])[1] for i in range(F)] 
#plt.scatter(range(F), probabilities, 100)

In [None]:
F = X_val.shape[1]
best_features = keep_features = range(F)
best_score = np.mean(cross_validation.cross_val_score(cvmodel.best_estimator_, X_val, y_val_cat, cv=5))
for i in range(F):
    print i,"of",F
    scores = []
    len_best = len(best_features)
    for j in range(len_best):
        #print "\t",j,"of",len_best
        keep_features = best_features[:] 
        del keep_features[j] 
        scores.append(np.mean( cross_validation.cross_val_score(cvmodel.best_estimator_, X_val[:, keep_features], y_val_cat, cv=5))) 
    if np.max(scores) >= best_score:
        del best_features[np.argmax(scores)]
        best_score = np.max(scores) 
    else:
        break
print "done"

In [69]:
#print best_features
"""
print len(best_features)
best_feature_names = []
for index,column in enumerate(val_data.columns.tolist()):
    if index in best_features:
        #print column
        best_feature_names.append(column)
"""
X_unscaled = np.array(val_data[best_feature_names])
mean = np.mean(X_unscaled, axis=0)
std = np.std(X_unscaled, axis=0)
X_val = (X_unscaled-mean)/std
y_val = np.array(val_data['rpsi'])
y_val_cat = np.array(val_data['rpsi'].map(categorize))
X_unscaled_test = np.array(test_data[best_feature_names] )
X_test = (X_unscaled_test-mean)/std
y_test = np.array(test_data['rpsi'])
y_test_cat = np.array(test_data['rpsi'].map(categorize))

In [35]:
shape(X_val)[1]

37

In [37]:
from sklearn.ensemble import RandomForestClassifier
values = range(10,shape(X_val)[1],5)
n_iter = len(values)
param_grid = {'n_estimators': values,
             'max_features': values}
clf2 = RandomForestClassifier()
clfmodel2 = grid_search.RandomizedSearchCV(clf2,param_grid,
                                          n_iter*n_iter,cv=5,verbose=1)
clfmodel2.fit(X_val,y_val_cat)
print clfmodel2.best_params_
print clfmodel2.best_score_

[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    3.8s
[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed:   15.3s finished


Fitting 5 folds for each of 36 candidates, totalling 180 fits
{'n_estimators': 25, 'max_features': 30}
0.625806451613


In [71]:
from sklearn.ensemble import RandomForestRegressor
values = range(10,shape(X_val)[1],5)
n_iter = len(values)
param_grid = {'n_estimators': values,
             'max_features': values}
clf2 = RandomForestRegressor(n_jobs=2)
clfmodel2 = grid_search.RandomizedSearchCV(clf2,param_grid,
                                          n_iter*n_iter,cv=5,verbose=1)
clfmodel2.fit(X_val,y_val)

[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:   16.6s
[Parallel(n_jobs=1)]: Done 199 tasks       | elapsed:  1.1min


Fitting 5 folds for each of 49 candidates, totalling 245 fits


[Parallel(n_jobs=1)]: Done 245 out of 245 | elapsed:  1.4min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=2, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
          fit_params={}, iid=True, n_iter=49, n_jobs=1,
          param_distributions={'n_estimators': [10, 15, 20, 25, 30, 35, 40], 'max_features': [10, 15, 20, 25, 30, 35, 40]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring=None, verbose=1)

In [72]:
clfmodel2.best_score_

-0.10634158898104189

In [43]:
#classification
nmatch = 0
total = 0
for value in clfmodel2.predict(X_test)==y_test_cat:
    if value:
        nmatch+=1
    total+=1

print "test accuracy =",float(nmatch)/float(total)
        

test accuracy = 0.605263157895


In [75]:
#regression
nmatch = 0
total =0
for prediction,actual in zip(clfmodel2.best_estimator_.predict(X_test),y_test):
    prediction = categorize(prediction)
    actual = categorize(actual)
    if prediction == actual:
        nmatch+=1
    total+=1
print "test accuracy =",float(nmatch)/float(total)
    

accuracy = 0.578947368421


In [17]:
from sklearn import linear_model,cross_validation,metrics,grid_search
from sklearn.learning_curve import learning_curve
regr = linear_model.LogisticRegression()

n_iter = 10
param_grid = {'C':  np.logspace(-2,3,n_iter)}


cvmodel = grid_search.RandomizedSearchCV(regr,param_grid,n_iter,cv=5,verbose=1)
cvmodel.fit(X_val,y_val_cat)

print cvmodel.best_params_
print cvmodel.best_score_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'C': 0.035938136638046278}
0.348387096774


[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    0.6s
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.6s finished


In [None]:
# The coefficients
#print('Coefficients: \n', regr.coef_)
# The mean square error
print("Residual sum of squares: %.2f"
      % np.mean((regr.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(X_test, y_test))


In [None]:
cvmodel.best_estimator_.predict(X_test)

In [None]:
plt.rcParams['figure.figsize'] = (10, 10)
plt.scatter(y_test,regr.predict(X_test))
#plt.scatter(y_train,regr.predict(X_train))
#plt.xlim([0,10])
#plt.ylim([0,10])
plt.xlabel('Actual RPSI')
plt.ylabel('Predicted RPSI')
plt.savefig('images/rsquare.png',facecolor='white')

In [None]:
predictions = regr.predict(X_predict)
plt.hist(predictions,bins=50)
plt.show()
print "mean =",predictions.mean(),"std =",predictions.std()

In [None]:
import pickle
pickle.dump(regr, open( "pickle/dumb_ridge_regression.p", "wb" ) )
pickle.dump(scaler,open("pickle/scaler.p","wb"))