In [1]:
from data_handler import CHSIDataHandler
from sklearn.linear_model import RidgeCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
#truncator.Truncator (included) is a simple scikit-learn estimator for discarding extreme values.
from truncator import Truncator

handler = CHSIDataHandler('./chsi_dataset/', dependent='Health_Status', exclude_cols=['Unhealthy_Days', 'ALE'], threshold=0.9)

#Note that CHSIDataHandler takes care of choosing predictors and imputing missing values
X,Y = handler.training_data()

scaler = StandardScaler()

rcv = RidgeCV(alphas=[2**k for k in range(-35, 35)])

#Discard predictors > 20 sd from the mean
truncator = Truncator(-20, 20)

steps = [('scaler', scaler), ('truncator', truncator), ('rcv', rcv)]
pipe = Pipeline(steps=steps)

#Weight samples by population. It might be better to weight by the square of the population instead.
sample_weight = (X.Population_Size/X.Population_Size.sum()).values

pipe.fit(X,Y, rcv__sample_weight = sample_weight)
coef=list(zip(list(X.columns), list(pipe.steps[-1][1].coef_)))
coef.sort(key=lambda x: -abs(x[1]))
coef

[('Disabled_Medicare', 0.92375428507346125),
 ('No_Exercise', 0.83979067704618859),
 ('Sev_Work_Disabled', 0.71497272870103923),
 ('Diabetes', 0.71074518753729854),
 ('Poverty', 0.70608298072707187),
 ('Smoker', 0.38510121542298054),
 ('Obesity', 0.38270953016356846),
 ('Hispanic', 0.3012868676727577),
 ('Lung_Cancer', 0.25110852677599049),
 ('Uninsured', 0.25043405988616307),
 ('Age_65_84', 0.24466340117745383),
 ('F_Wh_HeartDis', 0.24057843982517707),
 ('Major_Depression', 0.23408350367274328),
 ('Under_18', 0.231365278133603),
 ('LBW', 0.20142005969164245),
 ('Unmarried', -0.19117724001564831),
 ('High_Blood_Pres', 0.18564106049195933),
 ('Age_85_and_Over', -0.17736719388077737),
 ('Black', -0.16621691734676658),
 ('Dentist_Rate', -0.16321922607925057),
 ('Stroke', 0.16218327901041568),
 ('White', 0.15942048906295614),
 ('Unemployed', 0.15775133642036826),
 ('D_Wh_Cancer', -0.15698232244727892),
 ('Pap_Smear', -0.15594475925989798),
 ('Asian', 0.15239988183711228),
 ('RHI_Lung_Cance

In [2]:
#A utility function which orders the states by their values for a particular indicator.
def sorted_states(handler, columns):
    state, _ = handler.state_us_averages(columns)
    result = {}
    for column in columns:
        disp_cols = ['CHSI_State_Name', 'CHSI_State_Abbr', column, column + '_Not_Null']
        yield state[disp_cols].sort_values(by=column, axis=0, ascending=False)     

In [6]:
from IPython.display import display
from collections import Counter

all_states = []

columns = ['Disabled_Medicare', 'Sev_Work_Disabled', 'No_Exercise', 'Diabetes', 'Obesity', 'Poverty', 'Unemployed', 'Smoker', 'Lung_Cancer', 'Age_65_84']
#Identify states that are > 1 std above mean in several of these areas
for column, states in zip(columns, sorted_states(handler, columns)):
    mean = states[column].mean()
    std = states[column].std()
    print(column, mean, std, mean+std)
    over = states[states[column] > mean+std]
    all_states += list(over.CHSI_State_Abbr)
    display(over)

#Get the number of drivers for which each state is above mean+std
counter = Counter(all_states)
sorted(list(counter.items()), key=lambda x: -x[1])

Disabled_Medicare 2.12103312247 0.63572776339 2.75676088586


Unnamed: 0_level_0,CHSI_State_Name,CHSI_State_Abbr,Disabled_Medicare,Disabled_Medicare_Not_Null
State_FIPS_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
54,West Virginia,WV,4.196205,100
21,Kentucky,KY,3.576145,100
28,Mississippi,MS,3.46145,100
1,Alabama,AL,3.302487,100
5,Arkansas,AR,3.246204,100
23,Maine,ME,3.187124,100
47,Tennessee,TN,2.838524,100
45,South Carolina,SC,2.832236,100


Sev_Work_Disabled 2.75681908654 1.49707938696 4.2538984735


Unnamed: 0_level_0,CHSI_State_Name,CHSI_State_Abbr,Sev_Work_Disabled,Sev_Work_Disabled_Not_Null
State_FIPS_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12,Florida,FL,11.257382,100
54,West Virginia,WV,5.990568,100
21,Kentucky,KY,5.134441,100


No_Exercise 23.8313260351 3.88738598274 27.7187120178


Unnamed: 0_level_0,CHSI_State_Name,CHSI_State_Abbr,No_Exercise,No_Exercise_Not_Null
State_FIPS_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
22,Louisiana,LA,32.41887,97.947908
28,Mississippi,MS,31.658459,95.361831
21,Kentucky,KY,31.363717,94.313924
40,Oklahoma,OK,30.591589,98.342646
47,Tennessee,TN,30.483715,87.169809
1,Alabama,AL,28.78254,89.41105
54,West Virginia,WV,28.350663,97.391428
5,Arkansas,AR,28.22424,96.150087


Diabetes 6.93682380837 1.08899786689 8.02582167527


Unnamed: 0_level_0,CHSI_State_Name,CHSI_State_Abbr,Diabetes,Diabetes_Not_Null
State_FIPS_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
54,West Virginia,WV,9.955979,100.0
28,Mississippi,MS,9.486436,99.730374
1,Alabama,AL,8.75808,99.20679
45,South Carolina,SC,8.739645,99.762449
47,Tennessee,TN,8.594561,97.717308
22,Louisiana,LA,8.040127,100.0


Obesity 22.2663623273 2.43239119668 24.698753524


Unnamed: 0_level_0,CHSI_State_Name,CHSI_State_Abbr,Obesity,Obesity_Not_Null
State_FIPS_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
28,Mississippi,MS,27.954392,95.016788
54,West Virginia,WV,27.224912,97.57928
1,Alabama,AL,25.978219,89.649015
22,Louisiana,LA,25.852216,98.405638
21,Kentucky,KY,25.363527,94.127026
45,South Carolina,SC,25.220158,99.505885
47,Tennessee,TN,25.001001,85.683299
26,Michigan,MI,24.982953,96.569946
18,Indiana,IN,24.969432,97.777637


Poverty 11.9920682426 2.86483857767 14.8569068202


Unnamed: 0_level_0,CHSI_State_Name,CHSI_State_Abbr,Poverty,Poverty_Not_Null
State_FIPS_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
28,Mississippi,MS,18.298758,100
22,Louisiana,LA,18.070322,100
35,New Mexico,NM,17.670339,100
11,District of Columbia,DC,17.5,100
54,West Virginia,WV,16.273506,100
48,Texas,TX,16.20014,100
5,Arkansas,AR,16.018307,100
1,Alabama,AL,15.233088,100
21,Kentucky,KY,14.915699,100


Unemployed 2.50917694991 0.490202826496 2.99937977641


Unnamed: 0_level_0,CHSI_State_Name,CHSI_State_Abbr,Unemployed,Unemployed_Not_Null
State_FIPS_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
28,Mississippi,MS,3.629367,100.0
11,District of Columbia,DC,3.480158,100.0
2,Alaska,AK,3.468036,100.0
26,Michigan,MI,3.395324,100.0
45,South Carolina,SC,3.331968,100.0
22,Louisiana,LA,3.18426,70.833875
41,Oregon,OR,3.139034,100.0
39,Ohio,OH,3.05196,100.0


Smoker 21.8315008737 2.9465442923 24.778045166


Unnamed: 0_level_0,CHSI_State_Name,CHSI_State_Abbr,Smoker,Smoker_Not_Null
State_FIPS_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
21,Kentucky,KY,29.410825,93.033315
54,West Virginia,WV,27.009856,98.69742
18,Indiana,IN,26.148913,98.336823
40,Oklahoma,OK,25.61733,97.994748
32,Nevada,NV,25.136636,99.47619
28,Mississippi,MS,24.929532,96.441394
29,Missouri,MO,24.899274,92.288688


Lung_Cancer 55.1894140728 9.43263975973 64.6220538325


Unnamed: 0_level_0,CHSI_State_Name,CHSI_State_Abbr,Lung_Cancer,Lung_Cancer_Not_Null
State_FIPS_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
21,Kentucky,KY,78.829813,100
54,West Virginia,WV,71.284505,100
5,Arkansas,AR,68.54422,100
28,Mississippi,MS,68.228362,100
47,Tennessee,TN,67.879029,100
22,Louisiana,LA,66.921496,100
18,Indiana,IN,64.872697,100


Age_65_84 10.8392516871 1.37976218057 12.2190138677


Unnamed: 0_level_0,CHSI_State_Name,CHSI_State_Abbr,Age_65_84,Age_65_84_Not_Null
State_FIPS_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12,Florida,FL,14.583083,100
54,West Virginia,WV,13.512982,100
42,Pennsylvania,PA,12.761648,100
23,Maine,ME,12.608924,100


[('WV', 9),
 ('MS', 8),
 ('KY', 7),
 ('LA', 6),
 ('TN', 5),
 ('AL', 5),
 ('SC', 4),
 ('AR', 4),
 ('IN', 3),
 ('FL', 2),
 ('OK', 2),
 ('ME', 2),
 ('MI', 2),
 ('DC', 2),
 ('NM', 1),
 ('TX', 1),
 ('NV', 1),
 ('AK', 1),
 ('MO', 1),
 ('OH', 1),
 ('OR', 1),
 ('PA', 1)]