In [56]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.pipeline import make_pipeline

In [57]:
data = pd.read_csv('responses.csv',index_col=0)

In [58]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('response',axis=1),data.response,random_state=0)

In [59]:
lr = LinearRegression()
lr.fit(X_train,y_train)
lr.score(X_test,y_test)

0.40545271351361967

In [60]:
tree = DecisionTreeRegressor(max_depth=8)
tree.fit(X_train,y_train)
tree.score(X_test,y_test)

0.41760806659595096

In [61]:
rf_1 = RandomForestRegressor(max_depth=12,n_estimators=1,random_state=0)
rf_1.fit(X_train,y_train)
rf_1.score(X_test,y_test)

0.28975813561055663

In [62]:
rf_5 = RandomForestRegressor(max_depth=12,n_estimators=5,random_state=0)
rf_5.fit(X_train,y_train)
rf_5.score(X_test,y_test)

0.46565999793360846

In [63]:
rf_10 = RandomForestRegressor(max_depth=12,n_estimators=10,random_state=0)
rf_10.fit(X_train,y_train)
rf_10.score(X_test,y_test)

0.4907873297964497

In [64]:
correlations = list(data.corr()['response'][1:])
tree_importances = list(tree.feature_importances_)
rf_1_importances = list(rf_1.feature_importances_)
rf_5_importances = list(rf_5.feature_importances_)
rf_10_importances = list(rf_10.feature_importances_)
results = pd.DataFrame(data=[correlations,tree_importances,rf_1_importances,rf_5_importances,rf_10_importances],\
             columns=list(data.drop('response',axis=1).columns),\
             index=['correlations','tree_importances','rf_1_importances','rf_5_importances','rf_10_importances'])

results = results.transpose().round(2)
results = results.iloc[results['correlations'].abs().argsort()].iloc[::-1]
results

Unnamed: 0,correlations,tree_importances,rf_1_importances,rf_5_importances,rf_10_importances
below_poverty_level,-0.44,0.38,0.28,0.27,0.28
make_over_75k,0.39,0.09,0.09,0.09,0.09
white,0.36,0.07,0.07,0.06,0.07
no_high_school,-0.34,0.01,0.03,0.02,0.02
black,-0.34,0.03,0.03,0.04,0.04
pop,0.26,0.07,0.06,0.07,0.06
other_race,-0.19,0.03,0.04,0.03,0.03
hispanic,-0.19,0.02,0.03,0.03,0.03
noncitizen,-0.15,0.0,0.01,0.01,0.02
spanish_poor_english,-0.15,0.0,0.01,0.01,0.01


## Running again holding out economic data, education data, population data, and racial majority to develop better idea of which minorities are most at risk

In [65]:
data = data.drop(columns=['make_over_75k','white','below_poverty_level','no_high_school','rural','pop','avgfamsize'])

X_train, X_test, y_train, y_test = train_test_split(data.drop('response',axis=1),data.response,random_state=0)

In [66]:
lr = LinearRegression()
lr.fit(X_train,y_train)
tree = DecisionTreeRegressor(max_depth=8)
tree.fit(X_train,y_train)
rf_1 = RandomForestRegressor(max_depth=12,n_estimators=1,random_state=0)
rf_1.fit(X_train,y_train)
rf_5 = RandomForestRegressor(max_depth=12,n_estimators=5,random_state=0)
rf_5.fit(X_train,y_train)
rf_10 = RandomForestRegressor(max_depth=12,n_estimators=10,random_state=0)
rf_10.fit(X_train,y_train)
correlations = list(data.corr()['response'][1:])
tree_importances = list(tree.feature_importances_)
rf_1_importances = list(rf_1.feature_importances_)
rf_5_importances = list(rf_5.feature_importances_)
rf_10_importances = list(rf_10.feature_importances_)
results = pd.DataFrame(data=[correlations,tree_importances,rf_1_importances,rf_5_importances,rf_10_importances],\
             columns=list(data.drop('response',axis=1).columns),\
             index=['correlations','tree_importances','rf_1_importances','rf_5_importances','rf_10_importances'])

results = results.transpose().round(2)
results = results.iloc[results['correlations'].abs().argsort()].iloc[::-1]
results

Unnamed: 0,correlations,tree_importances,rf_1_importances,rf_5_importances,rf_10_importances
black,-0.34,0.31,0.24,0.24,0.24
hispanic,-0.19,0.05,0.05,0.05,0.04
other_race,-0.19,0.08,0.09,0.11,0.11
spanish_poor_english,-0.15,0.0,0.01,0.01,0.01
noncitizen,-0.15,0.04,0.03,0.03,0.04
AIAN,-0.14,0.11,0.11,0.07,0.08
under18,0.12,0.05,0.06,0.07,0.08
multiracial,-0.11,0.03,0.06,0.05,0.05
asian,0.11,0.14,0.13,0.15,0.15
foreign_born,-0.06,0.01,0.02,0.03,0.03
