In [236]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings("ignore")

In [238]:
# read in dataset
dfWalk = pd.read_csv('/Users/austincoffelt/Downloads/Rent-Walk-local/EPA_SmartLocationDatabase_V3_Jan_2021_Final.csv')
dfWalk.head()

Unnamed: 0,OBJECTID,GEOID10,GEOID20,STATEFP,COUNTYFP,TRACTCE,BLKGRPCE,CSA,CSA_Name,CBSA,...,D5DRI,D5DE,D5DEI,D2A_Ranked,D2B_Ranked,D3B_Ranked,D4A_Ranked,NatWalkInd,Shape_Length,Shape_Area
0,1,481130000000.0,481130000000.0,48,113,7825,4,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.184697,0.000476,0.137707,6,14,15,17,14.0,3110.36082,297836.0831
1,2,481130000000.0,481130000000.0,48,113,7825,2,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.323221,0.000801,0.231868,3,10,12,14,10.833333,3519.46911,484945.1466
2,3,481130000000.0,481130000000.0,48,113,7825,3,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.314628,0.000736,0.213146,1,1,7,17,8.333333,1697.091802,106705.9281
3,4,481130000000.0,481130000000.0,48,113,7824,1,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.229821,0.000708,0.205018,16,10,17,17,15.666667,2922.609204,481828.4303
4,5,481130000000.0,481130000000.0,48,113,7824,2,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.164863,0.000433,0.125296,4,7,11,14,10.166667,3731.971773,687684.7752


In [239]:
# change state, county, tract, and blk group to strings
dfWalk.loc[:, ['STATEFP', 'COUNTYFP', 'TRACTCE', 'BLKGRPCE']] = dfWalk.loc[:, ['STATEFP', 'COUNTYFP', 'TRACTCE', 'BLKGRPCE']].astype('str')

In [240]:
# add 0s to the front to create the geoIDs
for i in range(len(dfWalk)):
    dfWalk.loc[i, 'STATEFP'] = dfWalk.loc[i, 'STATEFP'].zfill(2)
    dfWalk.loc[i, 'COUNTYFP'] = dfWalk.loc[i, 'COUNTYFP'].zfill(3)
    dfWalk.loc[i, 'TRACTCE'] = dfWalk.loc[i, 'TRACTCE'].zfill(6)
    
dfWalk.head()

Unnamed: 0,OBJECTID,GEOID10,GEOID20,STATEFP,COUNTYFP,TRACTCE,BLKGRPCE,CSA,CSA_Name,CBSA,...,D5DRI,D5DE,D5DEI,D2A_Ranked,D2B_Ranked,D3B_Ranked,D4A_Ranked,NatWalkInd,Shape_Length,Shape_Area
0,1,481130000000.0,481130000000.0,48,113,7825,4,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.184697,0.000476,0.137707,6,14,15,17,14.0,3110.36082,297836.0831
1,2,481130000000.0,481130000000.0,48,113,7825,2,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.323221,0.000801,0.231868,3,10,12,14,10.833333,3519.46911,484945.1466
2,3,481130000000.0,481130000000.0,48,113,7825,3,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.314628,0.000736,0.213146,1,1,7,17,8.333333,1697.091802,106705.9281
3,4,481130000000.0,481130000000.0,48,113,7824,1,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.229821,0.000708,0.205018,16,10,17,17,15.666667,2922.609204,481828.4303
4,5,481130000000.0,481130000000.0,48,113,7824,2,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.164863,0.000433,0.125296,4,7,11,14,10.166667,3731.971773,687684.7752


In [243]:
# create realGeoID for merging
dfWalk['realGEOID'] = '1500000US' + dfWalk['STATEFP'] + dfWalk['COUNTYFP'] + dfWalk['TRACTCE'] + dfWalk['BLKGRPCE']
dfWalk.head()

Unnamed: 0,OBJECTID,GEOID10,GEOID20,STATEFP,COUNTYFP,TRACTCE,BLKGRPCE,CSA,CSA_Name,CBSA,...,D5DE,D5DEI,D2A_Ranked,D2B_Ranked,D3B_Ranked,D4A_Ranked,NatWalkInd,Shape_Length,Shape_Area,realGEOID
0,1,481130000000.0,481130000000.0,48,113,7825,4,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.000476,0.137707,6,14,15,17,14.0,3110.36082,297836.0831,1500000US481130078254
1,2,481130000000.0,481130000000.0,48,113,7825,2,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.000801,0.231868,3,10,12,14,10.833333,3519.46911,484945.1466,1500000US481130078252
2,3,481130000000.0,481130000000.0,48,113,7825,3,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.000736,0.213146,1,1,7,17,8.333333,1697.091802,106705.9281,1500000US481130078253
3,4,481130000000.0,481130000000.0,48,113,7824,1,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.000708,0.205018,16,10,17,17,15.666667,2922.609204,481828.4303,1500000US481130078241
4,5,481130000000.0,481130000000.0,48,113,7824,2,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.000433,0.125296,4,7,11,14,10.166667,3731.971773,687684.7752,1500000US481130078242


In [244]:
# create intercept constant
dfWalk['Intercept'] = 1

In [245]:
# select and rename the most important columns of our dataframe
imptCols = ['realGEOID', 'Intercept', 'NatWalkInd', 'STATEFP', 'CountHU', 'P_WrkAge', 'Pct_AO0',
            'Pct_AO2p', 'R_LowWageWk', 'R_HiWageWk', 'R_PCTLOWWAGE', 'TotEmp', 'D1A', 
            'D1B', 'D1D', 'D2A_JPHH', 'D4E', 'D5AR', 'D5BR', 'D5CRI']
dfWalk = dfWalk.loc[:, imptCols]
dfWalk = dfWalk.rename({'NatWalkInd':'Walk_Index',
                        'STATEFP':'state',
                        'CountHU':'count_housing_units',
                        'P_WrkAge':'percentage_work_age',
                        'Pct_AO0':'percent_no_car',
                        'Pct_AO2p':'percent_two_car',
                        'R_LowWageWk':'count_low_wage_workers',
                        'R_HiWageWk':'count_high_wage_workers',
                        'R_PCTLOWWAGE':'percent_low_wage_workers',
                        'D1A':'housing_density',
                        'D1B':'population_density',
                        'D1D':'employent_housing_density',
                        'D2A_JPHH':'jobs_per_household',
                        'D4E':'transit_frequency',
                        'D5AR':'jobs_within_45_minutes_auto',
                        'D5BR':'jobs_within_45_minutes_transit',
                        'D5CRI':'regional_centrality'}, axis = 1)
dfWalk.head()

Unnamed: 0,realGEOID,Intercept,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,percent_low_wage_workers,TotEmp,housing_density,population_density,employent_housing_density,jobs_per_household,transit_frequency,jobs_within_45_minutes_auto,jobs_within_45_minutes_transit,regional_centrality
0,1500000US481130078254,1,14.0,48,460.0,0.549,0.163121,0.744681,99,191,0.240291,66,6.250422,16.332625,7.147222,0.156028,0.003602,433601,135362,0.785893
1,1500000US481130078252,1,10.833333,48,409.0,0.466,0.0,0.589242,76,212,0.192405,25,3.430799,5.955666,3.640506,0.061125,0.006099,386504,236885,0.700531
2,1500000US481130078253,1,8.333333,48,365.0,0.811,0.057751,0.507599,136,138,0.293737,0,13.843035,27.951553,13.843035,0.0,0.004071,404573,230587,0.733281
3,1500000US481130078241,1,15.666667,48,384.0,0.638,0.0,0.888021,60,302,0.139211,253,3.225246,7.592767,5.350213,0.658854,0.007378,423099,168433,0.766859
4,1500000US481130078242,1,10.166667,48,343.0,0.506,0.014577,0.790087,91,404,0.157168,32,2.305992,6.373413,2.521128,0.093294,0.007036,335700,120826,0.60845


In [246]:
# change numbers to States
dfWalk.loc[dfWalk['state'] == '01', 'state'] = 'Alabama' 
dfWalk.loc[dfWalk['state'] == '02', 'state'] = 'Alaska'
dfWalk.loc[dfWalk['state'] == '04', 'state'] = 'Arizona'
dfWalk.loc[dfWalk['state'] == '05', 'state'] = 'Arkansas'
dfWalk.loc[dfWalk['state'] == '06', 'state'] = 'California'
dfWalk.loc[dfWalk['state'] == '08', 'state'] = 'Colorado'
dfWalk.loc[dfWalk['state'] == '09', 'state'] = 'Connecticut'
dfWalk.loc[dfWalk['state'] == '10', 'state'] = 'Deleware'
dfWalk.loc[dfWalk['state'] == '11', 'state'] = 'District of Columbia'
dfWalk.loc[dfWalk['state'] == '12', 'state'] = 'Florida'
dfWalk.loc[dfWalk['state'] == '13', 'state'] = 'Georgia'
dfWalk.loc[dfWalk['state'] == '15', 'state'] = 'Hawaii'
dfWalk.loc[dfWalk['state'] == '16', 'state'] = 'Idaho'
dfWalk.loc[dfWalk['state'] == '17', 'state'] = 'Illinois'
dfWalk.loc[dfWalk['state'] == '18', 'state'] = 'Indiana'
dfWalk.loc[dfWalk['state'] == '19', 'state'] = 'Iowa'
dfWalk.loc[dfWalk['state'] == '20', 'state'] = 'Kansas'
dfWalk.loc[dfWalk['state'] == '21', 'state'] = 'Kentucky'
dfWalk.loc[dfWalk['state'] == '22', 'state'] = 'Louisiana'
dfWalk.loc[dfWalk['state'] == '23', 'state'] = 'Maine'
dfWalk.loc[dfWalk['state'] == '24', 'state'] = 'Maryland'
dfWalk.loc[dfWalk['state'] == '25', 'state'] = 'Massachusetts'
dfWalk.loc[dfWalk['state'] == '26', 'state'] = 'Michigan'
dfWalk.loc[dfWalk['state'] == '27', 'state'] = 'Minnesota'
dfWalk.loc[dfWalk['state'] == '28', 'state'] = 'Mississippi'
dfWalk.loc[dfWalk['state'] == '29', 'state'] = 'Missouri'
dfWalk.loc[dfWalk['state'] == '30', 'state'] = 'Montana'
dfWalk.loc[dfWalk['state'] == '31', 'state'] = 'Nebraska'
dfWalk.loc[dfWalk['state'] == '32', 'state'] = 'Nevada'
dfWalk.loc[dfWalk['state'] == '33', 'state'] = 'New Hampshire'
dfWalk.loc[dfWalk['state'] == '34', 'state'] = 'New Jersey'
dfWalk.loc[dfWalk['state'] == '35', 'state'] = 'New Mexico'
dfWalk.loc[dfWalk['state'] == '36', 'state'] = 'New York'
dfWalk.loc[dfWalk['state'] == '37', 'state'] = 'North Carolina'
dfWalk.loc[dfWalk['state'] == '38', 'state'] = 'North Dakota'
dfWalk.loc[dfWalk['state'] == '39', 'state'] = 'Ohio'
dfWalk.loc[dfWalk['state'] == '40', 'state'] = 'Oklahoma'
dfWalk.loc[dfWalk['state'] == '41', 'state'] = 'Oregon'
dfWalk.loc[dfWalk['state'] == '42', 'state'] = 'Pennsylvania'
dfWalk.loc[dfWalk['state'] == '44', 'state'] = 'Rhode Island'
dfWalk.loc[dfWalk['state'] == '45', 'state'] = 'South Carolian'
dfWalk.loc[dfWalk['state'] == '46', 'state'] = 'South Dakota'
dfWalk.loc[dfWalk['state'] == '47', 'state'] = 'Tennessee'
dfWalk.loc[dfWalk['state'] == '48', 'state'] = 'Texas'
dfWalk.loc[dfWalk['state'] == '49', 'state'] = 'Utah'
dfWalk.loc[dfWalk['state'] == '50', 'state'] = 'Vermont'
dfWalk.loc[dfWalk['state'] == '51', 'state'] = 'Virginia'
dfWalk.loc[dfWalk['state'] == '53', 'state'] = 'Washington'
dfWalk.loc[dfWalk['state'] == '54', 'state'] = 'West Virginia'
dfWalk.loc[dfWalk['state'] == '55', 'state'] = 'Wisconsin'
dfWalk.loc[dfWalk['state'] == '56', 'state'] = 'Wyoming'
print(dfWalk.shape)
dfWalk.head()

(220740, 20)


Unnamed: 0,realGEOID,Intercept,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,percent_low_wage_workers,TotEmp,housing_density,population_density,employent_housing_density,jobs_per_household,transit_frequency,jobs_within_45_minutes_auto,jobs_within_45_minutes_transit,regional_centrality
0,1500000US481130078254,1,14.0,Texas,460.0,0.549,0.163121,0.744681,99,191,0.240291,66,6.250422,16.332625,7.147222,0.156028,0.003602,433601,135362,0.785893
1,1500000US481130078252,1,10.833333,Texas,409.0,0.466,0.0,0.589242,76,212,0.192405,25,3.430799,5.955666,3.640506,0.061125,0.006099,386504,236885,0.700531
2,1500000US481130078253,1,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,138,0.293737,0,13.843035,27.951553,13.843035,0.0,0.004071,404573,230587,0.733281
3,1500000US481130078241,1,15.666667,Texas,384.0,0.638,0.0,0.888021,60,302,0.139211,253,3.225246,7.592767,5.350213,0.658854,0.007378,423099,168433,0.766859
4,1500000US481130078242,1,10.166667,Texas,343.0,0.506,0.014577,0.790087,91,404,0.157168,32,2.305992,6.373413,2.521128,0.093294,0.007036,335700,120826,0.60845


In [252]:
# get race and population
temp = pd.read_csv('/Users/austincoffelt/Downloads/Rent-Walk-local/Race_2018/Race_2018.csv', header = 1)
temp['percent_non_white'] = 1 - (temp['Estimate!!Total!!White alone'] / temp['Estimate!!Total'])
temp = temp.rename({'Estimate!!Total': 'Population'}, axis = 1)
temp = temp[['Geography', 'Population', 'percent_non_white']]
temp.head()

Unnamed: 0,Geography,Population,percent_non_white
0,1500000US010010201001,636,0.141509
1,1500000US010010201002,1287,0.174048
2,1500000US010010202001,810,0.676543
3,1500000US010010202002,1218,0.542693
4,1500000US010010203001,2641,0.340401


In [254]:
# merge dfWalk with rent data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk.drop('Geography', axis = 1)
print(dfWalk.shape)
dfWalk.head()

(220333, 22)


Unnamed: 0,realGEOID,Intercept,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,...,housing_density,population_density,employent_housing_density,jobs_per_household,transit_frequency,jobs_within_45_minutes_auto,jobs_within_45_minutes_transit,regional_centrality,Population,percent_non_white
0,1500000US481130078254,1,14.0,Texas,460.0,0.549,0.163121,0.744681,99,191,...,6.250422,16.332625,7.147222,0.156028,0.003602,433601,135362,0.785893,1202,0.400998
1,1500000US481130078252,1,10.833333,Texas,409.0,0.466,0.0,0.589242,76,212,...,3.430799,5.955666,3.640506,0.061125,0.006099,386504,236885,0.700531,710,0.119718
2,1500000US481130078253,1,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,138,...,13.843035,27.951553,13.843035,0.0,0.004071,404573,230587,0.733281,737,0.770692
3,1500000US481130078241,1,15.666667,Texas,384.0,0.638,0.0,0.888021,60,302,...,3.225246,7.592767,5.350213,0.658854,0.007378,423099,168433,0.766859,904,0.095133
4,1500000US481130078242,1,10.166667,Texas,343.0,0.506,0.014577,0.790087,91,404,...,2.305992,6.373413,2.521128,0.093294,0.007036,335700,120826,0.60845,948,0.047468


In [256]:
dfWalk[['Walk_Index', 'percent_non_white']].cov()

Unnamed: 0,Walk_Index,percent_non_white
Walk_Index,19.049439,0.377049
percent_non_white,0.377049,0.070722


In [258]:
# get median rent
temp = pd.read_csv('/Users/austincoffelt/Downloads/Rent-Walk-local/Rent_2018/Rent_2018.csv', header = 1)
temp = temp.rename({'Estimate!!Median contract rent': 'Median_Contract_Rent'}, axis = 1)
temp = temp[['Geography', 'Median_Contract_Rent']]

# remove observations without numbers
temp = temp[temp['Median_Contract_Rent'] != '-']
temp.head()

Unnamed: 0,Geography,Median_Contract_Rent
0,1500000US010010201001,607
1,1500000US010010201002,532
2,1500000US010010202001,404
3,1500000US010010202002,646
4,1500000US010010203001,685


In [260]:
# merge dfWalk with rent data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk.drop('Geography', axis = 1)
dfWalk = dfWalk[dfWalk['Median_Contract_Rent'] != '3,500+']
dfWalk = dfWalk[dfWalk['Median_Contract_Rent'] != '100-']
dfWalk['log_Median_Contract_Rent'] = np.log(dfWalk['Median_Contract_Rent'].astype('int'))
print(dfWalk.shape)
dfWalk.head()

(176383, 24)


Unnamed: 0,realGEOID,Intercept,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,...,employent_housing_density,jobs_per_household,transit_frequency,jobs_within_45_minutes_auto,jobs_within_45_minutes_transit,regional_centrality,Population,percent_non_white,Median_Contract_Rent,log_Median_Contract_Rent
0,1500000US481130078254,1,14.0,Texas,460.0,0.549,0.163121,0.744681,99,191,...,7.147222,0.156028,0.003602,433601,135362,0.785893,1202,0.400998,838,6.731018
1,1500000US481130078253,1,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,138,...,13.843035,0.0,0.004071,404573,230587,0.733281,737,0.770692,707,6.561031
2,1500000US481130078271,1,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,206,...,11.028441,0.006036,0.002246,402287,138562,0.729137,1336,0.561377,859,6.755769
3,1500000US481130093012,1,8.0,Texas,451.0,0.644,0.029891,0.915761,117,137,...,2.252783,0.002717,0.002596,263813,8873,0.478156,1541,0.143413,581,6.364751
4,1500000US481130011022,1,13.166667,Texas,255.0,0.69,0.145098,0.552941,49,302,...,4.797439,0.203922,0.009142,372503,275466,0.675155,583,0.102916,1375,7.226209


In [262]:
# get average household size
temp = pd.read_csv('/Users/austincoffelt/Downloads/Rent-Walk-local/HH_Size_2019/HH_Size_2019.csv', header = 1)
temp = temp.rename({'Estimate!!Average household size --!!Total:!!Renter occupied': 'avg_HH_size_renters'}, axis = 1)
temp = temp[['Geography', 'avg_HH_size_renters']]

# remove observations without values
temp = temp[temp['avg_HH_size_renters'] != '-']
temp.head()

Unnamed: 0,Geography,avg_HH_size_renters
0,1500000US010010201001,1.78
1,1500000US010010201002,3.4
2,1500000US010010202001,2.42
3,1500000US010010202002,2.14
4,1500000US010010203001,2.49


In [264]:
# merge dfWalk with household data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk.drop('Geography', axis = 1)
dfWalk['avg_HH_size_renters'] = dfWalk['avg_HH_size_renters'].astype('float')
print(dfWalk.shape)
dfWalk.head()

(175330, 25)


Unnamed: 0,realGEOID,Intercept,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,...,jobs_per_household,transit_frequency,jobs_within_45_minutes_auto,jobs_within_45_minutes_transit,regional_centrality,Population,percent_non_white,Median_Contract_Rent,log_Median_Contract_Rent,avg_HH_size_renters
0,1500000US481130078254,1,14.0,Texas,460.0,0.549,0.163121,0.744681,99,191,...,0.156028,0.003602,433601,135362,0.785893,1202,0.400998,838,6.731018,2.55
1,1500000US481130078253,1,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,138,...,0.0,0.004071,404573,230587,0.733281,737,0.770692,707,6.561031,2.05
2,1500000US481130078271,1,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,206,...,0.006036,0.002246,402287,138562,0.729137,1336,0.561377,859,6.755769,2.72
3,1500000US481130093012,1,8.0,Texas,451.0,0.644,0.029891,0.915761,117,137,...,0.002717,0.002596,263813,8873,0.478156,1541,0.143413,581,6.364751,4.81
4,1500000US481130011022,1,13.166667,Texas,255.0,0.69,0.145098,0.552941,49,302,...,0.203922,0.009142,372503,275466,0.675155,583,0.102916,1375,7.226209,1.88


In [266]:
dfWalk[['Walk_Index', 'avg_HH_size_renters']].cov()

Unnamed: 0,Walk_Index,avg_HH_size_renters
Walk_Index,18.597667,-0.127653
avg_HH_size_renters,-0.127653,0.793796


In [128]:
# read in income inequality data (GINI Index)
#temp = pd.read_csv('GINI_Index_2013/GINI_Index_2013.csv', header = 1)
#temp = temp.rename({'Estimate!!Gini Index': 'GINI_Index'}, axis = 1)
#temp = temp[['Geography', 'GINI_Index']]

#remove observations without numbers
#temp = temp[temp['GINI_Index'] != '-']
#temp.head()

Unnamed: 0,Geography,GINI_Index
0,1500000US010010201001,0.3386
1,1500000US010010201002,0.4121
2,1500000US010010202001,0.4151
3,1500000US010010202002,0.3972
4,1500000US010010203001,0.3981


In [129]:
# merge dfWalk with GINI Index data on GEOID
#dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
#dfWalk = dfWalk.drop('Geography', axis = 1)
#print(dfWalk.shape)
#dfWalk.head()

(130237, 25)


Unnamed: 0,realGEOID,Intercept,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,...,jobs_per_household,transit_frequency,jobs_within_45_minutes_auto,jobs_within_45_minutes_transit,regional_centrality,Population,percent_non_white,Median_Contract_Rent,avg_HH_size_renters,GINI_Index
0,1500000US481130078254,1,14.0,Texas,460.0,0.549,0.163121,0.744681,99,191,...,0.156028,0.003602,433601,135362,0.785893,1508,0.259284,942,2.42,0.5129
1,1500000US481130078252,1,10.833333,Texas,409.0,0.466,0.0,0.589242,76,212,...,0.061125,0.006099,386504,236885,0.700531,1027,0.771178,770,1.72,0.3731
2,1500000US481130078253,1,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,138,...,0.0,0.004071,404573,230587,0.733281,662,0.853474,806,2.23,0.2593
3,1500000US481130078271,1,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,206,...,0.006036,0.002246,402287,138562,0.729137,1135,0.698678,1075,2.27,0.4482
4,1500000US481130093012,1,8.0,Texas,451.0,0.644,0.029891,0.915761,117,137,...,0.002717,0.002596,263813,8873,0.478156,1367,0.734455,917,4.1,0.3393


In [130]:
#dfWalk[['Walk_Index', 'GINI_Index']].cov()

Unnamed: 0,Walk_Index,GINI_Index
Walk_Index,18.608666,0.026798
GINI_Index,0.026798,0.005939


In [202]:
# read in household heating data
temp = pd.read_csv('/Users/austincoffelt/Downloads/Rent-Walk-local/Household_Heating/Household_Heating.csv', header = 1)

# get percentage of different heating methods
temp['percent_gas_energy'] = temp['Estimate!!Total:!!Utility gas'] / temp['Estimate!!Total:']
temp = temp[['Geography', 'percent_gas_energy']]
temp.head()

Unnamed: 0,Geography,percent_gas_energy
0,1500000US010010201001,0.337302
1,1500000US010010201002,0.492997
2,1500000US010010202001,0.305147
3,1500000US010010202002,0.716463
4,1500000US010010203001,0.646251


In [203]:
# merge dfWalk with household heating data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk.drop('Geography', axis = 1)
print(dfWalk.shape)
dfWalk.head()

(139749, 26)


Unnamed: 0,realGEOID,Intercept,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,...,transit_frequency,jobs_within_45_minutes_auto,jobs_within_45_minutes_transit,regional_centrality,Population,percent_non_white,Median_Contract_Rent,log_Median_Contract_Rent,avg_HH_size_renters,percent_gas_energy
0,1500000US481130078254,1,14.0,Texas,460.0,0.549,0.163121,0.744681,99,191,...,0.003602,433601,135362,0.785893,1202,0.400998,838,6.731018,2.55,0.48533
1,1500000US481130078253,1,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,138,...,0.004071,404573,230587,0.733281,737,0.770692,707,6.561031,2.05,0.191686
2,1500000US481130078271,1,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,206,...,0.002246,402287,138562,0.729137,1336,0.561377,859,6.755769,2.72,0.0
3,1500000US481130093012,1,8.0,Texas,451.0,0.644,0.029891,0.915761,117,137,...,0.002596,263813,8873,0.478156,1541,0.143413,581,6.364751,4.81,0.86236
4,1500000US481130011022,1,13.166667,Texas,255.0,0.69,0.145098,0.552941,49,302,...,0.009142,372503,275466,0.675155,583,0.102916,1375,7.226209,1.88,0.619469


In [204]:
dfWalk[['Walk_Index', 'percent_gas_energy']].cov()

Unnamed: 0,Walk_Index,percent_gas_energy
Walk_Index,18.682062,0.451995
percent_gas_energy,0.451995,0.097799


In [134]:
# get internet access data and find the percent that have no internet access
#temp = pd.read_csv('/Users/austincoffelt/Downloads/Rent-Walk-local/Internet_Access/Internet_Access.csv', header = 1)
#temp['percent_no_internet'] = temp['Estimate!!Total:!!No Internet access'] / temp['Estimate!!Total:']
#temp = temp[['Geography', 'percent_no_internet']]
#temp.head()

Unnamed: 0,Geography,percent_no_internet
0,1500000US010010201001,0.206349
1,1500000US010010201002,0.10084
2,1500000US010010202001,0.136029
3,1500000US010010202002,0.234756
4,1500000US010010203001,0.089757


In [135]:
# merge dfWalk with internet access data on GEOID
#dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
#dfWalk = dfWalk.drop('Geography', axis = 1)
#print(dfWalk.shape)
#dfWalk.head()

(130237, 30)


Unnamed: 0,realGEOID,Intercept,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,...,Population,percent_non_white,Median_Contract_Rent,avg_HH_size_renters,GINI_Index,percent_gas_energy,percent_electric_energy,percent_solar_energy,percent_no_heating,percent_no_internet
0,1500000US481130078254,1,14.0,Texas,460.0,0.549,0.163121,0.744681,99,191,...,1508,0.259284,942,2.42,0.5129,0.48533,0.51467,0.0,0.0,0.095355
1,1500000US481130078252,1,10.833333,Texas,409.0,0.466,0.0,0.589242,76,212,...,1027,0.771178,770,1.72,0.3731,0.07377,0.840164,0.0,0.0,0.114754
2,1500000US481130078253,1,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,138,...,662,0.853474,806,2.23,0.2593,0.191686,0.808314,0.0,0.0,0.189376
3,1500000US481130078271,1,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,206,...,1135,0.698678,1075,2.27,0.4482,0.0,1.0,0.0,0.0,0.098501
4,1500000US481130093012,1,8.0,Texas,451.0,0.644,0.029891,0.915761,117,137,...,1367,0.734455,917,4.1,0.3393,0.86236,0.02809,0.0,0.0,0.078652


In [136]:
#dfWalk[['Walk_Index', 'percent_no_internet']].cov()

Unnamed: 0,Walk_Index,percent_no_internet
Walk_Index,18.608666,-0.064713
percent_no_internet,-0.064713,0.012442


In [137]:
# read in kitchen data
#temp = pd.read_csv('/Users/austincoffelt/Downloads/Rent-Walk-local/Kitchen_Tenure/Kitchen_Tenure.csv', header = 1)

# get percentage that are lacking kitchen utilities
#temp['percent_lacking_kitchen_renter'] = temp['Estimate!!Total:!!Renter occupied:!!Lacking complete kitchen facilities'] / temp['Estimate!!Total:!!Renter occupied:']
#temp = temp[['Geography', 'percent_lacking_kitchen_renter']]
#temp.head()

Unnamed: 0,Geography,percent_lacking_kitchen_renter
0,1500000US010010201001,0.0
1,1500000US010010201002,0.0
2,1500000US010010202001,0.0
3,1500000US010010202002,0.0
4,1500000US010010203001,0.0


In [138]:
# merge dfWalk with kitchen data on GEOID
#dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
#dfWalk = dfWalk.drop('Geography', axis = 1)
#print(dfWalk.shape)
#dfWalk.head()

(130237, 31)


Unnamed: 0,realGEOID,Intercept,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,...,percent_non_white,Median_Contract_Rent,avg_HH_size_renters,GINI_Index,percent_gas_energy,percent_electric_energy,percent_solar_energy,percent_no_heating,percent_no_internet,percent_lacking_kitchen_renter
0,1500000US481130078254,1,14.0,Texas,460.0,0.549,0.163121,0.744681,99,191,...,0.259284,942,2.42,0.5129,0.48533,0.51467,0.0,0.0,0.095355,0.0
1,1500000US481130078252,1,10.833333,Texas,409.0,0.466,0.0,0.589242,76,212,...,0.771178,770,1.72,0.3731,0.07377,0.840164,0.0,0.0,0.114754,0.0
2,1500000US481130078253,1,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,138,...,0.853474,806,2.23,0.2593,0.191686,0.808314,0.0,0.0,0.189376,0.0
3,1500000US481130078271,1,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,206,...,0.698678,1075,2.27,0.4482,0.0,1.0,0.0,0.0,0.098501,0.0
4,1500000US481130093012,1,8.0,Texas,451.0,0.644,0.029891,0.915761,117,137,...,0.734455,917,4.1,0.3393,0.86236,0.02809,0.0,0.0,0.078652,0.0


In [139]:
#dfWalk[['Walk_Index', 'percent_lacking_kitchen_renter']].cov()

Unnamed: 0,Walk_Index,percent_lacking_kitchen_renter
Walk_Index,18.608666,-0.000603
percent_lacking_kitchen_renter,-0.000603,0.003483


In [140]:
# read in living arrangement data
#temp = pd.read_csv('/Users/austincoffelt/Downloads/Rent-Walk-local/Living_Arrangements/Living_Arrangements.csv', header = 1)

# get percentage that live alone
#temp['percent_lives_alone'] = temp['Estimate!!Total:!!Lives alone'] / temp['Estimate!!Total:']
#temp = temp[['Geography', 'percent_lives_alone']]
#temp.head()

Unnamed: 0,Geography,percent_lives_alone
0,1500000US010010201001,0.132404
1,1500000US010010201002,0.051345
2,1500000US010010202001,0.09736
3,1500000US010010202002,0.114086
4,1500000US010010203001,0.074187


In [141]:
# merge dfWalk with living alone data on GEOID
#dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
#dfWalk = dfWalk.drop('Geography', axis = 1)
#print(dfWalk.shape)
#dfWalk.head()

(130237, 32)


Unnamed: 0,realGEOID,Intercept,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,...,Median_Contract_Rent,avg_HH_size_renters,GINI_Index,percent_gas_energy,percent_electric_energy,percent_solar_energy,percent_no_heating,percent_no_internet,percent_lacking_kitchen_renter,percent_lives_alone
0,1500000US481130078254,1,14.0,Texas,460.0,0.549,0.163121,0.744681,99,191,...,942,2.42,0.5129,0.48533,0.51467,0.0,0.0,0.095355,0.0,0.238619
1,1500000US481130078252,1,10.833333,Texas,409.0,0.466,0.0,0.589242,76,212,...,770,1.72,0.3731,0.07377,0.840164,0.0,0.0,0.114754,0.0,0.42228
2,1500000US481130078253,1,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,138,...,806,2.23,0.2593,0.191686,0.808314,0.0,0.0,0.189376,0.0,0.295714
3,1500000US481130078271,1,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,206,...,1075,2.27,0.4482,0.0,1.0,0.0,0.0,0.098501,0.0,0.328788
4,1500000US481130093012,1,8.0,Texas,451.0,0.644,0.029891,0.915761,117,137,...,917,4.1,0.3393,0.86236,0.02809,0.0,0.0,0.078652,0.0,0.008937


In [142]:
#dfWalk[['Walk_Index', 'percent_lives_alone']].cov()

Unnamed: 0,Walk_Index,percent_lives_alone
Walk_Index,18.608666,0.076088
percent_lives_alone,0.076088,0.012223


In [205]:
# get median household income data
temp = pd.read_csv('/Users/austincoffelt/Downloads/Rent-Walk-Local/HH_Income_2018/HH_Income_2018.csv', header = 1)
temp = temp.rename({'Estimate!!Median household income in the past 12 months (in 2018 inflation-adjusted dollars)': 'Median_Household_Income'}, axis = 1)
temp = temp[['Geography', 'Median_Household_Income']]

# remove observations without numbers
temp = temp[temp['Median_Household_Income'] != '-']
temp.head()

Unnamed: 0,Geography,Median_Household_Income
0,1500000US010010201001,26579
1,1500000US010010201002,82750
2,1500000US010010202001,27500
3,1500000US010010202002,49276
4,1500000US010010203001,58235


In [206]:
# merge dfWalk with median household income data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk.drop('Geography', axis = 1)
print(dfWalk.shape)
dfWalk.head()

(136887, 27)


Unnamed: 0,realGEOID,Intercept,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,...,jobs_within_45_minutes_auto,jobs_within_45_minutes_transit,regional_centrality,Population,percent_non_white,Median_Contract_Rent,log_Median_Contract_Rent,avg_HH_size_renters,percent_gas_energy,Median_Household_Income
0,1500000US481130078254,1,14.0,Texas,460.0,0.549,0.163121,0.744681,99,191,...,433601,135362,0.785893,1202,0.400998,838,6.731018,2.55,0.48533,54154
1,1500000US481130078253,1,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,138,...,404573,230587,0.733281,737,0.770692,707,6.561031,2.05,0.191686,33996
2,1500000US481130078271,1,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,206,...,402287,138562,0.729137,1336,0.561377,859,6.755769,2.72,0.0,31213
3,1500000US481130093012,1,8.0,Texas,451.0,0.644,0.029891,0.915761,117,137,...,263813,8873,0.478156,1541,0.143413,581,6.364751,4.81,0.86236,53191
4,1500000US481130011022,1,13.166667,Texas,255.0,0.69,0.145098,0.552941,49,302,...,372503,275466,0.675155,583,0.102916,1375,7.226209,1.88,0.619469,139875


In [216]:
dfWalk = dfWalk[dfWalk['Median_Household_Income'] != '250,000+']
dfWalk = dfWalk[dfWalk['Median_Household_Income'] != '2,500-']
dfWalk = dfWalk[dfWalk['Median_Household_Income'].isna() == False]
dfWalk['log_Median_Household_Income'] = np.log(dfWalk['Median_Household_Income'].astype('int'))

In [218]:
dfWalk[['Walk_Index', 'log_Median_Household_Income']].cov()

Unnamed: 0,Walk_Index,log_Median_Household_Income
Walk_Index,18.701706,0.010404
log_Median_Household_Income,0.010404,0.247079


In [220]:
# get median rooms
temp = pd.read_csv('/Users/austincoffelt/Downloads/Rent-Walk-Local/Median_Rooms_2018/Median_Rooms_2018.csv', header = 1)
temp = temp.rename({'Estimate!!Median number of rooms --!!Renter occupied': 'Median_Num_Rooms'}, axis = 1)
temp = temp[['Geography', 'Median_Num_Rooms']]

# remove observations without numbers
temp = temp[temp['Median_Num_Rooms'] != '-']
temp.head()

Unnamed: 0,Geography,Median_Num_Rooms
0,1500000US010010201001,5.2
1,1500000US010010201002,5.7
2,1500000US010010202001,4.9
3,1500000US010010202002,5.4
4,1500000US010010203001,5.7


In [222]:
# merge dfWalk with rooms data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk.drop('Geography', axis = 1)
dfWalk = dfWalk[dfWalk['Median_Num_Rooms'] != '9.0+']
dfWalk['Median_Num_Rooms'] = dfWalk['Median_Num_Rooms'].astype('float')
print(dfWalk.shape)
dfWalk.head()

(135626, 29)


Unnamed: 0,realGEOID,Intercept,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,...,regional_centrality,Population,percent_non_white,Median_Contract_Rent,log_Median_Contract_Rent,avg_HH_size_renters,percent_gas_energy,Median_Household_Income,log_Median_Household_Income,Median_Num_Rooms
0,1500000US481130078254,1,14.0,Texas,460.0,0.549,0.163121,0.744681,99,191,...,0.785893,1202,0.400998,838,6.731018,2.55,0.48533,54154,10.899587,4.3
1,1500000US481130078253,1,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,138,...,0.733281,737,0.770692,707,6.561031,2.05,0.191686,33996,10.433998,3.5
2,1500000US481130078271,1,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,206,...,0.729137,1336,0.561377,859,6.755769,2.72,0.0,31213,10.34859,4.2
3,1500000US481130093012,1,8.0,Texas,451.0,0.644,0.029891,0.915761,117,137,...,0.478156,1541,0.143413,581,6.364751,4.81,0.86236,53191,10.881644,5.0
4,1500000US481130011022,1,13.166667,Texas,255.0,0.69,0.145098,0.552941,49,302,...,0.675155,583,0.102916,1375,7.226209,1.88,0.619469,139875,11.848504,3.1


In [224]:
dfWalk[['Walk_Index', 'Median_Num_Rooms']].cov()

Unnamed: 0,Walk_Index,Median_Num_Rooms
Walk_Index,18.710408,-1.317056
Median_Num_Rooms,-1.317056,1.067654


In [226]:
# get median year built data
temp = pd.read_csv('/Users/austincoffelt/Downloads/Rent-Walk-Local/Median_Structure_2018/Median_Structure_2018.csv', header = 1)
temp = temp.rename({'Estimate!!Median year structure built --!!Renter occupied': 'Median_Year_Structure_Built'}, axis = 1)
temp = temp[['Geography', 'Median_Year_Structure_Built']]

#remove observations without numbers
temp = temp[temp['Median_Year_Structure_Built'] != '-']
temp.head()

Unnamed: 0,Geography,Median_Year_Structure_Built
0,1500000US010010201001,1965
1,1500000US010010201002,1992
2,1500000US010010202001,1976
3,1500000US010010202002,1969
4,1500000US010010203001,1975


In [228]:
# merge dfWalk with year data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk.drop('Geography', axis = 1)
dfWalk = dfWalk[dfWalk['Median_Year_Structure_Built'] != '1939-']

# for 2014+ put 2016 since that is the middle year of the data
dfWalk.loc[dfWalk['Median_Year_Structure_Built'] == '2014+', 'Median_Year_Structure_Built'] = 2016
dfWalk['Median_Year_Structure_Built'] = dfWalk['Median_Year_Structure_Built'].astype('int')
print(dfWalk.shape)
dfWalk.head()

(119298, 30)


Unnamed: 0,realGEOID,Intercept,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,...,Population,percent_non_white,Median_Contract_Rent,log_Median_Contract_Rent,avg_HH_size_renters,percent_gas_energy,Median_Household_Income,log_Median_Household_Income,Median_Num_Rooms,Median_Year_Structure_Built
0,1500000US481130078254,1,14.0,Texas,460.0,0.549,0.163121,0.744681,99,191,...,1202,0.400998,838,6.731018,2.55,0.48533,54154,10.899587,4.3,1983
1,1500000US481130078253,1,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,138,...,737,0.770692,707,6.561031,2.05,0.191686,33996,10.433998,3.5,1986
2,1500000US481130078271,1,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,206,...,1336,0.561377,859,6.755769,2.72,0.0,31213,10.34859,4.2,1984
3,1500000US481130093012,1,8.0,Texas,451.0,0.644,0.029891,0.915761,117,137,...,1541,0.143413,581,6.364751,4.81,0.86236,53191,10.881644,5.0,1957
4,1500000US481130011022,1,13.166667,Texas,255.0,0.69,0.145098,0.552941,49,302,...,583,0.102916,1375,7.226209,1.88,0.619469,139875,11.848504,3.1,1959


In [230]:
dfWalk[['Walk_Index', 'Median_Year_Structure_Built']].cov()

Unnamed: 0,Walk_Index,Median_Year_Structure_Built
Walk_Index,18.263188,-14.493845
Median_Year_Structure_Built,-14.493845,244.959861


In [239]:
# get plumbing facilities data
#temp = pd.read_csv('/Users/austincoffelt/Downloads/Rent-Walk-local/Plumbing_Facilities_Tenure/Plumbing_Facilities_Tenure.csv', header = 1)

# get percentage with lackluster facilities
#temp['percent_lacking_plumbing_renter'] = temp['Estimate!!Total:!!Renter occupied:!!Lacking plumbing facilities'] / temp['Estimate!!Total:!!Renter occupied:']
#temp = temp[['Geography', 'percent_lacking_plumbing_renter']]
#temp.head()

Unnamed: 0,Geography,percent_lacking_plumbing_renter
0,1500000US010010201001,0.0
1,1500000US010010201002,0.0
2,1500000US010010202001,0.0
3,1500000US010010202002,0.0
4,1500000US010010203001,0.0


In [240]:
# merge dfWalk with plumbing data on GEOID
#dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
#dfWalk = dfWalk.drop('Geography', axis = 1)
#print(dfWalk.shape)
#dfWalk.head()

(127650, 22)


Unnamed: 0,realGEOID,Intercept,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,...,housing_density,population_density,employent_housing_density,jobs_per_household,transit_frequency,jobs_within_45_minutes_auto,jobs_within_45_minutes_transit,regional_centrality,Median_Year_Structure_Built,percent_lacking_plumbing_renter
0,1500000US481130078254,1,14.0,Texas,460.0,0.549,0.163121,0.744681,99,191,...,6.250422,16.332625,7.147222,0.156028,0.003602,433601,135362,0.785893,1977,0.0
1,1500000US481130078252,1,10.833333,Texas,409.0,0.466,0.0,0.589242,76,212,...,3.430799,5.955666,3.640506,0.061125,0.006099,386504,236885,0.700531,1977,0.0
2,1500000US481130078253,1,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,138,...,13.843035,27.951553,13.843035,0.0,0.004071,404573,230587,0.733281,1982,0.0
3,1500000US481130078241,1,15.666667,Texas,384.0,0.638,0.0,0.888021,60,302,...,3.225246,7.592767,5.350213,0.658854,0.007378,423099,168433,0.766859,1974,0.0
4,1500000US481130078271,1,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,206,...,10.969254,26.357776,11.028441,0.006036,0.002246,402287,138562,0.729137,1985,0.0


In [243]:
#dfWalk[['Walk_Index', 'percent_lacking_plumbing_renter']].cov()

Unnamed: 0,Walk_Index,percent_lacking_plumbing_renter
Walk_Index,18.728143,-0.00793
percent_lacking_plumbing_renter,-0.00793,0.001293


In [245]:
# get exam score data
#temp = pd.read_csv('Block_Group_Exam_Score.csv')
#temp.head()

Unnamed: 0,AFFGEOID,weight_avg_math_scores,weight_avg_ELA_scores
0,1500000US010010201001,0.236767,0.499152
1,1500000US010010201002,0.236767,0.499152
2,1500000US010010202001,0.236767,0.499152
3,1500000US010010202002,0.236767,0.499152
4,1500000US010010203001,0.236767,0.499152


In [247]:
# merge dfWalk with exam data on GEOID
#dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'AFFGEOID')
#dfWalk = dfWalk.drop('AFFGEOID', axis = 1)
#print(dfWalk.shape)
#dfWalk.head()

(103513, 24)


Unnamed: 0,realGEOID,Intercept,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,...,employent_housing_density,jobs_per_household,transit_frequency,jobs_within_45_minutes_auto,jobs_within_45_minutes_transit,regional_centrality,Median_Year_Structure_Built,percent_lacking_plumbing_renter,weight_avg_math_scores,weight_avg_ELA_scores
0,1500000US481130078254,1,14.0,Texas,460.0,0.549,0.163121,0.744681,99,191,...,7.147222,0.156028,0.003602,433601,135362,0.785893,1977,0.0,0.403983,0.439323
1,1500000US481130078252,1,10.833333,Texas,409.0,0.466,0.0,0.589242,76,212,...,3.640506,0.061125,0.006099,386504,236885,0.700531,1977,0.0,0.403983,0.439323
2,1500000US481130078253,1,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,138,...,13.843035,0.0,0.004071,404573,230587,0.733281,1982,0.0,0.403983,0.439323
3,1500000US481130078241,1,15.666667,Texas,384.0,0.638,0.0,0.888021,60,302,...,5.350213,0.658854,0.007378,423099,168433,0.766859,1974,0.0,0.403983,0.439323
4,1500000US481130078271,1,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,206,...,11.028441,0.006036,0.002246,402287,138562,0.729137,1985,0.0,0.403983,0.439323


In [249]:
#dfWalk[['Walk_Index', 'weight_avg_math_scores', 'weight_avg_ELA_scores']].cov()

Unnamed: 0,Walk_Index,weight_avg_math_scores,weight_avg_ELA_scores
Walk_Index,18.167443,-0.045625,0.017033
weight_avg_math_scores,-0.045625,0.027472,0.023298
weight_avg_ELA_scores,0.017033,0.023298,0.026191


In [232]:
# get crime data
laCrime = pd.read_csv('LA_Violent_Crime_BG.csv')
noCrime = pd.read_csv('NO_Violent_Crime_BG.csv')
auCrime = pd.read_csv('AustinTX_violent_crime_by_BG.csv')
chiCrime = pd.read_csv('Chicago_Violent_Crime_BG.csv')
montCrime = pd.read_csv('MontMD_violent_crime_by_BG.csv')
nyCrime = pd.read_csv('NY_Violent_Crime_BG.csv')
brCrime = pd.read_csv('BR_Violent_Crime_BG.csv')
brCrime = brCrime.rename({'AFFGEOID': 'count', 'Unnamed: 0':'AFFGEOID'}, axis = 1)
camCrime = pd.read_csv('Cambridge_Violent_Crime_BG.csv')
caryCrime = pd.read_csv('Cary_Violent_Crime_BG.csv')
caryCrime = caryCrime.rename({'AFFGEOID': 'count', 'Unnamed: 0':'AFFGEOID'}, axis = 1)
huCrime = pd.read_csv('Houston_Violent_Crime_BG.csv')
sfCrime = pd.read_csv('SF_Violent_Crime_BG.csv')
crime = pd.concat([laCrime, noCrime, auCrime, chiCrime, montCrime, nyCrime, brCrime, camCrime, caryCrime, huCrime, sfCrime])
print(crime.shape)
crime.head()

(11892, 2)


Unnamed: 0,AFFGEOID,count
0,1500000US060372077101,284
1,1500000US060372073012,189
2,1500000US060372260021,175
3,1500000US060372063003,163
4,1500000US060372063001,148


In [234]:
# merge dfWalk with crime data on GEOID
dfWalk = dfWalk.merge(crime, how = 'inner', left_on = 'realGEOID', right_on = 'AFFGEOID')
dfWalk['violent_crime_rate'] = (dfWalk['count'] / dfWalk['Population']) * 100000
dfWalk = dfWalk.drop('AFFGEOID', axis = 1)
print(dfWalk.shape)
dfWalk.head()

(5928, 32)


Unnamed: 0,realGEOID,Intercept,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,...,Median_Contract_Rent,log_Median_Contract_Rent,avg_HH_size_renters,percent_gas_energy,Median_Household_Income,log_Median_Household_Income,Median_Num_Rooms,Median_Year_Structure_Built,count,violent_crime_rate
0,1500000US484530023144,1,11.0,Texas,702.0,0.811,0.234168,0.309278,186,257,...,967,6.874198,3.03,0.272915,28495,10.257484,4.1,1989,16,814.663951
1,1500000US484530002041,1,18.333333,Texas,795.0,0.822,0.231132,0.34434,126,418,...,907,6.810142,1.79,0.401852,31500,10.357743,2.3,1976,1,86.132644
2,1500000US484530015033,1,11.833333,Texas,677.0,0.824,0.013294,0.453471,184,457,...,944,6.850126,1.81,0.452009,47078,10.759561,3.7,1972,9,679.245283
3,1500000US482013408003,1,8.166667,Texas,786.0,0.613,0.015267,0.839695,213,677,...,1825,7.509335,2.56,0.833333,125962,11.743736,6.1,1992,7,304.083406
4,1500000US482013408002,1,10.166667,Texas,558.0,0.645,0.0,0.844086,138,464,...,1732,7.457032,3.91,0.726496,84949,11.349806,6.0,1982,1,61.99628


In [666]:
# get dummies for the states
stateDummies = pd.get_dummies(dfWalk.loc[:, 'state'] , drop_first = False, dtype = 'int')

# concatinate with our dataframe
dfWalk = pd.concat([dfWalk, stateDummies], axis = 1)

# drop state and california for multicoliniarity issues
dfWalk = dfWalk.drop(['state', 'California'], axis = 1)
dfWalk.head()

Unnamed: 0,realGEOID,Intercept,Walk_Index,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,percent_low_wage_workers,...,Median_Year_Structure_Built,count,violent_crime_rate,Illinois,Louisiana,Maryland,Massachusetts,New York,North Carolina,Texas
0,1500000US484530023144,1,11.0,702.0,0.811,0.234168,0.309278,186,257,0.248663,...,1995,16,739.030023,0,0,0,0,0,0,1
1,1500000US484530002041,1,18.333333,795.0,0.822,0.231132,0.34434,126,418,0.171896,...,1970,1,81.566069,0,0,0,0,0,0,1
2,1500000US484530015032,1,18.666667,358.0,0.974,0.162264,0.532075,64,137,0.211221,...,1967,2,361.663653,0,0,0,0,0,0,1
3,1500000US482013340012,1,16.5,846.0,0.553,0.0,0.782016,220,272,0.269278,...,1987,27,1414.353064,0,0,0,0,0,0,1
4,1500000US482014103002,1,18.333333,580.0,0.639,0.023762,0.548515,52,429,0.090909,...,2001,5,422.654269,0,0,0,0,0,0,1


In [696]:
# separate into X and Y
y = dfWalk['log_Median_Contract_Rent']
X = dfWalk.drop(['Intercept', 'log_Median_Contract_Rent', 'Median_Contract_Rent', 'Median_Household_Income', 'realGEOID'], axis = 1)

# get continuous columns
contCols = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]

# Standardize features 
scaler = StandardScaler()
X_scaled = X.copy()
X_scaled.iloc[:, contCols] = scaler.fit_transform(X.iloc[:, contCols])

X_scaled.head()

Unnamed: 0,Walk_Index,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,percent_low_wage_workers,TotEmp,housing_density,...,Median_Year_Structure_Built,count,violent_crime_rate,Illinois,Louisiana,Maryland,Massachusetts,New York,North Carolina,Texas
0,-1.113046,0.274232,1.790679,-0.170961,-0.170916,0.466612,-0.172789,0.263523,-0.163957,-0.279575,...,1.724328,0.431008,0.065741,0,0,0,0,0,0,1
1,1.561278,0.572557,1.905098,-0.182739,-0.023107,-0.366205,0.620114,-1.097063,0.256082,-0.527675,...,0.227243,-0.413895,-0.275325,0,0,0,0,0,0,1
2,1.682839,-0.829247,3.486173,-0.44992,0.768339,-1.226782,-0.763773,-0.400085,-0.000345,-0.597964,...,0.047593,-0.357568,-0.130021,0,0,0,0,0,0,1
3,0.892697,0.736154,-0.892988,-1.079443,1.822026,0.938541,-0.098916,0.628891,-0.120872,-0.588345,...,1.245261,1.050605,0.416071,0,0,0,0,0,0,1
4,1.561278,-0.117118,0.001568,-0.987254,0.837643,-1.393346,0.674288,-2.532451,0.106895,-0.538188,...,2.083628,-0.188588,-0.098382,0,0,0,0,0,0,1


In [698]:
# use lasso to find best variables to use
lasso = LassoCV(cv=5, random_state=42)
lasso.fit(X_scaled, y)

# Get the coefficients and identify non-zero features
coef = pd.Series(lasso.coef_, index=X.columns)
selected_features = coef[coef != 0].index.tolist()

print("Selected features:", selected_features)
print("Features eliminated:", set(X.columns) - set(selected_features))

Selected features: ['Walk_Index', 'percentage_work_age', 'percent_no_car', 'count_high_wage_workers', 'population_density', 'employent_housing_density', 'jobs_per_household', 'transit_frequency', 'jobs_within_45_minutes_auto', 'jobs_within_45_minutes_transit', 'regional_centrality', 'percent_non_white', 'avg_HH_size_renters', 'log_Median_Household_Income', 'Median_Num_Rooms', 'Median_Year_Structure_Built', 'Illinois', 'Texas']
Features eliminated: {'Maryland', 'count', 'count_low_wage_workers', 'percent_gas_energy', 'Louisiana', 'percent_two_car', 'Massachusetts', 'North Carolina', 'percent_low_wage_workers', 'Population', 'TotEmp', 'violent_crime_rate', 'count_housing_units', 'New York', 'housing_density'}


In [700]:
X = dfWalk[['Intercept', 'Walk_Index', 'violent_crime_rate', 'percentage_work_age', 'percent_no_car', 'count_high_wage_workers', 'population_density', 'employent_housing_density', 
       'jobs_per_household', 'transit_frequency', 'jobs_within_45_minutes_auto', 'jobs_within_45_minutes_transit', 'regional_centrality',
       'percent_non_white', 'avg_HH_size_renters', 'Median_Num_Rooms',
       'Median_Year_Structure_Built', 'log_Median_Household_Income', 'Louisiana', 'North Carolina', 'Illinois', 'Maryland', 'Massachusetts',
       'New York', 'Texas']]
print(X.shape)
X.head()

(5082, 25)


Unnamed: 0,Intercept,Walk_Index,violent_crime_rate,percentage_work_age,percent_no_car,count_high_wage_workers,population_density,employent_housing_density,jobs_per_household,transit_frequency,...,Median_Num_Rooms,Median_Year_Structure_Built,log_Median_Household_Income,Louisiana,North Carolina,Illinois,Maryland,Massachusetts,New York,Texas
0,1,11.0,739.030023,0.811,0.234168,257,36.133598,13.356921,0.035346,0.014425,...,3.1,1995,11.126939,0,0,0,0,0,0,1
1,1,18.333333,81.566069,0.822,0.231132,418,9.144881,23.882238,3.517296,0.023256,...,3.2,1970,10.75524,0,0,0,0,0,0,1
2,1,18.666667,361.663653,0.974,0.162264,137,6.174323,15.209619,3.343396,0.088455,...,4.2,1967,11.049858,0,0,0,0,0,0,1
3,1,16.5,1414.353064,0.553,0.0,272,12.359976,6.010148,0.341962,0.000887,...,3.4,1987,10.349775,0,0,0,0,0,0,1
4,1,18.333333,422.654269,0.639,0.023762,429,9.196937,20.94056,2.873267,0.008969,...,3.9,2001,11.816727,0,0,0,0,0,0,1


In [702]:
X.cov()

Unnamed: 0,Intercept,Walk_Index,violent_crime_rate,percentage_work_age,percent_no_car,count_high_wage_workers,population_density,employent_housing_density,jobs_per_household,transit_frequency,...,Median_Num_Rooms,Median_Year_Structure_Built,log_Median_Household_Income,Louisiana,North Carolina,Illinois,Maryland,Massachusetts,New York,Texas
Intercept,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Walk_Index,0.0,7.520732,97.14204,0.045685,-0.000565,36.67785,-12.85004,6.710081,0.796554,19689.48,...,-0.612059,-2.519193,0.207992,-0.122306,-0.015067,0.039466,-0.02826,0.005828,-0.012682,-0.148423
violent_crime_rate,0.0,97.142039,3716672.0,-3.212107,-1.303073,-37433.11,-5934.756,-1314.025,915.397849,-581557.8,...,-26.238922,1251.324,-112.979906,20.211448,-3.149605,-43.166441,-15.668214,-1.112723,-59.386709,65.485372
percentage_work_age,0.0,0.045685,-3.212107,0.009244,-0.00089,5.239226,0.001800865,1.310538,0.061196,89.66976,...,-0.020377,0.08367944,0.016023,-0.000609,1.6e-05,0.000136,-0.00054,0.000282,-0.002812,-0.000414
percent_no_car,0.0,-0.000565,-1.303073,-0.00089,0.066452,-2.154629,10.31739,10.24228,0.083478,604.6339,...,-0.07275,-0.7335649,-0.048826,-0.004121,-0.001539,0.003878,-0.008261,8.3e-05,0.082929,-0.026491
count_high_wage_workers,0.0,36.67785,-37433.11,5.239226,-2.154629,41237.91,871.778,5753.098,155.473216,14446.24,...,-21.991623,851.2421,52.468566,-8.493444,2.090769,-1.736954,5.229608,0.455138,1.538804,4.046101
population_density,0.0,-12.850039,-5934.756,0.001801,10.317391,871.778,3457.864,2002.575,-33.807659,134776.5,...,-14.08167,-180.6112,-4.521302,-1.989108,-0.283456,-1.434143,-1.373406,-0.038561,16.320383,-5.368187
employent_housing_density,0.0,6.710081,-1314.025,1.310538,10.242279,5753.098,2002.575,9538.113,376.528028,110219.7,...,-24.248706,69.55307,5.045992,-1.609594,-0.220758,-0.094776,-1.121734,0.003192,13.087194,-4.228471
jobs_per_household,0.0,0.796554,915.3978,0.061196,0.083478,155.4732,-33.80766,376.528,78.432366,1089.062,...,-0.816116,18.93985,0.332805,-0.025782,0.012575,-0.005777,-0.014556,0.004945,-0.029293,0.040077
transit_frequency,0.0,19689.476861,-581557.8,89.66976,604.633911,14446.24,134776.5,110219.7,1089.06208,314189700.0,...,-2176.661473,-31452.91,338.114241,-1624.191732,-77.31826,304.160642,118.340934,10.863235,1095.370632,-475.796941


In [704]:
X = X.drop('percent_no_car', axis = 1)

In [710]:
crimeQuartile = np.quantile(X['violent_crime_rate'], [0.25, 0.75])

#get interaction
X['high_crime'] = 0
X.loc[(X['violent_crime_rate'] >= crimeQuartile[1]), 'high_crime'] = 1
X['low_crime'] = 0
X.loc[(X['violent_crime_rate'] <= crimeQuartile[0]), 'low_crime'] = 1
X['Walk_index_x_high_crime'] = X['high_crime'] * X['Walk_Index']
X['Walk_index_x_low_crime'] = X['low_crime'] * X['Walk_Index']
X = X.drop('violent_crime_rate', axis = 1)
print(X.shape)
X.head()

(5082, 27)


Unnamed: 0,Intercept,Walk_Index,percentage_work_age,count_high_wage_workers,population_density,employent_housing_density,jobs_per_household,transit_frequency,jobs_within_45_minutes_auto,jobs_within_45_minutes_transit,...,North Carolina,Illinois,Maryland,Massachusetts,New York,Texas,high_crime,low_crime,Walk_index_x_high_crime,Walk_index_x_low_crime
0,1,11.0,0.811,257,36.133598,13.356921,0.035346,0.014425,173647,162213,...,0,0,0,0,0,1,1,0,11.0,0.0
1,1,18.333333,0.822,418,9.144881,23.882238,3.517296,0.023256,236060,274279,...,0,0,0,0,0,1,0,1,0.0,18.333333
2,1,18.666667,0.974,137,6.174323,15.209619,3.343396,0.088455,217234,178387,...,0,0,0,0,0,1,0,0,0.0,0.0
3,1,16.5,0.553,272,12.359976,6.010148,0.341962,0.000887,240683,22785,...,0,0,0,0,0,1,1,0,16.5,0.0
4,1,18.333333,0.639,429,9.196937,20.94056,2.873267,0.008969,472807,440532,...,0,0,0,0,0,1,0,0,0.0,0.0


In [714]:
print('Columns to use in real analysis: ' + str(list(X.columns)))

Columns to use in real analysis: ['Intercept', 'Walk_Index', 'percentage_work_age', 'count_high_wage_workers', 'population_density', 'employent_housing_density', 'jobs_per_household', 'transit_frequency', 'jobs_within_45_minutes_auto', 'jobs_within_45_minutes_transit', 'regional_centrality', 'percent_non_white', 'avg_HH_size_renters', 'Median_Num_Rooms', 'Median_Year_Structure_Built', 'log_Median_Household_Income', 'Louisiana', 'North Carolina', 'Illinois', 'Maryland', 'Massachusetts', 'New York', 'Texas', 'high_crime', 'low_crime', 'Walk_index_x_high_crime', 'Walk_index_x_low_crime']
