In [683]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RepeatedKFold

import warnings
warnings.filterwarnings("ignore")

In [685]:
# read in dataset
dfWalk = pd.read_csv('/Users/austincoffelt/Documents/Rent-Walk-local/EPA_SmartLocationDatabase_V3_Jan_2021_Final.csv')
dfWalk.head()

Unnamed: 0,OBJECTID,GEOID10,GEOID20,STATEFP,COUNTYFP,TRACTCE,BLKGRPCE,CSA,CSA_Name,CBSA,...,D5DRI,D5DE,D5DEI,D2A_Ranked,D2B_Ranked,D3B_Ranked,D4A_Ranked,NatWalkInd,Shape_Length,Shape_Area
0,1,481130000000.0,481130000000.0,48,113,7825,4,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.184697,0.000476,0.137707,6,14,15,17,14.0,3110.36082,297836.0831
1,2,481130000000.0,481130000000.0,48,113,7825,2,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.323221,0.000801,0.231868,3,10,12,14,10.833333,3519.46911,484945.1466
2,3,481130000000.0,481130000000.0,48,113,7825,3,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.314628,0.000736,0.213146,1,1,7,17,8.333333,1697.091802,106705.9281
3,4,481130000000.0,481130000000.0,48,113,7824,1,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.229821,0.000708,0.205018,16,10,17,17,15.666667,2922.609204,481828.4303
4,5,481130000000.0,481130000000.0,48,113,7824,2,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.164863,0.000433,0.125296,4,7,11,14,10.166667,3731.971773,687684.7752


In [686]:
# change state, county, tract, and blk group to strings
dfWalk.loc[:, ['STATEFP', 'COUNTYFP', 'TRACTCE', 'BLKGRPCE']] = dfWalk.loc[:, ['STATEFP', 'COUNTYFP', 'TRACTCE', 'BLKGRPCE']].astype('str')

In [687]:
# add 0s to the front to create the geoIDs
for i in range(len(dfWalk)):
    dfWalk.loc[i, 'STATEFP'] = dfWalk.loc[i, 'STATEFP'].zfill(2)
    dfWalk.loc[i, 'COUNTYFP'] = dfWalk.loc[i, 'COUNTYFP'].zfill(3)
    dfWalk.loc[i, 'TRACTCE'] = dfWalk.loc[i, 'TRACTCE'].zfill(6)
    
dfWalk.head()

Unnamed: 0,OBJECTID,GEOID10,GEOID20,STATEFP,COUNTYFP,TRACTCE,BLKGRPCE,CSA,CSA_Name,CBSA,...,D5DRI,D5DE,D5DEI,D2A_Ranked,D2B_Ranked,D3B_Ranked,D4A_Ranked,NatWalkInd,Shape_Length,Shape_Area
0,1,481130000000.0,481130000000.0,48,113,7825,4,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.184697,0.000476,0.137707,6,14,15,17,14.0,3110.36082,297836.0831
1,2,481130000000.0,481130000000.0,48,113,7825,2,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.323221,0.000801,0.231868,3,10,12,14,10.833333,3519.46911,484945.1466
2,3,481130000000.0,481130000000.0,48,113,7825,3,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.314628,0.000736,0.213146,1,1,7,17,8.333333,1697.091802,106705.9281
3,4,481130000000.0,481130000000.0,48,113,7824,1,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.229821,0.000708,0.205018,16,10,17,17,15.666667,2922.609204,481828.4303
4,5,481130000000.0,481130000000.0,48,113,7824,2,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.164863,0.000433,0.125296,4,7,11,14,10.166667,3731.971773,687684.7752


In [688]:
# create realGeoID for merging
dfWalk['realGEOID'] = '1500000US' + dfWalk['STATEFP'] + dfWalk['COUNTYFP'] + dfWalk['TRACTCE'] + dfWalk['BLKGRPCE']
dfWalk['county'] = dfWalk['STATEFP'] + dfWalk['COUNTYFP']
dfWalk.head()

Unnamed: 0,OBJECTID,GEOID10,GEOID20,STATEFP,COUNTYFP,TRACTCE,BLKGRPCE,CSA,CSA_Name,CBSA,...,D5DEI,D2A_Ranked,D2B_Ranked,D3B_Ranked,D4A_Ranked,NatWalkInd,Shape_Length,Shape_Area,realGEOID,county
0,1,481130000000.0,481130000000.0,48,113,7825,4,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.137707,6,14,15,17,14.0,3110.36082,297836.0831,1500000US481130078254,48113
1,2,481130000000.0,481130000000.0,48,113,7825,2,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.231868,3,10,12,14,10.833333,3519.46911,484945.1466,1500000US481130078252,48113
2,3,481130000000.0,481130000000.0,48,113,7825,3,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.213146,1,1,7,17,8.333333,1697.091802,106705.9281,1500000US481130078253,48113
3,4,481130000000.0,481130000000.0,48,113,7824,1,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.205018,16,10,17,17,15.666667,2922.609204,481828.4303,1500000US481130078241,48113
4,5,481130000000.0,481130000000.0,48,113,7824,2,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.125296,4,7,11,14,10.166667,3731.971773,687684.7752,1500000US481130078242,48113


In [689]:
# select and rename the most important columns of our dataframe
imptCols = ['realGEOID','county', 'NatWalkInd', 'STATEFP', 'CountHU', 'P_WrkAge', 'Pct_AO0',
            'Pct_AO2p', 'R_LowWageWk', 'R_HiWageWk', 'R_PCTLOWWAGE', 'TotEmp', 'D1A', 
            'D1B', 'D1D', 'D2A_JPHH', 'D5AR', 'D5CRI']
dfWalk = dfWalk.loc[:, imptCols]
dfWalk = dfWalk.rename({'NatWalkInd':'Walk_Index',
                        'STATEFP':'state',
                        'CountHU':'count_housing_units',
                        'P_WrkAge':'percentage_work_age',
                        'Pct_AO0':'percent_no_car',
                        'Pct_AO2p':'percent_two_car',
                        'R_LowWageWk':'count_low_wage_workers',
                        'R_HiWageWk':'count_high_wage_workers',
                        'R_PCTLOWWAGE':'percent_low_wage_workers',
                        'D1A':'housing_density',
                        'D1B':'population_density',
                        'D1D':'employent_housing_density',
                        'D2A_JPHH':'jobs_per_household',
                        'D5AR':'jobs_within_45_minutes_auto',
                        'D5CRI':'regional_centrality'}, axis = 1)
dfWalk.head()

Unnamed: 0,realGEOID,county,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,percent_low_wage_workers,TotEmp,housing_density,population_density,employent_housing_density,jobs_per_household,jobs_within_45_minutes_auto,regional_centrality
0,1500000US481130078254,48113,14.0,48,460.0,0.549,0.163121,0.744681,99,191,0.240291,66,6.250422,16.332625,7.147222,0.156028,433601,0.785893
1,1500000US481130078252,48113,10.833333,48,409.0,0.466,0.0,0.589242,76,212,0.192405,25,3.430799,5.955666,3.640506,0.061125,386504,0.700531
2,1500000US481130078253,48113,8.333333,48,365.0,0.811,0.057751,0.507599,136,138,0.293737,0,13.843035,27.951553,13.843035,0.0,404573,0.733281
3,1500000US481130078241,48113,15.666667,48,384.0,0.638,0.0,0.888021,60,302,0.139211,253,3.225246,7.592767,5.350213,0.658854,423099,0.766859
4,1500000US481130078242,48113,10.166667,48,343.0,0.506,0.014577,0.790087,91,404,0.157168,32,2.305992,6.373413,2.521128,0.093294,335700,0.60845


In [690]:
# change numbers to States
dfWalk.loc[dfWalk['state'] == '01', 'state'] = 'Alabama' 
dfWalk.loc[dfWalk['state'] == '02', 'state'] = 'Alaska'
dfWalk.loc[dfWalk['state'] == '04', 'state'] = 'Arizona'
dfWalk.loc[dfWalk['state'] == '05', 'state'] = 'Arkansas'
dfWalk.loc[dfWalk['state'] == '06', 'state'] = 'California'
dfWalk.loc[dfWalk['state'] == '08', 'state'] = 'Colorado'
dfWalk.loc[dfWalk['state'] == '09', 'state'] = 'Connecticut'
dfWalk.loc[dfWalk['state'] == '10', 'state'] = 'Deleware'
dfWalk.loc[dfWalk['state'] == '11', 'state'] = 'District of Columbia'
dfWalk.loc[dfWalk['state'] == '12', 'state'] = 'Florida'
dfWalk.loc[dfWalk['state'] == '13', 'state'] = 'Georgia'
dfWalk.loc[dfWalk['state'] == '15', 'state'] = 'Hawaii'
dfWalk.loc[dfWalk['state'] == '16', 'state'] = 'Idaho'
dfWalk.loc[dfWalk['state'] == '17', 'state'] = 'Illinois'
dfWalk.loc[dfWalk['state'] == '18', 'state'] = 'Indiana'
dfWalk.loc[dfWalk['state'] == '19', 'state'] = 'Iowa'
dfWalk.loc[dfWalk['state'] == '20', 'state'] = 'Kansas'
dfWalk.loc[dfWalk['state'] == '21', 'state'] = 'Kentucky'
dfWalk.loc[dfWalk['state'] == '22', 'state'] = 'Louisiana'
dfWalk.loc[dfWalk['state'] == '23', 'state'] = 'Maine'
dfWalk.loc[dfWalk['state'] == '24', 'state'] = 'Maryland'
dfWalk.loc[dfWalk['state'] == '25', 'state'] = 'Massachusetts'
dfWalk.loc[dfWalk['state'] == '26', 'state'] = 'Michigan'
dfWalk.loc[dfWalk['state'] == '27', 'state'] = 'Minnesota'
dfWalk.loc[dfWalk['state'] == '28', 'state'] = 'Mississippi'
dfWalk.loc[dfWalk['state'] == '29', 'state'] = 'Missouri'
dfWalk.loc[dfWalk['state'] == '30', 'state'] = 'Montana'
dfWalk.loc[dfWalk['state'] == '31', 'state'] = 'Nebraska'
dfWalk.loc[dfWalk['state'] == '32', 'state'] = 'Nevada'
dfWalk.loc[dfWalk['state'] == '33', 'state'] = 'New Hampshire'
dfWalk.loc[dfWalk['state'] == '34', 'state'] = 'New Jersey'
dfWalk.loc[dfWalk['state'] == '35', 'state'] = 'New Mexico'
dfWalk.loc[dfWalk['state'] == '36', 'state'] = 'New York'
dfWalk.loc[dfWalk['state'] == '37', 'state'] = 'North Carolina'
dfWalk.loc[dfWalk['state'] == '38', 'state'] = 'North Dakota'
dfWalk.loc[dfWalk['state'] == '39', 'state'] = 'Ohio'
dfWalk.loc[dfWalk['state'] == '40', 'state'] = 'Oklahoma'
dfWalk.loc[dfWalk['state'] == '41', 'state'] = 'Oregon'
dfWalk.loc[dfWalk['state'] == '42', 'state'] = 'Pennsylvania'
dfWalk.loc[dfWalk['state'] == '44', 'state'] = 'Rhode Island'
dfWalk.loc[dfWalk['state'] == '45', 'state'] = 'South Carolian'
dfWalk.loc[dfWalk['state'] == '46', 'state'] = 'South Dakota'
dfWalk.loc[dfWalk['state'] == '47', 'state'] = 'Tennessee'
dfWalk.loc[dfWalk['state'] == '48', 'state'] = 'Texas'
dfWalk.loc[dfWalk['state'] == '49', 'state'] = 'Utah'
dfWalk.loc[dfWalk['state'] == '50', 'state'] = 'Vermont'
dfWalk.loc[dfWalk['state'] == '51', 'state'] = 'Virginia'
dfWalk.loc[dfWalk['state'] == '53', 'state'] = 'Washington'
dfWalk.loc[dfWalk['state'] == '54', 'state'] = 'West Virginia'
dfWalk.loc[dfWalk['state'] == '55', 'state'] = 'Wisconsin'
dfWalk.loc[dfWalk['state'] == '56', 'state'] = 'Wyoming'
print(dfWalk.shape)
dfWalk.head()

(220740, 18)


Unnamed: 0,realGEOID,county,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,percent_low_wage_workers,TotEmp,housing_density,population_density,employent_housing_density,jobs_per_household,jobs_within_45_minutes_auto,regional_centrality
0,1500000US481130078254,48113,14.0,Texas,460.0,0.549,0.163121,0.744681,99,191,0.240291,66,6.250422,16.332625,7.147222,0.156028,433601,0.785893
1,1500000US481130078252,48113,10.833333,Texas,409.0,0.466,0.0,0.589242,76,212,0.192405,25,3.430799,5.955666,3.640506,0.061125,386504,0.700531
2,1500000US481130078253,48113,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,138,0.293737,0,13.843035,27.951553,13.843035,0.0,404573,0.733281
3,1500000US481130078241,48113,15.666667,Texas,384.0,0.638,0.0,0.888021,60,302,0.139211,253,3.225246,7.592767,5.350213,0.658854,423099,0.766859
4,1500000US481130078242,48113,10.166667,Texas,343.0,0.506,0.014577,0.790087,91,404,0.157168,32,2.305992,6.373413,2.521128,0.093294,335700,0.60845


In [693]:
# get race and population
temp = pd.read_csv('/Users/austincoffelt/Documents/Rent-Walk-local/Race_2019/Race_2019.csv', header = 1)
temp['percent_non_white'] = 1 - (temp['Estimate!!Total:!!White alone'] / temp['Estimate!!Total:'])
temp = temp.rename({'Estimate!!Total:': 'Population'}, axis = 1)
temp = temp[['Geography', 'Population', 'percent_non_white']]
temp.head()

Unnamed: 0,Geography,Population,percent_non_white
0,1500000US010010201001,730,0.160274
1,1500000US010010201002,1263,0.151227
2,1500000US010010202001,835,0.669461
3,1500000US010010202002,1124,0.570285
4,1500000US010010203001,2774,0.327325


In [694]:
# merge dfWalk with rent data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk.drop('Geography', axis = 1)
print(dfWalk.shape)
dfWalk.head()

(220333, 20)


Unnamed: 0,realGEOID,county,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,percent_low_wage_workers,TotEmp,housing_density,population_density,employent_housing_density,jobs_per_household,jobs_within_45_minutes_auto,regional_centrality,Population,percent_non_white
0,1500000US481130078254,48113,14.0,Texas,460.0,0.549,0.163121,0.744681,99,191,0.240291,66,6.250422,16.332625,7.147222,0.156028,433601,0.785893,1141,0.423313
1,1500000US481130078252,48113,10.833333,Texas,409.0,0.466,0.0,0.589242,76,212,0.192405,25,3.430799,5.955666,3.640506,0.061125,386504,0.700531,792,0.113636
2,1500000US481130078253,48113,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,138,0.293737,0,13.843035,27.951553,13.843035,0.0,404573,0.733281,528,0.748106
3,1500000US481130078241,48113,15.666667,Texas,384.0,0.638,0.0,0.888021,60,302,0.139211,253,3.225246,7.592767,5.350213,0.658854,423099,0.766859,884,0.057692
4,1500000US481130078242,48113,10.166667,Texas,343.0,0.506,0.014577,0.790087,91,404,0.157168,32,2.305992,6.373413,2.521128,0.093294,335700,0.60845,1001,0.046953


In [695]:
# get median rent
temp = pd.read_csv('/Users/austincoffelt/Documents/Rent-Walk-local/Rent_2019/Rent_2019.csv', header = 1)
temp = temp.rename({'Estimate!!Median contract rent': 'Median_Contract_Rent'}, axis = 1)
temp = temp[['Geography', 'Median_Contract_Rent']]

# remove observations without numbers
temp = temp[temp['Median_Contract_Rent'] != '-']
temp.head()

Unnamed: 0,Geography,Median_Contract_Rent
0,1500000US010010201001,607
1,1500000US010010201002,494
2,1500000US010010202001,343
3,1500000US010010202002,645
4,1500000US010010203001,653


In [696]:
# merge dfWalk with rent data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk[dfWalk['Median_Contract_Rent'] != '3,500+']
dfWalk = dfWalk[dfWalk['Median_Contract_Rent'] != '100-']
dfWalk['log_Median_Contract_Rent'] = np.log(dfWalk['Median_Contract_Rent'].astype('int'))
dfWalk = dfWalk.drop(['Geography', 'Median_Contract_Rent'], axis = 1)

print(dfWalk.shape)
dfWalk.head()

(174070, 21)


Unnamed: 0,realGEOID,county,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,...,TotEmp,housing_density,population_density,employent_housing_density,jobs_per_household,jobs_within_45_minutes_auto,regional_centrality,Population,percent_non_white,log_Median_Contract_Rent
0,1500000US481130078254,48113,14.0,Texas,460.0,0.549,0.163121,0.744681,99,191,...,66,6.250422,16.332625,7.147222,0.156028,433601,0.785893,1141,0.423313,6.708084
1,1500000US481130078253,48113,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,138,...,0,13.843035,27.951553,13.843035,0.0,404573,0.733281,528,0.748106,6.60665
2,1500000US481130078242,48113,10.166667,Texas,343.0,0.506,0.014577,0.790087,91,404,...,32,2.305992,6.373413,2.521128,0.093294,335700,0.60845,1001,0.046953,7.26892
3,1500000US481130078271,48113,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,206,...,3,10.969254,26.357776,11.028441,0.006036,402287,0.729137,1090,0.511927,6.717805
4,1500000US481130093012,48113,8.0,Texas,451.0,0.644,0.029891,0.915761,117,137,...,1,2.247799,7.680394,2.252783,0.002717,263813,0.478156,1286,0.157076,6.57647


In [697]:
# get average household size
temp = pd.read_csv('/Users/austincoffelt/Documents/Rent-Walk-local/HH_Size_2019/HH_Size_2019.csv', header = 1)
temp = temp.rename({'Estimate!!Average household size --!!Total:!!Renter occupied': 'avg_HH_size_renters'}, axis = 1)
temp = temp[['Geography', 'avg_HH_size_renters']]

# remove observations without values
temp = temp[temp['avg_HH_size_renters'] != '-']
temp.head()

Unnamed: 0,Geography,avg_HH_size_renters
0,1500000US010010201001,1.78
1,1500000US010010201002,3.4
2,1500000US010010202001,2.42
3,1500000US010010202002,2.14
4,1500000US010010203001,2.49


In [698]:
# merge dfWalk with household data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk.drop('Geography', axis = 1)
dfWalk['avg_HH_size_renters'] = dfWalk['avg_HH_size_renters'].astype('float')
print(dfWalk.shape)
dfWalk.head()

(173448, 22)


Unnamed: 0,realGEOID,county,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,...,housing_density,population_density,employent_housing_density,jobs_per_household,jobs_within_45_minutes_auto,regional_centrality,Population,percent_non_white,log_Median_Contract_Rent,avg_HH_size_renters
0,1500000US481130078254,48113,14.0,Texas,460.0,0.549,0.163121,0.744681,99,191,...,6.250422,16.332625,7.147222,0.156028,433601,0.785893,1141,0.423313,6.708084,2.55
1,1500000US481130078253,48113,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,138,...,13.843035,27.951553,13.843035,0.0,404573,0.733281,528,0.748106,6.60665,2.05
2,1500000US481130078242,48113,10.166667,Texas,343.0,0.506,0.014577,0.790087,91,404,...,2.305992,6.373413,2.521128,0.093294,335700,0.60845,1001,0.046953,7.26892,2.16
3,1500000US481130078271,48113,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,206,...,10.969254,26.357776,11.028441,0.006036,402287,0.729137,1090,0.511927,6.717805,2.72
4,1500000US481130093012,48113,8.0,Texas,451.0,0.644,0.029891,0.915761,117,137,...,2.247799,7.680394,2.252783,0.002717,263813,0.478156,1286,0.157076,6.57647,4.81


In [699]:
# read in household heating data
temp = pd.read_csv('/Users/austincoffelt/Documents/Rent-Walk-local/heating_2019/heating_2019.csv', header = 1)

# get percentage of different heating methods
temp['percent_gas_energy'] = temp['Estimate!!Total:!!Utility gas'] / temp['Estimate!!Total:']
temp['percent_elec_energy'] = temp['Estimate!!Total:!!Electricity'] / temp['Estimate!!Total:']
temp['percent_solar_energy'] = temp['Estimate!!Total:!!Solar energy'] / temp['Estimate!!Total:']
temp['percent_no_energy'] = temp['Estimate!!Total:!!No fuel used'] / temp['Estimate!!Total:']
temp = temp[['Geography', 'percent_gas_energy', 'percent_elec_energy', 'percent_solar_energy', 'percent_no_energy']]
temp.head()

Unnamed: 0,Geography,percent_gas_energy,percent_elec_energy,percent_solar_energy,percent_no_energy
0,1500000US010010201001,0.412698,0.539683,0.0,0.0
1,1500000US010010201002,0.563452,0.423858,0.0,0.0
2,1500000US010010202001,0.600629,0.399371,0.0,0.0
3,1500000US010010202002,0.743243,0.235135,0.0,0.0
4,1500000US010010203001,0.661793,0.327485,0.0,0.0


In [700]:
# merge dfWalk with household heating data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk.drop('Geography', axis = 1)
print(dfWalk.shape)
dfWalk.head()

(173448, 26)


Unnamed: 0,realGEOID,county,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,...,jobs_within_45_minutes_auto,regional_centrality,Population,percent_non_white,log_Median_Contract_Rent,avg_HH_size_renters,percent_gas_energy,percent_elec_energy,percent_solar_energy,percent_no_energy
0,1500000US481130078254,48113,14.0,Texas,460.0,0.549,0.163121,0.744681,99,191,...,433601,0.785893,1141,0.423313,6.708084,2.55,0.22973,0.77027,0.0,0.0
1,1500000US481130078253,48113,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,138,...,404573,0.733281,528,0.748106,6.60665,2.05,0.0,0.94186,0.0,0.0
2,1500000US481130078242,48113,10.166667,Texas,343.0,0.506,0.014577,0.790087,91,404,...,335700,0.60845,1001,0.046953,7.26892,2.16,0.740331,0.259669,0.0,0.0
3,1500000US481130078271,48113,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,206,...,402287,0.729137,1090,0.511927,6.717805,2.72,0.0181,0.968326,0.0,0.013575
4,1500000US481130093012,48113,8.0,Texas,451.0,0.644,0.029891,0.915761,117,137,...,263813,0.478156,1286,0.157076,6.57647,4.81,0.97076,0.02924,0.0,0.0


In [701]:
# get internet access data and find the percent that have no internet access
temp = pd.read_csv('/Users/austincoffelt/Documents/Rent-Walk-local/Internet_2019.csv', header = 1)
temp['percent_no_internet'] = temp['Estimate!!Total:!!No Internet access'] / temp['Estimate!!Total:']
temp = temp[['Geography', 'percent_no_internet']]
temp.head()

Unnamed: 0,Geography,percent_no_internet
0,1500000US010010201001,0.298413
1,1500000US010010201002,0.19797
2,1500000US010010202001,0.393082
3,1500000US010010202002,0.227027
4,1500000US010010203001,0.119883


In [702]:
# merge dfWalk with internet access data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk.drop('Geography', axis = 1)
print(dfWalk.shape)
dfWalk.head()

(173448, 27)


Unnamed: 0,realGEOID,county,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,...,regional_centrality,Population,percent_non_white,log_Median_Contract_Rent,avg_HH_size_renters,percent_gas_energy,percent_elec_energy,percent_solar_energy,percent_no_energy,percent_no_internet
0,1500000US481130078254,48113,14.0,Texas,460.0,0.549,0.163121,0.744681,99,191,...,0.785893,1141,0.423313,6.708084,2.55,0.22973,0.77027,0.0,0.0,0.121622
1,1500000US481130078253,48113,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,138,...,0.733281,528,0.748106,6.60665,2.05,0.0,0.94186,0.0,0.0,0.20155
2,1500000US481130078242,48113,10.166667,Texas,343.0,0.506,0.014577,0.790087,91,404,...,0.60845,1001,0.046953,7.26892,2.16,0.740331,0.259669,0.0,0.0,0.016575
3,1500000US481130078271,48113,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,206,...,0.729137,1090,0.511927,6.717805,2.72,0.0181,0.968326,0.0,0.013575,0.126697
4,1500000US481130093012,48113,8.0,Texas,451.0,0.644,0.029891,0.915761,117,137,...,0.478156,1286,0.157076,6.57647,4.81,0.97076,0.02924,0.0,0.0,0.201754


In [703]:
# read in kitchen data
temp = pd.read_csv('/Users/austincoffelt/Documents/Rent-Walk-local/Kitchen_2019.csv', header = 1)

# get percentage that are lacking kitchen utilities
temp['percent_lacking_kitchen_renter'] = temp['Estimate!!Total:!!Renter occupied:!!Lacking complete kitchen facilities'] / temp['Estimate!!Total:!!Renter occupied:']
temp = temp[['Geography', 'percent_lacking_kitchen_renter']]
temp.head()

Unnamed: 0,Geography,percent_lacking_kitchen_renter
0,1500000US010010201001,0.0
1,1500000US010010201002,0.0
2,1500000US010010202001,0.0
3,1500000US010010202002,0.0
4,1500000US010010203001,0.0


In [704]:
# merge dfWalk with kitchen data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk.drop('Geography', axis = 1)
print(dfWalk.shape)
dfWalk.head()

(173448, 28)


Unnamed: 0,realGEOID,county,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,...,Population,percent_non_white,log_Median_Contract_Rent,avg_HH_size_renters,percent_gas_energy,percent_elec_energy,percent_solar_energy,percent_no_energy,percent_no_internet,percent_lacking_kitchen_renter
0,1500000US481130078254,48113,14.0,Texas,460.0,0.549,0.163121,0.744681,99,191,...,1141,0.423313,6.708084,2.55,0.22973,0.77027,0.0,0.0,0.121622,0.0
1,1500000US481130078253,48113,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,138,...,528,0.748106,6.60665,2.05,0.0,0.94186,0.0,0.0,0.20155,0.0
2,1500000US481130078242,48113,10.166667,Texas,343.0,0.506,0.014577,0.790087,91,404,...,1001,0.046953,7.26892,2.16,0.740331,0.259669,0.0,0.0,0.016575,0.0
3,1500000US481130078271,48113,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,206,...,1090,0.511927,6.717805,2.72,0.0181,0.968326,0.0,0.013575,0.126697,0.0
4,1500000US481130093012,48113,8.0,Texas,451.0,0.644,0.029891,0.915761,117,137,...,1286,0.157076,6.57647,4.81,0.97076,0.02924,0.0,0.0,0.201754,0.0


In [705]:
# read in living arrangement data
temp = pd.read_csv('/Users/austincoffelt/Documents/Rent-Walk-local/Living_Arange_2019/Living_Arange_2019.csv', header = 1)

# get percentage that live alone
temp['percent_lives_alone'] = temp['Estimate!!Total:!!Nonfamily households:!!Householder living alone'] / temp['Estimate!!Total:']
temp = temp[['Geography', 'percent_lives_alone']]
temp.head()

Unnamed: 0,Geography,percent_lives_alone
0,1500000US010010201001,0.307937
1,1500000US010010201002,0.060914
2,1500000US010010202001,0.367925
3,1500000US010010202002,0.443243
4,1500000US010010203001,0.182261


In [706]:
# merge dfWalk with living alone data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk.drop('Geography', axis = 1)
print(dfWalk.shape)
dfWalk.head()

(173448, 29)


Unnamed: 0,realGEOID,county,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,...,percent_non_white,log_Median_Contract_Rent,avg_HH_size_renters,percent_gas_energy,percent_elec_energy,percent_solar_energy,percent_no_energy,percent_no_internet,percent_lacking_kitchen_renter,percent_lives_alone
0,1500000US481130078254,48113,14.0,Texas,460.0,0.549,0.163121,0.744681,99,191,...,0.423313,6.708084,2.55,0.22973,0.77027,0.0,0.0,0.121622,0.0,0.186937
1,1500000US481130078253,48113,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,138,...,0.748106,6.60665,2.05,0.0,0.94186,0.0,0.0,0.20155,0.0,0.468992
2,1500000US481130078242,48113,10.166667,Texas,343.0,0.506,0.014577,0.790087,91,404,...,0.046953,7.26892,2.16,0.740331,0.259669,0.0,0.0,0.016575,0.0,0.121547
3,1500000US481130078271,48113,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,206,...,0.511927,6.717805,2.72,0.0181,0.968326,0.0,0.013575,0.126697,0.0,0.328054
4,1500000US481130093012,48113,8.0,Texas,451.0,0.644,0.029891,0.915761,117,137,...,0.157076,6.57647,4.81,0.97076,0.02924,0.0,0.0,0.201754,0.0,0.017544


In [707]:
# get median household income data
temp = pd.read_csv('/Users/austincoffelt/Documents/Rent-Walk-Local/Income_2019/Income_2019.csv', header = 1)
temp = temp.rename({'Estimate!!Median household income in the past 12 months (in 2019 inflation-adjusted dollars)': 'Median_Household_Income'}, axis = 1)
temp = temp[['Geography', 'Median_Household_Income']]

# remove observations without numbers
temp = temp[temp['Median_Household_Income'] != '-']
temp.head()

Unnamed: 0,Geography,Median_Household_Income
0,1500000US010010201001,35703
1,1500000US010010201002,79000
2,1500000US010010202001,26500
3,1500000US010010202002,51042
4,1500000US010010203001,59048


In [708]:
# merge dfWalk with median household income data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk[dfWalk['Median_Household_Income'] != '250,000+']
dfWalk = dfWalk[dfWalk['Median_Household_Income'] != '2,500-']
dfWalk = dfWalk[dfWalk['Median_Household_Income'].isna() == False]
dfWalk['log_Median_Household_Income'] = np.log(dfWalk['Median_Household_Income'].astype('int'))
dfWalk = dfWalk.drop(['Geography','Median_Household_Income'] , axis = 1)
print(dfWalk.shape)
dfWalk.head()

(168867, 30)


Unnamed: 0,realGEOID,county,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,...,log_Median_Contract_Rent,avg_HH_size_renters,percent_gas_energy,percent_elec_energy,percent_solar_energy,percent_no_energy,percent_no_internet,percent_lacking_kitchen_renter,percent_lives_alone,log_Median_Household_Income
0,1500000US481130078254,48113,14.0,Texas,460.0,0.549,0.163121,0.744681,99,191,...,6.708084,2.55,0.22973,0.77027,0.0,0.0,0.121622,0.0,0.186937,10.870243
1,1500000US481130078253,48113,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,138,...,6.60665,2.05,0.0,0.94186,0.0,0.0,0.20155,0.0,0.468992,10.415053
2,1500000US481130078242,48113,10.166667,Texas,343.0,0.506,0.014577,0.790087,91,404,...,7.26892,2.16,0.740331,0.259669,0.0,0.0,0.016575,0.0,0.121547,11.915052
3,1500000US481130078271,48113,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,206,...,6.717805,2.72,0.0181,0.968326,0.0,0.013575,0.126697,0.0,0.328054,10.400255
4,1500000US481130093012,48113,8.0,Texas,451.0,0.644,0.029891,0.915761,117,137,...,6.57647,4.81,0.97076,0.02924,0.0,0.0,0.201754,0.0,0.017544,10.941057


In [709]:
# get median rooms
temp = pd.read_csv('/Users/austincoffelt/Documents/Rent-Walk-Local/Rooms_2019/Rooms_2019.csv', header = 1)
temp = temp.rename({'Estimate!!Median number of rooms --!!Total:!!Renter occupied': 'Median_Num_Rooms'}, axis = 1)
temp = temp[['Geography', 'Median_Num_Rooms']]

# remove observations without numbers
temp = temp[temp['Median_Num_Rooms'] != '-']
temp.head()

Unnamed: 0,Geography,Median_Num_Rooms
0,1500000US010010201001,4.2
1,1500000US010010201002,5.3
2,1500000US010010202001,5.0
3,1500000US010010202002,5.6
4,1500000US010010203001,5.7


In [710]:
# merge dfWalk with rooms data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk.drop('Geography', axis = 1)
dfWalk = dfWalk[dfWalk['Median_Num_Rooms'] != '9.0+']
dfWalk['Median_Num_Rooms'] = dfWalk['Median_Num_Rooms'].astype('float')
print(dfWalk.shape)
dfWalk.head()

(167484, 31)


Unnamed: 0,realGEOID,county,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,...,avg_HH_size_renters,percent_gas_energy,percent_elec_energy,percent_solar_energy,percent_no_energy,percent_no_internet,percent_lacking_kitchen_renter,percent_lives_alone,log_Median_Household_Income,Median_Num_Rooms
0,1500000US481130078254,48113,14.0,Texas,460.0,0.549,0.163121,0.744681,99,191,...,2.55,0.22973,0.77027,0.0,0.0,0.121622,0.0,0.186937,10.870243,4.0
1,1500000US481130078253,48113,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,138,...,2.05,0.0,0.94186,0.0,0.0,0.20155,0.0,0.468992,10.415053,3.2
2,1500000US481130078242,48113,10.166667,Texas,343.0,0.506,0.014577,0.790087,91,404,...,2.16,0.740331,0.259669,0.0,0.0,0.016575,0.0,0.121547,11.915052,2.2
3,1500000US481130078271,48113,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,206,...,2.72,0.0181,0.968326,0.0,0.013575,0.126697,0.0,0.328054,10.400255,4.0
4,1500000US481130093012,48113,8.0,Texas,451.0,0.644,0.029891,0.915761,117,137,...,4.81,0.97076,0.02924,0.0,0.0,0.201754,0.0,0.017544,10.941057,5.3


In [711]:
# get median year built data
temp = pd.read_csv('/Users/austincoffelt/Documents/Rent-Walk-Local/Structure_2019/Structure_2019.csv', header = 1)
temp = temp.rename({'Estimate!!Median year structure built --!!Renter occupied': 'Median_Year_Structure_Built'}, axis = 1)
temp = temp[['Geography', 'Median_Year_Structure_Built']]

#remove observations without numbers
temp = temp[temp['Median_Year_Structure_Built'] != '-']
temp.head()

Unnamed: 0,Geography,Median_Year_Structure_Built
0,1500000US010010201001,1984
1,1500000US010010201002,1985
2,1500000US010010202001,1975
3,1500000US010010202002,1971
4,1500000US010010203001,1975


In [712]:
# merge dfWalk with year data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk.drop('Geography', axis = 1)
dfWalk = dfWalk[dfWalk['Median_Year_Structure_Built'] != '1939-']

# for 2014+ put 2017 since that is the middle year of the data
dfWalk.loc[dfWalk['Median_Year_Structure_Built'] == '2014+', 'Median_Year_Structure_Built'] = 2017
dfWalk['Median_Year_Structure_Built'] = dfWalk['Median_Year_Structure_Built'].astype('int')
print(dfWalk.shape)
dfWalk.head()

(149393, 32)


Unnamed: 0,realGEOID,county,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,...,percent_gas_energy,percent_elec_energy,percent_solar_energy,percent_no_energy,percent_no_internet,percent_lacking_kitchen_renter,percent_lives_alone,log_Median_Household_Income,Median_Num_Rooms,Median_Year_Structure_Built
0,1500000US481130078254,48113,14.0,Texas,460.0,0.549,0.163121,0.744681,99,191,...,0.22973,0.77027,0.0,0.0,0.121622,0.0,0.186937,10.870243,4.0,1981
1,1500000US481130078253,48113,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,138,...,0.0,0.94186,0.0,0.0,0.20155,0.0,0.468992,10.415053,3.2,1984
2,1500000US481130078242,48113,10.166667,Texas,343.0,0.506,0.014577,0.790087,91,404,...,0.740331,0.259669,0.0,0.0,0.016575,0.0,0.121547,11.915052,2.2,2017
3,1500000US481130078271,48113,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,206,...,0.0181,0.968326,0.0,0.013575,0.126697,0.0,0.328054,10.400255,4.0,1985
4,1500000US481130093012,48113,8.0,Texas,451.0,0.644,0.029891,0.915761,117,137,...,0.97076,0.02924,0.0,0.0,0.201754,0.0,0.017544,10.941057,5.3,1956


In [713]:
# get utilities facilities data
temp = pd.read_csv('/Users/austincoffelt/Documents/Rent-Walk-local/Utilities_2019.csv', header = 1)

# get percentage that have to pay extra
temp['percent_pay_extra'] = temp['Estimate!!Total:!!Pay extra for one or more utilities'] / temp['Estimate!!Total:']
temp = temp[['Geography', 'percent_pay_extra']]
temp.head()

Unnamed: 0,Geography,percent_pay_extra
0,1500000US010010201001,1.0
1,1500000US010010201002,1.0
2,1500000US010010202001,0.976048
3,1500000US010010202002,1.0
4,1500000US010010203001,0.956679


In [714]:
# merge dfWalk with utilities data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk.drop('Geography', axis = 1)
print(dfWalk.shape)
dfWalk.head()

(149393, 33)


Unnamed: 0,realGEOID,county,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,...,percent_elec_energy,percent_solar_energy,percent_no_energy,percent_no_internet,percent_lacking_kitchen_renter,percent_lives_alone,log_Median_Household_Income,Median_Num_Rooms,Median_Year_Structure_Built,percent_pay_extra
0,1500000US481130078254,48113,14.0,Texas,460.0,0.549,0.163121,0.744681,99,191,...,0.77027,0.0,0.0,0.121622,0.0,0.186937,10.870243,4.0,1981,1.0
1,1500000US481130078253,48113,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,138,...,0.94186,0.0,0.0,0.20155,0.0,0.468992,10.415053,3.2,1984,0.612403
2,1500000US481130078242,48113,10.166667,Texas,343.0,0.506,0.014577,0.790087,91,404,...,0.259669,0.0,0.0,0.016575,0.0,0.121547,11.915052,2.2,2017,1.0
3,1500000US481130078271,48113,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,206,...,0.968326,0.0,0.013575,0.126697,0.0,0.328054,10.400255,4.0,1985,1.0
4,1500000US481130093012,48113,8.0,Texas,451.0,0.644,0.029891,0.915761,117,137,...,0.02924,0.0,0.0,0.201754,0.0,0.017544,10.941057,5.3,1956,1.0


In [715]:
# get plumbing facilities data
temp = pd.read_csv('/Users/austincoffelt/Documents/Rent-Walk-local/Plumbing_2019.csv', header = 1)

# get percentage with lackluster facilities
temp['percent_lacking_plumbing_renter'] = temp['Estimate!!Total:!!Renter occupied:!!Lacking plumbing facilities'] / temp['Estimate!!Total:!!Renter occupied:']
temp = temp[['Geography', 'percent_lacking_plumbing_renter']]
temp.head()

Unnamed: 0,Geography,percent_lacking_plumbing_renter
0,1500000US010010201001,0.0
1,1500000US010010201002,0.0
2,1500000US010010202001,0.0
3,1500000US010010202002,0.0
4,1500000US010010203001,0.0


In [716]:
# merge dfWalk with plumbing data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk.drop('Geography', axis = 1)
print(dfWalk.shape)
dfWalk.head()

(149393, 34)


Unnamed: 0,realGEOID,county,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,...,percent_solar_energy,percent_no_energy,percent_no_internet,percent_lacking_kitchen_renter,percent_lives_alone,log_Median_Household_Income,Median_Num_Rooms,Median_Year_Structure_Built,percent_pay_extra,percent_lacking_plumbing_renter
0,1500000US481130078254,48113,14.0,Texas,460.0,0.549,0.163121,0.744681,99,191,...,0.0,0.0,0.121622,0.0,0.186937,10.870243,4.0,1981,1.0,0.0
1,1500000US481130078253,48113,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,138,...,0.0,0.0,0.20155,0.0,0.468992,10.415053,3.2,1984,0.612403,0.0
2,1500000US481130078242,48113,10.166667,Texas,343.0,0.506,0.014577,0.790087,91,404,...,0.0,0.0,0.016575,0.0,0.121547,11.915052,2.2,2017,1.0,0.0
3,1500000US481130078271,48113,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,206,...,0.0,0.013575,0.126697,0.0,0.328054,10.400255,4.0,1985,1.0,0.0
4,1500000US481130093012,48113,8.0,Texas,451.0,0.644,0.029891,0.915761,117,137,...,0.0,0.0,0.201754,0.0,0.017544,10.941057,5.3,1956,1.0,0.0


In [717]:
# get commute data
temp = pd.read_csv('/Users/austincoffelt/Documents/Rent-Walk-local/Means_of_transport_2019.csv', header = 1)

# get percentage with different commutes
temp['percent_drive'] = temp['Estimate!!Total:!!Car, truck, or van:'] / temp['Estimate!!Total:']
temp['percent_public_transport'] = temp['Estimate!!Total:!!Public transportation (excluding taxicab):'] / temp['Estimate!!Total:']
temp['percent_bike'] = temp['Estimate!!Total:!!Bicycle'] / temp['Estimate!!Total:']
temp['percent_walk'] = temp['Estimate!!Total:!!Walked'] / temp['Estimate!!Total:']
temp = temp[['Geography', 'percent_drive', 'percent_public_transport', 'percent_bike', 'percent_walk']]
temp.head()

Unnamed: 0,Geography,percent_drive,percent_public_transport,percent_bike,percent_walk
0,1500000US010010201001,1.0,0.0,0.0,0.0
1,1500000US010010201002,0.953198,0.0,0.0,0.0
2,1500000US010010202001,0.965779,0.0,0.0,0.0
3,1500000US010010202002,1.0,0.0,0.0,0.0
4,1500000US010010203001,0.94928,0.0,0.0,0.013708


In [718]:
# merge dfWalk with commute data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk.drop('Geography', axis = 1)
print(dfWalk.shape)
dfWalk.head()

(149393, 38)


Unnamed: 0,realGEOID,county,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,...,percent_lives_alone,log_Median_Household_Income,Median_Num_Rooms,Median_Year_Structure_Built,percent_pay_extra,percent_lacking_plumbing_renter,percent_drive,percent_public_transport,percent_bike,percent_walk
0,1500000US481130078254,48113,14.0,Texas,460.0,0.549,0.163121,0.744681,99,191,...,0.186937,10.870243,4.0,1981,1.0,0.0,0.866516,0.061086,0.0,0.072398
1,1500000US481130078253,48113,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,138,...,0.468992,10.415053,3.2,1984,0.612403,0.0,0.946619,0.0,0.0,0.053381
2,1500000US481130078242,48113,10.166667,Texas,343.0,0.506,0.014577,0.790087,91,404,...,0.121547,11.915052,2.2,2017,1.0,0.0,0.822785,0.025316,0.0,0.014768
3,1500000US481130078271,48113,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,206,...,0.328054,10.400255,4.0,1985,1.0,0.0,0.97314,0.0,0.0,0.012397
4,1500000US481130093012,48113,8.0,Texas,451.0,0.644,0.029891,0.915761,117,137,...,0.017544,10.941057,5.3,1956,1.0,0.0,0.981795,0.018205,0.0,0.0


In [719]:
# get time of departure data
temp = pd.read_csv('/Users/austincoffelt/Documents/Rent-Walk-local/Time_of_departure_2019.csv', header = 1)

# get percentage with different commutes
temp['before_6am'] = (temp['Estimate!!Total:!!12:00 a.m. to 4:59 a.m.'] + temp['Estimate!!Total:!!5:00 a.m. to 5:29 a.m.'] + temp['Estimate!!Total:!!5:30 a.m. to 5:59 a.m.']) / temp['Estimate!!Total:']
temp = temp[['Geography', 'before_6am']]
temp.head()

Unnamed: 0,Geography,before_6am
0,1500000US010010201001,0.09772
1,1500000US010010201002,0.064935
2,1500000US010010202001,0.169291
3,1500000US010010202002,0.197872
4,1500000US010010203001,0.107789


In [720]:
# merge dfWalk with commute data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk.drop('Geography', axis = 1)
print(dfWalk.shape)
dfWalk.head()

(149393, 39)


Unnamed: 0,realGEOID,county,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,...,log_Median_Household_Income,Median_Num_Rooms,Median_Year_Structure_Built,percent_pay_extra,percent_lacking_plumbing_renter,percent_drive,percent_public_transport,percent_bike,percent_walk,before_6am
0,1500000US481130078254,48113,14.0,Texas,460.0,0.549,0.163121,0.744681,99,191,...,10.870243,4.0,1981,1.0,0.0,0.866516,0.061086,0.0,0.072398,0.19457
1,1500000US481130078253,48113,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,138,...,10.415053,3.2,1984,0.612403,0.0,0.946619,0.0,0.0,0.053381,0.039146
2,1500000US481130078242,48113,10.166667,Texas,343.0,0.506,0.014577,0.790087,91,404,...,11.915052,2.2,2017,1.0,0.0,0.822785,0.025316,0.0,0.014768,0.061125
3,1500000US481130078271,48113,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,206,...,10.400255,4.0,1985,1.0,0.0,0.97314,0.0,0.0,0.012397,0.046122
4,1500000US481130093012,48113,8.0,Texas,451.0,0.644,0.029891,0.915761,117,137,...,10.941057,5.3,1956,1.0,0.0,0.981795,0.018205,0.0,0.0,0.13264


In [721]:
# get travel time data
temp = pd.read_csv('/Users/austincoffelt/Documents/Rent-Walk-local/travel_time_2019.csv', header = 1)

# get percentage with different commutes
temp['less_than_15_minutes'] = (temp['Estimate!!Total:!!Less than 5 minutes'] + temp['Estimate!!Total:!!5 to 9 minutes'] + temp['Estimate!!Total:!!10 to 14 minutes']) / temp['Estimate!!Total:']
temp['more_than_1_hour'] = (temp['Estimate!!Total:!!90 or more minutes'] + temp['Estimate!!Total:!!60 to 89 minutes']) / temp['Estimate!!Total:']
temp = temp[['Geography', 'less_than_15_minutes', 'more_than_1_hour']]
temp.head()

Unnamed: 0,Geography,less_than_15_minutes,more_than_1_hour
0,1500000US010010201001,0.547231,0.0
1,1500000US010010201002,0.318182,0.00487
2,1500000US010010202001,0.275591,0.019685
3,1500000US010010202002,0.231915,0.025532
4,1500000US010010203001,0.303199,0.04242


In [722]:
# merge dfWalk with commute data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk.drop('Geography', axis = 1)
print(dfWalk.shape)
dfWalk.head()

(149393, 41)


Unnamed: 0,realGEOID,county,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,...,Median_Year_Structure_Built,percent_pay_extra,percent_lacking_plumbing_renter,percent_drive,percent_public_transport,percent_bike,percent_walk,before_6am,less_than_15_minutes,more_than_1_hour
0,1500000US481130078254,48113,14.0,Texas,460.0,0.549,0.163121,0.744681,99,191,...,1981,1.0,0.0,0.866516,0.061086,0.0,0.072398,0.19457,0.106335,0.332579
1,1500000US481130078253,48113,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,138,...,1984,0.612403,0.0,0.946619,0.0,0.0,0.053381,0.039146,0.117438,0.0
2,1500000US481130078242,48113,10.166667,Texas,343.0,0.506,0.014577,0.790087,91,404,...,2017,1.0,0.0,0.822785,0.025316,0.0,0.014768,0.061125,0.271394,0.0
3,1500000US481130078271,48113,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,206,...,1985,1.0,0.0,0.97314,0.0,0.0,0.012397,0.046122,0.171908,0.092243
4,1500000US481130093012,48113,8.0,Texas,451.0,0.644,0.029891,0.915761,117,137,...,1956,1.0,0.0,0.981795,0.018205,0.0,0.0,0.13264,0.16515,0.081925


In [723]:
# get move in data
temp = pd.read_csv('/Users/austincoffelt/Documents/Rent-Walk-local/Move_in_2019/Move_in_2019.csv', header = 1)
temp = temp.rename({'Estimate!!Median year householder moved into unit --!!Total:!!Renter occupied': 'Median_Year_Move_in'}, axis = 1)
temp = temp[['Geography', 'Median_Year_Move_in']]

#remove observations without numbers
temp = temp[temp['Median_Year_Move_in'] != '-']
temp = temp[temp['Median_Year_Move_in'] != '1990-']
temp.loc[(temp['Median_Year_Move_in'] == '2017+'), 'Median_Year_Move_in'] = 2018
temp['Median_Year_Move_in'] = temp['Median_Year_Move_in'].astype('int')
temp.head()

Unnamed: 0,Geography,Median_Year_Move_in
0,1500000US010010201001,2014
1,1500000US010010201002,2014
2,1500000US010010202001,2011
3,1500000US010010202002,2012
4,1500000US010010203001,2012


In [724]:
# merge dfWalk with move in data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk.drop('Geography', axis = 1)
print(dfWalk.shape)
dfWalk.head()

(146255, 42)


Unnamed: 0,realGEOID,county,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,...,percent_pay_extra,percent_lacking_plumbing_renter,percent_drive,percent_public_transport,percent_bike,percent_walk,before_6am,less_than_15_minutes,more_than_1_hour,Median_Year_Move_in
0,1500000US481130078254,48113,14.0,Texas,460.0,0.549,0.163121,0.744681,99,191,...,1.0,0.0,0.866516,0.061086,0.0,0.072398,0.19457,0.106335,0.332579,2013
1,1500000US481130078253,48113,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,138,...,0.612403,0.0,0.946619,0.0,0.0,0.053381,0.039146,0.117438,0.0,2015
2,1500000US481130078242,48113,10.166667,Texas,343.0,0.506,0.014577,0.790087,91,404,...,1.0,0.0,0.822785,0.025316,0.0,0.014768,0.061125,0.271394,0.0,2018
3,1500000US481130078271,48113,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,206,...,1.0,0.0,0.97314,0.0,0.0,0.012397,0.046122,0.171908,0.092243,2014
4,1500000US481130093012,48113,8.0,Texas,451.0,0.644,0.029891,0.915761,117,137,...,1.0,0.0,0.981795,0.018205,0.0,0.0,0.13264,0.16515,0.081925,2011


In [725]:
list(dfWalk.columns)

['realGEOID',
 'county',
 'Walk_Index',
 'state',
 'count_housing_units',
 'percentage_work_age',
 'percent_no_car',
 'percent_two_car',
 'count_low_wage_workers',
 'count_high_wage_workers',
 'percent_low_wage_workers',
 'TotEmp',
 'housing_density',
 'population_density',
 'employent_housing_density',
 'jobs_per_household',
 'jobs_within_45_minutes_auto',
 'regional_centrality',
 'Population',
 'percent_non_white',
 'log_Median_Contract_Rent',
 'avg_HH_size_renters',
 'percent_gas_energy',
 'percent_elec_energy',
 'percent_solar_energy',
 'percent_no_energy',
 'percent_no_internet',
 'percent_lacking_kitchen_renter',
 'percent_lives_alone',
 'log_Median_Household_Income',
 'Median_Num_Rooms',
 'Median_Year_Structure_Built',
 'percent_pay_extra',
 'percent_lacking_plumbing_renter',
 'percent_drive',
 'percent_public_transport',
 'percent_bike',
 'percent_walk',
 'before_6am',
 'less_than_15_minutes',
 'more_than_1_hour',
 'Median_Year_Move_in']

In [726]:
# separate into X and Y
dfWalk = dfWalk.dropna()
y = dfWalk['log_Median_Contract_Rent']
X = dfWalk.drop(['county', 'log_Median_Contract_Rent', 'Population', 'state', 'realGEOID'], axis = 1)

# Standardize features 
scaler = StandardScaler()
X_scaled = X.copy()
X_scaled = scaler.fit_transform(X)

X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_scaled.head()

Unnamed: 0,Walk_Index,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,percent_low_wage_workers,TotEmp,housing_density,...,percent_pay_extra,percent_lacking_plumbing_renter,percent_drive,percent_public_transport,percent_bike,percent_walk,before_6am,less_than_15_minutes,more_than_1_hour,Median_Year_Move_in
0,0.990707,-0.554996,-0.577276,0.57382,0.942852,-0.630078,-0.4202,0.020348,-0.303321,0.158313,...,0.570197,-0.187836,0.027505,0.155121,-0.281897,0.789506,0.505759,-1.007085,2.780281,-0.249505
1,-0.340367,-0.787309,2.360603,-0.28057,-0.255816,-0.24618,-0.632937,0.939894,-0.334041,0.756261,...,-1.970617,-0.187836,0.569907,-0.419931,-0.281897,0.456822,-1.080325,-0.943503,-0.990139,0.487408
2,0.090274,-0.841108,-1.059447,-0.630642,1.172424,-0.713083,0.434764,-1.409829,-0.319146,-0.152327,...,0.570197,-0.187836,-0.268611,-0.181606,-0.281897,-0.218658,-0.856034,-0.061863,-0.990139,1.592777
3,-0.692711,-0.320239,-0.139958,-0.210451,-1.672659,-0.173551,-0.359991,0.084628,-0.332645,0.52994,...,0.570197,-0.187836,0.74949,-0.419931,-0.281897,-0.26014,-1.009139,-0.631577,0.055614,0.118952
4,-0.418666,-0.577005,0.487986,-0.506468,1.807818,-0.443317,-0.636951,0.071138,-0.333575,-0.15691,...,0.570197,-0.187836,0.808089,-0.248548,-0.281897,-0.477004,-0.126232,-0.670278,-0.061367,-0.986418


In [727]:
# use lasso to find best variables to use
lassoParam = {'alpha':np.logspace(-4, 2, 200)}
lasso = GridSearchCV(Lasso(), lassoParam, cv = RepeatedKFold(n_splits = 5, n_repeats = 3, random_state = 1), scoring = ('r2', 'neg_mean_squared_error'), refit = 'neg_mean_squared_error', n_jobs = -1)
lasso.fit(X_scaled, y)

# Get the best model (refitted on full data)
best_lasso = lasso.best_estimator_

# Extract coefficients and select non-zero features
coef = pd.Series(best_lasso.coef_, index=X.columns)
selected_features = coef[np.abs(coef) >= 0.01].index.tolist()

print("Selected features:", selected_features)
print("Features eliminated:", set(X.columns) - set(selected_features))

Selected features: ['Walk_Index', 'percent_no_car', 'percent_two_car', 'count_low_wage_workers', 'count_high_wage_workers', 'percent_low_wage_workers', 'housing_density', 'population_density', 'jobs_within_45_minutes_auto', 'regional_centrality', 'percent_non_white', 'avg_HH_size_renters', 'percent_gas_energy', 'percent_elec_energy', 'percent_solar_energy', 'percent_no_energy', 'percent_no_internet', 'percent_lacking_kitchen_renter', 'percent_lives_alone', 'log_Median_Household_Income', 'Median_Num_Rooms', 'Median_Year_Structure_Built', 'percent_pay_extra', 'percent_drive', 'percent_public_transport', 'percent_walk', 'before_6am', 'less_than_15_minutes', 'more_than_1_hour', 'Median_Year_Move_in']
Features eliminated: {'jobs_per_household', 'employent_housing_density', 'percentage_work_age', 'percent_lacking_plumbing_renter', 'count_housing_units', 'TotEmp', 'percent_bike'}


In [728]:
X = dfWalk[selected_features]
print(X.shape)
X.head()

(146247, 30)


Unnamed: 0,Walk_Index,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,percent_low_wage_workers,housing_density,population_density,jobs_within_45_minutes_auto,regional_centrality,...,Median_Num_Rooms,Median_Year_Structure_Built,percent_pay_extra,percent_drive,percent_public_transport,percent_walk,before_6am,less_than_15_minutes,more_than_1_hour,Median_Year_Move_in
0,14.0,0.163121,0.744681,99,191,0.240291,6.250422,16.332625,433601,0.785893,...,4.0,1981,1.0,0.866516,0.061086,0.072398,0.19457,0.106335,0.332579,2013
1,8.333333,0.057751,0.507599,136,138,0.293737,13.843035,27.951553,404573,0.733281,...,3.2,1984,0.612403,0.946619,0.0,0.053381,0.039146,0.117438,0.0,2015
2,10.166667,0.014577,0.790087,91,404,0.157168,2.305992,6.373413,335700,0.60845,...,2.2,2017,1.0,0.822785,0.025316,0.014768,0.061125,0.271394,0.0,2018
3,6.833333,0.066398,0.227364,143,206,0.244027,10.969254,26.357776,402287,0.729137,...,4.0,1985,1.0,0.97314,0.0,0.012397,0.046122,0.171908,0.092243,2014
4,8.0,0.029891,0.915761,117,137,0.243243,2.247799,7.680394,263813,0.478156,...,5.3,1956,1.0,0.981795,0.018205,0.0,0.13264,0.16515,0.081925,2011


In [729]:
X.cov()

Unnamed: 0,Walk_Index,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,percent_low_wage_workers,housing_density,population_density,jobs_within_45_minutes_auto,regional_centrality,...,Median_Num_Rooms,Median_Year_Structure_Built,percent_pay_extra,percent_drive,percent_public_transport,percent_walk,before_6am,less_than_15_minutes,more_than_1_hour,Median_Year_Move_in
Walk_Index,18.12399,0.155606,-0.338875,-3.538944,32.66676,-0.006063,12.840323,28.99758,296713.1,0.544371,...,-1.325811,-15.140124,-0.046199,-0.222366,0.158216,0.043998,-0.079066,-0.063266,0.021413,0.325226
percent_no_car,0.155606,0.01521,-0.017191,-1.398846,-5.493958,0.001145,0.664876,1.368305,7682.982,0.007667,...,-0.042923,-0.432087,-0.005496,-0.011005,0.008168,0.002611,-0.000827,-0.000935,0.001765,-0.046845
percent_two_car,-0.338875,-0.017191,0.03912,2.95576,12.40518,-0.002184,-0.845807,-1.622372,-10041.77,-0.018352,...,0.094679,0.634669,0.005738,0.013312,-0.00975,-0.003512,0.002362,-0.001543,-0.000721,0.008043
count_low_wage_workers,-3.538944,-1.398846,2.95576,9289.114686,18120.61,0.389151,-49.343426,-46.17501,-251922.0,0.332995,...,4.790875,492.601916,0.961052,1.421015,-0.687191,-0.561898,-0.132355,-2.091563,0.028499,21.856186
count_high_wage_workers,32.666762,-5.493958,12.405176,18120.609279,62067.84,-5.517496,99.015457,159.527,3250783.0,-3.612998,...,11.649909,1370.909627,2.243866,-0.314931,0.267625,-1.106649,-3.093065,-9.742198,1.573452,61.682257
percent_low_wage_workers,-0.006063,0.001145,-0.002184,0.389151,-5.517496,0.003378,-0.074162,-0.1215079,-1286.45,0.002503,...,0.000545,-0.129533,-0.000562,0.000514,-0.000442,0.000202,0.000523,0.002632,-0.000658,0.002358
housing_density,12.840323,0.664876,-0.845807,-49.343426,99.01546,-0.074162,161.234306,291.8756,835166.4,0.528691,...,-2.605948,-25.210985,-0.172419,-0.870801,0.651003,0.166289,-0.139227,-0.309042,0.150047,-3.234285
population_density,28.997582,1.368305,-1.622372,-46.175014,159.527,-0.121508,291.875556,610.819,1900123.0,1.13454,...,-4.731689,-60.558802,-0.327869,-1.825049,1.446841,0.312139,-0.230465,-0.748544,0.415098,-8.936953
jobs_within_45_minutes_auto,296713.145037,7682.981504,-10041.770266,-251922.012195,3250783.0,-1286.450274,835166.402584,1900123.0,20956090000.0,11508.144197,...,-33592.614698,-493568.721131,-1424.504615,-11781.658346,10207.789154,1308.450809,-2180.272386,-8629.693306,3897.745577,-65636.573243
regional_centrality,0.544371,0.007667,-0.018352,0.332995,-3.612998,0.002503,0.528691,1.13454,11508.14,0.079197,...,-0.054492,-0.692738,-0.001883,-0.006041,0.004973,0.00159,-0.005159,0.004966,-0.005203,0.070804


In [730]:
cov = pd.DataFrame(X.cov().loc[:, 'Walk_Index'])
cov = cov[np.abs(cov['Walk_Index']) >= 0.02]
cov.index.to_list()

['Walk_Index',
 'percent_no_car',
 'percent_two_car',
 'count_low_wage_workers',
 'count_high_wage_workers',
 'housing_density',
 'population_density',
 'jobs_within_45_minutes_auto',
 'regional_centrality',
 'percent_non_white',
 'avg_HH_size_renters',
 'percent_gas_energy',
 'percent_elec_energy',
 'percent_no_internet',
 'percent_lives_alone',
 'log_Median_Household_Income',
 'Median_Num_Rooms',
 'Median_Year_Structure_Built',
 'percent_pay_extra',
 'percent_drive',
 'percent_public_transport',
 'percent_walk',
 'before_6am',
 'less_than_15_minutes',
 'more_than_1_hour',
 'Median_Year_Move_in']

In [731]:
X = X[cov.index.to_list()]
X.head()

Unnamed: 0,Walk_Index,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,housing_density,population_density,jobs_within_45_minutes_auto,regional_centrality,percent_non_white,...,Median_Num_Rooms,Median_Year_Structure_Built,percent_pay_extra,percent_drive,percent_public_transport,percent_walk,before_6am,less_than_15_minutes,more_than_1_hour,Median_Year_Move_in
0,14.0,0.163121,0.744681,99,191,6.250422,16.332625,433601,0.785893,0.423313,...,4.0,1981,1.0,0.866516,0.061086,0.072398,0.19457,0.106335,0.332579,2013
1,8.333333,0.057751,0.507599,136,138,13.843035,27.951553,404573,0.733281,0.748106,...,3.2,1984,0.612403,0.946619,0.0,0.053381,0.039146,0.117438,0.0,2015
2,10.166667,0.014577,0.790087,91,404,2.305992,6.373413,335700,0.60845,0.046953,...,2.2,2017,1.0,0.822785,0.025316,0.014768,0.061125,0.271394,0.0,2018
3,6.833333,0.066398,0.227364,143,206,10.969254,26.357776,402287,0.729137,0.511927,...,4.0,1985,1.0,0.97314,0.0,0.012397,0.046122,0.171908,0.092243,2014
4,8.0,0.029891,0.915761,117,137,2.247799,7.680394,263813,0.478156,0.157076,...,5.3,1956,1.0,0.981795,0.018205,0.0,0.13264,0.16515,0.081925,2011


In [732]:
# define the function remove_predictors (from homework) 
def remove_predictors(data, threshold):    
    
    data_reduced=data.copy()
    
    while True:
        # Calculate the correlation matrix of the predictors
        corr_mat = data_reduced.corr()
        # Get the absolute pairwise correlations
        corr_mat = np.abs(corr_mat)
        np.fill_diagonal(corr_mat.values, 0)
        corr_mat=corr_mat.fillna(0)
        
        if corr_mat.max().max()<threshold:
            break
        

        # Determine the two predictors associated with the largest absolute pairwise correlation
        max_corr = np.unravel_index(np.argmax(corr_mat), corr_mat.shape)
        predictor_A = max_corr[0]
        predictor_B = max_corr[1]
        
        # Determine the average absolute correlation between A and the other variables
        avg_corr_A = np.mean(corr_mat.iloc[predictor_A, :])
        avg_corr_B = np.mean(corr_mat.iloc[predictor_B, :])
        
        # If A has a larger average correlation, remove it; otherwise, remove predictor B
        if avg_corr_A > avg_corr_B:
            remove_predictor = predictor_A
        else:
            remove_predictor = predictor_B
        
        # Remove the predictor
        
        print('Removed: '+str(data_reduced.columns.tolist()[remove_predictor]))
        
        del data_reduced[data_reduced.columns.tolist()[remove_predictor]]
        
        
        
    
    
    return data_reduced

In [733]:
Xreduced = remove_predictors(X, 0.75)

Removed: population_density
Removed: percent_public_transport
Removed: count_high_wage_workers


In [734]:
Xreduced.shape

(146247, 23)

In [735]:
# get dummies for the counties
countyDummies = pd.get_dummies(dfWalk.loc[:, 'county'] , drop_first = False, dtype = 'int')

# concatinate with our dataframe
XReducedDummies = pd.concat([Xreduced, countyDummies], axis = 1)

# drop state and california for multicoliniarity issues
XReducedDummies = XReducedDummies.drop('72137' , axis = 1)

#add intercept
XReducedDummies['Intercept'] = 1

XReducedDummies.head()

Unnamed: 0,Walk_Index,percent_no_car,percent_two_car,count_low_wage_workers,housing_density,jobs_within_45_minutes_auto,regional_centrality,percent_non_white,avg_HH_size_renters,percent_gas_energy,...,72135,72139,72141,72143,72145,72147,72149,72151,72153,Intercept
0,14.0,0.163121,0.744681,99,6.250422,433601,0.785893,0.423313,2.55,0.22973,...,0,0,0,0,0,0,0,0,0,1
1,8.333333,0.057751,0.507599,136,13.843035,404573,0.733281,0.748106,2.05,0.0,...,0,0,0,0,0,0,0,0,0,1
2,10.166667,0.014577,0.790087,91,2.305992,335700,0.60845,0.046953,2.16,0.740331,...,0,0,0,0,0,0,0,0,0,1
3,6.833333,0.066398,0.227364,143,10.969254,402287,0.729137,0.511927,2.72,0.0181,...,0,0,0,0,0,0,0,0,0,1
4,8.0,0.029891,0.915761,117,2.247799,263813,0.478156,0.157076,4.81,0.97076,...,0,0,0,0,0,0,0,0,0,1


In [736]:
mod = sm.OLS(y, XReducedDummies, axis = 1)
res = mod.fit()
res.summary()

0,1,2,3
Dep. Variable:,log_Median_Contract_Rent,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.804
Method:,Least Squares,F-statistic:,186.0
Date:,"Mon, 28 Apr 2025",Prob (F-statistic):,0.0
Time:,13:26:31,Log-Likelihood:,14220.0
No. Observations:,146247,AIC:,-21970.0
Df Residuals:,143011,BIC:,10050.0
Df Model:,3235,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Walk_Index,0.0024,0.000,10.018,0.000,0.002,0.003
percent_no_car,-0.2892,0.009,-32.159,0.000,-0.307,-0.272
percent_two_car,-0.0663,0.006,-10.604,0.000,-0.079,-0.054
count_low_wage_workers,2.959e-05,7.28e-06,4.062,0.000,1.53e-05,4.39e-05
housing_density,0.0009,5.95e-05,15.935,0.000,0.001,0.001
jobs_within_45_minutes_auto,-7.484e-08,1.51e-08,-4.964,0.000,-1.04e-07,-4.53e-08
regional_centrality,0.1756,0.005,37.120,0.000,0.166,0.185
percent_non_white,-0.0743,0.004,-21.003,0.000,-0.081,-0.067
avg_HH_size_renters,0.0217,0.001,22.018,0.000,0.020,0.024

0,1,2,3
Omnibus:,19420.757,Durbin-Watson:,1.952
Prob(Omnibus):,0.0,Jarque-Bera (JB):,113550.965
Skew:,-0.503,Prob(JB):,0.0
Kurtosis:,7.198,Cond. No.,660000000.0


In [737]:
robust_results = res.get_robustcov_results(cov_type='HC1')
robust_results.summary()

0,1,2,3
Dep. Variable:,log_Median_Contract_Rent,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.804
Method:,Least Squares,F-statistic:,99680.0
Date:,"Mon, 28 Apr 2025",Prob (F-statistic):,0.0
Time:,13:26:51,Log-Likelihood:,14220.0
No. Observations:,146247,AIC:,-21970.0
Df Residuals:,143011,BIC:,10050.0
Df Model:,3235,,
Covariance Type:,HC1,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Walk_Index,0.0024,0.000,9.849,0.000,0.002,0.003
percent_no_car,-0.2892,0.011,-25.925,0.000,-0.311,-0.267
percent_two_car,-0.0663,0.007,-9.612,0.000,-0.080,-0.053
count_low_wage_workers,2.959e-05,6.81e-06,4.345,0.000,1.62e-05,4.29e-05
housing_density,0.0009,0.000,6.266,0.000,0.001,0.001
jobs_within_45_minutes_auto,-7.484e-08,1.69e-08,-4.418,0.000,-1.08e-07,-4.16e-08
regional_centrality,0.1756,0.005,35.039,0.000,0.166,0.185
percent_non_white,-0.0743,0.004,-19.919,0.000,-0.082,-0.067
avg_HH_size_renters,0.0217,0.001,20.463,0.000,0.020,0.024

0,1,2,3
Omnibus:,19420.757,Durbin-Watson:,1.952
Prob(Omnibus):,0.0,Jarque-Bera (JB):,113550.965
Skew:,-0.503,Prob(JB):,0.0
Kurtosis:,7.198,Cond. No.,660000000.0


In [765]:
# get crime data
acCrime = pd.read_csv('RW Data/Alameda_County_Violent_Crime_BG.csv')
phCrime = pd.read_csv('RW Data/Philly_Violent_Crime_BG.csv')
btnCrime = pd.read_csv('RW Data/Boston_Violent_Crime_BG.csv')
buffCrime = pd.read_csv('RW Data/Buffalo_Violent_Crime_BG.csv')
chattCrime = pd.read_csv('RW Data/Chatta_Violent_Crime_BG.csv')
cinciCrime = pd.read_csv('RW Data/Cinci_Violent_Crime_BG.csv')
DetCrime = pd.read_csv('RW Data/Detroit_Violent_Crime_BG.csv')
gCrime = pd.read_csv('RW Data/Gaines_Violent_Crime_BG.csv')
johnCrime = pd.read_csv('RW Data/J_Creek_violent_crime_by_BG.csv')
oakCrime = pd.read_csv('RW Data/Oakland_Violent_Crime_BG.csv')
ralCrime = pd.read_csv('RW Data/Ral_Violent_Crime_BG.csv')
laCrime = pd.read_csv('RW Data/LA_Violent_Crime_BG.csv')
noCrime = pd.read_csv('RW Data/NO_Violent_Crime_BG.csv')
auCrime = pd.read_csv('RW Data/AustinTX_violent_crime_by_BG.csv')
chiCrime = pd.read_csv('RW Data/Chicago_Violent_Crime_BG.csv')
montCrime = pd.read_csv('RW Data/MontMD_violent_crime_by_BG.csv')
nyCrime = pd.read_csv('RW Data/NY_Violent_Crime_BG.csv')
camCrime = pd.read_csv('RW Data/Cambridge_Violent_Crime_BG.csv')
huCrime = pd.read_csv('RW Data/Houston_Violent_Crime_BG.csv')
sfCrime = pd.read_csv('RW Data/SF_Violent_Crime_BG.csv')
dcCrime = pd.read_csv('RW Data/DC_Violent_Crime_BG.csv')
ohCrime = pd.read_csv('RW Data/Omaha_violent_crime_by_BG.csv')
tmCrime = pd.read_csv('RW Data/Tempe_Violent_Crime_BG.csv')
crime = pd.concat([laCrime, noCrime, auCrime, chiCrime, montCrime, nyCrime, camCrime, huCrime, sfCrime, acCrime, btnCrime, 
                   buffCrime, chattCrime, cinciCrime, DetCrime, gCrime, johnCrime, oakCrime, ralCrime, phCrime, dcCrime, ohCrime, tmCrime])
print(crime.shape)
crime.head()

(16433, 2)


Unnamed: 0,AFFGEOID,count
0,1500000US060372077101,248
1,1500000US060372260021,183
2,1500000US060372073012,154
3,1500000US060372088011,153
4,1500000US060371907001,137


In [767]:
# merge dfWalk with crime data on GEOID
dfWalk = dfWalk.merge(crime, how = 'inner', left_on = 'realGEOID', right_on = 'AFFGEOID')
dfWalk['violent_crime_rate'] = (dfWalk['count'] / dfWalk['Population']) * 100000
dfWalk = dfWalk.drop(['AFFGEOID', 'count'], axis = 1)
print(dfWalk.shape)
dfWalk.head()

(9506, 43)


Unnamed: 0,realGEOID,county,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,...,percent_lacking_plumbing_renter,percent_drive,percent_public_transport,percent_bike,percent_walk,before_6am,less_than_15_minutes,more_than_1_hour,Median_Year_Move_in,violent_crime_rate
0,1500000US481576721002,48157,11.0,Texas,899.0,0.547,0.11985,0.586767,151,582,...,0.0,0.815578,0.0,0.0,0.013746,0.142265,0.259669,0.096685,2017,94.562648
1,1500000US483396923005,48339,8.0,Texas,3249.0,0.722,0.0,0.857593,509,1717,...,0.0,0.950431,0.003412,0.0,0.016055,0.162229,0.207018,0.157276,2013,11.326311
2,1500000US484530023144,48453,11.0,Texas,702.0,0.811,0.234168,0.309278,186,257,...,0.027933,0.791239,0.052704,0.0,0.060233,0.216876,0.351464,0.158996,2017,138.121547
3,1500000US484530002041,48453,18.333333,Texas,795.0,0.822,0.231132,0.34434,126,418,...,0.0,0.589595,0.111272,0.147399,0.102601,0.01037,0.322963,0.051852,2016,485.044462
4,1500000US484530015032,48453,18.666667,Texas,358.0,0.974,0.162264,0.532075,64,137,...,0.0,0.76,0.107273,0.018182,0.0,0.063655,0.398357,0.13963,2018,448.430493


In [769]:
dfWalk = dfWalk.drop_duplicates(subset = ['realGEOID'])

In [771]:
crimeQuartile = np.quantile(dfWalk['violent_crime_rate'], [0.25, 0.75])

#get interaction
dfWalk['high_crime'] = 0
dfWalk.loc[(dfWalk['violent_crime_rate'] >= crimeQuartile[1]), 'high_crime'] = 1
dfWalk['low_crime'] = 0
dfWalk.loc[(dfWalk['violent_crime_rate'] >= crimeQuartile[0]), 'low_crime'] = 1
dfWalk['Walk_index_x_high_crime'] = dfWalk['high_crime'] * dfWalk['Walk_Index']
dfWalk['Walk_index_x_low_crime'] = dfWalk['low_crime'] * dfWalk['Walk_Index']

print(dfWalk.shape)
dfWalk.head()

(9496, 47)


Unnamed: 0,realGEOID,county,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,...,percent_walk,before_6am,less_than_15_minutes,more_than_1_hour,Median_Year_Move_in,violent_crime_rate,high_crime,low_crime,Walk_index_x_high_crime,Walk_index_x_low_crime
0,1500000US481576721002,48157,11.0,Texas,899.0,0.547,0.11985,0.586767,151,582,...,0.013746,0.142265,0.259669,0.096685,2017,94.562648,0,0,0.0,0.0
1,1500000US483396923005,48339,8.0,Texas,3249.0,0.722,0.0,0.857593,509,1717,...,0.016055,0.162229,0.207018,0.157276,2013,11.326311,0,0,0.0,0.0
2,1500000US484530023144,48453,11.0,Texas,702.0,0.811,0.234168,0.309278,186,257,...,0.060233,0.216876,0.351464,0.158996,2017,138.121547,0,0,0.0,0.0
3,1500000US484530002041,48453,18.333333,Texas,795.0,0.822,0.231132,0.34434,126,418,...,0.102601,0.01037,0.322963,0.051852,2016,485.044462,0,1,0.0,18.333333
4,1500000US484530015032,48453,18.666667,Texas,358.0,0.974,0.162264,0.532075,64,137,...,0.0,0.063655,0.398357,0.13963,2018,448.430493,0,1,0.0,18.666667


In [773]:
impColumns = list(Xreduced.columns) + ['high_crime', 'low_crime', 'Walk_index_x_high_crime', 'Walk_index_x_low_crime']
X = dfWalk.loc[:, impColumns]
print(X.shape)
X.head()

(9496, 27)


Unnamed: 0,Walk_Index,percent_no_car,percent_two_car,count_low_wage_workers,housing_density,jobs_within_45_minutes_auto,regional_centrality,percent_non_white,avg_HH_size_renters,percent_gas_energy,...,percent_drive,percent_walk,before_6am,less_than_15_minutes,more_than_1_hour,Median_Year_Move_in,high_crime,low_crime,Walk_index_x_high_crime,Walk_index_x_low_crime
0,11.0,0.11985,0.586767,151,1.285888,214839,0.392794,0.227423,1.76,0.590147,...,0.815578,0.013746,0.142265,0.259669,0.096685,2017,0,0,0.0,0.0
1,8.0,0.0,0.857593,509,1.115687,136603,0.249754,0.09129,2.13,0.571906,...,0.950431,0.016055,0.162229,0.207018,0.157276,2013,0,0,0.0,0.0
2,11.0,0.234168,0.309278,186,12.915369,173647,0.701369,0.381676,3.03,0.114525,...,0.791239,0.060233,0.216876,0.351464,0.158996,2017,0,0,0.0,0.0
3,18.333333,0.231132,0.34434,126,6.261999,236060,0.953458,0.198868,1.79,0.346812,...,0.589595,0.102601,0.01037,0.322963,0.051852,2016,0,1,0.0,18.333333
4,18.666667,0.162264,0.532075,64,4.377045,217234,0.877419,0.325859,2.5,0.341818,...,0.76,0.0,0.063655,0.398357,0.13963,2018,0,1,0.0,18.666667


In [775]:
# get dummies for the counties
countyDummies = pd.get_dummies(dfWalk.loc[:, 'county'] , drop_first = False, dtype = 'int')

# concatinate with our dataframe
XReducedDummiesCrime = pd.concat([X, countyDummies], axis = 1)

# drop state and california for multicoliniarity issues
XReducedDummiesCrime = XReducedDummiesCrime.drop('42091' , axis = 1)

#add intercept
XReducedDummiesCrime['Intercept'] = 1

XReducedDummiesCrime.head()

Unnamed: 0,Walk_Index,percent_no_car,percent_two_car,count_low_wage_workers,housing_density,jobs_within_45_minutes_auto,regional_centrality,percent_non_white,avg_HH_size_renters,percent_gas_energy,...,47065,48039,48157,48167,48201,48339,48407,48453,48491,Intercept
0,11.0,0.11985,0.586767,151,1.285888,214839,0.392794,0.227423,1.76,0.590147,...,0,0,1,0,0,0,0,0,0,1
1,8.0,0.0,0.857593,509,1.115687,136603,0.249754,0.09129,2.13,0.571906,...,0,0,0,0,0,1,0,0,0,1
2,11.0,0.234168,0.309278,186,12.915369,173647,0.701369,0.381676,3.03,0.114525,...,0,0,0,0,0,0,0,1,0,1
3,18.333333,0.231132,0.34434,126,6.261999,236060,0.953458,0.198868,1.79,0.346812,...,0,0,0,0,0,0,0,1,0,1
4,18.666667,0.162264,0.532075,64,4.377045,217234,0.877419,0.325859,2.5,0.341818,...,0,0,0,0,0,0,0,1,0,1


In [777]:
yCrime = dfWalk['log_Median_Contract_Rent']
mod = sm.OLS(yCrime, XReducedDummiesCrime, axis = 1)
res = mod.fit()
res.summary()

0,1,2,3
Dep. Variable:,log_Median_Contract_Rent,R-squared:,0.763
Model:,OLS,Adj. R-squared:,0.761
Method:,Least Squares,F-statistic:,445.1
Date:,"Mon, 28 Apr 2025",Prob (F-statistic):,0.0
Time:,13:33:42,Log-Likelihood:,642.52
No. Observations:,9496,AIC:,-1147.0
Df Residuals:,9427,BIC:,-653.1
Df Model:,68,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Walk_Index,0.0040,0.002,2.322,0.020,0.001,0.007
percent_no_car,-0.1807,0.025,-7.147,0.000,-0.230,-0.131
percent_two_car,-0.1585,0.024,-6.692,0.000,-0.205,-0.112
count_low_wage_workers,4.429e-05,3.31e-05,1.336,0.182,-2.07e-05,0.000
housing_density,0.0011,0.000,7.432,0.000,0.001,0.001
jobs_within_45_minutes_auto,7.533e-08,5.19e-08,1.450,0.147,-2.65e-08,1.77e-07
regional_centrality,-0.0727,0.038,-1.903,0.057,-0.148,0.002
percent_non_white,-0.0899,0.012,-7.752,0.000,-0.113,-0.067
avg_HH_size_renters,0.0325,0.004,7.713,0.000,0.024,0.041

0,1,2,3
Omnibus:,1636.318,Durbin-Watson:,1.906
Prob(Omnibus):,0.0,Jarque-Bera (JB):,7338.916
Skew:,-0.778,Prob(JB):,0.0
Kurtosis:,7.016,Cond. No.,313000000.0


In [779]:
robust_results = res.get_robustcov_results(cov_type='HC1')
robust_results.summary()

0,1,2,3
Dep. Variable:,log_Median_Contract_Rent,R-squared:,0.763
Model:,OLS,Adj. R-squared:,0.761
Method:,Least Squares,F-statistic:,14980.0
Date:,"Mon, 28 Apr 2025",Prob (F-statistic):,0.0
Time:,13:33:42,Log-Likelihood:,642.52
No. Observations:,9496,AIC:,-1147.0
Df Residuals:,9427,BIC:,-653.1
Df Model:,68,,
Covariance Type:,HC1,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Walk_Index,0.0040,0.002,2.426,0.015,0.001,0.007
percent_no_car,-0.1807,0.028,-6.446,0.000,-0.236,-0.126
percent_two_car,-0.1585,0.026,-6.137,0.000,-0.209,-0.108
count_low_wage_workers,4.429e-05,3.14e-05,1.409,0.159,-1.73e-05,0.000
housing_density,0.0011,0.000,6.868,0.000,0.001,0.001
jobs_within_45_minutes_auto,7.533e-08,5.67e-08,1.329,0.184,-3.58e-08,1.86e-07
regional_centrality,-0.0727,0.041,-1.762,0.078,-0.154,0.008
percent_non_white,-0.0899,0.012,-7.650,0.000,-0.113,-0.067
avg_HH_size_renters,0.0325,0.004,7.596,0.000,0.024,0.041

0,1,2,3
Omnibus:,1636.318,Durbin-Watson:,1.906
Prob(Omnibus):,0.0,Jarque-Bera (JB):,7338.916
Skew:,-0.778,Prob(JB):,0.0
Kurtosis:,7.016,Cond. No.,313000000.0
