In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings("ignore")

In [3]:
# read in dataset
dfWalk = pd.read_csv('/Users/austincoffelt/Downloads/Rent-Walk-local/EPA_SmartLocationDatabase_V3_Jan_2021_Final.csv')
dfWalk.head()

Unnamed: 0,OBJECTID,GEOID10,GEOID20,STATEFP,COUNTYFP,TRACTCE,BLKGRPCE,CSA,CSA_Name,CBSA,...,D5DRI,D5DE,D5DEI,D2A_Ranked,D2B_Ranked,D3B_Ranked,D4A_Ranked,NatWalkInd,Shape_Length,Shape_Area
0,1,481130000000.0,481130000000.0,48,113,7825,4,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.184697,0.000476,0.137707,6,14,15,17,14.0,3110.36082,297836.0831
1,2,481130000000.0,481130000000.0,48,113,7825,2,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.323221,0.000801,0.231868,3,10,12,14,10.833333,3519.46911,484945.1466
2,3,481130000000.0,481130000000.0,48,113,7825,3,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.314628,0.000736,0.213146,1,1,7,17,8.333333,1697.091802,106705.9281
3,4,481130000000.0,481130000000.0,48,113,7824,1,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.229821,0.000708,0.205018,16,10,17,17,15.666667,2922.609204,481828.4303
4,5,481130000000.0,481130000000.0,48,113,7824,2,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.164863,0.000433,0.125296,4,7,11,14,10.166667,3731.971773,687684.7752


In [5]:
# change state, county, tract, and blk group to strings
dfWalk.loc[:, ['STATEFP', 'COUNTYFP', 'TRACTCE', 'BLKGRPCE']] = dfWalk.loc[:, ['STATEFP', 'COUNTYFP', 'TRACTCE', 'BLKGRPCE']].astype('str')

In [7]:
# add 0s to the front to create the geoIDs
for i in range(len(dfWalk)):
    dfWalk.loc[i, 'STATEFP'] = dfWalk.loc[i, 'STATEFP'].zfill(2)
    dfWalk.loc[i, 'COUNTYFP'] = dfWalk.loc[i, 'COUNTYFP'].zfill(3)
    dfWalk.loc[i, 'TRACTCE'] = dfWalk.loc[i, 'TRACTCE'].zfill(6)
    
dfWalk.head()

Unnamed: 0,OBJECTID,GEOID10,GEOID20,STATEFP,COUNTYFP,TRACTCE,BLKGRPCE,CSA,CSA_Name,CBSA,...,D5DRI,D5DE,D5DEI,D2A_Ranked,D2B_Ranked,D3B_Ranked,D4A_Ranked,NatWalkInd,Shape_Length,Shape_Area
0,1,481130000000.0,481130000000.0,48,113,7825,4,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.184697,0.000476,0.137707,6,14,15,17,14.0,3110.36082,297836.0831
1,2,481130000000.0,481130000000.0,48,113,7825,2,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.323221,0.000801,0.231868,3,10,12,14,10.833333,3519.46911,484945.1466
2,3,481130000000.0,481130000000.0,48,113,7825,3,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.314628,0.000736,0.213146,1,1,7,17,8.333333,1697.091802,106705.9281
3,4,481130000000.0,481130000000.0,48,113,7824,1,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.229821,0.000708,0.205018,16,10,17,17,15.666667,2922.609204,481828.4303
4,5,481130000000.0,481130000000.0,48,113,7824,2,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.164863,0.000433,0.125296,4,7,11,14,10.166667,3731.971773,687684.7752


In [9]:
# get county names and FIP code 
counties = pd.read_csv('Counties - Sheet1.csv', header = None, names = ['FIPS Code', 'County'])

# Zfill FIP code to join with main dataframe
counties.loc[:, 'FIPS Code'] = counties.loc[:, 'FIPS Code'].astype('str')
for i in range(len(counties)):
    counties.loc[i, 'FIPS Code'] = counties.loc[i, 'FIPS Code'].zfill(5)


counties.head()

Unnamed: 0,FIPS Code,County
0,46137,Ziebach County
1,48507,Zavala County
2,48505,Zapata County
3,4027,Yuma County
4,8125,Yuma County


In [11]:
# create column to merge
dfWalk['FIPS_code_county'] = dfWalk['STATEFP'] + dfWalk['COUNTYFP']
dfWalk.head()

Unnamed: 0,OBJECTID,GEOID10,GEOID20,STATEFP,COUNTYFP,TRACTCE,BLKGRPCE,CSA,CSA_Name,CBSA,...,D5DE,D5DEI,D2A_Ranked,D2B_Ranked,D3B_Ranked,D4A_Ranked,NatWalkInd,Shape_Length,Shape_Area,FIPS_code_county
0,1,481130000000.0,481130000000.0,48,113,7825,4,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.000476,0.137707,6,14,15,17,14.0,3110.36082,297836.0831,48113
1,2,481130000000.0,481130000000.0,48,113,7825,2,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.000801,0.231868,3,10,12,14,10.833333,3519.46911,484945.1466,48113
2,3,481130000000.0,481130000000.0,48,113,7825,3,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.000736,0.213146,1,1,7,17,8.333333,1697.091802,106705.9281,48113
3,4,481130000000.0,481130000000.0,48,113,7824,1,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.000708,0.205018,16,10,17,17,15.666667,2922.609204,481828.4303,48113
4,5,481130000000.0,481130000000.0,48,113,7824,2,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.000433,0.125296,4,7,11,14,10.166667,3731.971773,687684.7752,48113


In [13]:
# merge dfWalk with counties
dfWalk = dfWalk.merge(counties, how = 'left', left_on = 'FIPS_code_county', right_on = 'FIPS Code')
dfWalk = dfWalk.drop(['FIPS_code_county', 'FIPS Code'], axis = 1)
print(dfWalk.shape)
dfWalk.head()

(220740, 118)


Unnamed: 0,OBJECTID,GEOID10,GEOID20,STATEFP,COUNTYFP,TRACTCE,BLKGRPCE,CSA,CSA_Name,CBSA,...,D5DE,D5DEI,D2A_Ranked,D2B_Ranked,D3B_Ranked,D4A_Ranked,NatWalkInd,Shape_Length,Shape_Area,County
0,1,481130000000.0,481130000000.0,48,113,7825,4,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.000476,0.137707,6,14,15,17,14.0,3110.36082,297836.0831,Dallas County
1,2,481130000000.0,481130000000.0,48,113,7825,2,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.000801,0.231868,3,10,12,14,10.833333,3519.46911,484945.1466,Dallas County
2,3,481130000000.0,481130000000.0,48,113,7825,3,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.000736,0.213146,1,1,7,17,8.333333,1697.091802,106705.9281,Dallas County
3,4,481130000000.0,481130000000.0,48,113,7824,1,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.000708,0.205018,16,10,17,17,15.666667,2922.609204,481828.4303,Dallas County
4,5,481130000000.0,481130000000.0,48,113,7824,2,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.000433,0.125296,4,7,11,14,10.166667,3731.971773,687684.7752,Dallas County


In [15]:
# create realGeoID for merging
dfWalk['realGEOID'] = '1500000US' + dfWalk['STATEFP'] + dfWalk['COUNTYFP'] + dfWalk['TRACTCE'] + dfWalk['BLKGRPCE']
dfWalk.head()

Unnamed: 0,OBJECTID,GEOID10,GEOID20,STATEFP,COUNTYFP,TRACTCE,BLKGRPCE,CSA,CSA_Name,CBSA,...,D5DEI,D2A_Ranked,D2B_Ranked,D3B_Ranked,D4A_Ranked,NatWalkInd,Shape_Length,Shape_Area,County,realGEOID
0,1,481130000000.0,481130000000.0,48,113,7825,4,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.137707,6,14,15,17,14.0,3110.36082,297836.0831,Dallas County,1500000US481130078254
1,2,481130000000.0,481130000000.0,48,113,7825,2,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.231868,3,10,12,14,10.833333,3519.46911,484945.1466,Dallas County,1500000US481130078252
2,3,481130000000.0,481130000000.0,48,113,7825,3,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.213146,1,1,7,17,8.333333,1697.091802,106705.9281,Dallas County,1500000US481130078253
3,4,481130000000.0,481130000000.0,48,113,7824,1,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.205018,16,10,17,17,15.666667,2922.609204,481828.4303,Dallas County,1500000US481130078241
4,5,481130000000.0,481130000000.0,48,113,7824,2,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.125296,4,7,11,14,10.166667,3731.971773,687684.7752,Dallas County,1500000US481130078242


In [17]:
# create intercept constant
dfWalk['Intercept'] = 1

In [19]:
# select and rename the most important columns of our dataframe
imptCols = ['realGEOID','County', 'Intercept', 'NatWalkInd', 'STATEFP', 'CountHU', 'P_WrkAge', 'Pct_AO0',
            'Pct_AO2p', 'R_LowWageWk', 'R_HiWageWk', 'R_PCTLOWWAGE', 'TotEmp', 'D1A', 
            'D1B', 'D1D', 'D2A_JPHH', 'D4E', 'D5AR', 'D5BR', 'D5CRI']
dfWalk = dfWalk.loc[:, imptCols]
dfWalk = dfWalk.rename({'NatWalkInd':'Walk_Index',
                        'STATEFP':'state',
                        'CountHU':'count_housing_units',
                        'P_WrkAge':'percentage_work_age',
                        'Pct_AO0':'percent_no_car',
                        'Pct_AO2p':'percent_two_car',
                        'R_LowWageWk':'count_low_wage_workers',
                        'R_HiWageWk':'count_high_wage_workers',
                        'R_PCTLOWWAGE':'percent_low_wage_workers',
                        'D1A':'housing_density',
                        'D1B':'population_density',
                        'D1D':'employent_housing_density',
                        'D2A_JPHH':'jobs_per_household',
                        'D4E':'transit_frequency',
                        'D5AR':'jobs_within_45_minutes_auto',
                        'D5BR':'jobs_within_45_minutes_transit',
                        'D5CRI':'regional_centrality'}, axis = 1)
dfWalk.head()

Unnamed: 0,realGEOID,County,Intercept,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,...,percent_low_wage_workers,TotEmp,housing_density,population_density,employent_housing_density,jobs_per_household,transit_frequency,jobs_within_45_minutes_auto,jobs_within_45_minutes_transit,regional_centrality
0,1500000US481130078254,Dallas County,1,14.0,48,460.0,0.549,0.163121,0.744681,99,...,0.240291,66,6.250422,16.332625,7.147222,0.156028,0.003602,433601,135362,0.785893
1,1500000US481130078252,Dallas County,1,10.833333,48,409.0,0.466,0.0,0.589242,76,...,0.192405,25,3.430799,5.955666,3.640506,0.061125,0.006099,386504,236885,0.700531
2,1500000US481130078253,Dallas County,1,8.333333,48,365.0,0.811,0.057751,0.507599,136,...,0.293737,0,13.843035,27.951553,13.843035,0.0,0.004071,404573,230587,0.733281
3,1500000US481130078241,Dallas County,1,15.666667,48,384.0,0.638,0.0,0.888021,60,...,0.139211,253,3.225246,7.592767,5.350213,0.658854,0.007378,423099,168433,0.766859
4,1500000US481130078242,Dallas County,1,10.166667,48,343.0,0.506,0.014577,0.790087,91,...,0.157168,32,2.305992,6.373413,2.521128,0.093294,0.007036,335700,120826,0.60845


In [21]:
# change numbers to States
dfWalk.loc[dfWalk['state'] == '01', 'state'] = 'Alabama' 
dfWalk.loc[dfWalk['state'] == '02', 'state'] = 'Alaska'
dfWalk.loc[dfWalk['state'] == '04', 'state'] = 'Arizona'
dfWalk.loc[dfWalk['state'] == '05', 'state'] = 'Arkansas'
dfWalk.loc[dfWalk['state'] == '06', 'state'] = 'California'
dfWalk.loc[dfWalk['state'] == '08', 'state'] = 'Colorado'
dfWalk.loc[dfWalk['state'] == '09', 'state'] = 'Connecticut'
dfWalk.loc[dfWalk['state'] == '10', 'state'] = 'Deleware'
dfWalk.loc[dfWalk['state'] == '11', 'state'] = 'District of Columbia'
dfWalk.loc[dfWalk['state'] == '12', 'state'] = 'Florida'
dfWalk.loc[dfWalk['state'] == '13', 'state'] = 'Georgia'
dfWalk.loc[dfWalk['state'] == '15', 'state'] = 'Hawaii'
dfWalk.loc[dfWalk['state'] == '16', 'state'] = 'Idaho'
dfWalk.loc[dfWalk['state'] == '17', 'state'] = 'Illinois'
dfWalk.loc[dfWalk['state'] == '18', 'state'] = 'Indiana'
dfWalk.loc[dfWalk['state'] == '19', 'state'] = 'Iowa'
dfWalk.loc[dfWalk['state'] == '20', 'state'] = 'Kansas'
dfWalk.loc[dfWalk['state'] == '21', 'state'] = 'Kentucky'
dfWalk.loc[dfWalk['state'] == '22', 'state'] = 'Louisiana'
dfWalk.loc[dfWalk['state'] == '23', 'state'] = 'Maine'
dfWalk.loc[dfWalk['state'] == '24', 'state'] = 'Maryland'
dfWalk.loc[dfWalk['state'] == '25', 'state'] = 'Massachusetts'
dfWalk.loc[dfWalk['state'] == '26', 'state'] = 'Michigan'
dfWalk.loc[dfWalk['state'] == '27', 'state'] = 'Minnesota'
dfWalk.loc[dfWalk['state'] == '28', 'state'] = 'Mississippi'
dfWalk.loc[dfWalk['state'] == '29', 'state'] = 'Missouri'
dfWalk.loc[dfWalk['state'] == '30', 'state'] = 'Montana'
dfWalk.loc[dfWalk['state'] == '31', 'state'] = 'Nebraska'
dfWalk.loc[dfWalk['state'] == '32', 'state'] = 'Nevada'
dfWalk.loc[dfWalk['state'] == '33', 'state'] = 'New Hampshire'
dfWalk.loc[dfWalk['state'] == '34', 'state'] = 'New Jersey'
dfWalk.loc[dfWalk['state'] == '35', 'state'] = 'New Mexico'
dfWalk.loc[dfWalk['state'] == '36', 'state'] = 'New York'
dfWalk.loc[dfWalk['state'] == '37', 'state'] = 'North Carolina'
dfWalk.loc[dfWalk['state'] == '38', 'state'] = 'North Dakota'
dfWalk.loc[dfWalk['state'] == '39', 'state'] = 'Ohio'
dfWalk.loc[dfWalk['state'] == '40', 'state'] = 'Oklahoma'
dfWalk.loc[dfWalk['state'] == '41', 'state'] = 'Oregon'
dfWalk.loc[dfWalk['state'] == '42', 'state'] = 'Pennsylvania'
dfWalk.loc[dfWalk['state'] == '44', 'state'] = 'Rhode Island'
dfWalk.loc[dfWalk['state'] == '45', 'state'] = 'South Carolian'
dfWalk.loc[dfWalk['state'] == '46', 'state'] = 'South Dakota'
dfWalk.loc[dfWalk['state'] == '47', 'state'] = 'Tennessee'
dfWalk.loc[dfWalk['state'] == '48', 'state'] = 'Texas'
dfWalk.loc[dfWalk['state'] == '49', 'state'] = 'Utah'
dfWalk.loc[dfWalk['state'] == '50', 'state'] = 'Vermont'
dfWalk.loc[dfWalk['state'] == '51', 'state'] = 'Virginia'
dfWalk.loc[dfWalk['state'] == '53', 'state'] = 'Washington'
dfWalk.loc[dfWalk['state'] == '54', 'state'] = 'West Virginia'
dfWalk.loc[dfWalk['state'] == '55', 'state'] = 'Wisconsin'
dfWalk.loc[dfWalk['state'] == '56', 'state'] = 'Wyoming'
dfWalk.head()

Unnamed: 0,realGEOID,County,Intercept,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,...,percent_low_wage_workers,TotEmp,housing_density,population_density,employent_housing_density,jobs_per_household,transit_frequency,jobs_within_45_minutes_auto,jobs_within_45_minutes_transit,regional_centrality
0,1500000US481130078254,Dallas County,1,14.0,Texas,460.0,0.549,0.163121,0.744681,99,...,0.240291,66,6.250422,16.332625,7.147222,0.156028,0.003602,433601,135362,0.785893
1,1500000US481130078252,Dallas County,1,10.833333,Texas,409.0,0.466,0.0,0.589242,76,...,0.192405,25,3.430799,5.955666,3.640506,0.061125,0.006099,386504,236885,0.700531
2,1500000US481130078253,Dallas County,1,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,...,0.293737,0,13.843035,27.951553,13.843035,0.0,0.004071,404573,230587,0.733281
3,1500000US481130078241,Dallas County,1,15.666667,Texas,384.0,0.638,0.0,0.888021,60,...,0.139211,253,3.225246,7.592767,5.350213,0.658854,0.007378,423099,168433,0.766859
4,1500000US481130078242,Dallas County,1,10.166667,Texas,343.0,0.506,0.014577,0.790087,91,...,0.157168,32,2.305992,6.373413,2.521128,0.093294,0.007036,335700,120826,0.60845


In [23]:
# get race and population
temp = pd.read_csv('/Users/austincoffelt/Downloads/Rent-Walk-local/DECENNIALPL2020.P1_2025-04-07T022822/DECENNIALPL2020.P1-Data.csv', header = 1)
temp['percent_non_white'] = 1 - (temp[' !!Total:!!Population of one race:!!White alone'] / temp[' !!Total:'])
temp = temp.rename({' !!Total:': 'Population'}, axis = 1)
temp = temp[['Geography', 'Population', 'percent_non_white']]
temp.head()

Unnamed: 0,Geography,Population,percent_non_white
0,1500000US010010201001,575,0.246957
1,1500000US010010201002,1200,0.203333
2,1500000US010010202001,974,0.680698
3,1500000US010010202002,1081,0.508788
4,1500000US010010203001,2377,0.296592


In [25]:
# merge dfWalk with rent data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk.drop('Geography', axis = 1)
print(dfWalk.shape)
dfWalk.head()

(173583, 23)


Unnamed: 0,realGEOID,County,Intercept,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,...,housing_density,population_density,employent_housing_density,jobs_per_household,transit_frequency,jobs_within_45_minutes_auto,jobs_within_45_minutes_transit,regional_centrality,Population,percent_non_white
0,1500000US481130078254,Dallas County,1,14.0,Texas,460.0,0.549,0.163121,0.744681,99,...,6.250422,16.332625,7.147222,0.156028,0.003602,433601,135362,0.785893,1508,0.259284
1,1500000US481130078252,Dallas County,1,10.833333,Texas,409.0,0.466,0.0,0.589242,76,...,3.430799,5.955666,3.640506,0.061125,0.006099,386504,236885,0.700531,1027,0.771178
2,1500000US481130078253,Dallas County,1,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,...,13.843035,27.951553,13.843035,0.0,0.004071,404573,230587,0.733281,662,0.853474
3,1500000US481130078241,Dallas County,1,15.666667,Texas,384.0,0.638,0.0,0.888021,60,...,3.225246,7.592767,5.350213,0.658854,0.007378,423099,168433,0.766859,1969,0.120874
4,1500000US481130078271,Dallas County,1,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,...,10.969254,26.357776,11.028441,0.006036,0.002246,402287,138562,0.729137,1135,0.698678


In [27]:
# get median rent
temp = pd.read_csv('Median_Rent/Median_Rent.csv', header = 1)
temp = temp.rename({'Estimate!!Median contract rent': 'Median_Contract_Rent'}, axis = 1)
temp = temp[['Geography', 'Median_Contract_Rent']]

# remove observations without numbers
temp = temp[temp['Median_Contract_Rent'] != '-']
temp.head()

Unnamed: 0,Geography,Median_Contract_Rent
0,1500000US010010201001,579
1,1500000US010010201002,482
2,1500000US010010202001,1297
3,1500000US010010202002,679
4,1500000US010010203001,707


In [29]:
# merge dfWalk with rent data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk.drop('Geography', axis = 1)
print(dfWalk.shape)
dfWalk.head()

(131048, 24)


Unnamed: 0,realGEOID,County,Intercept,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,...,population_density,employent_housing_density,jobs_per_household,transit_frequency,jobs_within_45_minutes_auto,jobs_within_45_minutes_transit,regional_centrality,Population,percent_non_white,Median_Contract_Rent
0,1500000US481130078254,Dallas County,1,14.0,Texas,460.0,0.549,0.163121,0.744681,99,...,16.332625,7.147222,0.156028,0.003602,433601,135362,0.785893,1508,0.259284,942
1,1500000US481130078252,Dallas County,1,10.833333,Texas,409.0,0.466,0.0,0.589242,76,...,5.955666,3.640506,0.061125,0.006099,386504,236885,0.700531,1027,0.771178,770
2,1500000US481130078253,Dallas County,1,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,...,27.951553,13.843035,0.0,0.004071,404573,230587,0.733281,662,0.853474,806
3,1500000US481130078271,Dallas County,1,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,...,26.357776,11.028441,0.006036,0.002246,402287,138562,0.729137,1135,0.698678,1075
4,1500000US481130093012,Dallas County,1,8.0,Texas,451.0,0.644,0.029891,0.915761,117,...,7.680394,2.252783,0.002717,0.002596,263813,8873,0.478156,1367,0.734455,917


In [31]:
# get average household size
temp = pd.read_csv('Avg_Household_Size/Avg_Household_Size.csv', header = 1)
temp = temp.rename({'Estimate!!Average household size --!!Total:!!Renter occupied': 'avg_HH_size_renters'}, axis = 1)
temp = temp[['Geography', 'avg_HH_size_renters']]

# remove observations without values
temp = temp[temp['avg_HH_size_renters'] != '-']
temp.head()

Unnamed: 0,Geography,avg_HH_size_renters
0,1500000US010010201001,2.06
1,1500000US010010201002,2.05
2,1500000US010010202001,2.55
3,1500000US010010202002,3.37
4,1500000US010010203001,2.21


In [33]:
# merge dfWalk with household data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk.drop('Geography', axis = 1)
print(dfWalk.shape)
dfWalk.head()

(130349, 25)


Unnamed: 0,realGEOID,County,Intercept,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,...,employent_housing_density,jobs_per_household,transit_frequency,jobs_within_45_minutes_auto,jobs_within_45_minutes_transit,regional_centrality,Population,percent_non_white,Median_Contract_Rent,avg_HH_size_renters
0,1500000US481130078254,Dallas County,1,14.0,Texas,460.0,0.549,0.163121,0.744681,99,...,7.147222,0.156028,0.003602,433601,135362,0.785893,1508,0.259284,942,2.42
1,1500000US481130078252,Dallas County,1,10.833333,Texas,409.0,0.466,0.0,0.589242,76,...,3.640506,0.061125,0.006099,386504,236885,0.700531,1027,0.771178,770,1.72
2,1500000US481130078253,Dallas County,1,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,...,13.843035,0.0,0.004071,404573,230587,0.733281,662,0.853474,806,2.23
3,1500000US481130078271,Dallas County,1,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,...,11.028441,0.006036,0.002246,402287,138562,0.729137,1135,0.698678,1075,2.27
4,1500000US481130093012,Dallas County,1,8.0,Texas,451.0,0.644,0.029891,0.915761,117,...,2.252783,0.002717,0.002596,263813,8873,0.478156,1367,0.734455,917,4.1


In [35]:
# read in income inequality data (GINI Index)
temp = pd.read_csv('GINI_Index_2013/GINI_Index_2013.csv', header = 1)
temp = temp.rename({'Estimate!!Gini Index': 'GINI_Index'}, axis = 1)
temp = temp[['Geography', 'GINI_Index']]

#remove observations without numbers
temp = temp[temp['GINI_Index'] != '-']
temp.head()

Unnamed: 0,Geography,GINI_Index
0,1500000US010010201001,0.3386
1,1500000US010010201002,0.4121
2,1500000US010010202001,0.4151
3,1500000US010010202002,0.3972
4,1500000US010010203001,0.3981


In [37]:
# merge dfWalk with GINI Index data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk.drop('Geography', axis = 1)
print(dfWalk.shape)
dfWalk.head()

(130237, 26)


Unnamed: 0,realGEOID,County,Intercept,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,...,jobs_per_household,transit_frequency,jobs_within_45_minutes_auto,jobs_within_45_minutes_transit,regional_centrality,Population,percent_non_white,Median_Contract_Rent,avg_HH_size_renters,GINI_Index
0,1500000US481130078254,Dallas County,1,14.0,Texas,460.0,0.549,0.163121,0.744681,99,...,0.156028,0.003602,433601,135362,0.785893,1508,0.259284,942,2.42,0.5129
1,1500000US481130078252,Dallas County,1,10.833333,Texas,409.0,0.466,0.0,0.589242,76,...,0.061125,0.006099,386504,236885,0.700531,1027,0.771178,770,1.72,0.3731
2,1500000US481130078253,Dallas County,1,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,...,0.0,0.004071,404573,230587,0.733281,662,0.853474,806,2.23,0.2593
3,1500000US481130078271,Dallas County,1,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,...,0.006036,0.002246,402287,138562,0.729137,1135,0.698678,1075,2.27,0.4482
4,1500000US481130093012,Dallas County,1,8.0,Texas,451.0,0.644,0.029891,0.915761,117,...,0.002717,0.002596,263813,8873,0.478156,1367,0.734455,917,4.1,0.3393


In [39]:
# read in household heating data
temp = pd.read_csv('/Users/austincoffelt/Downloads/Rent-Walk-local/Household_Heating/Household_Heating.csv', header = 1)

# get percentage of different heating methods
temp['percent_gas_energy'] = temp['Estimate!!Total:!!Utility gas'] / temp['Estimate!!Total:']
temp['percent_electric_energy'] = temp['Estimate!!Total:!!Electricity'] / temp['Estimate!!Total:']
temp['percent_solar_energy'] = temp['Estimate!!Total:!!Solar energy'] / temp['Estimate!!Total:']
temp['percent_no_heating'] = temp['Estimate!!Total:!!No fuel used'] / temp['Estimate!!Total:']
temp = temp[['Geography', 'percent_gas_energy', 'percent_electric_energy', 'percent_solar_energy', 'percent_no_heating']]
temp.head()

Unnamed: 0,Geography,percent_gas_energy,percent_electric_energy,percent_solar_energy,percent_no_heating
0,1500000US010010201001,0.337302,0.603175,0.0,0.0
1,1500000US010010201002,0.492997,0.473389,0.0,0.0
2,1500000US010010202001,0.305147,0.694853,0.0,0.0
3,1500000US010010202002,0.716463,0.262195,0.0,0.0
4,1500000US010010203001,0.646251,0.281943,0.0,0.002112


In [41]:
# merge dfWalk with household heating data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk.drop('Geography', axis = 1)
print(dfWalk.shape)
dfWalk.head()

(130237, 30)


Unnamed: 0,realGEOID,County,Intercept,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,...,regional_centrality,Population,percent_non_white,Median_Contract_Rent,avg_HH_size_renters,GINI_Index,percent_gas_energy,percent_electric_energy,percent_solar_energy,percent_no_heating
0,1500000US481130078254,Dallas County,1,14.0,Texas,460.0,0.549,0.163121,0.744681,99,...,0.785893,1508,0.259284,942,2.42,0.5129,0.48533,0.51467,0.0,0.0
1,1500000US481130078252,Dallas County,1,10.833333,Texas,409.0,0.466,0.0,0.589242,76,...,0.700531,1027,0.771178,770,1.72,0.3731,0.07377,0.840164,0.0,0.0
2,1500000US481130078253,Dallas County,1,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,...,0.733281,662,0.853474,806,2.23,0.2593,0.191686,0.808314,0.0,0.0
3,1500000US481130078271,Dallas County,1,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,...,0.729137,1135,0.698678,1075,2.27,0.4482,0.0,1.0,0.0,0.0
4,1500000US481130093012,Dallas County,1,8.0,Texas,451.0,0.644,0.029891,0.915761,117,...,0.478156,1367,0.734455,917,4.1,0.3393,0.86236,0.02809,0.0,0.0


In [43]:
# get internet access data and find the percent that have no internet access
temp = pd.read_csv('/Users/austincoffelt/Downloads/Rent-Walk-local/Internet_Access/Internet_Access.csv', header = 1)
temp['percent_no_internet'] = temp['Estimate!!Total:!!No Internet access'] / temp['Estimate!!Total:']
temp = temp[['Geography', 'percent_no_internet']]
temp.head()

Unnamed: 0,Geography,percent_no_internet
0,1500000US010010201001,0.206349
1,1500000US010010201002,0.10084
2,1500000US010010202001,0.136029
3,1500000US010010202002,0.234756
4,1500000US010010203001,0.089757


In [45]:
# merge dfWalk with internet access data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk.drop('Geography', axis = 1)
print(dfWalk.shape)
dfWalk.head()

(130237, 31)


Unnamed: 0,realGEOID,County,Intercept,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,...,Population,percent_non_white,Median_Contract_Rent,avg_HH_size_renters,GINI_Index,percent_gas_energy,percent_electric_energy,percent_solar_energy,percent_no_heating,percent_no_internet
0,1500000US481130078254,Dallas County,1,14.0,Texas,460.0,0.549,0.163121,0.744681,99,...,1508,0.259284,942,2.42,0.5129,0.48533,0.51467,0.0,0.0,0.095355
1,1500000US481130078252,Dallas County,1,10.833333,Texas,409.0,0.466,0.0,0.589242,76,...,1027,0.771178,770,1.72,0.3731,0.07377,0.840164,0.0,0.0,0.114754
2,1500000US481130078253,Dallas County,1,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,...,662,0.853474,806,2.23,0.2593,0.191686,0.808314,0.0,0.0,0.189376
3,1500000US481130078271,Dallas County,1,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,...,1135,0.698678,1075,2.27,0.4482,0.0,1.0,0.0,0.0,0.098501
4,1500000US481130093012,Dallas County,1,8.0,Texas,451.0,0.644,0.029891,0.915761,117,...,1367,0.734455,917,4.1,0.3393,0.86236,0.02809,0.0,0.0,0.078652


In [49]:
# read in kitchen data
temp = pd.read_csv('/Users/austincoffelt/Downloads/Rent-Walk-local/Kitchen_Tenure/Kitchen_Tenure.csv', header = 1)

# get percentage that are lacking kitchen utilities
temp['percent_lacking_kitchen_renter'] = temp['Estimate!!Total:!!Renter occupied:!!Lacking complete kitchen facilities'] / temp['Estimate!!Total:!!Renter occupied:']
temp = temp[['Geography', 'percent_lacking_kitchen_renter']]
temp.head()

Unnamed: 0,Geography,percent_lacking_kitchen_renter
0,1500000US010010201001,0.0
1,1500000US010010201002,0.0
2,1500000US010010202001,0.0
3,1500000US010010202002,0.0
4,1500000US010010203001,0.0


In [51]:
# merge dfWalk with kitchen data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk.drop('Geography', axis = 1)
print(dfWalk.shape)
dfWalk.head()

(130237, 32)


Unnamed: 0,realGEOID,County,Intercept,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,...,percent_non_white,Median_Contract_Rent,avg_HH_size_renters,GINI_Index,percent_gas_energy,percent_electric_energy,percent_solar_energy,percent_no_heating,percent_no_internet,percent_lacking_kitchen_renter
0,1500000US481130078254,Dallas County,1,14.0,Texas,460.0,0.549,0.163121,0.744681,99,...,0.259284,942,2.42,0.5129,0.48533,0.51467,0.0,0.0,0.095355,0.0
1,1500000US481130078252,Dallas County,1,10.833333,Texas,409.0,0.466,0.0,0.589242,76,...,0.771178,770,1.72,0.3731,0.07377,0.840164,0.0,0.0,0.114754,0.0
2,1500000US481130078253,Dallas County,1,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,...,0.853474,806,2.23,0.2593,0.191686,0.808314,0.0,0.0,0.189376,0.0
3,1500000US481130078271,Dallas County,1,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,...,0.698678,1075,2.27,0.4482,0.0,1.0,0.0,0.0,0.098501,0.0
4,1500000US481130093012,Dallas County,1,8.0,Texas,451.0,0.644,0.029891,0.915761,117,...,0.734455,917,4.1,0.3393,0.86236,0.02809,0.0,0.0,0.078652,0.0


In [53]:
# read in living arrangement data
temp = pd.read_csv('/Users/austincoffelt/Downloads/Rent-Walk-local/Living_Arrangements/Living_Arrangements.csv', header = 1)

# get percentage that live alone
temp['percent_lives_alone'] = temp['Estimate!!Total:!!Lives alone'] / temp['Estimate!!Total:']
temp = temp[['Geography', 'percent_lives_alone']]
temp.head()

Unnamed: 0,Geography,percent_lives_alone
0,1500000US010010201001,0.132404
1,1500000US010010201002,0.051345
2,1500000US010010202001,0.09736
3,1500000US010010202002,0.114086
4,1500000US010010203001,0.074187


In [55]:
# merge dfWalk with living alone data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk.drop('Geography', axis = 1)
print(dfWalk.shape)
dfWalk.head()

(130237, 33)


Unnamed: 0,realGEOID,County,Intercept,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,...,Median_Contract_Rent,avg_HH_size_renters,GINI_Index,percent_gas_energy,percent_electric_energy,percent_solar_energy,percent_no_heating,percent_no_internet,percent_lacking_kitchen_renter,percent_lives_alone
0,1500000US481130078254,Dallas County,1,14.0,Texas,460.0,0.549,0.163121,0.744681,99,...,942,2.42,0.5129,0.48533,0.51467,0.0,0.0,0.095355,0.0,0.238619
1,1500000US481130078252,Dallas County,1,10.833333,Texas,409.0,0.466,0.0,0.589242,76,...,770,1.72,0.3731,0.07377,0.840164,0.0,0.0,0.114754,0.0,0.42228
2,1500000US481130078253,Dallas County,1,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,...,806,2.23,0.2593,0.191686,0.808314,0.0,0.0,0.189376,0.0,0.295714
3,1500000US481130078271,Dallas County,1,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,...,1075,2.27,0.4482,0.0,1.0,0.0,0.0,0.098501,0.0,0.328788
4,1500000US481130093012,Dallas County,1,8.0,Texas,451.0,0.644,0.029891,0.915761,117,...,917,4.1,0.3393,0.86236,0.02809,0.0,0.0,0.078652,0.0,0.008937


In [57]:
# get median household income data
temp = pd.read_csv('Median_Household_Income/Median_Household_Income.csv', header = 1)
temp = temp.rename({'Estimate!!Median household income in the past 12 months (in 2021 inflation-adjusted dollars)': 'Median_Household_Income'}, axis = 1)
temp = temp[['Geography', 'Median_Household_Income']]

# remove observations without numbers
temp = temp[temp['Median_Household_Income'] != '-']
temp.head()

Unnamed: 0,Geography,Median_Household_Income
0,1500000US010010201001,41607
1,1500000US010010201002,66313
2,1500000US010010202001,42288
3,1500000US010010202002,52609
4,1500000US010010203001,75074


In [59]:
# merge dfWalk with median household income data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk.drop('Geography', axis = 1)
print(dfWalk.shape)
dfWalk.head()

(124477, 34)


Unnamed: 0,realGEOID,County,Intercept,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,...,avg_HH_size_renters,GINI_Index,percent_gas_energy,percent_electric_energy,percent_solar_energy,percent_no_heating,percent_no_internet,percent_lacking_kitchen_renter,percent_lives_alone,Median_Household_Income
0,1500000US481130078254,Dallas County,1,14.0,Texas,460.0,0.549,0.163121,0.744681,99,...,2.42,0.5129,0.48533,0.51467,0.0,0.0,0.095355,0.0,0.238619,40714
1,1500000US481130078252,Dallas County,1,10.833333,Texas,409.0,0.466,0.0,0.589242,76,...,1.72,0.3731,0.07377,0.840164,0.0,0.0,0.114754,0.0,0.42228,67714
2,1500000US481130078253,Dallas County,1,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,...,2.23,0.2593,0.191686,0.808314,0.0,0.0,0.189376,0.0,0.295714,49741
3,1500000US481130078271,Dallas County,1,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,...,2.27,0.4482,0.0,1.0,0.0,0.0,0.098501,0.0,0.328788,36135
4,1500000US481130093012,Dallas County,1,8.0,Texas,451.0,0.644,0.029891,0.915761,117,...,4.1,0.3393,0.86236,0.02809,0.0,0.0,0.078652,0.0,0.008937,59781


In [61]:
# get median rooms
temp = pd.read_csv('Median_Rooms/Median_Rooms.csv', header = 1)
temp = temp.rename({'Estimate!!Median number of rooms --!!Total:!!Renter occupied': 'Median_Num_Rooms'}, axis = 1)
temp = temp[['Geography', 'Median_Num_Rooms']]

# remove observations without numbers
temp = temp[temp['Median_Num_Rooms'] != '-']
temp.head()

Unnamed: 0,Geography,Median_Num_Rooms
0,1500000US010010201001,4.3
1,1500000US010010201002,4.2
2,1500000US010010202001,5.1
3,1500000US010010202002,5.4
4,1500000US010010203001,6.1


In [63]:
# merge dfWalk with rooms data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk.drop('Geography', axis = 1)
print(dfWalk.shape)
dfWalk.head()

(124058, 35)


Unnamed: 0,realGEOID,County,Intercept,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,...,GINI_Index,percent_gas_energy,percent_electric_energy,percent_solar_energy,percent_no_heating,percent_no_internet,percent_lacking_kitchen_renter,percent_lives_alone,Median_Household_Income,Median_Num_Rooms
0,1500000US481130078254,Dallas County,1,14.0,Texas,460.0,0.549,0.163121,0.744681,99,...,0.5129,0.48533,0.51467,0.0,0.0,0.095355,0.0,0.238619,40714,4.9
1,1500000US481130078252,Dallas County,1,10.833333,Texas,409.0,0.466,0.0,0.589242,76,...,0.3731,0.07377,0.840164,0.0,0.0,0.114754,0.0,0.42228,67714,2.9
2,1500000US481130078253,Dallas County,1,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,...,0.2593,0.191686,0.808314,0.0,0.0,0.189376,0.0,0.295714,49741,4.1
3,1500000US481130078271,Dallas County,1,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,...,0.4482,0.0,1.0,0.0,0.0,0.098501,0.0,0.328788,36135,4.1
4,1500000US481130093012,Dallas County,1,8.0,Texas,451.0,0.644,0.029891,0.915761,117,...,0.3393,0.86236,0.02809,0.0,0.0,0.078652,0.0,0.008937,59781,5.8


In [65]:
# get median year built data
temp = pd.read_csv('Median_Structure_Tenure/Median_Structure_Tenure.csv', header = 1)
temp = temp.rename({'Estimate!!Median year structure built --!!Renter occupied': 'Median_Year_Structure_Built'}, axis = 1)
temp = temp[['Geography', 'Median_Year_Structure_Built']]

#remove observations without numbers
temp = temp[temp['Median_Year_Structure_Built'] != '-']
temp.head()

Unnamed: 0,Geography,Median_Year_Structure_Built
0,1500000US010010201001,1965
1,1500000US010010201002,1959
2,1500000US010010202001,1993
3,1500000US010010202002,1968
4,1500000US010010203001,1975


In [67]:
# merge dfWalk with year data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk.drop('Geography', axis = 1)
print(dfWalk.shape)
dfWalk.head()

(123067, 36)


Unnamed: 0,realGEOID,County,Intercept,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,...,percent_gas_energy,percent_electric_energy,percent_solar_energy,percent_no_heating,percent_no_internet,percent_lacking_kitchen_renter,percent_lives_alone,Median_Household_Income,Median_Num_Rooms,Median_Year_Structure_Built
0,1500000US481130078254,Dallas County,1,14.0,Texas,460.0,0.549,0.163121,0.744681,99,...,0.48533,0.51467,0.0,0.0,0.095355,0.0,0.238619,40714,4.9,1977
1,1500000US481130078252,Dallas County,1,10.833333,Texas,409.0,0.466,0.0,0.589242,76,...,0.07377,0.840164,0.0,0.0,0.114754,0.0,0.42228,67714,2.9,1977
2,1500000US481130078253,Dallas County,1,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,...,0.191686,0.808314,0.0,0.0,0.189376,0.0,0.295714,49741,4.1,1982
3,1500000US481130078271,Dallas County,1,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,...,0.0,1.0,0.0,0.0,0.098501,0.0,0.328788,36135,4.1,1985
4,1500000US481130093012,Dallas County,1,8.0,Texas,451.0,0.644,0.029891,0.915761,117,...,0.86236,0.02809,0.0,0.0,0.078652,0.0,0.008937,59781,5.8,1955


In [69]:
# get plumbing facilities data
temp = pd.read_csv('/Users/austincoffelt/Downloads/Rent-Walk-local/Plumbing_Facilities_Tenure/Plumbing_Facilities_Tenure.csv', header = 1)

# get percentage with lackluster facilities
temp['percent_lacking_plumbing_renter'] = temp['Estimate!!Total:!!Renter occupied:!!Lacking plumbing facilities'] / temp['Estimate!!Total:!!Renter occupied:']
temp = temp[['Geography', 'percent_lacking_plumbing_renter']]
temp.head()

Unnamed: 0,Geography,percent_lacking_plumbing_renter
0,1500000US010010201001,0.0
1,1500000US010010201002,0.0
2,1500000US010010202001,0.0
3,1500000US010010202002,0.0
4,1500000US010010203001,0.0


In [71]:
# merge dfWalk with plumbing data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'Geography')
dfWalk = dfWalk.drop('Geography', axis = 1)
print(dfWalk.shape)
dfWalk.head()

(123067, 37)


Unnamed: 0,realGEOID,County,Intercept,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,...,percent_electric_energy,percent_solar_energy,percent_no_heating,percent_no_internet,percent_lacking_kitchen_renter,percent_lives_alone,Median_Household_Income,Median_Num_Rooms,Median_Year_Structure_Built,percent_lacking_plumbing_renter
0,1500000US481130078254,Dallas County,1,14.0,Texas,460.0,0.549,0.163121,0.744681,99,...,0.51467,0.0,0.0,0.095355,0.0,0.238619,40714,4.9,1977,0.0
1,1500000US481130078252,Dallas County,1,10.833333,Texas,409.0,0.466,0.0,0.589242,76,...,0.840164,0.0,0.0,0.114754,0.0,0.42228,67714,2.9,1977,0.0
2,1500000US481130078253,Dallas County,1,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,...,0.808314,0.0,0.0,0.189376,0.0,0.295714,49741,4.1,1982,0.0
3,1500000US481130078271,Dallas County,1,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,...,1.0,0.0,0.0,0.098501,0.0,0.328788,36135,4.1,1985,0.0
4,1500000US481130093012,Dallas County,1,8.0,Texas,451.0,0.644,0.029891,0.915761,117,...,0.02809,0.0,0.0,0.078652,0.0,0.008937,59781,5.8,1955,0.0


In [73]:
# get exam score data
temp = pd.read_csv('Block_Group_Exam_Score.csv')
temp.head()

Unnamed: 0,AFFGEOID,weight_avg_math_scores,weight_avg_ELA_scores
0,1500000US010010201001,0.236767,0.499152
1,1500000US010010201002,0.236767,0.499152
2,1500000US010010202001,0.236767,0.499152
3,1500000US010010202002,0.236767,0.499152
4,1500000US010010203001,0.236767,0.499152


In [75]:
# merge dfWalk with exam data on GEOID
dfWalk = dfWalk.merge(temp, how = 'inner', left_on = 'realGEOID', right_on = 'AFFGEOID')
dfWalk = dfWalk.drop('AFFGEOID', axis = 1)
print(dfWalk.shape)
dfWalk.head()

(98596, 39)


Unnamed: 0,realGEOID,County,Intercept,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,...,percent_no_heating,percent_no_internet,percent_lacking_kitchen_renter,percent_lives_alone,Median_Household_Income,Median_Num_Rooms,Median_Year_Structure_Built,percent_lacking_plumbing_renter,weight_avg_math_scores,weight_avg_ELA_scores
0,1500000US481130078254,Dallas County,1,14.0,Texas,460.0,0.549,0.163121,0.744681,99,...,0.0,0.095355,0.0,0.238619,40714,4.9,1977,0.0,0.403983,0.439323
1,1500000US481130078252,Dallas County,1,10.833333,Texas,409.0,0.466,0.0,0.589242,76,...,0.0,0.114754,0.0,0.42228,67714,2.9,1977,0.0,0.403983,0.439323
2,1500000US481130078253,Dallas County,1,8.333333,Texas,365.0,0.811,0.057751,0.507599,136,...,0.0,0.189376,0.0,0.295714,49741,4.1,1982,0.0,0.403983,0.439323
3,1500000US481130078271,Dallas County,1,6.833333,Texas,556.0,0.588,0.066398,0.227364,143,...,0.0,0.098501,0.0,0.328788,36135,4.1,1985,0.0,0.403983,0.439323
4,1500000US481130093012,Dallas County,1,8.0,Texas,451.0,0.644,0.029891,0.915761,117,...,0.0,0.078652,0.0,0.008937,59781,5.8,1955,0.0,0.315137,0.359102


In [77]:
# get crime data
laCrime = pd.read_csv('LA_Violent_Crime_BG.csv')
noCrime = pd.read_csv('NO_Violent_Crime_BG.csv')
auCrime = pd.read_csv('AustinTX_violent_crime_by_BG.csv')
chiCrime = pd.read_csv('Chicago_Violent_Crime_BG.csv')
montCrime = pd.read_csv('MontMD_violent_crime_by_BG.csv')
crime = pd.concat([laCrime, noCrime, auCrime, chiCrime, montCrime])
print(crime.shape)
crime.head()

(4745, 2)


Unnamed: 0,AFFGEOID,count
0,1500000US060372077111,160
1,1500000US060372063011,155
2,1500000US060372063031,148
3,1500000US060372088011,114
4,1500000US060372260021,113


In [79]:
# merge dfWalk with crime data on GEOID
dfWalk = dfWalk.merge(crime, how = 'inner', left_on = 'realGEOID', right_on = 'AFFGEOID')
dfWalk['violent_crime_rate'] = (dfWalk['count'] / dfWalk['Population']) * 100000
dfWalk = dfWalk.drop('AFFGEOID', axis = 1)
print(dfWalk.shape)
dfWalk.head()

(3085, 41)


Unnamed: 0,realGEOID,County,Intercept,Walk_Index,state,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,...,percent_lacking_kitchen_renter,percent_lives_alone,Median_Household_Income,Median_Num_Rooms,Median_Year_Structure_Built,percent_lacking_plumbing_renter,weight_avg_math_scores,weight_avg_ELA_scores,count,violent_crime_rate
0,1500000US484530023144,Travis County,1,11.0,Texas,702.0,0.811,0.234168,0.309278,186,...,0.0,0.367606,67978,3.1,1995,0.0,0.187373,0.253501,16,739.030023
1,1500000US484530002041,Travis County,1,18.333333,Texas,795.0,0.822,0.231132,0.34434,126,...,0.0,0.235353,46875,3.2,1970,0.0,0.330847,0.439981,1,81.566069
2,1500000US484530015032,Travis County,1,18.666667,Texas,358.0,0.974,0.162264,0.532075,64,...,0.0,0.159555,62935,4.2,1967,0.0,0.330847,0.439981,2,361.663653
3,1500000US484910205081,Williamson County,1,5.5,Texas,2051.0,0.726,0.024313,0.357294,376,...,0.0,0.347867,63713,3.4,2003,0.0,0.505469,0.581849,5,160.616768
4,1500000US484910204051,Williamson County,1,13.166667,Texas,450.0,0.602,0.0,0.321168,109,...,0.0,0.101083,69710,3.8,1983,0.0,0.505469,0.581849,3,424.328147


In [81]:
# get dummies for the states
stateDummies = pd.get_dummies(dfWalk.loc[:, 'state'] , drop_first = False, dtype = 'int')

# concatinate with our dataframe
dfWalk = pd.concat([dfWalk, stateDummies], axis = 1)

# drop state and california for multicoliniarity issues
dfWalk = dfWalk.drop(['state', 'California'], axis = 1)
dfWalk.head()

Unnamed: 0,realGEOID,County,Intercept,Walk_Index,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,...,Median_Year_Structure_Built,percent_lacking_plumbing_renter,weight_avg_math_scores,weight_avg_ELA_scores,count,violent_crime_rate,Illinois,Louisiana,Maryland,Texas
0,1500000US484530023144,Travis County,1,11.0,702.0,0.811,0.234168,0.309278,186,257,...,1995,0.0,0.187373,0.253501,16,739.030023,0,0,0,1
1,1500000US484530002041,Travis County,1,18.333333,795.0,0.822,0.231132,0.34434,126,418,...,1970,0.0,0.330847,0.439981,1,81.566069,0,0,0,1
2,1500000US484530015032,Travis County,1,18.666667,358.0,0.974,0.162264,0.532075,64,137,...,1967,0.0,0.330847,0.439981,2,361.663653,0,0,0,1
3,1500000US484910205081,Williamson County,1,5.5,2051.0,0.726,0.024313,0.357294,376,1225,...,2003,0.0,0.505469,0.581849,5,160.616768,0,0,0,1
4,1500000US484910204051,Williamson County,1,13.166667,450.0,0.602,0.0,0.321168,109,217,...,1983,0.0,0.505469,0.581849,3,424.328147,0,0,0,1


In [83]:
# remove observations on the borders for precise measurements
dfWalk = dfWalk[dfWalk['Median_Household_Income'] != '250,000+']
dfWalk = dfWalk[dfWalk['Median_Household_Income'] != '2,500-']
dfWalk = dfWalk[dfWalk['Median_Contract_Rent'] != '3,500+']
dfWalk = dfWalk[dfWalk['Median_Contract_Rent'] != '100-']

# log data for income and rent
dfWalk['log_Median_Household_Income'] = np.log(dfWalk['Median_Household_Income'].astype('int'))
dfWalk['log_Median_Contract_Rent'] = np.log(dfWalk['Median_Contract_Rent'].astype('int'))
dfWalk = dfWalk.drop(['realGEOID', 'County', 'Median_Household_Income', 'Median_Contract_Rent'], axis = 1)
dfWalk.head()

Unnamed: 0,Intercept,Walk_Index,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,percent_low_wage_workers,TotEmp,...,weight_avg_math_scores,weight_avg_ELA_scores,count,violent_crime_rate,Illinois,Louisiana,Maryland,Texas,log_Median_Household_Income,log_Median_Contract_Rent
0,1,11.0,702.0,0.811,0.234168,0.309278,186,257,0.248663,24,...,0.187373,0.253501,16,739.030023,0,0,0,1,11.126939,7.149917
1,1,18.333333,795.0,0.822,0.231132,0.34434,126,418,0.171896,2237,...,0.330847,0.439981,1,81.566069,0,0,0,1,10.75524,7.075809
2,1,18.666667,358.0,0.974,0.162264,0.532075,64,137,0.211221,886,...,0.330847,0.439981,2,361.663653,0,0,0,1,11.049858,6.914731
3,1,5.5,2051.0,0.726,0.024313,0.357294,376,1225,0.169293,1297,...,0.505469,0.581849,5,160.616768,0,0,0,1,11.062144,7.060476
4,1,13.166667,450.0,0.602,0.0,0.321168,109,217,0.217131,801,...,0.505469,0.581849,3,424.328147,0,0,0,1,11.152099,7.057037


In [85]:
# remove observations on the borders for precise measurements and change type to float
dfWalk['avg_HH_size_renters'] = dfWalk['avg_HH_size_renters'].astype('float')
dfWalk['GINI_Index'] = dfWalk['GINI_Index'].astype('float')
dfWalk = dfWalk[dfWalk['Median_Num_Rooms'] != '9.0+']
dfWalk['Median_Num_Rooms'] = dfWalk['Median_Num_Rooms'].astype('float')
dfWalk = dfWalk[dfWalk['Median_Year_Structure_Built'] != '1939-']

# for 2020+ put 2021 since that is the year of the data
dfWalk.loc[dfWalk['Median_Year_Structure_Built'] == '2020+', 'Median_Year_Structure_Built'] = 2021
dfWalk['Median_Year_Structure_Built'] = dfWalk['Median_Year_Structure_Built'].astype('int')


dfWalk.shape

(2518, 42)

In [87]:
# separate into X and Y
y = dfWalk['log_Median_Contract_Rent']
X = dfWalk.drop(['Intercept', 'log_Median_Contract_Rent', axis = 1)

# get continuous columns
contCols = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 40]

# Standardize features 
scaler = StandardScaler()
X_scaled = X.copy()
X_scaled.iloc[:, contCols] = scaler.fit_transform(X.iloc[:, contCols])

X_scaled.head()

Unnamed: 0,Intercept,Walk_Index,count_housing_units,percentage_work_age,percent_no_car,percent_two_car,count_low_wage_workers,count_high_wage_workers,percent_low_wage_workers,TotEmp,...,percent_lacking_plumbing_renter,weight_avg_math_scores,weight_avg_ELA_scores,count,violent_crime_rate,Illinois,Louisiana,Maryland,Texas,log_Median_Household_Income
0,1,-1.386585,0.27058,1.687035,0.471961,-0.652652,0.340505,-0.090311,0.032557,-0.133464,...,-0.201872,-1.203018,-1.469144,0.904923,0.077283,0,0,0,1,0.174358
1,1,1.559254,0.555829,1.800939,0.452086,-0.482251,-0.418629,0.716296,-1.368974,0.209071,...,-0.201872,0.921168,-0.091778,-0.669926,-0.231152,0,0,0,1,-0.512601
2,1,1.693156,-0.784536,3.374889,0.001223,0.430165,-1.203066,-0.691508,-0.651021,-4.1e-05,...,-0.201872,0.921168,-0.091778,-0.564936,-0.09975,0,0,0,1,0.031899
3,1,-3.595965,4.408229,0.806866,-0.901913,-0.419292,2.744426,4.759348,-1.416501,0.063575,...,-0.201872,3.506498,0.956077,-0.249966,-0.194067,0,0,0,1,0.054606
4,1,-0.516224,-0.502354,-0.477145,-1.061084,-0.594868,-0.633716,-0.29071,-0.543116,-0.013198,...,-0.201872,3.506498,0.956077,-0.459946,-0.070353,0,0,0,1,0.220857


In [89]:
# use lasso to find best variables to use
lasso = LassoCV(cv=5, random_state=42)
lasso.fit(X_scaled, y)

# Get the coefficients and identify non-zero features
coef = pd.Series(lasso.coef_, index=X.columns)
selected_features = coef[coef != 0].index.tolist()

print("Selected features:", selected_features)
print("Features eliminated:", set(X.columns) - set(selected_features))

Selected features: ['Walk_Index', 'percentage_work_age', 'percent_no_car', 'count_high_wage_workers', 'housing_density', 'employent_housing_density', 'jobs_per_household', 'jobs_within_45_minutes_transit', 'regional_centrality', 'percent_non_white', 'GINI_Index', 'percent_no_heating', 'percent_no_internet', 'Median_Num_Rooms', 'Median_Year_Structure_Built', 'weight_avg_ELA_scores', 'Texas', 'log_Median_Household_Income']
Features eliminated: {'population_density', 'TotEmp', 'weight_avg_math_scores', 'Population', 'percent_lives_alone', 'Illinois', 'percent_gas_energy', 'Maryland', 'percent_solar_energy', 'Louisiana', 'count_low_wage_workers', 'count_housing_units', 'percent_electric_energy', 'percent_low_wage_workers', 'Intercept', 'percent_lacking_kitchen_renter', 'transit_frequency', 'percent_two_car', 'count', 'jobs_within_45_minutes_auto', 'avg_HH_size_renters', 'violent_crime_rate', 'percent_lacking_plumbing_renter'}


In [595]:
mod = sm.OLS(dfWalk['log_Median_Contract_Rent'], dfWalk.drop(['log_Median_Contract_Rent'], axis = 1))
res = mod.fit()
res.summary()

0,1,2,3
Dep. Variable:,log_Median_Contract_Rent,R-squared:,0.699
Model:,OLS,Adj. R-squared:,0.695
Method:,Least Squares,F-statistic:,170.5
Date:,"Mon, 07 Apr 2025",Prob (F-statistic):,0.0
Time:,16:07:36,Log-Likelihood:,320.48
No. Observations:,2972,AIC:,-559.0
Df Residuals:,2931,BIC:,-313.1
Df Model:,40,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.1408,0.641,-0.220,0.826,-1.398,1.116
Walk_Index,0.0021,0.002,1.040,0.298,-0.002,0.006
count_housing_units,4.545e-05,3.02e-05,1.506,0.132,-1.37e-05,0.000
percentage_work_age,0.4181,0.049,8.612,0.000,0.323,0.513
percent_no_car,-0.1778,0.047,-3.792,0.000,-0.270,-0.086
percent_two_car,-0.1373,0.038,-3.594,0.000,-0.212,-0.062
count_low_wage_workers,-0.0005,0.000,-4.422,0.000,-0.001,-0.000
count_high_wage_workers,0.0002,5.12e-05,3.651,0.000,8.66e-05,0.000
percent_low_wage_workers,0.1903,0.137,1.385,0.166,-0.079,0.460

0,1,2,3
Omnibus:,702.547,Durbin-Watson:,1.934
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3609.822
Skew:,-1.03,Prob(JB):,0.0
Kurtosis:,7.991,Cond. No.,96400000.0


pcas for edu
or add all 3 for edu
median high v low for crime
county fixed effects
control for observatbles
instrument 