In [27]:
import pandas as pd

In [28]:
# Read in raw data
df = pd.read_csv(
    "../../data/raw/EPA_SmartLocationDatabase_V3_Jan_2021_Final.csv", engine="python"
)
df

Unnamed: 0,STATEFP,COUNTYFP,TotPop,NatWalkInd
0,48,113,1202,14.000000
1,48,113,710,10.833333
2,48,113,737,8.333333
3,48,113,904,15.666667
4,48,113,948,10.166667
...,...,...,...,...
220735,78,30,1471,7.333333
220736,78,30,940,7.333333
220737,78,30,878,7.333333
220738,78,30,1278,4.000000


In [29]:
# Limit to the necessary columns
df = df[["STATEFP", "COUNTYFP", "TotPop", "NatWalkInd"]]
df

Unnamed: 0,STATEFP,COUNTYFP,TotPop,NatWalkInd
0,48,113,1202,14.000000
1,48,113,710,10.833333
2,48,113,737,8.333333
3,48,113,904,15.666667
4,48,113,948,10.166667
...,...,...,...,...
220735,78,30,1471,7.333333
220736,78,30,940,7.333333
220737,78,30,878,7.333333
220738,78,30,1278,4.000000


In [30]:
df["fips"] = df["STATEFP"] * 1000 + df["COUNTYFP"]
df

Unnamed: 0,STATEFP,COUNTYFP,TotPop,NatWalkInd,fips
0,48,113,1202,14.000000,48113
1,48,113,710,10.833333,48113
2,48,113,737,8.333333,48113
3,48,113,904,15.666667,48113
4,48,113,948,10.166667,48113
...,...,...,...,...,...
220735,78,30,1471,7.333333,78030
220736,78,30,940,7.333333,78030
220737,78,30,878,7.333333,78030
220738,78,30,1278,4.000000,78030


In [31]:
# Drop rows with NaN
df = df.dropna()
df

Unnamed: 0,STATEFP,COUNTYFP,TotPop,NatWalkInd,fips
0,48,113,1202,14.000000,48113
1,48,113,710,10.833333,48113
2,48,113,737,8.333333,48113
3,48,113,904,15.666667,48113
4,48,113,948,10.166667,48113
...,...,...,...,...,...
220735,78,30,1471,7.333333,78030
220736,78,30,940,7.333333,78030
220737,78,30,878,7.333333,78030
220738,78,30,1278,4.000000,78030


In [32]:
# Find total population of each county
pop = (
    df.groupby("fips")
    .sum()
    .reset_index()[["fips", "TotPop"]]
    .rename(columns={"TotPop": "CountyPop"})
)
pop

Unnamed: 0,fips,CountyPop
0,1001,55200
1,1003,208107
2,1005,25782
3,1007,22527
4,1009,57645
...,...,...
3228,72151,34149
3229,72153,36439
3230,78010,50601
3231,78020,4170


In [33]:
# Find the population weighted share of the walkability index
df = df.set_index("fips").join(pop.set_index("fips"), how="left").reset_index()
df["population_share"] = df["TotPop"] / df["CountyPop"]
df["walkability_index"] = df["population_share"] * df["NatWalkInd"]
df

Unnamed: 0,fips,STATEFP,COUNTYFP,TotPop,NatWalkInd,CountyPop,population_share,walkability_index
0,1001,1,1,634,3.166667,55200,0.011486,0.036371
1,1001,1,1,2224,6.833333,55200,0.040290,0.275314
2,1001,1,1,592,1.333333,55200,0.010725,0.014300
3,1001,1,1,3099,3.500000,55200,0.056141,0.196495
4,1001,1,1,1345,1.500000,55200,0.024366,0.036549
...,...,...,...,...,...,...,...,...
220735,78030,78,30,1471,7.333333,51634,0.028489,0.208919
220736,78030,78,30,940,7.333333,51634,0.018205,0.133504
220737,78030,78,30,878,7.333333,51634,0.017004,0.124698
220738,78030,78,30,1278,4.000000,51634,0.024751,0.099005


In [34]:
# Group by fips code to find the weighted average for each county
df = df.groupby("fips").sum("walkability_index").reset_index()
df = df[["fips", "walkability_index"]]
df

Unnamed: 0,fips,walkability_index
0,1001,5.483179
1,1003,6.812530
2,1005,5.156175
3,1007,5.241296
4,1009,4.674447
...,...,...
3228,72151,3.385946
3229,72153,4.541645
3230,78010,4.280686
3231,78020,4.179696


In [35]:
# Write out the cleaned data
df.to_csv("../../data/processed/walkability.csv", index=False)