In [1]:
pip install census

Note: you may need to restart the kernel to use updated packages.


In [2]:
%matplotlib inline

In [3]:
# Dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from census import Census
import os
import csv

# File path
atm_data = "../Desktop/visa_atm_cleaned.csv"

# Census API Key
from config import census_api
c = Census(census_api, year=2017)

In [4]:
# Run Census Search to retrieve data on all zip codes (2017 ACS5 Census)
# See: https://github.com/CommerceDataService/census-wrapper for library documentation
# See: https://gist.github.com/afhaque/60558290d6efd892351c4b64e5c01e9b for labels

census_data = c.acs5.get(("NAME", "B19013_001E", "B01003_001E", "B01002_001E",
                          "B19301_001E", "B23025_002E", "B23025_007E", "B17001A_002E", 
                          "B17001B_002E",  "B17001D_002E", "B17001I_002E", "B02001_002E",
                          "B02001_003E", "B02001_005E", "B03001_003E", "B25058_001E",
                          "B25064_001E", "B25077_001E", "B25088_002E", "B15003_002E", 
                          "B15003_017E", "B15003_018E", "B15003_021E", "B15003_022E", 
                          "B15003_023E", "B15003_024E","B15003_025E", 
                          "B17001_002E"), {'for': 'zip code tabulation area:*'})


In [5]:
# Convert to DataFrame
census_pd = pd.DataFrame(census_data)


#find county code for column or state column and add below

# Column Reordering
census_pd = census_pd.rename(columns={"B01003_001E": "Population",
                                      "B01002_001E": "Median Age",
                                      "B19013_001E": "Household Income",
                                      "B19301_001E": "Per Capita Income",
                                      "B17001_002E": "Poverty Count",
                                      "B25058_001E": "Avg Rent",
                                      "B25064_001E": "Avg Rent & Utilities",
                                      "B25077_001E": "Median Home Value",
                                      "B25088_002E": "Avg Monthly Cost of Home",
                                      "B23025_002E": "Employed",
                                      "B23025_007E": "Unemployed",
                                      "B15003_002E": "No HSch Ed",
                                      "B15003_017E": "HSch Ed",
                                      "B15003_018E": "GED",
                                      "B15003_021E": "Associate College",
                                      "B15003_022E": "Bachelors",
                                      "B15003_023E": "Masters",
                                      "B15003_024E": "Professional",
                                      "B15003_025E": "Doctorate",
                                      "NAME": "Name", "zip code tabulation area": "Zipcode"})

# Add in Poverty Rate (Poverty Count / Population)
census_pd["Poverty Rate"] = 100 * \
    census_pd["Poverty Count"].astype(
        int) / census_pd["Population"].astype(int)

census_pd["HSch/GED"] = census_pd["HSch Ed"].astype(int) + census_pd["GED"].astype(int)

# Final DataFrame
census_pd = census_pd[["Zipcode", "Population", "White Population", "Black Population", "Hispanic Population", "Asian Population",
                       "Median Age", "Employed", "Unemployed", "Household Income", "Per Capita Income",
                       "Avg Rent", "Avg Rent & Utilities", "Median Home Value", "Avg Monthly Cost of Home",
                       "No HSch Ed", "HSch/GED", "Associate College", "Bachelors", "Masters",
                       "Professional", "Doctorate", "Poverty Count", "Poverty Rate", 
                       "White Poverty", "Black Poverty", "Hispanic Poverty", "Asian Poverty"]]

# Visualize
print(len(census_pd))
census_pd.head()

33120


Unnamed: 0,Zipcode,Population,White Population,Black Population,Hispanic Population,Asian Population,Median Age,Employed,Unemployed,Household Income,...,Bachelors,Masters,Professional,Doctorate,Poverty Count,Poverty Rate,White Poverty,Black Poverty,Hispanic Poverty,Asian Poverty
0,601,17599.0,13686.0,120.0,17533.0,0.0,38.9,5953.0,8078.0,11757.0,...,1787.0,269.0,15.0,80.0,11282.0,64.105915,8765.0,84.0,11223.0,0.0
1,602,39209.0,26213.0,1092.0,36736.0,0.0,40.9,14390.0,18022.0,16190.0,...,3694.0,1097.0,174.0,332.0,20428.0,52.100283,13813.0,557.0,19144.0,0.0
2,603,50135.0,35709.0,1985.0,48865.0,557.0,40.4,16044.0,25020.0,16645.0,...,5858.0,1605.0,285.0,234.0,25176.0,50.216416,17714.0,874.0,24744.0,382.0
3,606,6304.0,3045.0,160.0,6292.0,0.0,42.8,1707.0,3472.0,13387.0,...,321.0,77.0,17.0,17.0,4092.0,64.911168,1819.0,63.0,4080.0,0.0
4,610,27590.0,17038.0,845.0,26850.0,0.0,41.4,10048.0,12749.0,18741.0,...,2268.0,500.0,10.0,141.0,12553.0,45.498369,7471.0,426.0,12263.0,0.0


In [6]:
# have a df (census_pd) of 33,120 zipcodes (rows) that needs to be reduced in size
# create a list of zipcodes of interest for Richmond City, and surrounding Henrico and Chesterfield Counties
zip_list = ['23005', '23059','23060', '23063', '23111', '23112', '23113', '23114', '23116', '23139', '23219', '23221', '23223', '23224', '23225', '23226', '23227', '23228', '23229', '23230', '23231', '23232', '23233', '23234', '23235', '23236', '23294', '23298', '23806', '23831', '23860']
print(len(zip_list))

45


In [7]:
# now iterate through the original dataframe to extract only those rows with zipcodes in the above list and create a new dataframe
for zip in zip_list:
    census_va_pd = census_pd[census_pd.Zipcode.isin(zip_list)]

In [8]:
# Visualize
print(len(census_va_pd))
census_va_pd

33


Unnamed: 0,Zipcode,Population,White Population,Black Population,Hispanic Population,Asian Population,Median Age,Employed,Unemployed,Household Income,...,Bachelors,Masters,Professional,Doctorate,Poverty Count,Poverty Rate,White Poverty,Black Poverty,Hispanic Poverty,Asian Poverty
7047,23059,34031.0,23175.0,3230.0,1277.0,6288.0,41.0,18054.0,7782.0,104878.0,...,7550.0,4350.0,1304.0,843.0,1044.0,3.067791,655.0,249.0,25.0,124.0
7048,23060,37381.0,23419.0,7305.0,1857.0,4823.0,37.8,21863.0,7733.0,83802.0,...,7825.0,3579.0,749.0,337.0,1737.0,4.646746,1038.0,486.0,83.0,144.0
7060,23075,9422.0,3449.0,5684.0,298.0,0.0,36.4,4948.0,2479.0,42962.0,...,810.0,170.0,0.0,51.0,1665.0,17.671407,379.0,1286.0,0.0,0.0
7078,23112,52485.0,39619.0,8325.0,2509.0,1855.0,37.7,29197.0,11339.0,89485.0,...,11002.0,4822.0,586.0,824.0,1713.0,3.26379,885.0,648.0,48.0,19.0
7079,23113,25195.0,21364.0,1758.0,965.0,1466.0,45.5,12434.0,7324.0,103683.0,...,6562.0,3026.0,973.0,535.0,1068.0,4.238936,844.0,81.0,76.0,124.0
7080,23114,18503.0,14790.0,1826.0,451.0,1355.0,39.2,10086.0,4132.0,100047.0,...,3889.0,2048.0,405.0,200.0,763.0,4.123656,342.0,91.0,11.0,305.0
7085,23120,8948.0,7770.0,568.0,245.0,273.0,35.4,4720.0,1551.0,132347.0,...,2151.0,898.0,206.0,57.0,143.0,1.598122,122.0,21.0,0.0,0.0
7100,23150,11961.0,8493.0,2954.0,214.0,58.0,41.7,6213.0,3208.0,58175.0,...,1093.0,302.0,50.0,20.0,1345.0,11.244879,993.0,264.0,0.0,15.0
7108,23173,2209.0,1306.0,316.0,180.0,401.0,19.5,1002.0,1207.0,-666666666.0,...,13.0,22.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7118,23219,4138.0,1675.0,1892.0,188.0,363.0,29.4,2204.0,1732.0,30917.0,...,763.0,320.0,205.0,42.0,1356.0,32.769454,442.0,784.0,93.0,81.0


In [9]:
# Reset Index
census_reset = census_va_pd.reset_index(drop=False)
print(len(census_reset))
census_reset.head(10)

33


Unnamed: 0,index,Zipcode,Population,White Population,Black Population,Hispanic Population,Asian Population,Median Age,Employed,Unemployed,...,Bachelors,Masters,Professional,Doctorate,Poverty Count,Poverty Rate,White Poverty,Black Poverty,Hispanic Poverty,Asian Poverty
0,7047,23059,34031.0,23175.0,3230.0,1277.0,6288.0,41.0,18054.0,7782.0,...,7550.0,4350.0,1304.0,843.0,1044.0,3.067791,655.0,249.0,25.0,124.0
1,7048,23060,37381.0,23419.0,7305.0,1857.0,4823.0,37.8,21863.0,7733.0,...,7825.0,3579.0,749.0,337.0,1737.0,4.646746,1038.0,486.0,83.0,144.0
2,7060,23075,9422.0,3449.0,5684.0,298.0,0.0,36.4,4948.0,2479.0,...,810.0,170.0,0.0,51.0,1665.0,17.671407,379.0,1286.0,0.0,0.0
3,7078,23112,52485.0,39619.0,8325.0,2509.0,1855.0,37.7,29197.0,11339.0,...,11002.0,4822.0,586.0,824.0,1713.0,3.26379,885.0,648.0,48.0,19.0
4,7079,23113,25195.0,21364.0,1758.0,965.0,1466.0,45.5,12434.0,7324.0,...,6562.0,3026.0,973.0,535.0,1068.0,4.238936,844.0,81.0,76.0,124.0
5,7080,23114,18503.0,14790.0,1826.0,451.0,1355.0,39.2,10086.0,4132.0,...,3889.0,2048.0,405.0,200.0,763.0,4.123656,342.0,91.0,11.0,305.0
6,7085,23120,8948.0,7770.0,568.0,245.0,273.0,35.4,4720.0,1551.0,...,2151.0,898.0,206.0,57.0,143.0,1.598122,122.0,21.0,0.0,0.0
7,7100,23150,11961.0,8493.0,2954.0,214.0,58.0,41.7,6213.0,3208.0,...,1093.0,302.0,50.0,20.0,1345.0,11.244879,993.0,264.0,0.0,15.0
8,7108,23173,2209.0,1306.0,316.0,180.0,401.0,19.5,1002.0,1207.0,...,13.0,22.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,7118,23219,4138.0,1675.0,1892.0,188.0,363.0,29.4,2204.0,1732.0,...,763.0,320.0,205.0,42.0,1356.0,32.769454,442.0,784.0,93.0,81.0


In [10]:
# There are two rows (8 and 27) in this new dataframe with bad values (NaN, and negatives)
#census_reset = census_reset.dropna(how='any') # will remove the row with NaN (row 27)
# How to remove row 8 with negative values...?
census_final = census_reset.drop([8,27], axis=0) #should remove both bad rows....

In [11]:
# Visualize
print(len(census_final))
census_final

31


Unnamed: 0,index,Zipcode,Population,White Population,Black Population,Hispanic Population,Asian Population,Median Age,Employed,Unemployed,...,Bachelors,Masters,Professional,Doctorate,Poverty Count,Poverty Rate,White Poverty,Black Poverty,Hispanic Poverty,Asian Poverty
0,7047,23059,34031.0,23175.0,3230.0,1277.0,6288.0,41.0,18054.0,7782.0,...,7550.0,4350.0,1304.0,843.0,1044.0,3.067791,655.0,249.0,25.0,124.0
1,7048,23060,37381.0,23419.0,7305.0,1857.0,4823.0,37.8,21863.0,7733.0,...,7825.0,3579.0,749.0,337.0,1737.0,4.646746,1038.0,486.0,83.0,144.0
2,7060,23075,9422.0,3449.0,5684.0,298.0,0.0,36.4,4948.0,2479.0,...,810.0,170.0,0.0,51.0,1665.0,17.671407,379.0,1286.0,0.0,0.0
3,7078,23112,52485.0,39619.0,8325.0,2509.0,1855.0,37.7,29197.0,11339.0,...,11002.0,4822.0,586.0,824.0,1713.0,3.26379,885.0,648.0,48.0,19.0
4,7079,23113,25195.0,21364.0,1758.0,965.0,1466.0,45.5,12434.0,7324.0,...,6562.0,3026.0,973.0,535.0,1068.0,4.238936,844.0,81.0,76.0,124.0
5,7080,23114,18503.0,14790.0,1826.0,451.0,1355.0,39.2,10086.0,4132.0,...,3889.0,2048.0,405.0,200.0,763.0,4.123656,342.0,91.0,11.0,305.0
6,7085,23120,8948.0,7770.0,568.0,245.0,273.0,35.4,4720.0,1551.0,...,2151.0,898.0,206.0,57.0,143.0,1.598122,122.0,21.0,0.0,0.0
7,7100,23150,11961.0,8493.0,2954.0,214.0,58.0,41.7,6213.0,3208.0,...,1093.0,302.0,50.0,20.0,1345.0,11.244879,993.0,264.0,0.0,15.0
9,7118,23219,4138.0,1675.0,1892.0,188.0,363.0,29.4,2204.0,1732.0,...,763.0,320.0,205.0,42.0,1356.0,32.769454,442.0,784.0,93.0,81.0
10,7119,23220,34409.0,20703.0,10707.0,1209.0,1639.0,26.5,21441.0,10564.0,...,6585.0,2738.0,928.0,462.0,9908.0,28.794792,4754.0,4221.0,384.0,511.0


In [12]:
# Creating an output file containing the data extracted from the input file
# Specify the file to write to
output_path = os.path.join("..", "Desktop", "census_2017.csv")

census_final.to_csv (output_path, index = None, header=True)

In [15]:
zip_list = ['23005', '23059','23060', '23063', '23111', '23112', '23113', '23114', '23116', '23139', '23219', '23221', '23223', '23224', '23225', '23226', '23227', '23228', '23229', '23230', '23231', '23232', '23233', '23234', '23235', '23236', '23294', '23298', '23806', '23831', '23860']

In [16]:
len(zip_list)

31