## Store and Process All Addresses in Cities of Interest
The goal of this notebook is to get addresses for the cities of interest which correspond to  the cities below. Address data was  downloaded from [OpenAddress](https://batch.openaddresses.io/data#map=0/0/0). The addresses collected were then processed to have values filled in for their incorporated place, city, state, zipcode, and census block group. For each city, a sample of the addresses was taken and stored in which each block group in the city had 10 addresses represented in the sample.

    - Hartford, Connecticut
    - Dover, Delaware
    - District of Columbia
    - Portland, Maine
    - Baltimore, Maryland
    - Boston, Massachusetts
    - Nashua, New Hampshire
    - Newark, New Jersey
    - New York City, New York
    - Philadelphia, Pennsylvania
    - Providence, Rhode Island
    - Burlington, Vermont
    - Norfolk, Virginia
    - Charleston, West Virginia


In [1]:
import geopandas
import json
import gzip
from pandas_geojson import to_geojson
import censusgeocode as cg
import numpy as np
import pandas as pd

In [2]:
# transforms dataframe to geojson file
def df_to_geojson(df):
    
    geojson_file = to_geojson(df=df, lat = 'lat', lon = 'lon', properties=['hash', 'number', 'street', 'unit', 'city', 'district', 'region', 'postcode', 'id', 'state', 'incorporated_place', 'block_group'])

    return geojson_file

In [3]:
# saves geojson data to geojson.gz file
def compress_geojson(gejson_data, output_file):
   with gzip.open(output_file, 'wt', encoding='utf-8') as f:
        json.dump(gejson_data, f, ensure_ascii=False)

In [4]:
def read_data(input_path, incorporated_place, city, state):
    df = geopandas.read_file(input_path)

    # add missing column information
    df["incorporated_place"] = incorporated_place
    df["state"] = state
    df["city"] = city

    df["lon"] = df.loc[:, "geometry"].x
    df["lat"] = df.loc[:, "geometry"].y

    df.drop(columns=["geometry"], inplace=True)

    return df

! pip install -U googlemaps

In [5]:
# import googlemaps

# gmaps = googlemaps.Client(key="")


# # get postcode and county for an address
# def get_postcode_county(add, gmaps=gmaps):
#     resp = gmaps.geocode(add)
#     if len(resp) < 1:
#         return [None, None]
#     if len(resp) >= 1:
#         resp = resp[0]
#     post, county = "", ""

#     for component in resp["address_components"]:
#         if "administrative_area_level_2" in component["types"]:
#             county = component["long_name"]
#         if "postal_code" in component["types"]:
#             post = component["short_name"]  # same difference for postal code
#     return [post, county]

In [6]:
# def add_postcode_county(df):
#     df["temp"] = df.apply(
#         lambda x: get_postcode_county(
#             x.number + " " + x.street + ", " + x.city + ", " + x.state + ", USA"
#         ),
#         axis=1,
#     )

#     df["postcode"] = df.loc[:, "temp"].apply(lambda x: x[0])
#     df["district"] = df.loc[:, "temp"].apply(lambda x: x[1])

#     df.drop(columns=["temp"], inplace=True)

#     return df

In [7]:
def get_census_info(number, street, city, state, zipcode):
    try:  
        result = cg.address(number+' '+street, city=city, state=state, zip=zipcode)  
        blk_grp_dict = result[0]['geographies']['2020 Census Blocks'][0]
        state = int(blk_grp_dict['STATE'])
        county = int(blk_grp_dict['COUNTY'])
        tract = int(blk_grp_dict['TRACT'])
        block_group = int(blk_grp_dict['BLKGRP'])
        geoid = f"{state:02}{county:03}{tract:06}{block_group:01}"
        return pd.Series({'block_group':str(block_group), 'geoid': str(geoid)})
    except:
        block_group = np.NaN
        geoid = np.NaN
        return pd.Series({'block_group':block_group, 'geoid': geoid})

Boston, Massachusetts

In [8]:
# read in Boston data as datframe
input_path = '../data/open_address/original/city_of_boston-addresses-city.geojson'
output_path_csv = '../data/open_address/processed/csv/city_of_boston-addresses-city.csv.gz'
output_path_geojson = '../data/open_address/processed/geojson/city_of_boston-addresses-city.geojson.gz'

# read in data and set city, incorporated city, and state values
ma_df = read_data(input_path, 'Boston city', 'Boston', 'MA')
# set county alue
ma_df['district'] = 'Suffolk County'
# only get one house number
ma_df['number'] = ma_df['number'].apply(lambda x: x.split('-')[0])
print(ma_df.shape)
ma_df.head()

(323574, 13)


Unnamed: 0,hash,number,street,unit,city,district,region,postcode,id,incorporated_place,state,lon,lat
0,6db8d40a96a73ac7,6,A Street,,Boston,Suffolk County,,2136,,Boston city,MA,-71.125036,42.250618
1,96b30bef0eef7d86,7,A Street,,Boston,Suffolk County,,2136,,Boston city,MA,-71.1254,42.25046
2,a23223056b6593a0,10,A Street,,Boston,Suffolk County,,2127,,Boston city,MA,-71.0568,42.34088
3,d878c704ce81bba1,172,A Street,,Boston,Suffolk County,,2210,,Boston city,MA,-71.053148,42.344837
4,222c7a044af11e7a,176,A Street,,Boston,Suffolk County,,2210,,Boston city,MA,-71.05306,42.344958


In [9]:
# get a sample of 5,000 addresses
ma_sample_df = ma_df.sample(n=5000)

In [10]:
# get a block and geoid for first batch of addresses
ma_sample_df[['block_group', 'geoid']] = ma_sample_df.apply(lambda row: get_census_info(row.number, row.street, row.city, row.state, row.postcode), axis=1)
ma_sample_df.head(5)

Unnamed: 0,hash,number,street,unit,city,district,region,postcode,id,incorporated_place,state,lon,lat,block_group,geoid
39601,d2aec566d471e747,16,Creighton Street,,Boston,Suffolk County,,2130,,Boston city,MA,-71.10852,42.32296,1,250251207001
121464,e3a1718dfa7ad40a,78,Train Street,1.0,Boston,Suffolk County,,2122,,Boston city,MA,-71.05414,42.29319,1,250251006011
174510,7bf2acccae085422,83,W Brookline Street,2.0,Boston,Suffolk County,,2118,,Boston city,MA,-71.07361,42.34072,2,250250705022
55948,175f2c0ed8d4b617,8,Garrison Street,408.0,Boston,Suffolk County,,2116,,Boston city,MA,-71.07964,42.34615,1,250250106001
35640,7a0fb8b823b6d0d3,250,Commonwealth Avenue,12.0,Boston,Suffolk County,,2116,,Boston city,MA,-71.08285,42.35019,1,250250107011


In [15]:
# set number of addresses to sample for each block group
SAMPLE_SIZE = 100
# get all block groups
ma_block_groups = [id for id in ma_sample_df['block_group'].unique().tolist() if not pd.isna(id)]
# create a sample that has 4 addresses from each block group
ma_sample = pd.DataFrame(columns = ma_sample_df.columns.tolist())
for blk_grp  in ma_block_groups:
    temp_df = ma_sample_df[ma_sample_df['block_group']==blk_grp]
    ma_sample = pd.concat([ma_sample, temp_df[0:SAMPLE_SIZE]])

  ma_sample = pd.concat([ma_sample, temp_df[0:SAMPLE_SIZE]])


In [21]:
# print out number of samples in each block
for i in ma_block_groups:
    print('block '+str(i)+': ' +str(ma_sample[ma_sample['block_group']==i].shape[0])+' datapoints')

block 1: 100 datapoints
block 2: 100 datapoints
block 4: 100 datapoints
block 6: 52 datapoints
block 3: 100 datapoints
block 5: 100 datapoints
block 7: 12 datapoints


In [50]:
# adding addresses in block 7 to data to balance the sample dataset
# get unique streets from the sample that are in block 7
block_7_st = list(set(ma_sample[ma_sample['block_group']=='7'].street.values))
# get addresses from all bosotn data that are on block 7 streets
block_7_sample = ma_df[ma_df['street'].isin(block_7_st)]
# get all indexes of data in sample
in_sample = list(ma_sample.index)
block_7_sample = block_7_sample[~block_7_sample.index.isin(in_sample)]
block_7_sample[['block_group', 'geoid']] = block_7_sample.apply(lambda row: get_census_info(row.number, row.street, row.city, row.state, row.postcode), axis=1)

In [52]:
# take a sample of 88 addresses in block 7
add_block_7 = block_7_sample[block_7_sample['block_group']=='7'].sample(88)

In [58]:
# adding addresses in block 6 to data to balance the sample dataset
# get unique streets from the sample that are in block 6
block_6_st = list(set(ma_sample[ma_sample['block_group']=='6'].street.values))
# get addresses from all bosotn data that are on block 7 streets
block_6_sample = ma_df[ma_df['street'].isin(block_6_st)]
# get all indexes of data in sample
in_sample = list(ma_sample.index)
block_6_sample = block_6_sample[~block_6_sample.index.isin(in_sample)].sample(400)
block_6_sample[['block_group', 'geoid']] = block_6_sample.apply(lambda row: get_census_info(row.number, row.street, row.city, row.state, row.postcode), axis=1)

In [67]:
add_block_6 = block_6_sample[block_6_sample['block_group']=='6'].sample(48)

In [68]:
ma_final_sample = pd.concat([ma_sample, add_block_6, add_block_7])

In [69]:
# print out number of samples in each block
for i in ma_block_groups:
    print('block '+str(i)+': ' +str(ma_final_sample[ma_final_sample['block_group']==i].shape[0])+' datapoints')

block 1: 100 datapoints
block 2: 100 datapoints
block 4: 100 datapoints
block 6: 100 datapoints
block 3: 100 datapoints
block 5: 100 datapoints
block 7: 100 datapoints


In [70]:
# save data as geojson data
ma_geojson = df_to_geojson(ma_final_sample)
# save csv file of addresses for use in later functions
ma_final_sample.to_csv(output_path_csv, index=False)
# save geojson data as geojson.gz file
compress_geojson(ma_geojson, output_path_geojson)

\* __Note__: the cities below do not have all the neccessary processing measures done as we ran out of time. To get addresses for cities below, follow what was doen above for Boston, MA.

Bridgeport, Connecticut

In [24]:
# read in CT data as datframe
input_path = '../data/open_address/original/city_of_hartford-addresses-city.geojson'
output_path = '../data/open_address/processed/city_of_hartford-addresses-city'

ct_df = read_data(input_path,'Hartford', 'CT')

In [None]:
# get postcodes and county for each address
ct_df = add_postcode_county(ct_df)

print(ct_df.shape)
ct_df.head()

In [None]:
# save data as geojson data
ct_geojson = df_to_geojson(ct_df)
# save csv file of addresses for use in later functions
ct_csv = ct_df.to_csv(output_path + ".csv.gz", index=False)
# save geojson data as geojson.gz file
compress_geojson(ct_geojson, output_path + ".geojson.gz")

Dover, Delaware

In [None]:
# read in CT data as datframe
input_path = '../data/open_address/original/city_of_dover-addresses-city.geojson'
output_path = '../data/open_address/processed/city_of_dover-addresses-city'

de_df = read_data(input_path, 'Dover', 'DE')

In [None]:
# get postcodes and county for each address
de_df = add_postcode_county(de_df)

print(de_df.shape)
de_df.head()

In [None]:
# save data as geojson data
de_geojson = df_to_geojson(de_df)
# save csv file of addresses for use in later functions
de_csv = de_df.to_csv(output_path + ".csv.gz", index=False)
# save geojson data as geojson.gz file
compress_geojson(de_geojson, output_path + ".geojson.gz")

District of Columbia

In [None]:
# read in DC data as datframe
input_path = '../data/open_address/original/dc_statewide-addresses-city.geojson'
output_path = '../data/open_address/processed/dc_statewide-addresses-city'

dc_df = read_data(input_path, 'Washington', 'DC')

In [None]:
# get postcodes and county for each address
de_df = add_postcode_county(de_df)

print(dc_df.shape)
dc_df.head()

In [None]:
# save data as geojson data
dc_geojson = df_to_geojson(dc_df)
# save csv file of addresses for use in later functions
dc_csv = dc_df.to_csv(output_path + ".csv.gz", index=False)
# save geojson data as geojson.gz file
compress_geojson(dc_geojson, output_path + ".geojson.gz")

Portland, Maine

In [None]:
# read in data as datframe
input_path = '../data/open_address/original/me_statewide-addresses-state.geojson'
output_path = '../data/open_address/processed/me_statewide-addresses-state'

me_df = read_data(input_path, 'Portland', 'ME')

In [None]:
# filter by addresses in Portland
portland_postcodes = ['04019', '04050', '04101', '04102', '04103', '04104', '04107', '04108', '04109', '04112', '04122', '04123', '04124']
# filter addresses to keep only those with Portland postcodes
me_df = me_df[me_df['postcode'].isin(portland_postcodes)]
me_df = me_df[me_df['district']=='Cumberland']

In [None]:
# get postcodes and county for each address
me_df = add_postcode_county(me_df)

print(me_df.shape)
me_df.head()

In [None]:
# save data as geojson data
me_geojson = df_to_geojson(me_df)
# save csv file of addresses for use in later functions
me_csv = me_df.to_csv(output_path + ".csv.gz", index=False)
# save geojson data as geojson.gz file
compress_geojson(me_geojson, output_path + ".geojson.gz")

Baltimore, Maryland

In [None]:
# read in data as datframe
input_path = '../data/open_address/original/city_of_baltimore-addresses-city.geojson'
output_path = '../data/open_address/processed/city_of_baltimore-addresses-city'

md_df = read_data(input_path, 'Baltimore', 'MD')

In [None]:
# get postcodes and county for each address
md_df = add_postcode_county(md_df)

print(md_df.shape)
md_df.head()

In [None]:
# save data as geojson data
md_geojson = df_to_geojson(md_df)
# save csv file of addresses for use in later functions
md_csv = md_df.to_csv(output_path + ".csv.gz", index=False)
# save geojson data as geojson.gz file
compress_geojson(md_geojson, output_path + ".geojson.gz")

Nashua, New Hampshire

In [None]:
# read in data as datframe
input_path = '../data/open_address/original/city_of_nashua-addresses-city.geojson'
output_path = '../data/open_address/processed/city_of_nashua-addresses-city'

nh_df = read_data(input_path, 'Nashua', 'NH')

In [None]:
# get postcodes and county for each address
nh_df = add_postcode_county(nh_df)

print(nh_df.shape)
nh_df.head()

In [None]:
# save data as geojson data
nh_geojson = df_to_geojson(nh_df)
# save csv file of addresses for use in later functions
nh_csv = nh_df.to_csv(output_path + ".csv.gz", index=False)
# save geojson data as geojson.gz file
compress_geojson(nh_geojson, output_path + ".geojson.gz")

Newark, New Jersey

In [None]:
# read in data as datframe
input_path = '../data/open_address/original/nj_statewide-addresses-state.geojson'
output_path = '../data/open_address/processed/nj_statewide-addresses-state'

nj_df = read_data(input_path, 'Newark', 'NJ')

In [None]:
# filter on Newark postalcodes and district
nj_df = nj_df[nj_df['city']=='NEWARK']

In [None]:
# get postcodes and county for each address
nj_df = add_postcode_county(nj_df)

print(nj_df.shape)
nj_df.head()

In [None]:
# save data as geojson data
nj_geojson = df_to_geojson(nj_df)
# save csv file of addresses for use in later functions
nj_csv = nj_df.to_csv(output_path + ".csv.gz", index=False)
# save geojson data as geojson.gz file
compress_geojson(nj_geojson, output_path + ".geojson.gz")

New York City, New York

In [None]:
# read in data as datframe
input_path = '../data/open_address/original/city_of_new_york-addresses-city.geojson'
output_path = '../data/open_address/processed/city_of_new_york-addresses-city'

ny_df  = read_data(input_path, 'New York', 'NY')

In [None]:
# get postcodes and county for each address
ny_df = add_postcode_county(ny_df)

print(ny_df.shape)
ny_df.head()

In [None]:
# save data as geojson data
ny_geojson = df_to_geojson(ny_df)
# save csv file of addresses for use in later functions
ny_csv = ny_df.to_csv(output_path + ".csv.gz", index=False)
# save geojson data as geojson.gz file
compress_geojson(ny_geojson, output_path + ".geojson.gz")

Philadelphia, Pennsylvania

In [None]:
# read in data as datframe
input_path = '../data/open_address/original/philadelphia-addresses-county.geojson'
output_path = '../data/open_address/processed/philadelphia-addresses-county'

pa_df = read_data(input_path, 'Philadelphia', 'PA')

In [None]:
# get postcodes and county for each address
pa_df = add_postcode_county(pa_df)

print(pa_df.shape)
pa_df.head()

In [None]:
# save data as geojson data
pa_geojson = df_to_geojson(pa_df)
# save csv file of addresses for use in later functions
pa_csv = pa_df.to_csv(output_path + ".csv.gz", index=False)
# save geojson data as geojson.gz file
compress_geojson(pa_geojson, output_path + ".geojson.gz")

Providence, Rhode Island

In [None]:
# read in data as datframe
input_path = '../data/open_address/original/providence-addresses-city.geojson'
output_path = '../data/open_address/processed/providence-addresses-city'

ri_df = read_data(input_path, 'Providence', 'RI')

In [None]:
# get postcodes and county for each address
ri_df = add_postcode_county(ri_df)

print(ri_df.shape)
ri_df.head()

In [None]:
# save data as geojson data
ri_geojson = df_to_geojson(ri_df)
# save csv file of addresses for use in later functions
ri_csv = ri_df.to_csv(output_path + ".csv.gz", index=False)
# save geojson data as geojson.gz file
compress_geojson(ri_geojson, output_path + ".geojson.gz")

Burlington, Vermont

In [None]:
# read in data as datframe
input_path = '../data/open_address/original/city_of_burlington-addresses-city.geojson'
output_path = '../data/open_address/processed/city_of_burlington-addresses-city'

vt_df = read_data(input_path, 'Burlington', 'VT')

In [None]:
# get postcodes and county for each address
vt_df = add_postcode_county(vt_df)

print(vt_df.shape)
vt_df.head()

In [None]:
# save data as geojson data
vt_geojson = df_to_geojson(vt_df)
# save csv file of addresses for use in later functions
vt_csv = vt_df.to_csv(output_path + ".csv.gz", index=False)
# save geojson data as geojson.gz file
compress_geojson(vt_geojson, output_path + ".geojson.gz")

In [None]:
# # note: no postcode values
# path = '../data/open_address/vt_df.csv.gz'
# vt_df = addresses_df(path, 'Burlington', 'VT', 'Burlington')
# vt_df.shape

Norfolk, Virginia

In [None]:
# read in data as datframe
input_path = '../data/open_address/original/city_of_norfolk-addresses-city.geojson'
output_path = '../data/open_address/processed/city_of_norfolk-addresses-city'

va_df = read_data(input_path, 'Norfolk', 'VA')

In [None]:
# get postcodes and county for each address
va_df = add_postcode_county(va_df)

print(va_df.shape)
va_df.head()

In [None]:
# save data as geojson data
va_geojson = df_to_geojson(va_df)
# save csv file of addresses for use in later functions
va_csv = va_df.to_csv(output_path + ".csv.gz", index=False)
# save geojson data as geojson.gz file
compress_geojson(va_geojson, output_path + ".geojson.gz")

Charleston, West Virginia

In [None]:
# read in data as datframe
input_path = '../data/open_address/original/wv_statewide-addresses-state.geojson'
output_path = '../data/open_address/processed/wv_statewide-addresses-state'

wv_df = read_data(input_path, 'Charleston', 'WV')

In [None]:
# filter on addresses in Charleston
wv_df = wv_df[wv_df['city'] == 'Charleston']

In [None]:
# get postcodes and county for each address
wv_df = add_postcode_county(wv_df)

print(wv_df.shape)
wv_df.head()

In [None]:
# save data as geojson data
wv_geojson = df_to_geojson(wv_df)
# save csv file of addresses for use in later functions
wv_csv = wv_df.to_csv(output_path + ".csv.gz", index=False)
# save geojson data as geojson.gz file
compress_geojson(wv_geojson, output_path + ".geojson.gz")

## Reduce Address Datasets

In [None]:
from config import state2address


# load in the files
for link in state2address.values:
    reduce = link.replace("original", "processed")
    df = geopandas.read_file(reduce)

    # SPLIT ON UNIQUE county , state!
    # -----

    out = link.replace("original", "reduced")
    out_geojson = df_to_geojson(df)
    compress_geojson(df, out)