### Store All Addresses in Cities of Interest
Addresses were downloaded from [OpenAddress](https://batch.openaddresses.io/data#map=0/0/0). The addresses collected correspond to the following cities and states.
- Hartford, Connecticut
- Dover, Delaware
- District of Columbia
- Portland, Maine
- Baltimore, Maryland
- Boston, Massachusetts
- Nashua, New Hampshire
- Newark, New Jersey
- New York City, New York
- Philadelphia, Pennsylvania
- Providence, Rhode Island
- Burlington, Vermont
- Norfolk, Virginia
- Charleston, West Virginia


In [None]:
import geopandas
import json
import gzip
from pandas_geojson import to_geojson
from geopy.geocoders import Nominatim 
import regex as re

In [None]:
# transforms dataframe to geojson file
def df_to_geojson(df):
    df['lon'] = df['geometry'].apply(lambda x: str(x)[6:].split(' ')[0].replace('(', ''))
    df['lat'] = df['geometry'].apply(lambda x: str(x)[6:].split(' ')[1].replace(')', ''))
    
    geojson_file = to_geojson(df=df, lat = 'lat', lon = 'lon', properties=['hash', 'number', 'street', 'unit', 'city', 'district', 'region', 'postcode', 'id', 'state', 'incorporated_place'])

    return geojson_file

In [None]:
# saves geojson data to geojson.gz file
def compress_geojson(gejson_data, output_file):
   with gzip.open(output_file, 'wt', encoding='utf-8') as f:
        json.dump(gejson_data, f, ensure_ascii=False)

In [None]:
# get postcode for an address
def get_postcode(address: str):
    # initialize Nominatim API  
    geolocator = Nominatim(user_agent="ne_isp") 
    
    # set location 
    location = geolocator.geocode(address) 
    
    # traverse the data 
    if location is not None:
        address_info = location.raw['display_name']
        postcode_matches = re.findall('[0-9-]+', address_info)
        if len(postcode_matches)>1:
            postcode = postcode_matches[1]
        else:
            postcode = postcode_matches[0]
    else:
        postcode = None
    
    return postcode

Bridgeport, Connecticut

In [None]:
# read in CT data as datframe
input_path = '../data/open_address/original/city_of_hartford-addresses-city.geojson'
output_path = '../data/open_address/processed/city_of_hartford-addresses-city.geojson.gz'

ct_df = geopandas.read_file(input_path)
# add missing column information
ct_df['city'] = 'Hartford'
ct_df['incorporated_place'] = 'Hartford'
ct_df['state'] = 'CT'

print(ct_df.shape)
ct_df.head()

In [None]:
# TODO:
# join ct_df on census data

In [None]:
# # get postcodes for each address
# ct_df['postcode'] = ct_df.apply(lambda x: get_postcode(x.number+' '+x.street+', '+x.city+', '+x.state+', USA'), axis=1)

In [None]:
# save data as geojson data
ct_geojson = df_to_geojson(ct_df)
# save geojson data as geojson.gz file
compress_geojson(ct_geojson, output_path)

Dover, Delaware

In [None]:
# read in CT data as datframe
input_path = '../data/open_address/original/city_of_dover-addresses-city.geojson'
output_path = '../data/open_address/processed/city_of_dover-addresses-city.geojson.gz'

de_df = geopandas.read_file(input_path)
# add missing column information
de_df['city'] = 'Dover'
de_df['incorporated_place'] = 'Dover'
de_df['state'] = 'DE'

print(de_df.shape)
de_df.head()

In [None]:
# TODO:
# join de_df on census data

In [None]:
# # get postcodes for each address
# de_df['postcode'] = de_df.apply(lambda x: get_postcode(x.number+' '+x.street+', '+x.city+', '+x.state+', USA'), axis=1)

In [None]:
# save data as geojson data
de_geojson = df_to_geojson(de_df)
# save geojson data as geojson.gz file
compress_geojson(de_geojson, output_path)

District of Columbia

In [None]:
# read in DC data as datframe
input_path = '../data/open_address/original/dc_statewide-addresses-city.geojson'
output_path = '../data/open_address/processed/dc_statewide-addresses-city.geojson.gz'

dc_df = geopandas.read_file(input_path)
# add missing column information
dc_df['incorporated_place'] = 'Washington'
dc_df['state'] = 'DC'

print(dc_df.shape)
dc_df.head()

In [None]:
# TODO:
# join dc_df on census data

In [None]:
# # get postcodes for each address
# dc_df['postcode'] = dc_df.apply(lambda x: get_postcode(x.number+' '+x.street+', '+x.city+', '+x.state+', USA'), axis=1)

In [None]:
# save data as geojson data
dc_geojson = df_to_geojson(dc_df)
# save geojson data as geojson.gz file
compress_geojson(dc_geojson, output_path)

Portland, Maine

In [None]:
# read in data as datframe
input_path = '../data/open_address/original/me_statewide-addresses-state.geojson'
output_path = '../data/open_address/processed/me_statewide-addresses-state.geojson.gz'

me_df = geopandas.read_file(input_path)

print(me_df.shape)
me_df.head()

In [None]:
# filter by addresses in Portland
portland_postcodes = ['04019', '04050', '04101', '04102', '04103', '04104', '04107', '04108', '04109', '04112', '04122', '04123', '04124']
# filter addresses to keep only those with Portland postcodes
me_df = me_df[me_df['postcode'].isin(portland_postcodes)]
me_df = me_df[me_df['district']=='Cumberland']

# add missing column information
me_df['city'] = 'Dover'
me_df['incorporated_place'] = 'Dover'
me_df['state'] = 'ME'

In [None]:
# TODO:
# join me_df on census data

In [None]:
# # get postcodes for each address
# me_df['postcode'] = me_df.apply(lambda x: get_postcode(x.number+' '+x.street+', '+x.city+', '+x.state+', USA'), axis=1)

In [None]:
# save data as geojson data
me_geojson = df_to_geojson(me_df)
# save geojson data as geojson.gz file
compress_geojson(me_geojson, output_path)

Baltimore, Maryland

In [None]:
# read in data as datframe
input_path = '../data/open_address/original/city_of_baltimore-addresses-city.geojson'
output_path = '../data/open_address/processed/city_of_baltimore-addresses-city.geojson.gz'

md_df = geopandas.read_file(input_path)
# add missing column information
md_df['city'] = 'Baltimore'
md_df['incorporated_place'] = 'Baltimore'
md_df['state'] = 'MD'

print(md_df.shape)
md_df.head()

In [None]:
# TODO:
# join md_df on census data

In [None]:
# # get postcodes for each address
# md_df['postcode'] = md_df.apply(lambda x: get_postcode(x.number+' '+x.street+', '+x.city+', '+x.state+', USA'), axis=1)

In [None]:
# save data as geojson data
md_geojson = df_to_geojson(md_df)
# save geojson data as geojson.gz file
compress_geojson(md_geojson, output_path)

Boston, Massachusetts

In [None]:
# read in DC data as datframe
input_path = '../data/open_address/original/city_of_boston-addresses-city.geojson'
output_path = '../data/open_address/processed/city_of_boston-addresses-city.geojson.gz'

ma_df = geopandas.read_file(input_path)
# add missing column information
ma_df['city'] = 'Boston'
ma_df['incorporated_place'] = 'Boston'
ma_df['state'] = 'MA'

print(ma_df.shape)
ma_df.head()

In [None]:
# TODO:
# join ma_df on census data

In [None]:
# # get postcodes for each address
# ma_df['postcode'] = ma_df.apply(lambda x: get_postcode(x.number+' '+x.street+', '+x.city+', '+x.state+', USA'), axis=1)

In [None]:
# save data as geojson data
ma_geojson = df_to_geojson(ma_df)
# save geojson data as geojson.gz file
compress_geojson(ma_geojson, output_path)

Nashua, New Hampshire

In [None]:
# read in data as datframe
input_path = '../data/open_address/original/city_of_nashua-addresses-city.geojson'
output_path = '../data/open_address/processed/city_of_nashua-addresses-city.geojson.gz'

nh_df = geopandas.read_file(input_path)
# add missing column information
nh_df['incorporated_place'] = 'Nashua'
nh_df['state'] = 'NH'

print(nh_df.shape)
nh_df.head()

In [None]:
# TODO:
# join nh_df on census data

In [None]:
# # get postcodes for each address
# nh_df['postcode'] = nh_df.apply(lambda x: get_postcode(x.number+' '+x.street+', '+x.city+', '+x.state+', USA'), axis=1)

In [None]:
# save data as geojson data
nh_geojson = df_to_geojson(nh_df)
# save geojson data as geojson.gz file
compress_geojson(nh_geojson, output_path)

Newark, New Jersey

In [None]:
# read in data as datframe
input_path = '../data/open_address/original/nj_statewide-addresses-state.geojson'
output_path = '../data/open_address/processed/nj_statewide-addresses-state.geojson.gz'

nj_df = geopandas.read_file(input_path)

print(nj_df.shape)
nj_df.head()

In [None]:
# filter on Newark postalcodes and district
nj_df = nj_df[nj_df['city']=='NEWARK']

# add missing column information
nj_df['incorporated_place'] = 'Newark'
nj_df['state'] = 'NJ'

In [None]:
# TODO:
# join nj_df on census data

In [None]:
# # get postcodes for each address
# nj_df['postcode'] = nj_df.apply(lambda x: get_postcode(x.number+' '+x.street+', '+x.city+', '+x.state+', USA'), axis=1)

In [None]:
# save data as geojson data
nj_geojson = df_to_geojson(nj_df)
# save geojson data as geojson.gz file
compress_geojson(nj_geojson, output_path)

New York City, New York

In [None]:
# read in data as datframe
input_path = '../data/open_address/original/city_of_new_york-addresses-city.geojson'
output_path = '../data/open_address/processed/city_of_new_york-addresses-city.geojson.gz'

ny_df = geopandas.read_file(input_path)
# add missing column information
ny_df['city'] = 'New York'
ny_df['incorporated_place'] = 'New York'
ny_df['state'] = 'NY'

print(ny_df.shape)
ny_df.head()

In [None]:
# TODO:
# join ny_df on census data

In [None]:
# # get postcodes for each address
# ny_df['postcode'] = ny_df.apply(lambda x: get_postcode(x.number+' '+x.street+', '+x.city+', '+x.state+', USA'), axis=1)

In [None]:
# save data as geojson data
ny_geojson = df_to_geojson(ny_df)
# save geojson data as geojson.gz file
compress_geojson(ny_geojson, output_path)

Philadelphia, Pennsylvania

In [None]:
# read in data as datframe
input_path = '../data/open_address/original/philadelphia-addresses-county.geojson'
output_path = '../data/open_address/processed/philadelphia-addresses-county.geojson.gz'

pa_df = geopandas.read_file(input_path)
# add missing column information
pa_df['city'] = 'Philadelphia'
pa_df['incorporated_place'] = 'Philadelphia'
pa_df['state'] = 'PA'

print(pa_df.shape)
pa_df.head()

In [None]:
# TODO:
# join pa_df on census data

In [None]:
# # get postcodes for each address
# pa_df['postcode'] = pa_df.apply(lambda x: get_postcode(x.number+' '+x.street+', '+x.city+', '+x.state+', USA'), axis=1)

In [None]:
# save data as geojson data
pa_geojson = df_to_geojson(pa_df)
# save geojson data as geojson.gz file
compress_geojson(pa_geojson, output_path)

Providence, Rhode Island

In [None]:
# read in data as datframe
input_path = '../data/open_address/original/providence-addresses-city.geojson'
output_path = '../data/open_address/processed/providence-addresses-city.geojson.gz'

ri_df = geopandas.read_file(input_path)
# add missing column information
ri_df['city'] = 'Providence'
ri_df['incorporated_place'] = 'Providence'
ri_df['state'] = 'RI'

print(ri_df.shape)
ri_df.head()

In [None]:
# TODO:
# join ri_df on census data

In [None]:
# # get postcodes for each address
# ri_df['postcode'] = ri_df.apply(lambda x: get_postcode(x.number+' '+x.street+', '+x.city+', '+x.state+', USA'), axis=1)

In [None]:
# save data as geojson data
ri_geojson = df_to_geojson(ri_df)
# save geojson data as geojson.gz file
compress_geojson(ri_geojson, output_path)

Burlington, Vermont

In [None]:
# read in data as datframe
input_path = '../data/open_address/original/city_of_burlington-addresses-city.geojson'
output_path = '../data/open_address/processed/city_of_burlington-addresses-city.geojson.gz'

vt_df = geopandas.read_file(input_path)
# add missing column information
vt_df['city'] = 'Burlington'
vt_df['incorporated_place'] = 'Burlington'
vt_df['state'] = 'VT'

print(vt_df.shape)
vt_df.head()

In [None]:
# TODO:
# join vt_df on census data

In [None]:
# # get postcodes for each address
# vt_df['postcode'] = vt_df.apply(lambda x: get_postcode(x.number+' '+x.street+', '+x.city+', '+x.state+', USA'), axis=1)

In [None]:
# save data as geojson data
vt_geojson = df_to_geojson(vt_df)
# save geojson data as geojson.gz file
compress_geojson(vt_geojson, output_path)

In [None]:
# # note: no postcode values
# path = '../data/open_address/vt_df.csv.gz'
# vt_df = addresses_df(path, 'Burlington', 'VT', 'Burlington')
# vt_df.shape

Norfolk, Virginia

In [None]:
# read in data as datframe
input_path = '../data/open_address/original/city_of_norfolk-addresses-city.geojson'
output_path = '../data/open_address/processed/city_of_norfolk-addresses-city.geojson.gz'

va_df = geopandas.read_file(input_path)
# add missing column information
va_df['city'] = 'Norfolk'
va_df['incorporated_place'] = 'Norfolk'
va_df['state'] = 'VA'

print(va_df.shape)
va_df.head()

In [None]:
# TODO:
# join va_df on census data

In [None]:
# # get postcodes for each address
# va_df['postcode'] = va_df.apply(lambda x: get_postcode(x.number+' '+x.street+', '+x.city+', '+x.state+', USA'), axis=1)

In [None]:
# save data as geojson data
va_geojson = df_to_geojson(va_df)
# save geojson data as geojson.gz file
compress_geojson(va_geojson, output_path)

Charleston, West Virginia

In [None]:
# read in data as datframe
input_path = '../data/open_address/original/wv_statewide-addresses-state.geojson'
output_path = '../data/open_address/processed/wv_statewide-addresses-state.geojson.gz'

wv_df = geopandas.read_file(input_path)

print(wv_df.shape)
wv_df.head()

In [None]:
# filter on addresses in Charleston
wv_df = wv_df[wv_df['city'] == 'Charleston']
# add missing column information
wv_df['incorporated_place'] = 'Charleston'
wv_df['state'] = 'WV'

In [None]:
# TODO:
# join wv_df on census data

In [None]:
# # get postcodes for each address
# wv_df['postcode'] = wv_df.apply(lambda x: get_postcode(x.number+' '+x.street+', '+x.city+', '+x.state+', USA'), axis=1)

In [None]:
# save data as geojson data
wv_geojson = df_to_geojson(wv_df)
# save geojson data as geojson.gz file
compress_geojson(wv_geojson, output_path)