### Store All Addresses in Cities of Interest
Addresses were downloaded from [OpenAddress](https://batch.openaddresses.io/data#map=0/0/0). The addresses collected correspond to the following cities and states.
- Hartford, Connecticut
- Dover, Delaware
- District of Columbia
- Portland, Maine
- Baltimore, Maryland
- Boston, Massachusetts
- Nashua, New Hampshire
- Newark, New Jersey
- New York City, New York
- Philadelphia, Pennsylvania
- Providence, Rhode Island
- Burlington, Vermont
- Norfolk, Virginia
- Charleston, West Virginia


In [1]:
import geopandas
import json
import gzip
from pandas_geojson import to_geojson
from geopy.geocoders import Nominatim 
import regex as re
import numpy as np

In [2]:
# transforms dataframe to geojson file
def df_to_geojson(df):
    df['lon'] = df['geometry'].apply(lambda x: str(x)[6:].split(' ')[0].replace('(', ''))
    df['lat'] = df['geometry'].apply(lambda x: str(x)[6:].split(' ')[1].replace(')', ''))
    
    geojson_file = to_geojson(df=df, lat = 'lat', lon = 'lon', properties=['hash', 'number', 'street', 'unit', 'city', 'district', 'region', 'postcode', 'id', 'state', 'incorporated_place'])

    return geojson_file

In [3]:
# saves geojson data to geojson.gz file
def compress_geojson(gejson_data, output_file):
   with gzip.open(output_file, 'wt', encoding='utf-8') as f:
        json.dump(gejson_data, f, ensure_ascii=False)

In [27]:
# get postcode and county for an address
def get_postcode_county(address: str):
    # initialize Nominatim API  
    geolocator = Nominatim(user_agent="ne_isp") 

    # set location 
    location = geolocator.geocode(address) 

    # traverse the data 
    if location is not None:
        address_info = location.raw['display_name']
        # get postal code
        postcode_matches = re.findall('[0-9-]+', address_info)
        if len(postcode_matches)>1:
            postcode = postcode_matches[1]
        else:
            postcode = postcode_matches[0]
        # get county
        address_info = address_info.split(',')
        end = len(address_info)
        county = address_info[end-4].strip()
    else:
        postcode = np.NaN
        county = np.NaN

    return [postcode, county]

In [28]:
def read_data(input_path, city, state):
    
    df = geopandas.read_file(input_path)
    
    # add missing column information
    df['incorporated_place'] = city
    df['state'] = state
    
    if len(df['city'].unique()) == 0:
        df['city'] = city
    
    return df

In [29]:
def add_postcode_county(df):
    
    df['temp'] = df.apply(lambda x: get_postcode_county(x.number+' '+x.street+', '+x.city+', '+x.state+', USA'), axis=1)
    
    df['postcode'] = df['temp'].apply(lambda x: x[0])
    df['district'] = df['temp'].apply(lambda x: x[1])
    
    df.drop(columns=['temp'], inplace=True)
    
    return df

Bridgeport, Connecticut

In [30]:
# read in CT data as datframe
input_path = '../data/open_address/original/city_of_hartford-addresses-city.geojson'
output_path = '../data/open_address/processed/city_of_hartford-addresses-city.geojson.gz'

ct_df = read_data(input_path,'Hartford', 'CT')

print(ct_df.shape)
ct_df.head()

(37923, 12)


Unnamed: 0,id,unit,number,street,city,district,region,postcode,hash,geometry,incorporated_place,state
0,,,1,CIVIC CENTER PLZ,,,,,3d6b7182020b27b1,POINT (-72.67718 41.76845),Hartford,CT
1,,,1,CIVIC CENTER PLZ,,,,,7d7e3e812f9143de,POINT (-72.67722 41.76835),Hartford,CT
2,,,80,STATE HOUSE SQ,,,,,8e0c8d2fd7daf0b4,POINT (-72.67206 41.76637),Hartford,CT
3,,,1,CIVIC CENTER PLZ,,,,,4a96226ee988b5ff,POINT (-72.67696 41.76764),Hartford,CT
4,,,30,STATE HOUSE SQ,,,,,07ff199dc68a5d04,POINT (-72.67259 41.76732),Hartford,CT


In [31]:
# TODO:
# join ct_df on census data

In [33]:
# # get postcodes and county for each address
# ct_df = add_postcode_county(ct_df)

In [None]:
# save data as geojson data
ct_geojson = df_to_geojson(ct_df)
# save geojson data as geojson.gz file
compress_geojson(ct_geojson, output_path)

Dover, Delaware

In [None]:
# read in CT data as datframe
input_path = '../data/open_address/original/city_of_dover-addresses-city.geojson'
output_path = '../data/open_address/processed/city_of_dover-addresses-city.geojson.gz'

de_df = read_data(input_path, 'Dover', 'DE')

print(de_df.shape)
de_df.head()

In [None]:
# TODO:
# join de_df on census data

In [None]:
# get postcodes and county for each address
# de_df = add_postcode_county(de_df)

In [None]:
# save data as geojson data
de_geojson = df_to_geojson(de_df)
# save geojson data as geojson.gz file
compress_geojson(de_geojson, output_path)

District of Columbia

In [None]:
# read in DC data as datframe
input_path = '../data/open_address/original/dc_statewide-addresses-city.geojson'
output_path = '../data/open_address/processed/dc_statewide-addresses-city.geojson.gz'

dc_df = read_data(input_path, 'Washington', 'DC')

print(dc_df.shape)
dc_df.head()

In [None]:
# TODO:
# join dc_df on census data

In [None]:
# get postcodes and county for each address
# de_df = add_postcode_county(de_df)

In [None]:
# save data as geojson data
dc_geojson = df_to_geojson(dc_df)
# save geojson data as geojson.gz file
compress_geojson(dc_geojson, output_path)

Portland, Maine

In [None]:
# read in data as datframe
input_path = '../data/open_address/original/me_statewide-addresses-state.geojson'
output_path = '../data/open_address/processed/me_statewide-addresses-state.geojson.gz'

me_df = read_data(input_path, 'Portland', 'ME')

print(me_df.shape)
me_df.head()

In [None]:
# filter by addresses in Portland
portland_postcodes = ['04019', '04050', '04101', '04102', '04103', '04104', '04107', '04108', '04109', '04112', '04122', '04123', '04124']
# filter addresses to keep only those with Portland postcodes
me_df = me_df[me_df['postcode'].isin(portland_postcodes)]
me_df = me_df[me_df['district']=='Cumberland']

In [None]:
# TODO:
# join me_df on census data

In [None]:
# get postcodes and county for each address
# me_df = add_postcode_county(me_df)

In [None]:
# save data as geojson data
me_geojson = df_to_geojson(me_df)
# save geojson data as geojson.gz file
compress_geojson(me_geojson, output_path)

Baltimore, Maryland

In [None]:
# read in data as datframe
input_path = '../data/open_address/original/city_of_baltimore-addresses-city.geojson'
output_path = '../data/open_address/processed/city_of_baltimore-addresses-city.geojson.gz'

md_df = read_data(input_path, 'Baltimore', 'MD')


print(md_df.shape)
md_df.head()

In [None]:
# TODO:
# join md_df on census data

In [None]:
# get postcodes and county for each address
# md_df = add_postcode_county(md_df)

In [None]:
# save data as geojson data
md_geojson = df_to_geojson(md_df)
# save geojson data as geojson.gz file
compress_geojson(md_geojson, output_path)

Boston, Massachusetts

In [None]:
# read in DC data as datframe
input_path = '../data/open_address/original/city_of_boston-addresses-city.geojson'
output_path = '../data/open_address/processed/city_of_boston-addresses-city.geojson.gz'

ma_df = read_data(input_path, 'Boston', 'MA')

print(ma_df.shape)
ma_df.head()

In [None]:
# TODO:
# join ma_df on census data

In [None]:
# get postcodes and county for each address
# ma_df = add_postcode_county(ma_df)

In [None]:
# save data as geojson data
ma_geojson = df_to_geojson(ma_df)
# save geojson data as geojson.gz file
compress_geojson(ma_geojson, output_path)

Nashua, New Hampshire

In [None]:
# read in data as datframe
input_path = '../data/open_address/original/city_of_nashua-addresses-city.geojson'
output_path = '../data/open_address/processed/city_of_nashua-addresses-city.geojson.gz'

nh_df = read_data(input_path, 'Nashua', 'NH')


print(nh_df.shape)
nh_df.head()

In [None]:
# TODO:
# join nh_df on census data

In [None]:
# get postcodes and county for each address
# nh_df = add_postcode_county(nh_df)

In [None]:
# save data as geojson data
nh_geojson = df_to_geojson(nh_df)
# save geojson data as geojson.gz file
compress_geojson(nh_geojson, output_path)

Newark, New Jersey

In [None]:
# read in data as datframe
input_path = '../data/open_address/original/nj_statewide-addresses-state.geojson'
output_path = '../data/open_address/processed/nj_statewide-addresses-state.geojson.gz'

nj_df = read_data(input_path, 'Newark', 'NJ')

print(nj_df.shape)
nj_df.head()

In [None]:
# filter on Newark postalcodes and district
nj_df = nj_df[nj_df['city']=='NEWARK']

In [None]:
# TODO:
# join nj_df on census data

In [None]:
# get postcodes and county for each address
# nj_df = add_postcode_county(nj_df)

In [None]:
# save data as geojson data
nj_geojson = df_to_geojson(nj_df)
# save geojson data as geojson.gz file
compress_geojson(nj_geojson, output_path)

New York City, New York

In [None]:
# read in data as datframe
input_path = '../data/open_address/original/city_of_new_york-addresses-city.geojson'
output_path = '../data/open_address/processed/city_of_new_york-addresses-city.geojson.gz'

ny_df  = read_data(input_path, 'New York', 'NY')


print(ny_df.shape)
ny_df.head()

In [None]:
# TODO:
# join ny_df on census data

In [None]:
# get postcodes and county for each address
# ny_df = add_postcode_county(ny_df)

In [None]:
# save data as geojson data
ny_geojson = df_to_geojson(ny_df)
# save geojson data as geojson.gz file
compress_geojson(ny_geojson, output_path)

Philadelphia, Pennsylvania

In [None]:
# read in data as datframe
input_path = '../data/open_address/original/philadelphia-addresses-county.geojson'
output_path = '../data/open_address/processed/philadelphia-addresses-county.geojson.gz'

pa_df = read_data(input_path, 'Philadelphia', 'PA')

print(pa_df.shape)
pa_df.head()

In [None]:
# TODO:
# join pa_df on census data

In [None]:
# get postcodes and county for each address
# pa_df = add_postcode_county(pa_df)

In [None]:
# save data as geojson data
pa_geojson = df_to_geojson(pa_df)
# save geojson data as geojson.gz file
compress_geojson(pa_geojson, output_path)

Providence, Rhode Island

In [None]:
# read in data as datframe
input_path = '../data/open_address/original/providence-addresses-city.geojson'
output_path = '../data/open_address/processed/providence-addresses-city.geojson.gz'

ri_df = read_data(input_path, 'Providence', 'RI')


print(ri_df.shape)
ri_df.head()

In [None]:
# TODO:
# join ri_df on census data

In [None]:
# get postcodes and county for each address
# ri_df = add_postcode_county(ri_df)

In [None]:
# save data as geojson data
ri_geojson = df_to_geojson(ri_df)
# save geojson data as geojson.gz file
compress_geojson(ri_geojson, output_path)

Burlington, Vermont

In [None]:
# read in data as datframe
input_path = '../data/open_address/original/city_of_burlington-addresses-city.geojson'
output_path = '../data/open_address/processed/city_of_burlington-addresses-city.geojson.gz'

vt_df = read_data(input_path, 'Burlington', 'VT')

print(vt_df.shape)
vt_df.head()

In [None]:
# TODO:
# join vt_df on census data

In [None]:
# get postcodes and county for each address
# vt_df = add_postcode_county(vt_df)

In [None]:
# save data as geojson data
vt_geojson = df_to_geojson(vt_df)
# save geojson data as geojson.gz file
compress_geojson(vt_geojson, output_path)

In [None]:
# # note: no postcode values
# path = '../data/open_address/vt_df.csv.gz'
# vt_df = addresses_df(path, 'Burlington', 'VT', 'Burlington')
# vt_df.shape

Norfolk, Virginia

In [None]:
# read in data as datframe
input_path = '../data/open_address/original/city_of_norfolk-addresses-city.geojson'
output_path = '../data/open_address/processed/city_of_norfolk-addresses-city.geojson.gz'

va_df = read_data(input_path, 'Norfolk', 'VA')

print(va_df.shape)
va_df.head()

In [None]:
# TODO:
# join va_df on census data

In [None]:
# get postcodes and county for each address
# va_df = add_postcode_county(va_df)

In [None]:
# save data as geojson data
va_geojson = df_to_geojson(va_df)
# save geojson data as geojson.gz file
compress_geojson(va_geojson, output_path)

Charleston, West Virginia

In [None]:
# read in data as datframe
input_path = '../data/open_address/original/wv_statewide-addresses-state.geojson'
output_path = '../data/open_address/processed/wv_statewide-addresses-state.geojson.gz'

wv_df = read_data(input_path, 'Charleston', 'WV')

print(wv_df.shape)
wv_df.head()

In [None]:
# filter on addresses in Charleston
wv_df = wv_df[wv_df['city'] == 'Charleston']

In [None]:
# TODO:
# join wv_df on census data

In [None]:
# get postcodes and county for each address
# wv_df = add_postcode_county(wv_df)

In [None]:
# save data as geojson data
wv_geojson = df_to_geojson(wv_df)
# save geojson data as geojson.gz file
compress_geojson(wv_geojson, output_path)