### Store All Addresses in Cities of Interest
Addresses were downloaded from [OpenAddress](https://batch.openaddresses.io/data#map=0/0/0). The addresses collected correspond to the following cities and states.
- Hartford, Connecticut
- Dover, Delaware
- District of Columbia
- Portland, Maine
- Baltimore, Maryland
- Boston, Massachusetts
- Nashua, New Hampshire
- Middletown, New Jersey
- New York City, New York
- Philadelphia, Pennsylvania
- Providence, Rhode Island
- Burlington, Vermont
- Norfolk, Virginia
- Charleston, West Virginia


!pip install geopandas

In [15]:
# !pip install geopy

In [16]:
import geopandas
import json
import gzip
import pandas as pd
from pandas_geojson import to_geojson
from geopy.geocoders import Nominatim
import regex as re

In [17]:
# transforms dataframe to geojson file
def df_to_geojson(df):
    df['lon'] = df['geometry'].apply(lambda x: str(x)[6:].split(' ')[0].replace('(', ''))
    df['lat'] = df['geometry'].apply(lambda x: str(x)[6:].split(' ')[1].replace(')', ''))
    
    geojson_file = to_geojson(df=df, lat = 'lat', lon = 'lon', properties=['hash', 'number', 'street', 'unit', 'city', 'district', 'region', 'postcode', 'id', 'state', 'incorporated_place'])

    return geojson_file

In [18]:
# saves geojson data to geojson.gz file
def compress_geojson(gejson_data, output_file):
   with gzip.open(output_file, 'wt', encoding='utf-8') as f:
        json.dump(gejson_data, f, ensure_ascii=False)

In [19]:
# get postcode for an address
def get_postcode(address: str):
    # initialize Nominatim API
    geolocator = Nominatim(user_agent="ne_isp")

    # set location
    location = geolocator.geocode(address)

    # traverse the data
    if location is not None:
        address_info = location.raw["display_name"]
        postcode_matches = re.findall("[0-9-]+", address_info)
        if len(postcode_matches) > 1:
            postcode = postcode_matches[1]
        else:
            postcode = postcode_matches[0]
    else:
        postcode = None

    return postcode

Bridgeport, Connecticut

In [36]:
# note: no postcode data
# read in CT data as dataframe
input_path = '../data/open_address/original/city_of_hartford-addresses-city.geojson'
output_path = '../data/open_address/processed/city_of_hartford-addresses-city'

ct_df = geopandas.read_file(input_path)
# add missing column information
ct_df['city'] = 'Hartford'
ct_df['incorporated_place'] = 'Hartford'
ct_df['state'] = 'CT'


# print(ct_df.shape)
# ct_df.head()

      id    unit number            street      city district region postcode  \
0                     1  CIVIC CENTER PLZ  Hartford                            
1                     1  CIVIC CENTER PLZ  Hartford                            
2                    80    STATE HOUSE SQ  Hartford                            
3                     1  CIVIC CENTER PLZ  Hartford                            
4                    30    STATE HOUSE SQ  Hartford                            
...   ..     ...    ...               ...       ...      ...    ...      ...   
37918        301   1678          BROAD ST  Hartford                            
37919        302   1678          BROAD ST  Hartford                            
37920     CNDASC   1678          BROAD ST  Hartford                            
37921     CNDASC   1678          BROAD ST  Hartford                            
37922     CNDASC    152                    Hartford                            

                   hash                

In [None]:
# save data as geojson data
ct_geojson = df_to_geojson(ct_df)
# save csv file of addresses for use in later functions
ct_csv = ct_df.to_csv(output_path + ".csv.gz", index=False)
# save geojson data as geojson.gz file
compress_geojson(ct_geojson, output_path + ".geojson.gz")

Dover, Delaware

In [21]:
# read in DE data as dataframe
input_path = '../data/open_address/original/city_of_dover-addresses-city.geojson'
output_path = '../data/open_address/processed/city_of_dover-addresses-city'

de_df = geopandas.read_file(input_path)
# add missing column information
de_df['city'] = 'Dover'
de_df['incorporated_place'] = 'Dover'
de_df['state'] = 'DE'

# print(de_df.shape)
# de_df.head()

In [None]:
# save data as geojson data
de_geojson = df_to_geojson(de_df)
# save csv file of addresses for use in later functions
de_csv = de_df.to_csv(output_path + ".csv.gz", index=False)
# save geojson data as geojson.gz file
compress_geojson(de_geojson, output_path + ".geojson.gz")

District of Columbia

In [22]:
# read in DC data as datframe
input_path = '../data/open_address/original/dc_statewide-addresses-city.geojson'
output_path = '../data/open_address/processed/dc_statewide-addresses-city'

dc_df = geopandas.read_file(input_path)
# add missing column information
dc_df['incorporated_place'] = 'Washington'
dc_df['state'] = 'DC'

# print(dc_df.shape)
# dc_df.head()

In [None]:
# # get postcodes for each address
# dc_df['postcode'] = dc_df.apply(lambda x: get_postcode(x.number+' '+x.street+', '+x.city+', '+x.state+', USA'), axis=1)

In [None]:
# save data as geojson data
dc_geojson = df_to_geojson(dc_df)
# save csv file of addresses for use in later functions
dc_csv = dc_df.to_csv(output_path + ".csv.gz", index=False)
# save geojson data as geojson.gz file
compress_geojson(dc_geojson, output_path + ".geojson.gz")

Portland, Maine

In [23]:
# read in data as datframe
input_path = '../data/open_address/original/me_statewide-addresses-state.geojson'
output_path = '../data/open_address/processed/me_statewide-addresses-state'

me_df = geopandas.read_file(input_path)

# print(me_df.shape)
# me_df.head()

(738706, 10)


Unnamed: 0,hash,number,street,unit,city,district,region,postcode,id,geometry
0,71c7b7c56b09b22e,1026,E Pond Rd,,,Somerset,ME,4978,,POINT (-69.79555 44.65301)
1,5a92e4335ec43693,1031,E Pond Rd,,,Somerset,ME,4978,,POINT (-69.79567 44.65340)
2,449254049adc0dd5,1033,E Pond Rd,,,Somerset,ME,4978,,POINT (-69.79575 44.65352)
3,66917a4a325d90ca,1043,E Pond Rd,,,Somerset,ME,4978,,POINT (-69.79614 44.65410)
4,1ae72f93c14415f8,1075,E Pond Rd,,,Somerset,ME,4978,,POINT (-69.79735 44.65599)


In [24]:
# filter by addresses in Portland
portland_postcodes = ['04019', '04050', '04101', '04102', '04103', '04104', '04107', '04108', '04109', '04112', '04122', '04123', '04124']
# filter addresses to keep only those with Portland postcodes
me_df = me_df[me_df['postcode'].isin(portland_postcodes)]
me_df = me_df[me_df['district']=='Cumberland']

# add missing column information
me_df['city'] = 'Dover'
me_df['incorporated_place'] = 'Dover'
me_df['state'] = 'ME'

In [None]:
# # get postcodes for each address
# me_df['postcode'] = me_df.apply(lambda x: get_postcode(x.number+' '+x.street+', '+x.city+', '+x.state+', USA'), axis=1)

In [None]:
# save data as geojson data
me_geojson = df_to_geojson(me_df)
# save csv file of addresses for use in later functions
me_csv = me_df.to_csv(output_path + ".csv.gz", index=False)
# save geojson data as geojson.gz file
compress_geojson(me_geojson, output_path + ".geojson.gz")

Baltimore, Maryland

In [25]:
# read in data as datframe
input_path = '../data/open_address/original/city_of_baltimore-addresses-city.geojson'
output_path = '../data/open_address/processed/city_of_baltimore-addresses-city'

md_df = geopandas.read_file(input_path)
# add missing column information
md_df['city'] = 'Baltimore'
md_df['incorporated_place'] = 'Baltimore'
md_df['state'] = 'MD'

# print(md_df.shape)
# md_df.head()

In [None]:
# # get postcodes for each address
# md_df['postcode'] = md_df.apply(lambda x: get_postcode(x.number+' '+x.street+', '+x.city+', '+x.state+', USA'), axis=1)

In [None]:
# save data as geojson data
md_geojson = df_to_geojson(md_df)
# save csv file of addresses for use in later functions
md_csv = md_df.to_csv(output_path + ".csv.gz", index=False)
# save geojson data as geojson.gz file
compress_geojson(md_geojson, output_path + ".geojson.gz")

Boston, Massachusetts

In [26]:
# read in DC data as datframe
input_path = '../data/open_address/original/city_of_boston-addresses-city.geojson'
output_path = '../data/open_address/processed/city_of_boston-addresses-city'

ma_df = geopandas.read_file(input_path)
# add missing column information
ma_df['city'] = 'Boston'
ma_df['incorporated_place'] = 'Boston'
ma_df['state'] = 'MA'

print(ma_df.shape)
ma_df.head()

(323574, 12)


Unnamed: 0,hash,number,street,unit,city,district,region,postcode,id,geometry,incorporated_place,state
0,6db8d40a96a73ac7,6-10,A Street,,Boston,,,2136,,POINT (-71.12504 42.25062),Boston,MA
1,96b30bef0eef7d86,7,A Street,,Boston,,,2136,,POINT (-71.12540 42.25046),Boston,MA
2,a23223056b6593a0,10,A Street,,Boston,,,2127,,POINT (-71.05680 42.34088),Boston,MA
3,d878c704ce81bba1,172-174,A Street,,Boston,,,2210,,POINT (-71.05315 42.34484),Boston,MA
4,222c7a044af11e7a,176-178,A Street,,Boston,,,2210,,POINT (-71.05306 42.34496),Boston,MA


In [None]:
# # get postcodes for each address
# ma_df['postcode'] = ma_df.apply(lambda x: get_postcode(x.number+' '+x.street+', '+x.city+', '+x.state+', USA'), axis=1)

In [None]:
# save data as geojson data
ma_geojson = df_to_geojson(ma_df)
# save csv file of addresses for use in later functions
ma_csv = ma_df.to_csv(output_path + ".csv.gz", index=False)
# save geojson data as geojson.gz file
compress_geojson(ma_geojson, output_path + ".geojson.gz")

Nashua, New Hampshire

In [27]:
# read in data as datframe
input_path = '../data/open_address/original/city_of_nashua-addresses-city.geojson'
output_path = '../data/open_address/processed/city_of_nashua-addresses-city'

nh_df = geopandas.read_file(input_path)
# add missing column information
nh_df['incorporated_place'] = 'Nashua'
nh_df['state'] = 'NH'

# print(nh_df.shape)
# nh_df.head()

In [None]:
# # get postcodes for each address
# nh_df['postcode'] = nh_df.apply(lambda x: get_postcode(x.number+' '+x.street+', '+x.city+', '+x.state+', USA'), axis=1)

In [None]:
# save data as geojson data
nh_geojson = df_to_geojson(nh_df)
# save csv file of addresses for use in later functions
nh_csv = nh_df.to_csv(output_path + ".csv.gz", index=False)
# save geojson data as geojson.gz file
compress_geojson(nh_geojson, output_path + ".geojson.gz")

Newark, New Jersey

In [28]:
# read in data as datframe
input_path = '../data/open_address/original/nj_statewide-addresses-state.geojson'
output_path = '../data/open_address/processed/nj_statewide-addresses-state'

nj_df = geopandas.read_file(input_path)

# print(nj_df.shape)
# nj_df.head()

DriverError: ../data/open_address/original/nj_statewide-addresses-state.geojson: No such file or directory

In [None]:
# filter on Newark postalcodes and district
nj_df = nj_df[nj_df['city']=='NEWARK']

# add missing column information
nj_df['incorporated_place'] = 'Newark'
nj_df['state'] = 'NJ'

In [None]:
# # get postcodes for each address
# nj_df['postcode'] = nj_df.apply(lambda x: get_postcode(x.number+' '+x.street+', '+x.city+', '+x.state+', USA'), axis=1)

In [None]:
# save data as geojson data
nj_geojson = df_to_geojson(nj_df)
# save csv file of addresses for use in later functions
nj_csv = nj_df.to_csv(output_path + ".csv.gz", index=False)
# save geojson data as geojson.gz file
compress_geojson(nj_geojson, output_path + ".geojson.gz")

New York City, New York

In [29]:
# read in data as datframe
input_path = '../data/open_address/original/city_of_new_york-addresses-city.geojson'
output_path = '../data/open_address/processed/city_of_new_york-addresses-city'

ny_df = geopandas.read_file(input_path)
# add missing column information
ny_df['city'] = 'New York'
ny_df['incorporated_place'] = 'New York'
ny_df['state'] = 'NY'

# print(ny_df.shape)
# ny_df.head()

(967381, 12)


Unnamed: 0,hash,number,street,unit,city,district,region,postcode,id,geometry,incorporated_place,state
0,b48fa74626b88b0d,54,MACON ST,,New York,,,11216,3066687,POINT (-73.94891 40.68102),New York,NY
1,725c1c60ee5577b7,438,GATES AVE,,New York,,,11216,3064205,POINT (-73.94867 40.68630),New York,NY
2,67ea895ba357a0e5,442,GREENE AVE,,New York,,,11216,3063204,POINT (-73.95303 40.68805),New York,NY
3,7b0492ccc02c7520,411,TOMPKINS AVE,,New York,,,11221,3065757,POINT (-73.94380 40.68348),New York,NY
4,410c0c2df82022d3,290,HALSEY ST,,New York,,,11216,3066531,POINT (-73.94227 40.68252),New York,NY


In [None]:
# # get postcodes for each address
# ny_df['postcode'] = ny_df.apply(lambda x: get_postcode(x.number+' '+x.street+', '+x.city+', '+x.state+', USA'), axis=1)

In [None]:
# save data as geojson data
ny_geojson = df_to_geojson(ny_df)
# save csv file of addresses for use in later functions
ny_csv = ny_df.to_csv(output_path + ".csv.gz", index=False)
# save geojson data as geojson.gz file
compress_geojson(ny_geojson, output_path + ".geojson.gz")

Philadelphia, Pennsylvania

In [30]:
# read in data as datframe
input_path = '../data/open_address/original/philadelphia-addresses-county.geojson'
output_path = '../data/open_address/processed/philadelphia-addresses-county'

pa_df = geopandas.read_file(input_path)
# add missing column information
pa_df['city'] = 'Philadelphia'
pa_df['incorporated_place'] = 'Philadelphia'
pa_df['state'] = 'PA'

# print(pa_df.shape)
# pa_df.head()

In [None]:
# # get postcodes for each address
# pa_df['postcode'] = pa_df.apply(lambda x: get_postcode(x.number+' '+x.street+', '+x.city+', '+x.state+', USA'), axis=1)

In [None]:
# save data as geojson data
pa_geojson = df_to_geojson(pa_df)
# save csv file of addresses for use in later functions
pa_csv = pa_df.to_csv(output_path + ".csv.gz", index=False)
# save geojson data as geojson.gz file
compress_geojson(pa_geojson, output_path + ".geojson.gz")

Providence, Rhode Island

In [31]:
# read in data as datframe
input_path = '../data/open_address/original/providence-addresses-city.geojson'
output_path = '../data/open_address/processed/providence-addresses-city'

ri_df = geopandas.read_file(input_path)
# add missing column information
ri_df['city'] = 'Providence'
ri_df['incorporated_place'] = 'Providence'
ri_df['state'] = 'RI'

# print(ri_df.shape)
# ri_df.head()

In [None]:
# # get postcodes for each address
# ri_df['postcode'] = ri_df.apply(lambda x: get_postcode(x.number+' '+x.street+', '+x.city+', '+x.state+', USA'), axis=1)

In [None]:
# save data as geojson data
ri_geojson = df_to_geojson(ri_df)
# save csv file of addresses for use in later functions
ri_csv = ri_df.to_csv(output_path + ".csv.gz", index=False)
# save geojson data as geojson.gz file
compress_geojson(ri_geojson, output_path + ".geojson.gz")

Burlington, Vermont

In [32]:
# read in data as datframe
input_path = '../data/open_address/original/city_of_burlington-addresses-city.geojson'
output_path = '../data/open_address/processed/city_of_burlington-addresses-city'

vt_df = geopandas.read_file(input_path)
# add missing column information
vt_df['city'] = 'Burlington'
vt_df['incorporated_place'] = 'Burlington'
vt_df['state'] = 'VT'

# print(vt_df.shape)
# vt_df.head()

In [None]:
# # get postcodes for each address
# vt_df['postcode'] = vt_df.apply(lambda x: get_postcode(x.number+' '+x.street+', '+x.city+', '+x.state+', USA'), axis=1)

In [None]:
# save data as geojson data
vt_geojson = df_to_geojson(vt_df)
# save csv file of addresses for use in later functions
vt_csv = vt_df.to_csv(output_path + ".csv.gz", index=False)
# save geojson data as geojson.gz file
compress_geojson(vt_geojson, output_path + ".geojson.gz")

In [None]:
# # note: no postcode values
# path = '../data/open_address/vt_df.csv.gz'
# vt_df = addresses_df(path, 'Burlington', 'VT', 'Burlington')
# vt_df.shape

Norfolk, Virginia

In [33]:
# read in data as datframe
input_path = '../data/open_address/original/city_of_norfolk-addresses-city.geojson'
output_path = '../data/open_address/processed/city_of_norfolk-addresses-city'

va_df = geopandas.read_file(input_path)
# add missing column information
va_df['city'] = 'Norfolk'
va_df['incorporated_place'] = 'Norfolk'
va_df['state'] = 'VA'

# print(va_df.shape)
# va_df.head()

In [None]:
# # get postcodes for each address
# va_df['postcode'] = va_df.apply(lambda x: get_postcode(x.number+' '+x.street+', '+x.city+', '+x.state+', USA'), axis=1)

In [None]:
# save data as geojson data
va_geojson = df_to_geojson(va_df)
# save csv file of addresses for use in later functions
va_csv = va_df.to_csv(output_path + ".csv.gz", index=False)
# save geojson data as geojson.gz file
compress_geojson(va_geojson, output_path + ".geojson.gz")

Charleston, West Virginia

In [34]:
# read in data as datframe
input_path = '../data/open_address/original/wv_statewide-addresses-state.geojson'
output_path = '../data/open_address/processed/wv_statewide-addresses-state'

wv_df = geopandas.read_file(input_path)

# print(wv_df.shape)
# wv_df.head()

In [35]:
# filter on addresses in Charleston
wv_df = wv_df[wv_df['city'] == 'Charleston']
# add missing column information
wv_df['incorporated_place'] = 'Charleston'
wv_df['state'] = 'WV'

In [None]:
# # get postcodes for each address
# wv_df['postcode'] = wv_df.apply(lambda x: get_postcode(x.number+' '+x.street+', '+x.city+', '+x.state+', USA'), axis=1)

In [None]:
# save data as geojson data
wv_geojson = df_to_geojson(wv_df)
# save csv file of addresses for use in later functions
wv_csv = wv_df.to_csv(output_path + ".csv.gz", index=False)
# save geojson data as geojson.gz file
compress_geojson(wv_geojson, output_path + ".geojson.gz")