In [1]:
import os
import sys
import csv

import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt

In [2]:
# list shp files recursively
os.chdir('..')
abs_path = os.getcwd()

HIFLD_path = os.path.join(abs_path, 'data/HIFLD')
shp_files = [os.path.join(root, name) \
             for root, dirs, files in os.walk(HIFLD_path) \
             for name in files \
             if name.endswith(('.shp'))]

In [3]:
# number of observations per geoshape file 
for file in shp_files:
    with open(file, 'rb') as f:
        row_count = len(f.readlines()) - 1
        
        basename = os.path.basename(file).split('/')[0]
        fname = os.path.basename(basename).split('.')[0]
        
        print('There are {} observations in {}.'.format(row_count, fname))

There are 267622 observations in AllPlacesOfWorship.
There are 93348 observations in FDIC_Insured_Banks.
There are 54785 observations in Fire_Stations.
There are 5056 observations in Prison_Boundaries.
There are 112783 observations in PublicSchools.
There are 5298 observations in UrgentCareFacs.


# Explore Geoshape Files
* Columns
* Data types
* CRS

In [4]:
def read_shp(file, rows=100):
    """
    Read geoshapes file

    Parameters
    ----------
        rows (int): number of rows per file to read

    Returns
    -------
        df (GeoDataFrame)
    """
    df = gpd.read_file(file, rows=rows)

    return df

In [5]:
# read all geoshape files into dictionary
dict_geo = {}
for file in shp_files:
    basename = os.path.basename(file).split('/')[0]
    fname = os.path.basename(basename).split('.')[0]

    # print(file)
    dict_geo[fname] = read_shp(file, rows=5000)

## Columns

In [6]:
dict_geo['FDIC_Insured_Banks'].columns

Index(['OBJECTID', 'ADDRESBR', 'BRNUM', 'BRSERTYP', 'CBSABR', 'CBSANAMB',
       'CITYBR', 'CNTRYNAB', 'CNTYNAMB', 'DEPSUMBR', 'GEOCODE_CE', 'NAMEBR',
       'STALPBR', 'STCNTYBR', 'STNAMEBR', 'UNINUMBR', 'ZIPBR', 'CERT',
       'ADDRESS', 'ASSET', 'BKCLASS', 'CITY', 'CNTRYNA', 'DENOVO', 'DEPDOM',
       'NAMEFULL', 'NAMEHCR', 'REGAGNT', 'REPDTE', 'RSSDID', 'STALP', 'STCNTY',
       'STNAME', 'ZIP', 'BKMO', 'LOC_NAME', 'STATUS', 'SCORE', 'x', 'y',
       'GeocodeSou', 'STD_ADDR_B', 'STD_ADDR', 'ZIP4BR', 'geometry'],
      dtype='object')

In [7]:
dict_geo['Prison_Boundaries'].columns

Index(['FID', 'FACILITYID', 'NAME', 'ADDRESS', 'CITY', 'STATE', 'ZIP', 'ZIP4',
       'TELEPHONE', 'TYPE', 'STATUS', 'POPULATION', 'COUNTY', 'COUNTYFIPS',
       'COUNTRY', 'NAICS_CODE', 'NAICS_DESC', 'SOURCE', 'SOURCEDATE',
       'VAL_METHOD', 'VAL_DATE', 'WEBSITE', 'SECURELVL', 'CAPACITY',
       'SHAPE_Leng', 'GlobalID', 'CreationDa', 'Creator', 'EditDate', 'Editor',
       'SHAPE_Le_1', 'SHAPE_Area', 'geometry'],
      dtype='object')

## CRS

In [8]:
dict_geo['FDIC_Insured_Banks'].crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [9]:
dict_geo['Prison_Boundaries'].crs

<Projected CRS: EPSG:3857>
Name: WGS 84 / Pseudo-Mercator
Axis Info [cartesian]:
- X[east]: Easting (metre)
- Y[north]: Northing (metre)
Area of Use:
- name: World between 85.06°S and 85.06°N.
- bounds: (-180.0, -85.06, 180.0, 85.06)
Coordinate Operation:
- name: Popular Visualisation Pseudo-Mercator
- method: Popular Visualisation Pseudo Mercator
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

# Full Address String
* Output full address string (Street, City, State ZIP) for each GeoDataFrame

In [10]:
def full_address(df):
    """
    Concatenate each address component to a full address string
         (Street, City, State ZIP)
         
    Parameters
    ----------
        df (GeoDataFrame)

    Returns
    -------
        df (GeoDataFrame): with 'full_address' column
    """
    if 'ZIP' in df.columns:
        zip_col = 'ZIP'
    else:
        zip_col = 'ZIPCODE'

    if 'STATE' in df.columns:
        state_col = 'STATE'
    else:
        state_col = 'STNAME'

    if 'ADDRESS' in df.columns:
        addr_col = 'ADDRESS'
    elif 'ADDRESS' not in df.columns:
        addr_col = 'STREET'
    elif ('STREET' not in df.columns) and ('ADDRESS' not in df.columns):
        addr_col = 'STD_ADDR_B'

    df['Full_Address'] = df[[addr_col, 'CITY', state_col]].fillna('NaN').agg(', '.join, axis=1) + ' ' + df[zip_col].astype('str')
             
    return df

In [11]:
dict_address = {}
for fname in dict_geo:
    dict_address[fname] = full_address(dict_geo[fname])

In [12]:
# check full address string
dict_address['FDIC_Insured_Banks']

Unnamed: 0,OBJECTID,ADDRESBR,BRNUM,BRSERTYP,CBSABR,CBSANAMB,CITYBR,CNTRYNAB,CNTYNAMB,DEPSUMBR,...,STATUS,SCORE,x,y,GeocodeSou,STD_ADDR_B,STD_ADDR,ZIP4BR,geometry,Full_Address
0,1,22 Main Street,7636,11,25540,"Hartford-West Hartford-East Hartford, CT",Southington,United States,Hartford,76878,...,M,100.0,-72.878648,41.601435,HSIP USA_ZIP4 Composite,22 MAIN ST,100 N TRYON ST,2501,POINT (-72.87865 41.60144),"100 North Tryon St, Charlotte, North Carolina ..."
1,2,One City Center,8203,11,38860,"Portland-South Portland, ME",Portland,United States,Cumberland,1037118,...,M,100.0,-70.256668,43.657432,HSIP USA_ZIP4 Composite,1 CITY CTR,100 N TRYON ST,6420,POINT (-70.25667 43.65743),"100 North Tryon St, Charlotte, North Carolina ..."
2,3,66 Main Street,28,11,0,<None>,Ellsworth,United States,Hancock,75152,...,M,100.0,-68.424621,44.541326,HSIP USA_ZIP4 Composite,66 MAIN ST,2 ELM ST,1970,POINT (-68.42462 44.54133),"2 Elm Street, Camden, Maine 04843"
3,4,1 Lincoln St. Fl 1,0,11,14460,"Boston-Cambridge-Newton, MA-NH",Boston,United States,Suffolk,112389655,...,M,100.0,-71.057959,42.352824,HSIP USA_ZIP4 Composite,1 LINCOLN ST FL 1,1 LINCOLN ST FL 1,2901,POINT (-71.05796 42.35282),"1 Lincoln St. Fl 1, Boston, Massachusetts 02111"
4,5,1414 Massachusetts Avenue,7332,11,14460,"Boston-Cambridge-Newton, MA-NH",Cambridge,United States,Middlesex,1415083,...,T,100.0,-71.118913,42.373832,HSIP USA_ZIP4 Composite,1414 MASSACHUSETTS AVE,100 N TRYON ST,3807,POINT (-71.11891 42.37383),"100 North Tryon St, Charlotte, North Carolina ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,305-7 East Main Street,624,11,47900,"Washington-Arlington-Alexandria, DC-VA-MD-WV",Front Royal,United States,Warren,16261,...,M,100.0,-78.190446,38.917555,HSIP USA_ZIP4 Composite,305-7 EAST MAIN ST,239 FAYETTEVILLE ST,0000,POINT (-78.19045 38.91756),"239 Fayetteville Street, Raleigh, North Caroli..."
4996,4997,410 Main Street,1014,11,0,<None>,South Boston,United States,Halifax,68520,...,M,100.0,-78.901473,36.697827,HSIP USA_ZIP4 Composite,410 MAIN ST,303 PEACHTREET ST NORTHEAST,3222,POINT (-78.90147 36.69783),"303 Peachtreet Street, Northeast, Atlanta, Geo..."
4997,4998,5758 Main Street,0,11,49220,"Wisconsin Rapids-Marshfield, WI",Auburndale,United States,Wood,47131,...,M,100.0,-90.007378,44.627107,HSIP USA_ZIP4 Composite,5758 MAIN ST,5758 MAIN ST,9047,POINT (-90.00738 44.62711),"5758 Main Street, Auburndale, Wisconsin 54412"
4998,4999,1255 Lincoln Street,10,11,0,<None>,Rhinelander,United States,Oneida,31898,...,M,100.0,-89.394191,45.633037,HSIP USA_ZIP4 Composite,1255 LINCOLN ST,1905 STEWART AVE,3623,POINT (-89.39419 45.63304),"1905 West Stewart Avenue, Wausau, Wisconsin 54401"


# Calculate Latitutde / Longitude
1. Convert each GeoDataFrame to 'EPSG:4326'
2. Calculate centroids from geometry in each GeoDataFrame
3. Extract latitude and longitude from each centroid

In [13]:
def convert_EPSG4326(dict):
    """
    Convert each GeoDataFrame to 'EPSG:4326'
         
    Parameters
    ----------
        dict (dictionary): of GeoDataFrames

    Returns
    -------
        dict (dictionary): GeoDataFrames of 'EPSG:4326' CRS
    """
    for fname in dict:
        dict[fname] = dict[fname].to_crs("EPSG:4326")

    return dict

In [14]:
def get_centroid(dict):
    """
    Convert each GeoDataFrame to 'EPSG:4326'
         
    Parameters
    ----------
        dict (dictionary): GeoDataFrames of 'EPSG:4326' CRS

    Returns
    -------
        dict (dictionary): GeoDataFrames with extracted centroids from geoshapes
    """
    dict_centroids = {}
    for fname in dict:
        if 'x' in dict[fname].columns:
            dict[fname] = dict[fname].rename(columns={'x': 'source_lon', 'y': 'source_lat'})
            dict[fname]['Place_type'] = os.path.basename(fname)
            dict[fname]['source_centroid'] = dict[fname]['geometry']
        else:
            dict[fname]['source_centroid'] = dict[fname]['geometry'].centroid
            dict[fname]['source_lon'] = dict[fname]['geometry'].centroid.x
            dict[fname]['source_lat'] = dict[fname]['geometry'].centroid.y
            dict[fname]['Place_type'] = os.path.basename(fname)

        dict_centroids[fname] = keep_columns(dict[fname])
        
    return dict_centroids

In [15]:
def keep_columns(df):
    """
    Convert each GeoDataFrame to 'EPSG:4326'
         
    Parameters
    ----------
        df (GeoDataFrame): GeoDataFrames of 'EPSG:4326' CRS with extracted centroids

    Returns
    -------
        new_df (GeoDataFrame): GeoDataFrames with only source centroid and full address columns
    """
    cols = ['Full_Address', 'Place_type', 'source_centroid', 'source_lon', 'source_lat']
    new_df = df[cols]

    return new_df

In [16]:
# if you get an error on the first pass, try re-running the cell
dict_EPSG4326 = convert_EPSG4326(dict_address)
dict_centroid = get_centroid(dict_EPSG4326)


  dict[fname]['source_centroid'] = dict[fname]['geometry'].centroid

  dict[fname]['source_lon'] = dict[fname]['geometry'].centroid.x

  dict[fname]['source_lat'] = dict[fname]['geometry'].centroid.y

  dict[fname]['source_centroid'] = dict[fname]['geometry'].centroid

  dict[fname]['source_lon'] = dict[fname]['geometry'].centroid.x

  dict[fname]['source_lat'] = dict[fname]['geometry'].centroid.y

  dict[fname]['source_centroid'] = dict[fname]['geometry'].centroid

  dict[fname]['source_lon'] = dict[fname]['geometry'].centroid.x

  dict[fname]['source_lat'] = dict[fname]['geometry'].centroid.y

  dict[fname]['source_centroid'] = dict[fname]['geometry'].centroid

  dict[fname]['source_lon'] = dict[fname]['geometry'].centroid.x

  dict[fname]['source_lat'] = dict[fname]['geometry'].centroid.y

  dict[fname]['source_centroid'] = dict[fname]['geometry'].centroid

  dict[fname]['source_lon'] = dict[fname]['geometry'].centroid.x

  dict[fname]['source_lat'] = dict[fname]['geometry'].centro

In [17]:
# check centroids
dict_centroid['FDIC_Insured_Banks']

Unnamed: 0,Full_Address,Place_type,source_centroid,source_lon,source_lat
0,"100 North Tryon St, Charlotte, North Carolina ...",FDIC_Insured_Banks,POINT (-72.87865 41.60144),-72.878648,41.601435
1,"100 North Tryon St, Charlotte, North Carolina ...",FDIC_Insured_Banks,POINT (-70.25667 43.65743),-70.256668,43.657432
2,"2 Elm Street, Camden, Maine 04843",FDIC_Insured_Banks,POINT (-68.42462 44.54133),-68.424621,44.541326
3,"1 Lincoln St. Fl 1, Boston, Massachusetts 02111",FDIC_Insured_Banks,POINT (-71.05796 42.35282),-71.057959,42.352824
4,"100 North Tryon St, Charlotte, North Carolina ...",FDIC_Insured_Banks,POINT (-71.11891 42.37383),-71.118913,42.373832
...,...,...,...,...,...
4995,"239 Fayetteville Street, Raleigh, North Caroli...",FDIC_Insured_Banks,POINT (-78.19045 38.91756),-78.190446,38.917555
4996,"303 Peachtreet Street, Northeast, Atlanta, Geo...",FDIC_Insured_Banks,POINT (-78.90147 36.69783),-78.901473,36.697827
4997,"5758 Main Street, Auburndale, Wisconsin 54412",FDIC_Insured_Banks,POINT (-90.00738 44.62711),-90.007378,44.627107
4998,"1905 West Stewart Avenue, Wausau, Wisconsin 54401",FDIC_Insured_Banks,POINT (-89.39419 45.63304),-89.394191,45.633037


# Save Geoshape Files

In [18]:
def save_shp(dict, save_dir):
    """
    Save each GeoDataFrame to individual geoshape files
         
    Parameters
    ----------
        dict (dictionary): GeoDataFrames of 'EPSG:4326' CRS with extracted centroids
        save_dir (str): path of desired output directory
    """
    for fname in dict:
        shp_file = dict[fname].set_geometry('source_centroid')
        
        save_path = os.path.join(save_dir, f"{fname}")
        create_dir(save_path)
        
        shp_file.to_file(save_path, driver='ESRI Shapefile')

In [19]:
def create_dir(save_dir):
    """
    Creates directory if it does not exist
         
    Parameters
    ----------
        save_dir (str): path of desired output directory
    """
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

In [20]:
# save files
save_dir = os.path.join(abs_path, 'output/HIFLD/centroids')
create_dir(save_dir)

save_shp(dict_centroid, save_dir)

  shp_file.to_file(save_path, driver='ESRI Shapefile')
  shp_file.to_file(save_path, driver='ESRI Shapefile')
  shp_file.to_file(save_path, driver='ESRI Shapefile')
  shp_file.to_file(save_path, driver='ESRI Shapefile')
  shp_file.to_file(save_path, driver='ESRI Shapefile')
  shp_file.to_file(save_path, driver='ESRI Shapefile')
