In [1]:
import os
import sys
import csv

import numpy as np
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# list shp files recursively
os.chdir('..')
abs_path = os.getcwd()

HIFLD_path = os.path.join(abs_path, 'data/HIFLD')
shp_files = [os.path.join(root, name) \
             for root, dirs, files in os.walk(HIFLD_path) \
             for name in files \
             if name.endswith(('.shp'))]

In [3]:
def full_address(df):
    """
    Concatenate each address component to a full address string
         (Street, City, State ZIP)
         
    Parameters
    ----------
        df (GeoDataFrame)

    Returns
    -------
        df (GeoDataFrame): with 'full_address' column
    """
    if 'ZIP' in df.columns:
        zip_col = 'ZIP'
    else:
        zip_col = 'ZIPCODE'

    if 'STATE' in df.columns:
        state_col = 'STATE'
    else:
        state_col = 'STNAME'

    if 'ADDRESS' in df.columns:
        addr_col = 'ADDRESS'
    elif 'ADDRESS' not in df.columns:
        addr_col = 'STREET'
    elif ('STREET' not in df.columns) and ('ADDRESS' not in df.columns):
        addr_col = 'STD_ADDR_B'

    # add 0 to zip code if only 4 digits
    df['zip'] = df[zip_col].astype('str')
    df['zip'] = df['zip'].apply(lambda x: '0' + x if len(x) <5 else x)
    
    df[addr_col] = df[addr_col].astype('str')
    df['CITY'] = df['CITY'].astype('str')
    df[state_col] = df[state_col].astype('str')
    
    df['Full_Address'] = df[[addr_col, 'CITY', state_col]].fillna('NaN').agg(', '.join, axis=1) + ' ' + df['zip']
    df['Full_Address'] = df['Full_Address'].astype('str')
             
    return df

In [4]:
def keep_columns(df):
    """
    Convert each GeoDataFrame to 'EPSG:4326'
         
    Parameters
    ----------
        df (GeoDataFrame): GeoDataFrames of 'EPSG:4326' CRS with extracted centroids

    Returns
    -------
        new_df (GeoDataFrame): GeoDataFrames with only source centroid and full address columns
    """
    cols = ['Full_Address', 'Place_type', 'source_centroid', 'source_lon', 'source_lat']
    new_df = df[cols]

    return new_df

In [5]:
allplacesofworship = gpd.read_file(shp_files[0], rows=slice(52000,53000))
allplacesofworship.columns

Index(['OBJECTID', 'EIN', 'NAME', 'STREET', 'CITY', 'STATE', 'ZIP',
       'AFFILIATIO', 'RULING', 'FOUNDATION', 'ACTIVITY', 'ORGANIZATI',
       'TAX_PERIOD', 'ACCT_PD', 'NTEE_CD', 'SORT_NAME', 'LOC_NAME',
       'GEOCODED_S', 'SCORE', 'MATCH_TYPE', 'MATCH_ADDR', 'ADDR_TYPE',
       'ADDNUM', 'SIDE', 'STPREDIR', 'STPRETYPE', 'STNAME', 'STTYPE', 'STDIR',
       'STADDR', 'CITY_2', 'SUBREGION', 'REGION', 'REGIONABBR', 'POSTAL',
       'COUNTRY', 'LANGCODE', 'DISTANCE', 'X', 'Y', 'DISPLAYX', 'DISPLAYY',
       'XMIN', 'XMAX', 'YMIN', 'YMAX', 'ADDNUMFROM', 'ADDNUMTO', 'RANK',
       'ARC_ADDRES', 'ARC_CITY', 'ARC_REGION', 'ARC_POSTAL', 'geometry'],
      dtype='object')

In [6]:
def load2gdf_slices(in_fc, use_fields=None, chunk_size=1000):
    i = 0
    out_gdf = gpd.GeoDataFrame()
    gdf = []
    f_geom = 'geometry'
    
    #import pdb; pdb.set_trace()
    while True:
        chunk = slice(i, i + chunk_size, 1)
        gdf_iter = gpd.read_file(in_fc, rows=chunk)
    
        if gdf_iter.shape[0] == 0:
            break
        else:
            if use_fields:
                if f_geom not in use_fields:
                    #use_fields.append('geometry')
                    pass
                gdf_iter = gdf_iter[use_fields]
    
            gdf.append(gdf_iter)
            i += chunk_size
        gdf_concat = pd.concat(gdf)
        out_gdf = gpd.GeoDataFrame(gdf_concat)
    return out_gdf

# All Places of Worship Sample

In [7]:
%%time

out_gdf = load2gdf_slices(shp_files[0], use_fields=['STATE', 'NAME', 'STREET', 'CITY', 'geometry', 'ZIP'], chunk_size=1000)

CPU times: total: 3min 19s
Wall time: 5min 15s


In [8]:
states = ['AK', 'NM', 'AZ', 'UT', 'WY', 'CO', 'NV', 'ND', 'SD', 'NE', 'MT', 'ID', 'OR']

allplacesofworship_states = out_gdf[out_gdf.STATE.isin(states)]
allplacesofworship_states

Unnamed: 0,STATE,NAME,STREET,CITY,geometry,ZIP
194,SD,RIVER OF LIFE COMMUNITY CHURCH,PO BOX 156,CANTON,POINT (-10752229.057 5358265.198),57013
213,ND,ABUNDANT FAITH MINISTRIES,PO BOX 181,LANGDON,POINT (-10950872.454 6234753.691),58249
262,SD,LIVING WATER CHRISTIAN FELLOWSHIP,413 W 15TH ST,YANKTON,POINT (-10842292.536 5294238.058),57078
317,SD,MAIN STREET LIVING INC,1400 S DULUTH AVE,SIOUX FALLS,POINT (-10768354.464 5393393.579),57105
399,NE,TEMPLO FUENTE DE VIDA,1020 E 8TH ST,SCOTTSBLUFF,POINT (-11538117.277 5139260.369),69361
...,...,...,...,...,...,...
709,OR,MT OLIVET BAPTIST CHURCH,8725 N CHAUTAUQUA BLVD,PORTLAND,POINT (-13659745.433 5714168.005),97217
717,NM,LIGHT OF THE LIVING RAINBOW CHURCH,PO BOX 90843,ALBUQUERQUE,POINT (-11863972.915 4185723.432),87199
719,AZ,LIVING FAITH CHRISTIAN CENTER,PO BOX 873,SNOWFLAKE,POINT (-12254284.208 4097221.050),85937
728,AZ,EARTH VISION FOUNDATION INC,PO BOX 3341,SEDONA,POINT (-12445181.884 4145357.494),86340


In [9]:
allplacesofworship_address = full_address(allplacesofworship_states)

allplacesofworship_states_EPSG4326 = allplacesofworship_address.to_crs("EPSG:4326")
allplacesofworship_states_EPSG4326['source_centroid'] = allplacesofworship_states_EPSG4326['geometry'].centroid
allplacesofworship_states_EPSG4326['source_lon'] = allplacesofworship_states_EPSG4326['geometry'].centroid.x
allplacesofworship_states_EPSG4326['source_lat'] = allplacesofworship_states_EPSG4326['geometry'].centroid.y
allplacesofworship_states_EPSG4326['Place_type'] = 'AllPlacesOfWorship'

allplacesofworship_states_EPSG4326 = keep_columns(allplacesofworship_states_EPSG4326)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

In [10]:
def create_dir(save_dir):
    """
    Creates directory if it does not exist
         
    Parameters
    ----------
        save_dir (str): path of desired output directory
    """
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

In [11]:
shp_file = allplacesofworship_states_EPSG4326.set_geometry('source_centroid')

save_dir = os.path.join(abs_path, 'output/HIFLD/centroids')
save_path = os.path.join(save_dir, 'AllPlacesofWorship_Sample')
create_dir(save_path)

shp_file.to_file(save_path, driver='ESRI Shapefile')

  shp_file.to_file(save_path, driver='ESRI Shapefile')


# Public Schools Sample

In [15]:
gpd.read_file(shp_files[6], rows=10).columns

Index(['OBJECTID', 'NCESID', 'NAME', 'ADDRESS', 'CITY', 'STATE', 'ZIP', 'ZIP4',
       'TELEPHONE', 'TYPE', 'STATUS', 'POPULATION', 'COUNTY', 'COUNTYFIPS',
       'COUNTRY', 'LATITUDE', 'LONGITUDE', 'NAICS_CODE', 'NAICS_DESC',
       'SOURCE', 'SOURCEDATE', 'VAL_METHOD', 'VAL_DATE', 'WEBSITE', 'LEVEL_',
       'ENROLLMENT', 'ST_GRADE', 'END_GRADE', 'DISTRICTID', 'FT_TEACHER',
       'SHELTER_ID', 'geometry'],
      dtype='object')

In [12]:
%%time

out_gdf = load2gdf_slices(shp_files[6], use_fields=['STATE', 'NAME', 'ADDRESS', 'CITY', 'geometry', 'ZIP'], chunk_size=1000)

CPU times: total: 44.4 s
Wall time: 1min 8s


In [13]:
states = ['AK', 'NM', 'AZ', 'UT', 'WY', 'CO', 'NV', 'ND', 'SD', 'NE', 'MT', 'ID', 'OR']

publicschools_states = out_gdf[out_gdf.STATE.isin(states)]
publicschools_states

Unnamed: 0,STATE,NAME,ADDRESS,CITY,geometry,ZIP
3,AZ,DR. GARY AND ANNETTE AUXIER ELEMENTARY SCHOOL,22700 S POWER RD,QUEEN CREEK,POINT (-12432974.033 3927587.870),85142
4,AZ,DR. CAMILLE CASTEEL HIGH SCHOOL,24901 S POWER RD,QUEEN CREEK,POINT (-12432580.229 3924830.599),85142
5,AZ,BRIDGES ELEMENTARY SCHOOL,5205 S SOBOBA ST,GILBERT,POINT (-12435059.846 3929308.122),85298
6,AZ,LAS BRISAS ACADEMY,18211 W LAS BRISAS DR,GOODYEAR,POINT (-12518034.063 3949789.464),85338
7,AZ,MANSON MESA HIGH SCHOOL,500 SOUTH NAVAJO,PAGE,POINT (-12407477.267 4426920.575),86040
...,...,...,...,...,...,...
289,SD,VIRTUAL ACADEMY - 90,201 E 38TH ST,SIOUX FALLS,POINT (-10767317.419 5391010.343),57105
290,SD,BEN REIFEL MIDDLE SCHOOL - 68,6700 E 41ST ST,SIOUX FALLS,POINT (-10758172.775 5390921.640),57110
291,SD,GARFIELD EDUCATION CENTER - 14,216 10TH ST SE,WATERTOWN,POINT (-10809059.301 5604875.725),57201
292,SD,PLATTE-GEDDES CYBER MIDDLE SCHOOL - 93,400 S ILLINOIS AVE,PLATTE,POINT (-11002815.772 5371021.071),57369


In [14]:
publicschools_address = full_address(publicschools_states)

publicschools_address_EPSG4326 = publicschools_address.to_crs("EPSG:4326")
publicschools_address_EPSG4326['source_centroid'] = publicschools_address_EPSG4326['geometry'].centroid
publicschools_address_EPSG4326['source_lon'] = publicschools_address_EPSG4326['geometry'].centroid.x
publicschools_address_EPSG4326['source_lat'] = publicschools_address_EPSG4326['geometry'].centroid.y
publicschools_address_EPSG4326['Place_type'] = 'PublicSchools'

publicschools_address_EPSG4326 = keep_columns(publicschools_address_EPSG4326)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

In [15]:
shp_file = publicschools_address_EPSG4326.set_geometry('source_centroid')

save_dir = os.path.join(abs_path, 'output/HIFLD/centroids')
save_path = os.path.join(save_dir, 'PublicSchools_Sample')
create_dir(save_path)

shp_file.to_file(save_path, driver='ESRI Shapefile')

  shp_file.to_file(save_path, driver='ESRI Shapefile')


# FDIC Insured Banks Sample

In [16]:
%%time

out_gdf = load2gdf_slices(shp_files[2], use_fields=['STNAME', 'STD_ADDR_B', 'CITY', 'geometry', 'ZIP'], chunk_size=1000)

CPU times: total: 52.3 s
Wall time: 1min 19s


In [17]:
states = ['Alaska', 'New Mexico', 'Arizona', 'Utah', 'Wyoming', 'Colorado', 'Nevada', 'North Dakota', 'South Dakota', 'Nebraska', 'Montana', 'Idaho', 'Oregon']

FDIC_Insured_Banks_states = out_gdf[out_gdf.STNAME.isin(states)]
FDIC_Insured_Banks_states

Unnamed: 0,STNAME,STD_ADDR_B,CITY,geometry,ZIP
31,South Dakota,22769 LAKESHORE BLVD,Sioux Falls,POINT (-86.32616 31.00276),57104
65,South Dakota,200 W BASE ST,Sioux Falls,POINT (-83.41673 30.46948),57104
66,South Dakota,1601 MAIN ST,Sioux Falls,POINT (-81.42171 30.33902),57104
70,South Dakota,130 N RIDGEWOOD AVE,Sioux Falls,POINT (-81.02357 29.21204),57104
71,South Dakota,200 W MAIN ST,Sioux Falls,POINT (-83.58330 30.11796),57104
...,...,...,...,...,...
998,Utah,105 S 500 W,Logan,POINT (-111.89220 40.88831),84321
999,Utah,30 E MAIN ST,Logan,POINT (-111.85432 41.63400),84321
27,South Dakota,247 E 86TH ST,Sioux Falls,POINT (-73.91820 40.64675),57104
48,Nebraska,5931 S 58TH ST,Cook,POINT (-96.64162 40.75183),68329


In [18]:
# FDIC_Insured_Banks_address = full_address(FDIC_Insured_Banks_states)
FDIC_Insured_Banks_states['ZIP'] = FDIC_Insured_Banks_states['ZIP'].astype('str')
FDIC_Insured_Banks_states['ZIP'] = FDIC_Insured_Banks_states['ZIP'].apply(lambda x: '0' + x if len(x) < 5 else x)

FDIC_Insured_Banks_states['STD_ADDR_B'] = FDIC_Insured_Banks_states['STD_ADDR_B'].astype('str')
FDIC_Insured_Banks_states['CITY'] = FDIC_Insured_Banks_states['CITY'].astype('str')
FDIC_Insured_Banks_states['STNAME'] = FDIC_Insured_Banks_states['STNAME'].astype('str')

FDIC_Insured_Banks_states['Full_Address'] = FDIC_Insured_Banks_states[['STD_ADDR_B', 'CITY', 'STNAME']].fillna('NaN').agg(', '.join, axis=1) + ' ' + FDIC_Insured_Banks_states['ZIP']
FDIC_Insured_Banks_states['Full_Address'] = FDIC_Insured_Banks_states['Full_Address'].astype('str')

FDIC_Insured_Banks_address = FDIC_Insured_Banks_states.to_crs("EPSG:4326")
FDIC_Insured_Banks_address['source_centroid'] = FDIC_Insured_Banks_address['geometry'].centroid
FDIC_Insured_Banks_address['source_lon'] = FDIC_Insured_Banks_address['geometry'].centroid.x
FDIC_Insured_Banks_address['source_lat'] = FDIC_Insured_Banks_address['geometry'].centroid.y
FDIC_Insured_Banks_address['Place_type'] = 'FDIC_Insured_Banks'

FDIC_Insured_Banks_address_address_EPSG4326 = keep_columns(publicschools_address_EPSG4326)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

In [19]:
shp_file = FDIC_Insured_Banks_address_address_EPSG4326.set_geometry('source_centroid')

save_dir = os.path.join(abs_path, 'output/HIFLD/centroids')
save_path = os.path.join(save_dir, 'FDIC_Insured_Banks_Sample')
create_dir(save_path)

shp_file.to_file(save_path, driver='ESRI Shapefile')

  shp_file.to_file(save_path, driver='ESRI Shapefile')
