In [1]:
import os
import sys
import re
import csv

import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

In [2]:
# list shp files recursively
os.chdir('..')
abs_path = os.getcwd()

HIFLD_path = os.path.join(abs_path, 'output/HIFLD/centroids')
shp_files = [os.path.join(root, name) \
             for root, dirs, files in os.walk(HIFLD_path) \
             for name in files \
             if name.endswith(('.shp'))]

tribal_WA_folder = os.path.join(abs_path, 'data/Tribal_Lands_WA')
tribal_WA_file = 'TribalLands.shp'
tribal_WA_path = os.path.join(tribal_WA_folder, tribal_WA_file)

tribal_priority_folder = os.path.join(abs_path, 'output/Tribal_Priority_2_5')
tribal_priority_file = 'Tribal_Priority_2_5_combined.shp'
tribal_priority_path = os.path.join(tribal_priority_folder, tribal_priority_file)

In [3]:
def read_shp(file, rows=100):
    """
    Read geoshapes file

    Parameters
    ----------
        rows (int): number of rows per file to read

    Returns
    -------
        df (GeoDataFrame)
    """
    df = gpd.read_file(file, rows=rows)

    return df

In [4]:
# read files
tribal_priority_shapes = gpd.read_file(tribal_priority_path)
# for all shp_files
centroids_dict = dict()
for file in shp_files:
    basename = os.path.basename(file).split('/')[0]
    fname = os.path.basename(basename).split('.')[0]

    # print(file)
    df = gpd.read_file(file)
    centroids_dict[fname] = df
    # df_address[fname]['Facility_type'] = fname

In [5]:
# number of observations per geoshape file 
for file in shp_files:
    with open(file, 'rb') as f:
        row_count = len(f.readlines()) - 1
        
        basename = os.path.basename(file).split('/')[0]
        fname = os.path.basename(basename).split('.')[0]
        
        print('There are {} observations in {}.'.format(row_count, fname))

There are 5510 observations in AllPlacesOfWorship.
There are 24277 observations in AllPlacesofWorship_Sample.
There are 5478 observations in CollegesUniversities.
There are 5524 observations in FDIC_Insured_Banks.
There are 12553 observations in FDIC_Insured_Banks_Sample.
There are 5497 observations in Fire_Stations.
There are 5497 observations in Hospitals.
There are 5506 observations in NCUA_Insured_Credit_Unions_V_2024_Q1.
There are 5507 observations in Pharmacies_.
There are 5536 observations in Prison_Boundaries.
There are 5531 observations in Private_Schools.
There are 5491 observations in PublicSchools.
There are 15249 observations in PublicSchools_Sample.
There are 5332 observations in UrgentCareFacs.


# Convert to EPSG:4326
* The centroids GeoDataFrames should be in CRS EPSG:4326 but we will convert again to make sure
* Also convert the tribal lands geoshape file to CRS EPSG:4236

In [6]:
def convert_EPSG4326(dict):
    """
    Convert each GeoDataFrame to 'EPSG:4326'
         
    Parameters
    ----------
        dict (dictionary): of GeoDataFrames

    Returns
    -------
        dict (dictionary): GeoDataFrames of 'EPSG:4326' CRS
    """
    for fname in dict:
        dict[fname] = dict[fname].to_crs("EPSG:4326")

    return dict

In [7]:
dict_EPSG4326 = convert_EPSG4326(centroids_dict)
tribal_priority_shapes = tribal_priority_shapes.to_crs("EPSG:4326")

# Spatial Joins
* Goal: determine what centroids fall within a tribal land geoshape for Washington state
* If 'NA' value is returned, it means that the centroid did not fall into any geoshape

In [8]:
def spatial_join(dict, gdf, how='left'):
    """
    Spatial join centroids to geoshape file

    Parameters
    ----------
        dict (dictionary): of GeoDataFrames
        gdf (GeoDataFrame): of land shapes
        how (parameter): sjoin parameter, default = 'left'

    Returns
    -------
        dict_sjoin (dictionary): GeoDataFrames of 'EPSG:4326' CRS
    """
    dict_sjoin = {}
    for fname in dict:
        df = dict[fname].sjoin(gdf, how=how)
        
        # if centroid in tribal polygon, label as 1
        df['Tribal'] = df.index_right.apply(lambda x: 0 if pd.isna(x) else 1) 
        dict_sjoin[fname] = df

    return dict_sjoin    

In [9]:
dict_sjoin = spatial_join(dict_EPSG4326, tribal_priority_shapes)

In [10]:
for fname in dict_sjoin:
    print(fname)

AllPlacesOfWorship
AllPlacesofWorship_Sample
CollegesUniversities
FDIC_Insured_Banks
FDIC_Insured_Banks_Sample
Fire_Stations
Hospitals
NCUA_Insured_Credit_Unions_V_2024_Q1
Pharmacies_
Prison_Boundaries
Private_Schools
PublicSchools
PublicSchools_Sample
UrgentCareFacs


## Confirm there are valid spatial joins

In [11]:
# we see that there are urgent care facilities in WA tribal lands
dict_sjoin['AllPlacesofWorship_Sample'].notna().sum()

Full_Addre     23135
Place_type     23135
source_lon     23135
source_lat     23135
geometry       23135
index_right     1097
OBJECTID        1097
GEOID_1         1097
NAME_1          1097
NAMELSAD_1      1097
Eligible_L      1097
Shape_Leng      1097
Shape_Le_1      1097
Shape_Area      1097
Tribal         23135
dtype: int64

In [12]:
# same goes for public schools in WA tribal lands
dict_sjoin['PublicSchools_Sample'].notna().sum()

Full_Addre     14484
Place_type     14484
source_lon     14484
source_lat     14484
geometry       14484
index_right     1213
OBJECTID        1213
GEOID_1         1213
NAME_1          1213
NAMELSAD_1      1213
Eligible_L      1213
Shape_Leng      1212
Shape_Le_1      1212
Shape_Area      1212
Tribal         14484
dtype: int64

In [13]:
# look at the 167 public schools that are in WA tribal lands
# we can see from the full address strings that they are indeed WA addresses
dict_sjoin['PublicSchools_Sample'].loc[dict_sjoin['PublicSchools_Sample'].OBJECTID.notna()]

Unnamed: 0,Full_Addre,Place_type,source_lon,source_lat,geometry,index_right,OBJECTID,GEOID_1,NAME_1,NAMELSAD_1,Eligible_L,Shape_Leng,Shape_Le_1,Shape_Area,Tribal
13,"101 MAIN STREET, PORCUPINE, SD 57772",PublicSchools,-102.329859,43.248148,POINT (-102.32986 43.24815),65.0,38.0,2810R,Pine Ridge,Pine Ridge Reservation,Yes,5.941682,5.941682,1.251797,1
20,"18001 BARANOFF ST, EAGLE RIVER, AK 99577",PublicSchools,-149.546730,61.314887,POINT (-149.54673 61.31489),282.0,638.0,0217140,Cook Inlet,Cook Inlet Alaska Native Regional Corporation,Yes,31.129071,31.129071,19.395772,1
22,"1415 GLACIER AVE - MARIE DRAKE, JUNEAU, AK 99801",PublicSchools,-134.426121,58.303662,POINT (-134.42612 58.30366),139.0,633.0,0267940,Sealaska,Sealaska Alaska Native Regional Corporation,Yes,33.190960,33.190960,19.810494,1
23,"10015 W REDINGTON SCHOOL RD, WASILLA, AK 99623",PublicSchools,-149.702835,61.501207,POINT (-149.70284 61.50121),282.0,638.0,0217140,Cook Inlet,Cook Inlet Alaska Native Regional Corporation,Yes,31.129071,31.129071,19.395772,1
23,"10015 W REDINGTON SCHOOL RD, WASILLA, AK 99623",PublicSchools,-149.702835,61.501207,POINT (-149.70284 61.50121),247.0,351.0,6785R,Knik,Knik ANVSA,Yes,15.884392,15.884392,3.248782,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14106,"9189 1/2 HWY 24, FORT YATES, ND 58538",PublicSchools,-100.676957,46.093675,POINT (-100.67696 46.09367),608.0,217.0,3970R,Standing Rock,Standing Rock Reservation,Yes,6.764861,6.764861,1.098482,1
14161,"715 BAYVIEW BLVD, KLAWOCK, AK 99925",PublicSchools,-133.099004,55.553616,POINT (-133.09900 55.55362),140.0,384.0,6765R,Klawock,Klawock ANVSA,Yes,0.112933,0.112933,0.000355,1
14161,"715 BAYVIEW BLVD, KLAWOCK, AK 99925",PublicSchools,-133.099004,55.553616,POINT (-133.09900 55.55362),139.0,633.0,0267940,Sealaska,Sealaska Alaska Native Regional Corporation,Yes,33.190960,33.190960,19.810494,1
14187,"1209 GUILBERT GODON STREET, BELCOURT, ND 58316",PublicSchools,-99.742824,48.841002,POINT (-99.74282 48.84100),607.0,230.0,4345R,Turtle Mountain,Turtle Mountain Reservation,Yes,0.699192,0.699192,0.022795,1


# Summary Statistics
* Total counts of centroids that fall within WA tribal lands

In [14]:
summary_tribal = []
for fname in dict_sjoin:
    tmp = dict()
    
    tmp['Dataset'] = fname
    tmp['not_tribal'] = dict_sjoin[fname].Tribal.value_counts().tolist()[0]
    tmp['tribal'] = dict_sjoin[fname].shape[0] - dict_sjoin[fname].Tribal.value_counts().tolist()[0]
    
    summary_tribal.append(tmp)

In [15]:
pd.DataFrame(summary_tribal)

Unnamed: 0,Dataset,not_tribal,tribal
0,AllPlacesOfWorship,4999,1
1,AllPlacesofWorship_Sample,22038,1097
2,CollegesUniversities,4920,83
3,FDIC_Insured_Banks,4788,215
4,FDIC_Insured_Banks_Sample,11554,163
5,Fire_Stations,4979,22
6,Hospitals,4818,199
7,NCUA_Insured_Credit_Unions_V_2024_Q1,4966,35
8,Pharmacies_,4927,100
9,Prison_Boundaries,4857,162


In [16]:
gdf = pd.concat([dict_sjoin[x] for x in dict_sjoin], ignore_index=True)
gdf

Unnamed: 0,Full_Addre,Place_type,source_lon,source_lat,geometry,index_right,OBJECTID,GEOID_1,NAME_1,NAMELSAD_1,Eligible_L,Shape_Leng,Shape_Le_1,Shape_Area,Tribal
0,"523 E BROADWAY, SOUTH BOSTON, MA 02127",AllPlacesOfWorship,-71.043522,42.335472,POINT (-71.04352 42.33547),,,,,,,,,,0
1,"454 ESSEX ST, LAWRENCE, MA 01840",AllPlacesOfWorship,-71.164940,42.706213,POINT (-71.16494 42.70621),,,,,,,,,,0
2,"569 BROADWAY, NEWARK, NJ 07104",AllPlacesOfWorship,-74.162821,40.769935,POINT (-74.16282 40.76993),,,,,,,,,,0
3,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",AllPlacesOfWorship,-78.747816,42.798535,POINT (-78.74782 42.79853),,,,,,,,,,0
4,"431 CAMPGROUND RD, LIVERMORE FLS, ME 04254",AllPlacesOfWorship,-70.113777,44.428610,POINT (-70.11378 44.42861),,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104247,"1300 PEACHTREE INDUSTRIAL BOULEVARD, SUWANEE, ...",UrgentCareFacs,-84.095174,34.041727,POINT (-84.09517 34.04173),,,,,,,,,,0
104248,"2660 SATELLITE BOULEVARD NORTHWEST, DULUTH, GA...",UrgentCareFacs,-84.101318,33.966797,POINT (-84.10132 33.96680),,,,,,,,,,0
104249,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",UrgentCareFacs,-83.902215,34.068832,POINT (-83.90222 34.06883),,,,,,,,,,0
104250,"1055 DOVE RUN ROAD, LEXINGTON, KY 40502",UrgentCareFacs,-84.494106,37.996508,POINT (-84.49411 37.99651),,,,,,,,,,0


In [17]:
gdf.to_csv(os.path.join(abs_path, 'output/sample_spatial_join.csv'), index=False)