In [1]:
import os
import sys
import re
import csv

import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

In [2]:
# list shp files recursively
os.chdir('..')
abs_path = os.getcwd()

RUCA_folder = os.path.join(abs_path, 'data')
RUCA_file = 'ruca2010revised.xlsx'
RUCA_path = os.path.join(RUCA_folder, RUCA_file)

output_folder = os.path.join(abs_path, 'output')
OMOP_file = 'sample_spatial_join_OMOP_clean.csv'
OMOP_path = os.path.join(output_folder, OMOP_file)

tract_folder = os.path.join(abs_path, 'data/cb_2019_us_tract_500k')
tract_file = 'cb_2019_us_tract_500k.shp'
tract_path = os.path.join(tract_folder, tract_file)

# RUCA Dataset
* 2010 Revised

In [3]:
RUCA_df = pd.read_excel(RUCA_path, sheet_name='Data', engine="openpyxl", header=1)
RUCA_df = RUCA_df.rename(columns={'State-County-Tract FIPS Code (lookup by address at http://www.ffiec.gov/Geocode/)':'State-County-Tract FIPS Code'})
RUCA_df

Unnamed: 0,State-County FIPS Code,Select State,Select County,State-County-Tract FIPS Code,Primary RUCA Code 2010,"Secondary RUCA Code, 2010 (see errata)","Tract Population, 2010","Land Area (square miles), 2010","Population Density (per square mile), 2010"
0,1001,AL,Autauga County,1001020100,1,1.0,1912,3.787641,504.799727
1,1001,AL,Autauga County,1001020200,1,1.0,2170,1.289776,1682.462370
2,1001,AL,Autauga County,1001020300,1,1.0,3373,2.065366,1633.124331
3,1001,AL,Autauga County,1001020400,1,1.0,4386,2.464376,1779.760676
4,1001,AL,Autauga County,1001020500,1,1.0,10766,4.400686,2446.436531
...,...,...,...,...,...,...,...,...,...
73997,72153,PR,Yauco Municipio,72153750501,1,1.0,4960,0.693337,7153.810353
73998,72153,PR,Yauco Municipio,72153750502,1,1.0,2092,0.263530,7938.381391
73999,72153,PR,Yauco Municipio,72153750503,1,1.0,2357,1.280621,1840.512771
74000,72153,PR,Yauco Municipio,72153750601,1,1.0,5315,4.251936,1250.018953


# TIGER/Line Shapefiles

In [4]:
tract_shapes = gpd.read_file(tract_path)
tract_shapes

Unnamed: 0,STATEFP,COUNTYFP,TRACTCE,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,geometry
0,34,005,702808,1400000US34005702808,34005702808,7028.08,CT,1510553,0,"POLYGON ((-74.90801 40.03250, -74.90496 40.036..."
1,39,153,504200,1400000US39153504200,39153504200,5042,CT,808151,0,"POLYGON ((-81.51807 41.05957, -81.51805 41.062..."
2,36,071,000502,1400000US36071000502,36071000502,5.02,CT,967431,969216,"POLYGON ((-74.02226 41.49281, -74.02180 41.496..."
3,06,013,370000,1400000US06013370000,06013370000,3700,CT,999356,0,"POLYGON ((-122.32697 37.93502, -122.32610 37.9..."
4,48,113,019204,1400000US48113019204,48113019204,192.04,CT,2851229,9966,"POLYGON ((-96.76902 32.94655, -96.76899 32.950..."
...,...,...,...,...,...,...,...,...,...,...
73863,48,245,011303,1400000US48245011303,48245011303,113.03,CT,155358411,14272444,"POLYGON ((-94.28353 29.88008, -94.26230 29.910..."
73864,01,073,012302,1400000US01073012302,01073012302,123.02,CT,168942340,1269965,"POLYGON ((-87.16295 33.49312, -87.16185 33.495..."
73865,72,113,072400,1400000US72113072400,72113072400,724,CT,11230767,184996,"POLYGON ((-66.58227 18.03935, -66.58177 18.039..."
73866,06,081,608023,1400000US06081608023,06081608023,6080.23,CT,805954,119028,"POLYGON ((-122.27898 37.54565, -122.27590 37.5..."


In [5]:
tract_shapes.columns

Index(['STATEFP', 'COUNTYFP', 'TRACTCE', 'AFFGEOID', 'GEOID', 'NAME', 'LSAD',
       'ALAND', 'AWATER', 'geometry'],
      dtype='object')

In [6]:
tract_shapes['State-County-Tract FIPS Code'] = tract_shapes[['STATEFP', 'COUNTYFP', 'TRACTCE']].agg(''.join, axis=1)
tract_shapes['State-County-Tract FIPS Code'] = tract_shapes['State-County-Tract FIPS Code'].astype(np.int64)
tract_shapes

Unnamed: 0,STATEFP,COUNTYFP,TRACTCE,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,geometry,State-County-Tract FIPS Code
0,34,005,702808,1400000US34005702808,34005702808,7028.08,CT,1510553,0,"POLYGON ((-74.90801 40.03250, -74.90496 40.036...",34005702808
1,39,153,504200,1400000US39153504200,39153504200,5042,CT,808151,0,"POLYGON ((-81.51807 41.05957, -81.51805 41.062...",39153504200
2,36,071,000502,1400000US36071000502,36071000502,5.02,CT,967431,969216,"POLYGON ((-74.02226 41.49281, -74.02180 41.496...",36071000502
3,06,013,370000,1400000US06013370000,06013370000,3700,CT,999356,0,"POLYGON ((-122.32697 37.93502, -122.32610 37.9...",6013370000
4,48,113,019204,1400000US48113019204,48113019204,192.04,CT,2851229,9966,"POLYGON ((-96.76902 32.94655, -96.76899 32.950...",48113019204
...,...,...,...,...,...,...,...,...,...,...,...
73863,48,245,011303,1400000US48245011303,48245011303,113.03,CT,155358411,14272444,"POLYGON ((-94.28353 29.88008, -94.26230 29.910...",48245011303
73864,01,073,012302,1400000US01073012302,01073012302,123.02,CT,168942340,1269965,"POLYGON ((-87.16295 33.49312, -87.16185 33.495...",1073012302
73865,72,113,072400,1400000US72113072400,72113072400,724,CT,11230767,184996,"POLYGON ((-66.58227 18.03935, -66.58177 18.039...",72113072400
73866,06,081,608023,1400000US06081608023,06081608023,6080.23,CT,805954,119028,"POLYGON ((-122.27898 37.54565, -122.27590 37.5...",6081608023


# Merge RUCA to Census Tract

In [7]:
RUCA_tracts = RUCA_df.merge(tract_shapes, on='State-County-Tract FIPS Code', how='left')
RUCA_tracts

Unnamed: 0,State-County FIPS Code,Select State,Select County,State-County-Tract FIPS Code,Primary RUCA Code 2010,"Secondary RUCA Code, 2010 (see errata)","Tract Population, 2010","Land Area (square miles), 2010","Population Density (per square mile), 2010",STATEFP,COUNTYFP,TRACTCE,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,geometry
0,1001,AL,Autauga County,1001020100,1,1.0,1912,3.787641,504.799727,01,001,020100,1400000US01001020100,01001020100,201,CT,9817813.0,28435.0,"POLYGON ((-86.50916 32.47344, -86.50620 32.475..."
1,1001,AL,Autauga County,1001020200,1,1.0,2170,1.289776,1682.462370,01,001,020200,1400000US01001020200,01001020200,202,CT,3325680.0,5669.0,"POLYGON ((-86.48093 32.48154, -86.47945 32.485..."
2,1001,AL,Autauga County,1001020300,1,1.0,3373,2.065366,1633.124331,01,001,020300,1400000US01001020300,01001020300,203,CT,5349273.0,9054.0,"POLYGON ((-86.47087 32.47573, -86.46964 32.478..."
3,1001,AL,Autauga County,1001020400,1,1.0,4386,2.464376,1779.760676,01,001,020400,1400000US01001020400,01001020400,204,CT,6384276.0,8408.0,"POLYGON ((-86.45394 32.49318, -86.45308 32.493..."
4,1001,AL,Autauga County,1001020500,1,1.0,10766,4.400686,2446.436531,01,001,020500,1400000US01001020500,01001020500,205,CT,11408866.0,43534.0,"POLYGON ((-86.43816 32.45069, -86.43773 32.451..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73997,72153,PR,Yauco Municipio,72153750501,1,1.0,4960,0.693337,7153.810353,72,153,750501,1400000US72153750501,72153750501,7505.01,CT,1820185.0,0.0,"POLYGON ((-66.87736 18.03493, -66.87529 18.034..."
73998,72153,PR,Yauco Municipio,72153750502,1,1.0,2092,0.263530,7938.381391,72,153,750502,1400000US72153750502,72153750502,7505.02,CT,689930.0,0.0,"POLYGON ((-66.86686 18.02211, -66.86401 18.024..."
73999,72153,PR,Yauco Municipio,72153750503,1,1.0,2357,1.280621,1840.512771,72,153,750503,1400000US72153750503,72153750503,7505.03,CT,3298433.0,1952.0,"POLYGON ((-66.88788 18.03583, -66.88326 18.036..."
74000,72153,PR,Yauco Municipio,72153750601,1,1.0,5315,4.251936,1250.018953,72,153,750601,1400000US72153750601,72153750601,7506.01,CT,10987037.0,4527.0,"POLYGON ((-66.86634 18.01929, -66.85396 18.025..."


In [8]:
def create_dir(save_dir):
    """
    Creates directory if it does not exist
         
    Parameters
    ----------
        save_dir (str): path of desired output directory
    """
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

In [9]:
save_dir = os.path.join(abs_path, 'output')

shp_file = RUCA_tracts.set_geometry('geometry')
save_path = os.path.join(save_dir, 'RUCA_tracts')
create_dir(save_path)

shp_file.to_file(save_path, driver='ESRI Shapefile')

  shp_file.to_file(save_path, driver='ESRI Shapefile')


In [54]:
RUCA_gdf = gpd.GeoDataFrame(RUCA_tracts, geometry='geometry')
RUCA_gdf = RUCA_gdf.to_crs("EPSG:4326")

# Spatial Join OMOP to RUCA Tracts

In [42]:
import shapely.wkt

OMOP_df = pd.read_csv(OMOP_path)
OMOP_df.geometry.isna().sum()
OMOP_df = OMOP_df.loc[OMOP_df.geometry.notna()]
OMOP_df.shape

  OMOP_df = pd.read_csv(OMOP_path)


(104097, 21)

In [43]:
OMOP_df['geometry'].astype('str').apply(shapely.wkt.loads)

0         POINT (-71.04352199999346 42.33547200002667)
1         POINT (-71.16493999969576 42.70621300000711)
2         POINT (-74.16282099988595 40.76993499987205)
3         POINT (-78.74781599997843 42.79853499984358)
4         POINT (-70.11377699970232 44.42860999977688)
                              ...                     
104097    POINT (-84.09517421962246 34.04172694900986)
104098    POINT (-84.10131848955332 33.96679709299809)
104099    POINT (-83.90221543184028 34.06883234729177)
104100    POINT (-84.49410603994914 37.99650802456039)
104101    POINT (-81.35019228948175 40.86915505731037)
Name: geometry, Length: 104097, dtype: object

In [50]:
OMOP_gdf = gpd.GeoDataFrame(OMOP_df, geometry=OMOP_df['geometry'].apply(shapely.wkt.loads))
OMOP_gdf.crs = "EPSG:4326"

In [59]:
OMOP_gdf = OMOP_gdf.drop(['index_right'], axis=1)

In [60]:
OMOP_RUCA = OMOP_gdf.sjoin(RUCA_gdf, how='left')
OMOP_RUCA

Unnamed: 0,Full_Addre,Place_type,source_lon,source_lat,geometry,OBJECTID,GEOID_1,NAME_1,NAMELSAD_1,Eligible_L,...,"Population Density (per square mile), 2010",STATEFP,COUNTYFP,TRACTCE,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER
0,"523 E BROADWAY, SOUTH BOSTON, MA 02127",AllPlacesOfWorship,-71.043522,42.335472,POINT (-71.04352 42.33547),,,,,,...,31810.671141,25,025,060400,1400000US25025060400,25025060400,604,CT,399279.0,0.0
1,"454 ESSEX ST, LAWRENCE, MA 01840",AllPlacesOfWorship,-71.164940,42.706213,POINT (-71.16494 42.70621),,,,,,...,7077.781854,25,009,250100,1400000US25009250100,25009250100,2501,CT,852256.0,198604.0
2,"569 BROADWAY, NEWARK, NJ 07104",AllPlacesOfWorship,-74.162821,40.769935,POINT (-74.16282 40.76993),,,,,,...,10667.291472,34,013,009600,1400000US34013009600,34013009600,96,CT,1089628.0,139136.0
3,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",AllPlacesOfWorship,-78.747816,42.798535,POINT (-78.74782 42.79853),,,,,,...,798.869054,36,029,013702,1400000US36029013702,36029013702,137.02,CT,18369144.0,0.0
4,"431 CAMPGROUND RD, LIVERMORE FLS, ME 04254",AllPlacesOfWorship,-70.113777,44.428610,POINT (-70.11378 44.42861),,,,,,...,161.930309,23,001,044000,1400000US23001044000,23001044000,440,CT,50966670.0,1924400.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104097,"1300 PEACHTREE INDUSTRIAL BOULEVARD, SUWANEE, ...",UrgentCareFacs,-84.095174,34.041727,POINT (-84.09517 34.04173),,,,,,...,1236.558511,13,135,050210,1400000US13135050210,13135050210,502.10,CT,12738728.0,96904.0
104098,"2660 SATELLITE BOULEVARD NORTHWEST, DULUTH, GA...",UrgentCareFacs,-84.101318,33.966797,POINT (-84.10132 33.96680),,,,,,...,991.926210,13,135,050215,1400000US13135050215,13135050215,502.15,CT,7288214.0,27626.0
104099,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",UrgentCareFacs,-83.902215,34.068832,POINT (-83.90222 34.06883),,,,,,...,1178.524050,13,135,050610,1400000US13135050610,13135050610,506.10,CT,32703063.0,360953.0
104100,"1055 DOVE RUN ROAD, LEXINGTON, KY 40502",UrgentCareFacs,-84.494106,37.996508,POINT (-84.49411 37.99651),,,,,,...,4060.929442,21,067,002302,1400000US21067002302,21067002302,23.02,CT,3315849.0,74.0


In [61]:
OMOP_RUCA.columns

Index(['Full_Addre', 'Place_type', 'source_lon', 'source_lat', 'geometry',
       'OBJECTID', 'GEOID_1', 'NAME_1', 'NAMELSAD_1', 'Eligible_L',
       'Shape_Leng', 'Shape_Le_1', 'Shape_Area', 'Tribal', 'address_1',
       'address_2', 'city', 'state', 'zip', 'STATE', 'index_right',
       'State-County FIPS Code', 'Select State', 'Select County',
       'State-County-Tract FIPS Code', 'Primary RUCA Code 2010',
       'Secondary RUCA Code, 2010 (see errata)', 'Tract Population, 2010',
       'Land Area (square miles), 2010',
       'Population Density (per square mile), 2010', 'STATEFP', 'COUNTYFP',
       'TRACTCE', 'AFFGEOID', 'GEOID', 'NAME', 'LSAD', 'ALAND', 'AWATER'],
      dtype='object')

In [62]:
save_dir = os.path.join(abs_path, 'output')

shp_file = OMOP_RUCA.set_geometry('geometry')
save_path = os.path.join(save_dir, 'OMOP_RUCA')
create_dir(save_path)

shp_file.to_file(save_path, driver='ESRI Shapefile')

  shp_file.to_file(save_path, driver='ESRI Shapefile')


# RUCA and Tribal Summary

In [67]:
pd.DataFrame(OMOP_RUCA['Primary RUCA Code 2010'].value_counts())

Unnamed: 0_level_0,count
Primary RUCA Code 2010,Unnamed: 1_level_1
1.0,63703
4.0,10436
10.0,9975
2.0,7687
7.0,7143
5.0,2686
8.0,1079
3.0,585
6.0,333
9.0,248


In [69]:
rural_codes = [1.0, 2.0, 3.0]
omop_ruca_lst = []
for i, row in OMOP_RUCA.iterrows():
    # tmp = {}
    
    if row['Primary RUCA Code 2010'] in rural_codes and row.Tribal == 0:
        row['Rural_Tribal'] = 0
        omop_ruca_lst.append(row)
    elif row['Primary RUCA Code 2010'] in rural_codes and row.Tribal == 1:
        row['Rural_Tribal'] = 0
        omop_ruca_lst.append(row)
    elif row['Primary RUCA Code 2010'] not in rural_codes and row.Tribal == 0:
        row['Rural_Tribal'] = 0
        omop_ruca_lst.append(row)
    elif row['Primary RUCA Code 2010'] not in rural_codes and row.Tribal == 1:
        row['Rural_Tribal'] = 1
        omop_ruca_lst.append(row)

In [73]:
omop_ruca_tribal = pd.concat(omop_ruca_lst, axis=1).transpose()

In [74]:
pd.DataFrame(omop_ruca_tribal.Rural_Tribal.value_counts())

Unnamed: 0_level_0,count
Rural_Tribal,Unnamed: 1_level_1
0,100972
1,3125
