## Preliminaries



Prepare

In [1]:
%%bash
rm -rf counties && rm -rf states

rm -rf *.log

rm -rf *.zip && rm -rf *.csv

wget -q https://github.com/miscellane/cartographs/raw/develop/cartographs.zip

<br>

Unzip

In [2]:
%%bash
rm -rf cartographs
unzip -u -q cartographs.zip
rm -r cartographs.zip

<br>

### Packages

In [3]:
!pip install geopandas &> geopandas.log

In [4]:
!pip install dask[complete] &> dask.log

<br>

### Libraries

In [5]:
import pandas as pd
import numpy as np
import os
import logging
import dask

<br>

### Logging

In [6]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

<br>
<br>

## Custom Classes

In [7]:
import cartographs.boundaries.us.boundaries
import cartographs.boundaries.us.settings

<br>
<br>

### Boundaries

Important notes courtesy of Wikipedia

* **Shannon County**: Shannon County has been renamed Oglala Lakota County (Shannon County until May 2015)
* **Wade Hampton County**: Wade Hampton County/Census Area has been renamed Kusilvak Census Area.  The Governor of Alaska formally notified the U.S. Census Bureau in July 2015.

In [8]:
settings = cartographs.boundaries.us.settings.Settings()
boundaries = cartographs.boundaries.us.boundaries.Boundaries(settings.crs)

<br>

Counties

In [9]:
counties = boundaries.counties(year=settings.latest)
counties.rename(columns={'GEOID': 'COUNTYGEOID', 'NAME': 'COUNTY'}, inplace=True)
counties.head()

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,COUNTYGEOID,COUNTY,LSAD,ALAND,AWATER,geometry
0,21,7,516850,0500000US21007,21007,Ballard,6,639387454,69473325,"POLYGON ((-89.18137 37.04630, -89.17938 37.053..."
1,21,17,516855,0500000US21017,21017,Bourbon,6,750439351,4829777,"POLYGON ((-84.44266 38.28324, -84.44114 38.283..."
2,21,31,516862,0500000US21031,21031,Butler,6,1103571974,13943044,"POLYGON ((-86.94486 37.07341, -86.94346 37.074..."
3,21,65,516879,0500000US21065,21065,Estill,6,655509930,6516335,"POLYGON ((-84.12662 37.64540, -84.12483 37.646..."
4,21,69,516881,0500000US21069,21069,Fleming,6,902727151,7182793,"POLYGON ((-83.98428 38.44549, -83.98246 38.450..."


<br>

State

In [10]:
states = boundaries.states(year=settings.latest)
states.rename(columns={'GEOID': 'STATEGEOID', 'NAME': 'STATE'}, inplace=True)
states.head()

Unnamed: 0,STATEFP,STATENS,AFFGEOID,STATEGEOID,STUSPS,STATE,LSAD,ALAND,AWATER,geometry
0,28,1779790,0400000US28,28,MS,Mississippi,0,121533519481,3926919758,"MULTIPOLYGON (((-88.50297 30.21523, -88.49176 ..."
1,37,1027616,0400000US37,37,NC,North Carolina,0,125923656064,13466071395,"MULTIPOLYGON (((-75.72681 35.93584, -75.71827 ..."
2,40,1102857,0400000US40,40,OK,Oklahoma,0,177662925723,3374587997,"POLYGON ((-103.00257 36.52659, -103.00219 36.6..."
3,51,1779803,0400000US51,51,VA,Virginia,0,102257717110,8528531774,"MULTIPOLYGON (((-75.74241 37.80835, -75.74151 ..."
4,54,1779805,0400000US54,54,WV,West Virginia,0,62266474513,489028543,"POLYGON ((-82.64320 38.16909, -82.64300 38.169..."


<br>

Tracts

In [11]:
computations = [dask.delayed(boundaries.tracts)(stategeoid, settings.latest) for stategeoid in states.STATEGEOID.values]
dask.visualize(computations, filename='tracts', format='pdf')

matrix = dask.compute(computations, scheduler='processes')[0]

tracts = pd.concat(matrix, axis=0, ignore_index=True)
tracts.head()

Unnamed: 0,STATEFP,COUNTYFP,TRACTCE,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,geometry,COUNTYGEOID
0,28,75,300,1400000US28075000300,28075000300,3.0,CT,2082376,0,"POLYGON ((-88.71853 32.37918, -88.71708 32.379...",28075
1,28,75,600,1400000US28075000600,28075000600,6.0,CT,3638314,0,"POLYGON ((-88.72393 32.35019, -88.72332 32.352...",28075
2,28,77,960100,1400000US28077960100,28077960100,9601.0,CT,416065599,5340857,"POLYGON ((-90.12452 31.65834, -90.12392 31.660...",28077
3,28,81,950301,1400000US28081950301,28081950301,9503.01,CT,62665855,122061,"POLYGON ((-88.67838 34.28574, -88.67665 34.286...",28081
4,28,85,950600,1400000US28085950600,28085950600,9506.0,CT,18741571,82001,"POLYGON ((-90.47957 31.56210, -90.47946 31.564...",28085


<br>

Gazetteer

In [12]:
gazetteer = counties[['STATEFP', 'COUNTYGEOID', 'COUNTY']].merge(states[['STATEFP', 'STUSPS', 'STATE']], how='left', on='STATEFP')
gazetteer.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3233 entries, 0 to 3232
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   STATEFP      3233 non-null   object
 1   COUNTYGEOID  3233 non-null   object
 2   COUNTY       3233 non-null   object
 3   STUSPS       3233 non-null   object
 4   STATE        3233 non-null   object
dtypes: object(5)
memory usage: 151.5+ KB


<br>
<br>

### Anomalies

* `zip(['Shannon', 'Wade Hampton'], ['46113', '02270'], ['Ogala Lakota', 'Kusilvak'], ['46102', '02158'])`

In [13]:
obsolete =  {0: {'COUNTY': 'Shannon', 'COUNTYGEOID': '46113', 'COUNTY_': 'Ogala Lakota', 'COUNTYGEOID_': '46102'},
                1: {'COUNTY': 'Wade Hampton', 'COUNTYGEOID': '02270', 'COUNTY_': 'Kusilvak', 'COUNTYGEOID_': '02158'}}

In [14]:
class Anomalies:

    def __init__(self, targets: dict):

        self.targets = targets

    def tractseries(self, blob: pd.DataFrame):

        for k, v in self.targets.items():
            condition = blob['Tract'].str.startswith(v['COUNTYGEOID'])
            blob.loc[condition, 'Tract'] = blob[condition]['Tract'].str.replace(v['COUNTYGEOID'], v['COUNTYGEOID_'], regex=True)

        return blob


    def countyseries(self, blob: pd.DataFrame):

        for k, v in self.targets.items():
            condition = (blob['COUNTY'] == v['COUNTY']) & (blob['COUNTYGEOID'] == v['COUNTYGEOID'])
            blob.loc[condition, ['COUNTY', 'COUNTYGEOID']] = [v['COUNTY_'], v['COUNTYGEOID_']]
            blob.loc[condition, 'Tract'] = blob[condition]['Tract'].str.replace(v['COUNTYGEOID'], v['COUNTYGEOID_'], regex=True)

        return blob

In [15]:
anomalies = Anomalies(targets=obsolete)

<br>
<br>

## Risks



In [16]:
urlstrings = ['https://www.epa.gov/sites/production/files/2018-08/nata2014v2_national_cancerrisk_by_tract_srcgrp.xlsx',
              'https://www.epa.gov/sites/production/files/2018-08/nata2014v2_national_cancerrisk_by_tract_poll.xlsx',
              'https://www.epa.gov/sites/production/files/2018-08/nata2014v2_national_resphi_by_tract_srcgrp.xlsx',
              'https://www.epa.gov/sites/production/files/2018-08/nata2014v2_national_resphi_by_tract_poll.xlsx',
              'https://www.epa.gov/sites/production/files/2018-08/nata2014v2_national_neurhi_by_tract_srcgrp.xlsx', 
              'https://www.epa.gov/sites/production/files/2018-08/nata2014v2_national_neurhi_by_tract_poll.xlsx', 
              'https://www.epa.gov/sites/production/files/2018-08/nata2014v2_national_liverhi_by_tract_srcgrp.xlsx', 
              'https://www.epa.gov/sites/production/files/2018-08/nata2014v2_national_liverhi_by_tract_poll.xlsx', 
              'https://www.epa.gov/sites/production/files/2018-08/nata2014v2_national_kidnhi_by_tract_srcgrp.xlsx', 
              'https://www.epa.gov/sites/production/files/2018-08/nata2014v2_national_kidnhi_by_tract_poll.xlsx', 
              'https://www.epa.gov/sites/production/files/2018-08/nata2014v2_national_immuhi_by_tract_srcgrp.xlsx', 
              'https://www.epa.gov/sites/production/files/2018-08/nata2014v2_national_immuhi_by_tract_poll.xlsx']

In [17]:
names = ['cancerRiskByGroup', 'cancerRiskByPollutant', 'respRiskByGroup', 'respRiskByPollutant', 
         'neurologicalRiskByGroup', 'neurologicalRiskByPollutant', 'liverRiskByGroup', 'liverRiskByPollutant',
         'kidneyRiskByGroup', 'kidneyRiskByPollutant', 'immunologicalRiskByGroup', 'immunologicalRiskByPollutant']

<br>

### Risk Data

In [18]:

for urlstring, name in zip(urlstrings, names):

    # Reading-in
    data = pd.read_excel(io=urlstring, header=0, sheet_name=0, dtype={'FIPS': str, 'Tract': str})
    data.rename(columns={'State': 'STUSPS', 'County': 'COUNTY', 'FIPS': 'COUNTYGEOID'}, inplace=True)


    # The counties that do not exist
    # indices = ~data.COUNTYGEOID.isin(gazetteer.COUNTYGEOID.values) & ~data.COUNTY.str.contains('Entire State')
    

    # Anomalies
    data = anomalies.countyseries(blob=data.copy())
    

    # Focus on valid county level records
    readings = data.merge(gazetteer[['STUSPS', 'COUNTYGEOID', 'STATE']], how='inner', on=['STUSPS', 'COUNTYGEOID'])
    readings.reset_index(drop=True, inplace=True)

    readings = readings[readings['Tract'].str.endswith('000000')]
    readings.reset_index(drop=True, inplace=True)

    readings = readings.drop(columns=['EPA Region', 'Tract', 'COUNTY', 'STATE'], inplace=False)


    # Melting
    risks = readings.melt(id_vars=['STUSPS', 'COUNTYGEOID', 'Population'], var_name='riskType', value_name='risk')
    logger.info('\n\n{}\n'.format(risks.info()))

    risks.to_csv(path_or_buf=os.path.join(os.getcwd(), name + '.csv'), header=True, index=False, encoding='utf-8')
    logger.info('\n\n{}\n'.format(risks.head()))


INFO:__main__:

None



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125697 entries, 0 to 125696
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   STUSPS       125697 non-null  object 
 1   COUNTYGEOID  125697 non-null  object 
 2   Population   125697 non-null  int64  
 3   riskType     125697 non-null  object 
 4   risk         125697 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 4.8+ MB


INFO:__main__:

  STUSPS COUNTYGEOID  Population                         riskType       risk
0     AL       01001       54571  Total Cancer Risk (per million)  49.511426
1     AL       01003      182265  Total Cancer Risk (per million)  35.610599
2     AL       01005       27457  Total Cancer Risk (per million)  45.671354
3     AL       01007       22915  Total Cancer Risk (per million)  46.044593
4     AL       01009       57322  Total Cancer Risk (per million)  39.198849

INFO:__main__:

None



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232056 entries, 0 to 232055
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   STUSPS       232056 non-null  object 
 1   COUNTYGEOID  232056 non-null  object 
 2   Population   232056 non-null  int64  
 3   riskType     232056 non-null  object 
 4   risk         232056 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 8.9+ MB


INFO:__main__:

  STUSPS COUNTYGEOID  Population                         riskType       risk
0     AL       01001       54571  Total Cancer Risk (per million)  49.511426
1     AL       01003      182265  Total Cancer Risk (per million)  35.610599
2     AL       01005       27457  Total Cancer Risk (per million)  45.671354
3     AL       01007       22915  Total Cancer Risk (per million)  46.044593
4     AL       01009       57322  Total Cancer Risk (per million)  39.198849

INFO:__main__:

None



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125697 entries, 0 to 125696
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   STUSPS       125697 non-null  object 
 1   COUNTYGEOID  125697 non-null  object 
 2   Population   125697 non-null  int64  
 3   riskType     125697 non-null  object 
 4   risk         125697 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 4.8+ MB


INFO:__main__:

  STUSPS COUNTYGEOID  Population                             riskType      risk
0     AL       01001       54571  Total Respiratory (hazard quotient)  0.799127
1     AL       01003      182265  Total Respiratory (hazard quotient)  0.523780
2     AL       01005       27457  Total Respiratory (hazard quotient)  0.807123
3     AL       01007       22915  Total Respiratory (hazard quotient)  0.723578
4     AL       01009       57322  Total Respiratory (hazard quotient)  0.572684

INFO:__main__:

None



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141812 entries, 0 to 141811
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   STUSPS       141812 non-null  object 
 1   COUNTYGEOID  141812 non-null  object 
 2   Population   141812 non-null  int64  
 3   riskType     141812 non-null  object 
 4   risk         141812 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 5.4+ MB


INFO:__main__:

  STUSPS COUNTYGEOID  Population                             riskType      risk
0     AL       01001       54571  Total Respiratory (hazard quotient)  0.799127
1     AL       01003      182265  Total Respiratory (hazard quotient)  0.523780
2     AL       01005       27457  Total Respiratory (hazard quotient)  0.807123
3     AL       01007       22915  Total Respiratory (hazard quotient)  0.723578
4     AL       01009       57322  Total Respiratory (hazard quotient)  0.572684

INFO:__main__:

None



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125697 entries, 0 to 125696
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   STUSPS       125697 non-null  object 
 1   COUNTYGEOID  125697 non-null  object 
 2   Population   125697 non-null  int64  
 3   riskType     125697 non-null  object 
 4   risk         125697 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 4.8+ MB


INFO:__main__:

  STUSPS COUNTYGEOID  ...                              riskType      risk
0     AL       01001  ...  Total Neurological (hazard quotient)  0.038198
1     AL       01003  ...  Total Neurological (hazard quotient)  0.027525
2     AL       01005  ...  Total Neurological (hazard quotient)  0.025123
3     AL       01007  ...  Total Neurological (hazard quotient)  0.024480
4     AL       01009  ...  Total Neurological (hazard quotient)  0.030272

[5 rows x 5 columns]

INFO:__main__:

None



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80575 entries, 0 to 80574
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   STUSPS       80575 non-null  object 
 1   COUNTYGEOID  80575 non-null  object 
 2   Population   80575 non-null  int64  
 3   riskType     80575 non-null  object 
 4   risk         80575 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 3.1+ MB


INFO:__main__:

  STUSPS COUNTYGEOID  ...                              riskType      risk
0     AL       01001  ...  Total Neurological (hazard quotient)  0.038198
1     AL       01003  ...  Total Neurological (hazard quotient)  0.027525
2     AL       01005  ...  Total Neurological (hazard quotient)  0.025123
3     AL       01007  ...  Total Neurological (hazard quotient)  0.024480
4     AL       01009  ...  Total Neurological (hazard quotient)  0.030272

[5 rows x 5 columns]

INFO:__main__:

None



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125697 entries, 0 to 125696
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   STUSPS       125697 non-null  object 
 1   COUNTYGEOID  125697 non-null  object 
 2   Population   125697 non-null  int64  
 3   riskType     125697 non-null  object 
 4   risk         125697 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 4.8+ MB


INFO:__main__:

  STUSPS COUNTYGEOID  Population                       riskType      risk
0     AL       01001       54571  Total Liver (hazard quotient)  0.011782
1     AL       01003      182265  Total Liver (hazard quotient)  0.008522
2     AL       01005       27457  Total Liver (hazard quotient)  0.008279
3     AL       01007       22915  Total Liver (hazard quotient)  0.009019
4     AL       01009       57322  Total Liver (hazard quotient)  0.009509

INFO:__main__:

None



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93467 entries, 0 to 93466
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   STUSPS       93467 non-null  object 
 1   COUNTYGEOID  93467 non-null  object 
 2   Population   93467 non-null  int64  
 3   riskType     93467 non-null  object 
 4   risk         93467 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 3.6+ MB


INFO:__main__:

  STUSPS COUNTYGEOID  Population                       riskType      risk
0     AL       01001       54571  Total Liver (hazard quotient)  0.011782
1     AL       01003      182265  Total Liver (hazard quotient)  0.008522
2     AL       01005       27457  Total Liver (hazard quotient)  0.008279
3     AL       01007       22915  Total Liver (hazard quotient)  0.009019
4     AL       01009       57322  Total Liver (hazard quotient)  0.009509

INFO:__main__:

None



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125697 entries, 0 to 125696
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   STUSPS       125697 non-null  object 
 1   COUNTYGEOID  125697 non-null  object 
 2   Population   125697 non-null  int64  
 3   riskType     125697 non-null  object 
 4   risk         125697 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 4.8+ MB


INFO:__main__:

  STUSPS COUNTYGEOID  Population                        riskType      risk
0     AL       01001       54571  Total Kidney (hazard quotient)  0.005948
1     AL       01003      182265  Total Kidney (hazard quotient)  0.002870
2     AL       01005       27457  Total Kidney (hazard quotient)  0.002833
3     AL       01007       22915  Total Kidney (hazard quotient)  0.003757
4     AL       01009       57322  Total Kidney (hazard quotient)  0.004126

INFO:__main__:

None

INFO:__main__:

  STUSPS COUNTYGEOID  Population                        riskType      risk
0     AL       01001       54571  Total Kidney (hazard quotient)  0.005948
1     AL       01003      182265  Total Kidney (hazard quotient)  0.002870
2     AL       01005       27457  Total Kidney (hazard quotient)  0.002833
3     AL       01007       22915  Total Kidney (hazard quotient)  0.003757
4     AL       01009       57322  Total Kidney (hazard quotient)  0.004126



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32230 entries, 0 to 32229
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   STUSPS       32230 non-null  object 
 1   COUNTYGEOID  32230 non-null  object 
 2   Population   32230 non-null  int64  
 3   riskType     32230 non-null  object 
 4   risk         32230 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 1.2+ MB


INFO:__main__:

None



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125697 entries, 0 to 125696
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   STUSPS       125697 non-null  object 
 1   COUNTYGEOID  125697 non-null  object 
 2   Population   125697 non-null  int64  
 3   riskType     125697 non-null  object 
 4   risk         125697 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 4.8+ MB


INFO:__main__:

  STUSPS COUNTYGEOID  ...                               riskType      risk
0     AL       01001  ...  Total Immunological (hazard quotient)  0.017973
1     AL       01003  ...  Total Immunological (hazard quotient)  0.012382
2     AL       01005  ...  Total Immunological (hazard quotient)  0.012559
3     AL       01007  ...  Total Immunological (hazard quotient)  0.013410
4     AL       01009  ...  Total Immunological (hazard quotient)  0.014146

[5 rows x 5 columns]

INFO:__main__:

None

INFO:__main__:

  STUSPS COUNTYGEOID  ...                               riskType      risk
0     AL       01001  ...  Total Immunological (hazard quotient)  0.017973
1     AL       01003  ...  Total Immunological (hazard quotient)  0.012382
2     AL       01005  ...  Total Immunological (hazard quotient)  0.012559
3     AL       01007  ...  Total Immunological (hazard quotient)  0.013410
4     AL       01009  ...  Total Immunological (hazard quotient)  0.014146

[5 rows x 5 columns]



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12892 entries, 0 to 12891
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   STUSPS       12892 non-null  object 
 1   COUNTYGEOID  12892 non-null  object 
 2   Population   12892 non-null  int64  
 3   riskType     12892 non-null  object 
 4   risk         12892 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 503.7+ KB


<br>
<br>

## Hazards

In [19]:
urlhazards = 'https://www.epa.gov/sites/production/files/2018-08/nata2014v2_national_allhi.xlsx'

<br>
<br>

### Hazards Data

In [20]:
hazards = pd.read_excel(io=urlhazards, header=0, sheet_name=0, dtype={'Tract': str})
hazards.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76727 entries, 0 to 76726
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Tract             76727 non-null  object 
 1   Population        76727 non-null  int64  
 2   Respiratory HI    76727 non-null  float64
 3   Neurological HI   76727 non-null  float64
 4   Liver HI          76727 non-null  float64
 5   Developmental HI  76727 non-null  float64
 6   Reproductive HI   76727 non-null  float64
 7   Kidney HI         76727 non-null  float64
 8   Ocular HI         76727 non-null  float64
 9   Endocrine HI      76727 non-null  float64
 10  Hematological HI  76727 non-null  float64
 11  Immunological HI  76727 non-null  float64
 12  Skeletal HI       76727 non-null  float64
 13  Spleen HI         76727 non-null  float64
 14  Thyroid HI        76727 non-null  float64
 15  Whole Body HI     76727 non-null  float64
dtypes: float64(14), int64(1), object(1)
memo

<br>

Anomalies

* `indices = hazards.Tract.str.startswith('46113') | hazards.Tract.str.startswith('02270')`
* `hazards[indices][['Tract']]`

In [21]:
hazards = anomalies.tractseries(blob=hazards.copy())

<br>

Tracts Only

In [22]:
readings = hazards.merge(tracts[['STATEFP', 'COUNTYGEOID', 'GEOID']], how='inner', left_on='Tract', right_on='GEOID')
readings.drop(columns=['GEOID'], inplace=True)
readings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73430 entries, 0 to 73429
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Tract             73430 non-null  object 
 1   Population        73430 non-null  int64  
 2   Respiratory HI    73430 non-null  float64
 3   Neurological HI   73430 non-null  float64
 4   Liver HI          73430 non-null  float64
 5   Developmental HI  73430 non-null  float64
 6   Reproductive HI   73430 non-null  float64
 7   Kidney HI         73430 non-null  float64
 8   Ocular HI         73430 non-null  float64
 9   Endocrine HI      73430 non-null  float64
 10  Hematological HI  73430 non-null  float64
 11  Immunological HI  73430 non-null  float64
 12  Skeletal HI       73430 non-null  float64
 13  Spleen HI         73430 non-null  float64
 14  Thyroid HI        73430 non-null  float64
 15  Whole Body HI     73430 non-null  float64
 16  STATEFP           73430 non-null  object

<br>

Melt

In [23]:
hazardindices = readings.melt(id_vars=['Tract', 'Population', 'STATEFP', 'COUNTYGEOID'], var_name='hazardIndexType', value_name='hazardIndex')
hazardindices.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1028020 entries, 0 to 1028019
Data columns (total 6 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   Tract            1028020 non-null  object 
 1   Population       1028020 non-null  int64  
 2   STATEFP          1028020 non-null  object 
 3   COUNTYGEOID      1028020 non-null  object 
 4   hazardIndexType  1028020 non-null  object 
 5   hazardIndex      1028020 non-null  float64
dtypes: float64(1), int64(1), object(4)
memory usage: 47.1+ MB


In [24]:
hazardindices.to_csv(path_or_buf=os.path.join(os.getcwd(), 'hazardIndices.csv'), header=True, index=False, encoding='utf-8')
logger.info('\n\n{}\n'.format(hazardindices.head()))

INFO:__main__:

         Tract  Population STATEFP COUNTYGEOID hazardIndexType  hazardIndex
0  01001020100        1912      01       01001  Respiratory HI     0.788052
1  01001020200        2170      01       01001  Respiratory HI     0.808744
2  01001020300        3373      01       01001  Respiratory HI     0.816773
3  01001020400        4386      01       01001  Respiratory HI     0.835904
4  01001020500       10766      01       01001  Respiratory HI     0.841417

