# Data Cleaning

In [2]:
# Dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from config import state_abbrev, reverse_state_abbrev, state_codes

import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
py.init_notebook_mode(connected=True)

## US Air Quality Index (AQI) Data - by County
By state/county, with FIPS data for choropleth plotting.

In [2]:
# Read in and combine AQI data
county_aqi_df = pd.DataFrame()
for yr in range(1980,2018):
    aqi = pd.read_csv('raw_data/annual_aqi_by_county_{}.csv'.format(yr))
    county_aqi_df = county_aqi_df.append(aqi, ignore_index=True)

# Make state and county names lowercase
for y in ['State', 'County']:
    county_aqi_df[y] = county_aqi_df[y].apply(lambda x: x.lower())

county_aqi_df.head()
# county_aqi_df.loc[county_aqi_df['State']=='louisiana']
# county_aqi_df['State'].unique()

Unnamed: 0,State,County,Year,Days with AQI,Good Days,Moderate Days,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,Max AQI,90th Percentile AQI,Median AQI,Days CO,Days NO2,Days Ozone,Days SO2,Days PM2.5,Days PM10
0,alabama,autauga,1980,179,122,35,18,4,0,0,177,108,40,0,0,122,57,0,0
1,alabama,colbert,1980,274,127,45,63,39,0,0,200,165,56,0,0,0,274,0,0
2,alabama,jackson,1980,366,85,110,92,79,0,0,200,200,94,0,0,0,366,0,0
3,alabama,jefferson,1980,342,174,105,37,19,7,0,221,140,50,202,0,140,0,0,0
4,alabama,lauderdale,1980,274,120,58,77,19,0,0,200,139,56,0,0,0,274,0,0


In [3]:
# Get dataframe with state/county FIPS codes
fips = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/laucnty16.csv')

# Parse FIPS county/state data
fips["County"] = fips['County Name/State Abbreviation'].apply(lambda x: x.split(",")[0].replace(" County", "").replace(" Parish", "").replace(" Borough", "").replace(" Census Area", "").replace("/city", "").replace("/municipality", "").replace(" Municipality", "").lower())
# fips["County"] = fips['County Name/State Abbreviation'].apply(lambda x: x.split(",")[0].lower())
fips["State Abbreviation"] = fips['County Name/State Abbreviation'].apply(lambda x: x.split(", ")[-1])
fips["State"] = fips["State Abbreviation"].apply(lambda x: state_abbrev[x].lower())

# Pad FIPS codes with 0's, concat for combined FIPS code
fips['State FIPS Code'] = fips['State FIPS Code'].apply(lambda x: str(x).zfill(2))
fips['County FIPS Code'] = fips['County FIPS Code'].apply(lambda x: str(x).zfill(3))
fips['FIPS'] = fips['State FIPS Code'] + fips['County FIPS Code']

# Select out specific columns
fips = fips[['County','State','State Abbreviation','FIPS','State FIPS Code','County FIPS Code']]

fips.loc[fips['State']=='hawaii']
fips.head()

Unnamed: 0,County,State,State Abbreviation,FIPS,State FIPS Code,County FIPS Code
0,autauga,alabama,AL,1001,1,1
1,baldwin,alabama,AL,1003,1,3
2,barbour,alabama,AL,1005,1,5
3,bibb,alabama,AL,1007,1,7
4,blount,alabama,AL,1009,1,9


In [4]:
# Export FIPS data to csv
fips.to_csv("clean_data/fips_data.csv")
print('Done.')

Done.


In [5]:
# Combine fips df with county AQI df
county_aqi_1980_2018 = county_aqi_df.merge(fips, how='inner', on=['State','County'])
county_aqi_1980_2018.head()
# county_aqi_1980_2018.loc[county_aqi_1980_2018['State']=='louisiana']

Unnamed: 0,State,County,Year,Days with AQI,Good Days,Moderate Days,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,...,Days CO,Days NO2,Days Ozone,Days SO2,Days PM2.5,Days PM10,State Abbreviation,FIPS,State FIPS Code,County FIPS Code
0,alabama,autauga,1980,179,122,35,18,4,0,0,...,0,0,122,57,0,0,AL,1001,1,1
1,alabama,autauga,1981,357,289,49,15,4,0,0,...,0,0,241,116,0,0,AL,1001,1,1
2,alabama,autauga,1982,245,203,36,5,0,1,0,...,0,0,166,79,0,0,AL,1001,1,1
3,alabama,autauga,1989,63,54,9,0,0,0,0,...,0,0,63,0,0,0,AL,1001,1,1
4,alabama,autauga,1990,266,183,64,18,1,0,0,...,0,0,266,0,0,0,AL,1001,1,1


In [6]:
county_aqi_1980_2018.keys()

Index(['State', 'County', 'Year', 'Days with AQI', 'Good Days',
       'Moderate Days', 'Unhealthy for Sensitive Groups Days',
       'Unhealthy Days', 'Very Unhealthy Days', 'Hazardous Days', 'Max AQI',
       '90th Percentile AQI', 'Median AQI', 'Days CO', 'Days NO2',
       'Days Ozone', 'Days SO2', 'Days PM2.5', 'Days PM10',
       'State Abbreviation', 'FIPS', 'State FIPS Code', 'County FIPS Code'],
      dtype='object')

In [7]:
# Add columns with percentage data for good/hazardous days
county_aqi_1980_2018['% Days AQI Unhealthy-Hazardous'] = ((
    county_aqi_1980_2018["Unhealthy Days"] + county_aqi_1980_2018["Very Unhealthy Days"] + 
    county_aqi_1980_2018["Hazardous Days"]) / county_aqi_1980_2018["Days with AQI"]) * 100

county_aqi_1980_2018['% Days CO'] = county_aqi_1980_2018["Days CO"] / county_aqi_1980_2018["Days with AQI"] * 100
county_aqi_1980_2018['% Days NO2'] = county_aqi_1980_2018["Days NO2"] / county_aqi_1980_2018["Days with AQI"] * 100
county_aqi_1980_2018['% Days SO2'] = county_aqi_1980_2018["Days SO2"] / county_aqi_1980_2018["Days with AQI"] * 100
county_aqi_1980_2018['% Days Ozone'] = county_aqi_1980_2018["Days Ozone"] / county_aqi_1980_2018["Days with AQI"] * 100
county_aqi_1980_2018['% Days PM2.5'] = county_aqi_1980_2018["Days PM2.5"] / county_aqi_1980_2018["Days with AQI"] * 100
county_aqi_1980_2018['% Days PM10'] = county_aqi_1980_2018["Days PM10"] / county_aqi_1980_2018["Days with AQI"] * 100

# county_aqi_1980_2018['% Days Good'] = county_aqi_1980_2018["Good Days"] / county_aqi_1980_2018["Days with AQI"] * 100
# county_aqi_1980_2018['% Days Moderate'] = county_aqi_1980_2018["Moderate Days"] / county_aqi_1980_2018["Days with AQI"] * 100
# county_aqi_1980_2018['% Days Unhealthy for Sensitive Groups'] = county_aqi_1980_2018["Unhealthy for Sensitive Groups Days"] / county_aqi_1980_2018["Days with AQI"] * 100
# county_aqi_1980_2018['% Days Unhealthy'] = county_aqi_1980_2018["Unhealthy Days"] / county_aqi_1980_2018["Days with AQI"] * 100
# county_aqi_1980_2018['% Days Very Unhealthy'] = county_aqi_1980_2018["Very Unhealthy Days"] / county_aqi_1980_2018["Days with AQI"] * 100
# county_aqi_1980_2018['% Days Hazardous'] = county_aqi_1980_2018["Hazardous Days"] / county_aqi_1980_2018["Days with AQI"] * 100
county_aqi_1980_2018.keys()

Index(['State', 'County', 'Year', 'Days with AQI', 'Good Days',
       'Moderate Days', 'Unhealthy for Sensitive Groups Days',
       'Unhealthy Days', 'Very Unhealthy Days', 'Hazardous Days', 'Max AQI',
       '90th Percentile AQI', 'Median AQI', 'Days CO', 'Days NO2',
       'Days Ozone', 'Days SO2', 'Days PM2.5', 'Days PM10',
       'State Abbreviation', 'FIPS', 'State FIPS Code', 'County FIPS Code',
       '% Days AQI Unhealthy-Hazardous', '% Days CO', '% Days NO2',
       '% Days SO2', '% Days Ozone', '% Days PM2.5', '% Days PM10'],
      dtype='object')

In [8]:
# county_aqi_1980_2018.loc[county_aqi_1980_2018['State']=='louisiana']
# county_aqi_1980_2018.dtypes

In [9]:
# Export county aqi data to csv
county_aqi_1980_2018.to_csv("clean_data/county_aqi_1980_2018.csv")
print('Done.')

Done.


### AQI Data with relevant quant data + state/county/fips/etc + year (1980-2018)

In [10]:
# Separate out useful data
aqi_1980_2018 = county_aqi_1980_2018[['State', 'County', 'State Abbreviation', 'Year', 'FIPS',
                               'State FIPS Code', 'County FIPS Code', 'Max AQI','90th Percentile AQI', 
                               '% Days AQI Unhealthy-Hazardous', '% Days CO', '% Days NO2',
                               '% Days SO2', '% Days Ozone', '% Days PM2.5', '% Days PM10']]
aqi_1980_2018

Unnamed: 0,State,County,State Abbreviation,Year,FIPS,State FIPS Code,County FIPS Code,Max AQI,90th Percentile AQI,% Days AQI Unhealthy-Hazardous,% Days CO,% Days NO2,% Days SO2,% Days Ozone,% Days PM2.5,% Days PM10
0,alabama,autauga,AL,1980,01001,01,001,177,108,2.234637,0.0,0.000000,31.843575,68.156425,0.000000,0.000000
1,alabama,autauga,AL,1981,01001,01,001,195,77,1.120448,0.0,0.000000,32.492997,67.507003,0.000000,0.000000
2,alabama,autauga,AL,1982,01001,01,001,206,67,0.408163,0.0,0.000000,32.244898,67.755102,0.000000,0.000000
3,alabama,autauga,AL,1989,01001,01,001,100,64,0.000000,0.0,0.000000,0.000000,100.000000,0.000000,0.000000
4,alabama,autauga,AL,1990,01001,01,001,151,93,0.375940,0.0,0.000000,0.000000,100.000000,0.000000,0.000000
5,alabama,colbert,AL,1980,01033,01,033,200,165,14.233577,0.0,0.000000,100.000000,0.000000,0.000000,0.000000
6,alabama,colbert,AL,1981,01033,01,033,200,149,9.776536,0.0,0.000000,100.000000,0.000000,0.000000,0.000000
7,alabama,colbert,AL,1982,01033,01,033,200,148,9.577465,0.0,0.000000,100.000000,0.000000,0.000000,0.000000
8,alabama,colbert,AL,1983,01033,01,033,200,129,4.678363,0.0,0.000000,100.000000,0.000000,0.000000,0.000000
9,alabama,colbert,AL,1984,01033,01,033,200,127,5.459770,0.0,0.000000,100.000000,0.000000,0.000000,0.000000


In [11]:
# Export county aqi data to csv
aqi_1980_2018.to_csv("clean_data/aqi_1980_2018.csv")
print('Done.')

Done.


## US AQI Data - by CBSA

In [12]:
# Read in and combine AQI data
cbsa_aqi_df = pd.DataFrame()
for yr in range(1980,2018):
    aqi = pd.read_csv('raw_data/annual_aqi_by_cbsa_{}.csv'.format(yr))
    cbsa_aqi_df = cbsa_aqi_df.append(aqi, ignore_index=True)

# Parse CBSA column into city and state columns
cbsa_aqi_df['State Abbreviation'] = cbsa_aqi_df['CBSA'].apply(lambda x: x.split(', ')[-1])
cbsa_aqi_df['CBSA'] = cbsa_aqi_df['CBSA'].apply(lambda x: x.split(', ')[0])
    
cbsa_aqi_df

Unnamed: 0,CBSA,CBSA Code,Year,Days with AQI,Good Days,Moderate Days,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,Max AQI,90th Percentile AQI,Median AQI,Days CO,Days NO2,Days Ozone,Days SO2,Days PM2.5,Days PM10,State Abbreviation
0,Akron,10420,1980,366,17,119,171,57,2,0,205,164,109,8,5,55,298,0,0,OH
1,Albany-Schenectady-Troy,10580,1980,365,95,183,72,15,0,0,200,122,69,1,0,90,274,0,0,NY
2,Albuquerque,10740,1980,366,161,155,29,12,9,0,251,118,54,121,49,196,0,0,0,NM
3,Alexandria,10780,1980,183,152,25,5,1,0,0,169,71,35,0,0,183,0,0,0,LA
4,Allentown-Bethlehem-Easton,10900,1980,365,104,157,58,36,9,1,357,159,69,10,92,124,139,0,0,PA-NJ
5,Alma,10940,1980,359,275,36,46,2,0,0,152,120,27,0,0,0,359,0,0,MI
6,Altoona,11020,1980,147,111,26,7,3,0,0,177,84,33,0,0,37,110,0,0,PA
7,Amarillo,11100,1980,113,93,16,4,0,0,0,112,77,38,0,0,113,0,0,0,TX
8,Ames,11180,1980,180,147,23,9,1,0,0,163,74,7,0,0,0,180,0,0,IA
9,Anchorage,11260,1980,366,205,116,18,14,13,0,280,126,44,366,0,0,0,0,0,AK


In [13]:
print(len(cbsa_aqi_df))
cbsa_aqi_df.keys()

18189


Index(['CBSA', 'CBSA Code', 'Year', 'Days with AQI', 'Good Days',
       'Moderate Days', 'Unhealthy for Sensitive Groups Days',
       'Unhealthy Days', 'Very Unhealthy Days', 'Hazardous Days', 'Max AQI',
       '90th Percentile AQI', 'Median AQI', 'Days CO', 'Days NO2',
       'Days Ozone', 'Days SO2', 'Days PM2.5', 'Days PM10',
       'State Abbreviation'],
      dtype='object')

In [14]:
# Add columns with percentage data for good/hazardous days
cbsa_aqi_df['% Days Good'] = cbsa_aqi_df["Good Days"] / cbsa_aqi_df["Days with AQI"] * 100
cbsa_aqi_df['% Days Moderate'] = cbsa_aqi_df["Moderate Days"] / cbsa_aqi_df["Days with AQI"] * 100
cbsa_aqi_df['% Days Unhealthy for Sensitive Groups'] = cbsa_aqi_df["Unhealthy for Sensitive Groups Days"] / cbsa_aqi_df["Days with AQI"] * 100
cbsa_aqi_df['% Days Unhealthy'] = cbsa_aqi_df["Unhealthy Days"] / cbsa_aqi_df["Days with AQI"] * 100
cbsa_aqi_df['% Days Very Unhealthy'] = cbsa_aqi_df["Very Unhealthy Days"] / cbsa_aqi_df["Days with AQI"] * 100
cbsa_aqi_df['% Days Hazardous'] = cbsa_aqi_df["Hazardous Days"] / cbsa_aqi_df["Days with AQI"] * 100
cbsa_aqi_df.head()

Unnamed: 0,CBSA,CBSA Code,Year,Days with AQI,Good Days,Moderate Days,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,...,Days SO2,Days PM2.5,Days PM10,State Abbreviation,% Days Good,% Days Moderate,% Days Unhealthy for Sensitive Groups,% Days Unhealthy,% Days Very Unhealthy,% Days Hazardous
0,Akron,10420,1980,366,17,119,171,57,2,0,...,298,0,0,OH,4.644809,32.513661,46.721311,15.57377,0.546448,0.0
1,Albany-Schenectady-Troy,10580,1980,365,95,183,72,15,0,0,...,274,0,0,NY,26.027397,50.136986,19.726027,4.109589,0.0,0.0
2,Albuquerque,10740,1980,366,161,155,29,12,9,0,...,0,0,0,NM,43.989071,42.349727,7.923497,3.278689,2.459016,0.0
3,Alexandria,10780,1980,183,152,25,5,1,0,0,...,0,0,0,LA,83.060109,13.661202,2.73224,0.546448,0.0,0.0
4,Allentown-Bethlehem-Easton,10900,1980,365,104,157,58,36,9,1,...,139,0,0,PA-NJ,28.493151,43.013699,15.890411,9.863014,2.465753,0.273973


In [15]:
# Export county aqi data to csv
cbsa_aqi_df.to_csv("clean_data/cbsa_aqi_1980_2018.csv")
print('Done.')

Done.


# Pollution Data

In [15]:
pollution = pd.read_csv('../pollution_us_2000_2016.csv')

In [17]:
pollution = pollution[['State','County','State Code','County Code','City',
                      'Date Local','NO2 AQI','O3 AQI','SO2 AQI','CO AQI']]
pollution.head()

Unnamed: 0,State,County,State Code,County Code,City,Date Local,NO2 AQI,O3 AQI,SO2 AQI,CO AQI
0,Arizona,Maricopa,4,13,Phoenix,2000-01-01,46,34,13.0,
1,Arizona,Maricopa,4,13,Phoenix,2000-01-01,46,34,13.0,25.0
2,Arizona,Maricopa,4,13,Phoenix,2000-01-01,46,34,,
3,Arizona,Maricopa,4,13,Phoenix,2000-01-01,46,34,,25.0
4,Arizona,Maricopa,4,13,Phoenix,2000-01-02,34,27,4.0,


In [20]:
# Export pollution data to csv
pollution.to_csv("clean_data/pollution_us_2000_2016", compression='gzip')
print('Done.')

Done.


## US Cancer Data - by State

In [16]:
# Read in csvs as dataframes
df_cancer_1998_2008 = pd.read_csv("raw_data/state_yearly_monthly_cancer_statistics_1999_to_2008.csv")
df_cancer_2009_2014 = pd.read_csv("raw_data/state_yearly_monthly_cancer_statistics_2009_to_2014.csv")

# Remove lines without data
df_cancer_1998_2008 = df_cancer_1998_2008.loc[df_cancer_1998_2008["State"].isin(reverse_state_abbrev)]
df_cancer_2009_2014 = df_cancer_2009_2014.loc[df_cancer_2009_2014["State"].isin(reverse_state_abbrev)]

# Combine dataframes
df_cancer_1998_2014 = df_cancer_1998_2008.append(df_cancer_2009_2014, ignore_index=True)

# Add column with state abbreviation for making choropleth
df_cancer_1998_2014["State Abbreviation"] = df_cancer_1998_2014["State"].apply(lambda x: reverse_state_abbrev[x])

# Change numeric dtypes
df_cancer_1998_2014[['State Code', 'Year', 'Year Code', 'Count']\
                   ] = df_cancer_1998_2014[['State Code', 'Year', 'Year Code', 'Count']].astype(int)

state_cancer_1998_2014 = df_cancer_1998_2014[['State','State Abbreviation','Year','Year Code','Cancer Sites','Cancer Sites Code','Count']]
state_cancer_1998_2014.head()

Unnamed: 0,State,State Abbreviation,Year,Year Code,Cancer Sites,Cancer Sites Code,Count
0,Connecticut,CT,1999,1999,All Invasive Cancer Sites Combined,0,18694
1,Connecticut,CT,2000,2000,All Invasive Cancer Sites Combined,0,19200
2,Connecticut,CT,2001,2001,All Invasive Cancer Sites Combined,0,19626
3,Connecticut,CT,2002,2002,All Invasive Cancer Sites Combined,0,19570
4,Connecticut,CT,2003,2003,All Invasive Cancer Sites Combined,0,19516


In [17]:
# Export state cancer data to csv
state_cancer_1998_2014.to_csv("clean_data/state_cancer_1998_2014.csv")
print('Done.')

Done.


In [18]:
# Group data by state
cancer_by_state = state_cancer_1998_2014.groupby(['State Abbreviation','State'])[['Count']].sum()
# cancer_by_state = cancer_by_state.reset_index('State Abbreviation')
cancer_by_state.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Count
State Abbreviation,State,Unnamed: 2_level_1
AK,Alaska,125735
AL,Alabama,1215322
AR,Arkansas,668683
AZ,Arizona,1356728
CA,California,7956919


In [19]:
# state_cancer_1998_2014.keys()
cancer_1998_2014 = state_cancer_1998_2014[['State', 'State Abbreviation', 'Year', 'Cancer Sites', 'Count']]
cancer_1998_2014['State'] = cancer_1998_2014['State'].apply(lambda x: x.lower())
cancer_1998_2014

Unnamed: 0,State,State Abbreviation,Year,Cancer Sites,Count
0,connecticut,CT,1999,All Invasive Cancer Sites Combined,18694
1,connecticut,CT,2000,All Invasive Cancer Sites Combined,19200
2,connecticut,CT,2001,All Invasive Cancer Sites Combined,19626
3,connecticut,CT,2002,All Invasive Cancer Sites Combined,19570
4,connecticut,CT,2003,All Invasive Cancer Sites Combined,19516
5,connecticut,CT,2004,All Invasive Cancer Sites Combined,19496
6,connecticut,CT,2005,All Invasive Cancer Sites Combined,19806
7,connecticut,CT,2006,All Invasive Cancer Sites Combined,20562
8,connecticut,CT,2007,All Invasive Cancer Sites Combined,20637
9,connecticut,CT,2008,All Invasive Cancer Sites Combined,20733


## US Cancer Data - by County

In [20]:
# Read in county data for cancer incidences
csv = "Choropleth Testing/Resources/county_cancer_data.csv"
county_cancer_data = pd.read_csv(csv, encoding='latin-1')
county_cancer_data = county_cancer_data.rename(columns = {"County": "Geography"})

# Parse geographic info into county and state
county_cancer_data['County'] = county_cancer_data['Geography'].apply(lambda x: x.split(', ')[0].lower())
county_cancer_data['County'] = county_cancer_data['County'].apply(lambda x: x.replace(' county', '').replace(' parish', '').replace(' borough', ''))
county_cancer_data['State'] = county_cancer_data['Geography'].apply(lambda x: x.split(', ')[-1].split('(')[0])
county_cancer_data['FIPS'] = county_cancer_data[' FIPS']

# Remove values that don't contain state data
county_cancer_data = county_cancer_data.loc[county_cancer_data['State'] != 'US ']
county_cancer_data = county_cancer_data.loc[county_cancer_data['State'] != 'Puerto Rico']

# Remove null datapoints
county_cancer_data = county_cancer_data.loc[county_cancer_data['Average Annual Count'] != '¶ ']
county_cancer_data = county_cancer_data.loc[county_cancer_data['Average Annual Count'] != '¶¶']
county_cancer_data = county_cancer_data.loc[county_cancer_data['Average Annual Count'] != '¶']
county_cancer_data = county_cancer_data.loc[county_cancer_data['Met Healthy People Objective of ***?'] != '<font color=grey>*</font>']

# Add column for state abbreviation
county_cancer_data['State Abbreviation'] = county_cancer_data['State'].apply(lambda x: reverse_state_abbrev[x])

# Make state name lowercase
county_cancer_data['State'] = county_cancer_data['State'].apply(lambda x: x.lower())

county_cancer_data.head()

Unnamed: 0,Geography,FIPS,Met Healthy People Objective of ***?,"Age-Adjusted Incidence Rate() - cases per 100,000",Lower 95% Confidence Interval,Upper 95% Confidence Interval,Average Annual Count,Recent Trend,Recent 5-Year Trend () in Incidence Rates,Lower 95% Confidence Interval.1,Upper 95% Confidence Interval.1,County,State,FIPS.1,State Abbreviation
1,"Union County, Florida(6,10)",12125,***,215.1,183.2,251.1,38,stable,-2.5,-24.0,25.0,union,florida,12125,FL
2,"Owsley County, Kentucky(7,8)",21189,***,179.7,134.8,236.3,11,rising,4.3,0.9,7.7,owsley,kentucky,21189,KY
3,"McCreary County, Kentucky(7,8)",21147,***,154.1,130.5,180.9,32,stable,-0.7,-3.0,1.7,mccreary,kentucky,21147,KY
4,"North Slope Borough, Alaska(6,10)",2185,***,150.8,97.0,220.1,7,stable,-0.2,-36.8,57.8,north slope,alaska,2185,AK
5,"Powell County, Kentucky(7,8)",21197,***,140.9,115.2,171.0,22,stable,0.4,-3.3,4.3,powell,kentucky,21197,KY


In [21]:
# Export county cancer data to csv
# county_cancer_data_years = [???]
county_cancer_data.to_csv("clean_data/county_cancer_trends_2011_2015.csv")
print('Done.')

Done.


## County Population Data

In [22]:
# Read in census data with county population estimates
census_csv = "Choropleth Testing/Resources/county_populations_census_2010-17.csv"
county_pop = pd.read_csv(census_csv, encoding='latin-1')

# Rename columns
county_pop = county_pop.rename(columns = {'April 1, 2010 - Census': '2010 Census',
                                          'April 1, 2010 - Estimates Base': 'Estimate Base',
                                          'Population Estimate (as of July 1) - 2010': '2010 Estimate',
                                          'Population Estimate (as of July 1) - 2011': '2011 Estimate',
                                          'Population Estimate (as of July 1) - 2012': '2012 Estimate',
                                          'Population Estimate (as of July 1) - 2013': '2013 Estimate',
                                          'Population Estimate (as of July 1) - 2014': '2014 Estimate',
                                          'Population Estimate (as of July 1) - 2015': '2015 Estimate',
                                          'Population Estimate (as of July 1) - 2016': '2016 Estimate',
                                          'Population Estimate (as of July 1) - 2017': '2017 Estimate'})

# Parse geography column into county/state columns
county_pop["County"] = county_pop['Geography'].apply(lambda x: x.split(", ")[0].lower())
county_pop["County"] = county_pop["County"].apply(lambda x: x.replace(" county", ""))
county_pop["County"] = county_pop["County"].apply(lambda x: x.replace(" parish", ""))
county_pop["State"] = county_pop['Geography'].apply(lambda x: x.split(", ")[-1])
county_pop["FIPS"] = county_pop['Id'].apply(lambda x: x[-5:])
county_pop["State Abbreviation"] = county_pop['State'].apply(lambda x: reverse_state_abbrev[x])
county_pop["State"] = county_pop['State'].apply(lambda x: x.lower())

# Remove unnecessary cols
county_pop = county_pop[['FIPS','County','State','State Abbreviation','2010 Census','2011 Estimate','2012 Estimate','2013 Estimate','2014 Estimate','2015 Estimate','2016 Estimate','2017 Estimate']]

county_pop.head()

Unnamed: 0,FIPS,County,State,State Abbreviation,2010 Census,2011 Estimate,2012 Estimate,2013 Estimate,2014 Estimate,2015 Estimate,2016 Estimate,2017 Estimate
0,1001,autauga,alabama,AL,54571,55199,54927,54695,54864,54838,55278,55504
1,1003,baldwin,alabama,AL,182265,186534,190048,194736,199064,202863,207509,212628
2,1005,barbour,alabama,AL,27457,27351,27175,26947,26749,26264,25774,25270
3,1007,bibb,alabama,AL,22915,22745,22658,22503,22533,22561,22633,22668
4,1009,blount,alabama,AL,57322,57562,57595,57623,57546,57590,57562,58013


In [23]:
# Change county population data format to match other datasets (new rows for each year)?
county_pop_2010_2017 = []
year_cols = ['2010 Census','2011 Estimate','2012 Estimate','2013 Estimate','2014 Estimate','2015 Estimate','2016 Estimate','2017 Estimate']

for i in range(len(county_pop)):
    county_fips = county_pop.iloc[i,0]
    county_name = county_pop.iloc[i,1]
    county_state = county_pop.iloc[i,2]
    county_st_abbrev = county_pop.iloc[i,3]
    for j in range(4,len(county_pop.iloc[i])-1):
        year = year_cols[int(j-4)].split()[0]
        county_yr_pop = county_pop.iloc[i,j]
        county_yr_row = {'FIPS':county_fips,
                          'County':county_name,
                          'State':county_state,
                          'State Abbreviation':county_st_abbrev,
                          'Year':year,
                          'Population':county_yr_pop}
        county_pop_2010_2017.append(county_yr_row)
        
county_pop_2010_2017 = pd.DataFrame(county_pop_2010_2017)
county_pop_2010_2017.head()

Unnamed: 0,County,FIPS,Population,State,State Abbreviation,Year
0,autauga,1001,54571,alabama,AL,2010
1,autauga,1001,55199,alabama,AL,2011
2,autauga,1001,54927,alabama,AL,2012
3,autauga,1001,54695,alabama,AL,2013
4,autauga,1001,54864,alabama,AL,2014


In [24]:
# Export county population data to csv
county_pop.to_csv("clean_data/county_pop.csv")
county_pop_2010_2017.to_csv("clean_data/county_pop_2010_2017.csv")
print('Done.')

Done.


### Combine relevant datasets

In [25]:
print(aqi_1980_2018.keys())
print(cancer_1998_2014.keys())

Index(['State', 'County', 'State Abbreviation', 'Year', 'FIPS',
       'State FIPS Code', 'County FIPS Code', 'Max AQI', '90th Percentile AQI',
       '% Days AQI Unhealthy-Hazardous', '% Days CO', '% Days NO2',
       '% Days SO2', '% Days Ozone', '% Days PM2.5', '% Days PM10'],
      dtype='object')
Index(['State', 'State Abbreviation', 'Year', 'Cancer Sites', 'Count'], dtype='object')


In [26]:
aqi_1980_2018.to_csv("clean_data/aqi_1980_2018.csv")
cancer_1998_2014.to_csv("clean_data/cancer_1998_2014.csv")
print('Done.')

Done.


In [47]:
# Group AQI data by state and year to merge with cancer data, average other data
aqi_med = aqi_1980_2018.groupby(['State', 'State Abbreviation', 'Year'])[['Max AQI', '90th Percentile AQI',
                                                                '% Days AQI Unhealthy-Hazardous', 
                                                                '% Days CO', '% Days NO2', '% Days SO2', 
                                                                '% Days Ozone', '% Days PM2.5', '% Days PM10']].median()
aqi_med = aqi.reset_index()

# Rename columns
newcols = {'State':'State', 'State Abbreviation':'State Abbreviation', 'Year':'Year',
           'Max AQI':'Median Max AQI', '90th Percentile AQI':'Median 90th Percentile AQI',
           '% Days AQI Unhealthy-Hazardous':'Median % Days AQI Unhealthy-Hazardous', 
           '% Days CO':'Median % Days CO', '% Days NO2':'Median % Days NO2',
           '% Days SO2':'Median % Days SO2', '% Days Ozone':'Median % Days Ozone', 
           '% Days PM2.5':'Median % Days PM2.5', '% Days PM10':'Median % Days PM10',
           'Cancer Sites':'Cancer Sites', 'Count':'Cancer Count'}
aqi_med.rename(columns = newcols, inplace = True)

aqi_med = aqi_med[['State', 'State Abbreviation', 'Year', 'Median Max AQI',
       'Median 90th Percentile AQI', 'Median % Days AQI Unhealthy-Hazardous',
       'Median % Days CO', 'Median % Days NO2', 'Median % Days SO2',
       'Median % Days Ozone', 'Median % Days PM2.5', 'Median % Days PM10']]

aqi_med.head()
# aqi_med.keys()

Unnamed: 0,State,State Abbreviation,Year,Median Max AQI,Median 90th Percentile AQI,Median % Days AQI Unhealthy-Hazardous,Median % Days CO,Median % Days NO2,Median % Days SO2,Median % Days Ozone,Median % Days PM2.5,Median % Days PM10
0,alabama,AL,1980,188.2,130.0,6.395587,6.061712,8.737746,46.318783,38.881758,0.0,0.0
1,alabama,AL,1981,180.636364,117.818182,6.239544,1.594022,2.597403,35.983619,59.824956,0.0,0.0
2,alabama,AL,1982,180.181818,100.727273,2.636585,1.867995,0.762873,34.233762,63.135369,0.0,0.0
3,alabama,AL,1983,196.888889,107.444444,3.270985,4.048706,0.0,43.916337,52.034956,0.0,0.0
4,alabama,AL,1984,183.25,104.375,2.609055,5.703552,0.0,54.478193,39.818255,0.0,0.0


In [43]:
# Group AQI data by state and year to merge with cancer data, average other data
aqi_avg = aqi_1980_2018.groupby(['State', 'State Abbreviation', 'Year'])[['Max AQI', '90th Percentile AQI',
                                                                '% Days AQI Unhealthy-Hazardous', 
                                                                '% Days CO', '% Days NO2', '% Days SO2', 
                                                                '% Days Ozone', '% Days PM2.5', '% Days PM10']].mean()
aqi_avg = aqi.reset_index()

# Rename columns
newcols = {'State':'State', 'State Abbreviation':'State Abbreviation', 'Year':'Year', 
           'Max AQI':'Average Max AQI', '90th Percentile AQI':'Average 90th Percentile AQI',
           '% Days AQI Unhealthy-Hazardous':'Average % Days AQI Unhealthy-Hazardous', 
           '% Days CO':'Average % Days CO', '% Days NO2':'Average % Days NO2',
           '% Days SO2':'Average % Days SO2', '% Days Ozone':'Average % Days Ozone', 
           '% Days PM2.5':'Average % Days PM2.5', '% Days PM10':'Average % Days PM10',
           'Cancer Sites':'Cancer Sites', 'Count':'Cancer Count'}
aqi_avg.rename(columns = newcols, inplace = True)

aqi_avg = aqi_avg[['State', 'State Abbreviation', 'Year', 'Average Max AQI',
       'Average 90th Percentile AQI', 'Average % Days AQI Unhealthy-Hazardous',
       'Average % Days CO', 'Average % Days NO2', 'Average % Days SO2',
       'Average % Days Ozone', 'Average % Days PM2.5', 'Average % Days PM10']]

aqi_avg.head()

Unnamed: 0,State,State Abbreviation,Year,Average Max AQI,Average 90th Percentile AQI,Average % Days AQI Unhealthy-Hazardous,Average % Days CO,Average % Days NO2,Average % Days SO2,Average % Days Ozone,Average % Days PM2.5,Average % Days PM10
0,alabama,AL,1980,188.2,130.0,6.395587,6.061712,8.737746,46.318783,38.881758,0.0,0.0
1,alabama,AL,1981,180.636364,117.818182,6.239544,1.594022,2.597403,35.983619,59.824956,0.0,0.0
2,alabama,AL,1982,180.181818,100.727273,2.636585,1.867995,0.762873,34.233762,63.135369,0.0,0.0
3,alabama,AL,1983,196.888889,107.444444,3.270985,4.048706,0.0,43.916337,52.034956,0.0,0.0
4,alabama,AL,1984,183.25,104.375,2.609055,5.703552,0.0,54.478193,39.818255,0.0,0.0


In [55]:
# Left merge cancer + aqi means data (includes NaNs)
cancer_aqi = pd.merge(left=aqi_avg, right=cancer_1998_2014, how='left', on=['State', 'State Abbreviation', 'Year'])
cancer_aqi.to_csv('clean_data/cancer_aqi_1980_2018.csv')
cancer_aqi.head()

Unnamed: 0,State,State Abbreviation,Year,Average Max AQI,Average 90th Percentile AQI,Average % Days AQI Unhealthy-Hazardous,Average % Days CO,Average % Days NO2,Average % Days SO2,Average % Days Ozone,Average % Days PM2.5,Average % Days PM10,Cancer Sites,Count
0,alabama,AL,1980,188.2,130.0,6.395587,6.061712,8.737746,46.318783,38.881758,0.0,0.0,,
1,alabama,AL,1981,180.636364,117.818182,6.239544,1.594022,2.597403,35.983619,59.824956,0.0,0.0,,
2,alabama,AL,1982,180.181818,100.727273,2.636585,1.867995,0.762873,34.233762,63.135369,0.0,0.0,,
3,alabama,AL,1983,196.888889,107.444444,3.270985,4.048706,0.0,43.916337,52.034956,0.0,0.0,,
4,alabama,AL,1984,183.25,104.375,2.609055,5.703552,0.0,54.478193,39.818255,0.0,0.0,,


In [54]:
# Inner merge cancer + aqi means data
cancer_aqi_df = pd.merge(left=aqi_avg, right=cancer_1998_2014, how='inner', on=['State', 'State Abbreviation', 'Year'])
print(cancer_aqi_df.keys())

# Export to csv
cancer_aqi_df.to_csv('clean_data/DATA_avgs.csv')
cancer_aqi_df.head()

Index(['State', 'State Abbreviation', 'Year', 'Average Max AQI',
       'Average 90th Percentile AQI', 'Average % Days AQI Unhealthy-Hazardous',
       'Average % Days CO', 'Average % Days NO2', 'Average % Days SO2',
       'Average % Days Ozone', 'Average % Days PM2.5', 'Average % Days PM10',
       'Cancer Sites', 'Count'],
      dtype='object')


Unnamed: 0,State,State Abbreviation,Year,Average Max AQI,Average 90th Percentile AQI,Average % Days AQI Unhealthy-Hazardous,Average % Days CO,Average % Days NO2,Average % Days SO2,Average % Days Ozone,Average % Days PM2.5,Average % Days PM10,Cancer Sites,Count
0,alabama,AL,1999,144.954545,89.909091,1.725947,0.535731,0.0,11.765212,25.312181,40.137894,22.248982,All Invasive Cancer Sites Combined,19290
1,alabama,AL,1999,144.954545,89.909091,1.725947,0.535731,0.0,11.765212,25.312181,40.137894,22.248982,Oral Cavity and Pharynx,489
2,alabama,AL,1999,144.954545,89.909091,1.725947,0.535731,0.0,11.765212,25.312181,40.137894,22.248982,Lip,67
3,alabama,AL,1999,144.954545,89.909091,1.725947,0.535731,0.0,11.765212,25.312181,40.137894,22.248982,Tongue,110
4,alabama,AL,1999,144.954545,89.909091,1.725947,0.535731,0.0,11.765212,25.312181,40.137894,22.248982,Salivary Gland,44


In [53]:
# Inner merge cancer + aqi medians data
cancer_aqi_df = pd.merge(left=aqi_med, right=cancer_1998_2014, how='inner', on=['State', 'State Abbreviation', 'Year'])
print(cancer_aqi_df.keys())

# Export to csv
cancer_aqi_df.to_csv('clean_data/DATA_meds.csv')
cancer_aqi_df.head()

Index(['State', 'State Abbreviation', 'Year', 'Median Max AQI',
       'Median 90th Percentile AQI', 'Median % Days AQI Unhealthy-Hazardous',
       'Median % Days CO', 'Median % Days NO2', 'Median % Days SO2',
       'Median % Days Ozone', 'Median % Days PM2.5', 'Median % Days PM10',
       'Cancer Sites', 'Count'],
      dtype='object')


Unnamed: 0,State,State Abbreviation,Year,Median Max AQI,Median 90th Percentile AQI,Median % Days AQI Unhealthy-Hazardous,Median % Days CO,Median % Days NO2,Median % Days SO2,Median % Days Ozone,Median % Days PM2.5,Median % Days PM10,Cancer Sites,Count
0,alabama,AL,1999,144.954545,89.909091,1.725947,0.535731,0.0,11.765212,25.312181,40.137894,22.248982,All Invasive Cancer Sites Combined,19290
1,alabama,AL,1999,144.954545,89.909091,1.725947,0.535731,0.0,11.765212,25.312181,40.137894,22.248982,Oral Cavity and Pharynx,489
2,alabama,AL,1999,144.954545,89.909091,1.725947,0.535731,0.0,11.765212,25.312181,40.137894,22.248982,Lip,67
3,alabama,AL,1999,144.954545,89.909091,1.725947,0.535731,0.0,11.765212,25.312181,40.137894,22.248982,Tongue,110
4,alabama,AL,1999,144.954545,89.909091,1.725947,0.535731,0.0,11.765212,25.312181,40.137894,22.248982,Salivary Gland,44
