# Reading and Cleaning Climate Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
# cdc wonder Daily Air Temp by County (aggregated by month)
# Jan 1, 1979 - Dec 31, 2011
#temp_df = pd.read_csv('data/climate_data/2011_AirTemp_byState.txt', delimiter="\t", header=0)
#particulateMatter_df = pd.read_csv('data/climate_data/2011_fineParticulateMatter_µgm3.txt', delimiter="\t", header=0)

# Read in Temperature Data

In [3]:
temperature_files = os.listdir('../data/01_climate_data/monthlyAirTemp_byCounty_RAW/')
temperature_files.remove('.ipynb_checkpoints')

In [4]:
# making the above easier with a function to grab data in folder, read it in, remove unwanted stuff and return a clean df
def process_temp_data(filename):
    file_string = '../data/01_climate_data/monthlyAirTemp_byCounty_RAW/' + filename
    temp_df = pd.read_csv(file_string, delimiter='\t', header=0, parse_dates=['Month, Year Code'], dtype={'County Code': object})
    
    #drop unneccesary rows
    temp_df = temp_df[temp_df['County'].isna() == False]
    temp_df = temp_df[temp_df['Notes'] != 'Total'] #drop the total rows
    
    # create UID
    temp_df['UID'] = temp_df['County'] + " - " + temp_df['Month, Year']
    
    #get only needed columns
    output_df = temp_df[['UID', 'County', 'County Code', 'Month, Year', 'Month, Year Code','Avg Daily Max Air Temperature (F)', 'Min Temp for Daily Max Air Temp (F)', 'Max Temp for Daily Max Air Temp (F)']]
    output_df = output_df.rename(columns={'Avg Daily Max Air Temperature (F)': 'avg_dailyMaxAirTemp_F', 'Min Temp for Daily Max Air Temp (F)': 'min_dailyMaxAirTemp_F', 'Max Temp for Daily Max Air Temp (F)': 'max_dailyMaxAirTemp_F'})
    return output_df

In [5]:
# run process data function on all data in temperature files
air_temperature_df = [process_temp_data(file) for file in temperature_files]
air_temperature_df = pd.concat(air_temperature_df)

In [6]:
# turn objects to categories to save memory
air_temperature_df['County'] = air_temperature_df['County'].astype('category')
air_temperature_df['Month, Year'] = air_temperature_df['Month, Year'].astype('category')
air_temperature_df['Month, Year Code'] = air_temperature_df['Month, Year Code'].astype('category')

# Read in Precipitation Files

In [7]:
precipitation_files = os.listdir('../data/01_climate_data/monthlyPrecipitation_RAW/')
#precipitation_files

In [8]:
#writing a function to process precip data
def process_precip(filename):
    file_string = '../data/01_climate_data/monthlyPrecipitation_RAW/' + filename
    precip_df = pd.read_csv(file_string, delimiter='\t', header=0, dtype={'County Code': object}, parse_dates=['Month, Year Code'])
    
    #drop unneccesary rows
    precip_df = precip_df[precip_df['County'].isna() == False]
    precip_df = precip_df[precip_df['Notes'] != 'Total'] #drop the total rows
    
    #county code to int
    #precip_df = precip_df.astype({'County Code': 'int64'})
    
    # create UID
    precip_df['UID'] = precip_df['County'] + " - " + precip_df['Month, Year']
    
    #get only needed columns
    output_df = precip_df[['UID', 'County', 'County Code', 'Month, Year', 'Month, Year Code', 'Avg Daily Precipitation (mm)' ,'Min Daily Precipitation', 'Max Daily Precipitation']]
    final_df = output_df.rename(columns={'Avg Daily Precipitation (mm)': 'avg_daily_precip_mm', 'Min Daily Precipitation': 'min_daily_precip_mm', 'Max Daily Precipitation': 'max_daily_precip_mm'})
    return final_df

In [9]:
precipitation_df = [process_precip(file) for file in precipitation_files]
precipitation_df = pd.concat(precipitation_df)

In [10]:
# turn objects to categories to save memory
precipitation_df['County'] = precipitation_df['County'].astype('category')
precipitation_df['Month, Year'] = precipitation_df['Month, Year'].astype('category')

# Combining Air Temp and Precip Climate Dfs into One

Here we will combine all climate data into one master data frame (sans the heat wave data, since that is not monthly), and write out to a zipped csv. This is to save on file size for github file size limits.


Ultimately, we ended up not using this data, but for the sake of showing functions one might use to read in and combine such data, I've included the workflow in this notebook.

In [11]:
precip_airTemp = pd.merge(air_temperature_df, precipitation_df, left_on='UID', right_on='UID')

In [12]:
precip_airTemp.drop(columns=['UID','County_y', 'County Code_y', 'Month, Year_y', 'Month, Year Code_y'], inplace=True)

#renaming for later
precip_airTemp.rename(columns={'County_x': 'county_name', 'County Code_x': 'county_FIPS', 'Month, Year_x': 'month_year_long', 'Month, Year Code_x': 'month_year_short'}, inplace=True)

In [13]:
# turn objects to categories to save memory
precip_airTemp['county_name'] = precip_airTemp['county_name'].astype('category')
precip_airTemp['county_FIPS'] = precip_airTemp['county_FIPS'].astype('category')
precip_airTemp['month_year_long'] = precip_airTemp['month_year_long'].astype('category')

In [14]:
# writing out to a zipped csv for later use
final_compression_opts = dict(method='zip',
                        archive_name='precip_AirTemp_monthly_1979_2011.csv')

precip_airTemp.to_csv('../data/cleaned/precip_AirTemp_monthly_1979_2011.zip', compression=final_compression_opts)

# Read in the Heat Wave Days
Again, this was part of the climate data we didn't really use. 

Included to show process.

In [15]:
heat_files = os.listdir('../data/01_climate_data/heatWaveDays_RAW/')

In [16]:
def process_heat_wave_data(filename):
    file_string = '../data/01_climate_data/heatWaveDays_RAW/' + filename
    hw_df = pd.read_csv(file_string, delimiter='\t', header=0, parse_dates=['Year'], dtype={'County Code': object})
    
    #drop unneccesary rows
    hw_df = hw_df[hw_df['County'].isna() == False]
    hw_df = hw_df[hw_df['Notes'] != 'Total'] #drop the total rows
    
    #county code to int
    #hw_df = hw_df.astype({'County Code': 'int64'})
    
    # No UID in this df, since it doesn't have the Month, Year col

    
    #get only needed columns
    output_df = hw_df[['County', 'County Code', 'Year', 'Heat Wave Days Based on Daily Maximum Temperature', 'Heat Wave Days Based on Daily Maximum Heat Index', 'Heat Wave Days Based on Net Daily Heat Stress']]
    final_df = output_df.rename(columns={'Heat Wave Days Based on Daily Maximum Temperature': 'count_hwDays_onDailyMaxTemp', 'Heat Wave Days Based on Daily Maximum Heat Index': 'count_hwDays_onDailyMaxHeatIndex', 'Heat Wave Days Based on Net Daily Heat Stress': 'count_hwDays_onDailyNetHeatStress'})
    return final_df

In [17]:
heat_wave_df = [process_heat_wave_data(file) for file in heat_files]
heat_wave_df = pd.concat(heat_wave_df)
heat_wave_df.sort_values(by=['County Code', 'Year'], inplace=True)

In [18]:
#heat_wave_df.memory_usage(deep=True) / 1_000_000

In [19]:
heat_wave_df.to_csv('../data/cleaned/heat_wave_days_1981_2010.csv', index=False)

# Inspecting Health Data

In [20]:
cvd_files = os.listdir('../data/03_health_data/IHME_USA_COUNTY_CVD_MORTALITY_RATES_1980_2014/')
inf_files = os.listdir('../data/03_health_data/IHME_USA_COUNTY_INFECT_DIS_MORT_1980_2014/')
resp_files = os.listdir('../data/03_health_data/IHME_USA_COUNTY_RESP_DISEASE_MORTALITY_1980_2014/')
substance_injury_files = os.listdir('../data/03_health_data/IHME_USA_COUNTY_USE_INJ_MORTALITY_1980_2014/')

cvd_files.remove('.ipynb_checkpoints')
inf_files.remove('.ipynb_checkpoints')
substance_injury_files.remove('.ipynb_checkpoints')

NameError: name 'cancer_files' is not defined

In [21]:
# universal function to process data in all folders
def process_health_data(file):
    state = file.split('_')[-2].title()
    stat_type = file.split('_')[3]
    
    #set filepath based on stat_type
    if stat_type == 'CVD':
        folder = '../data/03_health_data/IHME_USA_COUNTY_CVD_MORTALITY_RATES_1980_2014/'
    elif stat_type == 'INFECT':
        folder = '../data/03_health_data/IHME_USA_COUNTY_INFECT_DIS_MORT_1980_2014/'
    elif stat_type == 'RESP':
        folder = '../data/03_health_data/IHME_USA_COUNTY_RESP_DISEASE_MORTALITY_1980_2014/'
    elif stat_type == 'CANCER':
        folder = '../data/03_health_data/IHME_USA_COUNTY_CANCER_MORTALITY_RATES_1980_2014/'
    elif stat_type == 'USE':
        folder = '../data/03_health_data/IHME_USA_COUNTY_USE_INJ_MORTALITY_1980_2014/'
    full_path = folder + file
    #print(full_path)
    
    #read in csv
    df = pd.read_csv(full_path, dtype={'FIPS': object})
    # drop rows that are sums of whole state
    df = df[(df['location_name'] != state) & (df['FIPS'].isna() == False)]
    
    #recase cols to save memory and so we can create a UID later
    # UID is formatted: 'FIPS-cause_id-sex_id-year_id'
    #df[['cause_id', 'sex', 'cause_name', 'location_name', 'FIPS']] = df[['location_name', 'FIPS', 'cause_id', 'sex', 'cause_name']].astype('category')
    #df['UID'] = df['FIPS'] + '-' + df['cause_id'] + '-' + df['sex_id'] + '-' + df['year_id']
    df['state'] = state
    df.drop(columns=['sex_id', 'location_id'], inplace=True)
    
    #recast variables as category for file size handling
    #df[['location_name', 'FIPS', 'cause_id', 'cause_name', 'sex_id', 'sex', 'year_id', 'UID']] = df[['location_name', 'FIPS', 'cause_id', 'cause_name', 'sex_id', 'sex', 'year_id', 'UID']].astype('category')
    
    return df

In [22]:
# process cardiovascular mortality data
full_cvd_df = [process_health_data(file) for file in cvd_files]
full_cvd_df = pd.concat(full_cvd_df)

full_cvd_df['FIPS'] = full_cvd_df['FIPS'].str.zfill(5)

In [23]:
# process infectious disease mortality data
full_inf_df = [process_health_data(file) for file in inf_files]
full_inf_df = pd.concat(full_inf_df)

full_inf_df['FIPS'] = full_inf_df['FIPS'].str.zfill(5)

In [24]:
# process respiratory disease mortality data
full_resp_df = [process_health_data(file) for file in resp_files]
full_resp_df = pd.concat(full_resp_df)

full_resp_df['FIPS'] = full_resp_df['FIPS'].str.zfill(5)

In [25]:
# process substance abuse/self injury disease mortality data
full_subInj_df = [process_health_data(file) for file in substance_injury_files]
full_subInj_df = pd.concat(full_subInj_df)

full_subInj_df['FIPS'] = full_subInj_df['FIPS'].str.zfill(5)

In [26]:
# dropping all the unnecessary columns
full_cvd_df.drop(columns=['measure_id', 'measure_name', 'cause_id', 'age_id', 'age_name', 'metric', 'upper', 'lower'], inplace=True)
full_inf_df.drop(columns=['measure_id', 'measure_name', 'cause_id', 'age_id', 'age_name', 'metric', 'upper', 'lower'], inplace=True)
full_resp_df.drop(columns=['measure_id', 'measure_name', 'cause_id', 'age_id', 'age_name', 'metric', 'upper', 'lower'], inplace=True)
full_subInj_df.drop(columns=['measure_id', 'measure_name', 'cause_id', 'age_id', 'age_name', 'metric', 'upper', 'lower', 'measure_ID'], inplace=True)

In [27]:
full_cvd_df[['location_name', 'FIPS', 'cause_name', 'sex', 'year_id', 'state']] = full_cvd_df[['location_name', 'FIPS', 'cause_name', 'sex', 'year_id', 'state']].astype('category')

full_inf_df[['location_name', 'FIPS', 'cause_name', 'sex', 'year_id', 'state']] = full_inf_df[['location_name', 'FIPS', 'cause_name', 'sex', 'year_id', 'state']].astype('category')

full_resp_df[['location_name', 'FIPS', 'cause_name', 'sex', 'year_id', 'state']] = full_resp_df[['location_name', 'FIPS', 'cause_name', 'sex', 'year_id', 'state']].astype('category')

full_subInj_df[['location_name', 'FIPS', 'cause_name', 'sex', 'year_id', 'state']] = full_subInj_df[['location_name', 'FIPS', 'cause_name', 'sex', 'year_id', 'state']].astype('category')

In [28]:
# creating a Unique Identifier to join later with other climate dfs
cvd_df_compression_opts = dict(method='zip',
                        archive_name='cvd_mortality.csv')

full_cvd_df.to_csv('../data/cleaned/cvd_mortality.zip', index=False, compression=cvd_df_compression_opts)

In [29]:
# creating a Unique Identifier to join later with other climate dfs
inf_df_compression_opts = dict(method='zip',
                        archive_name='inf_mortality.csv')

full_inf_df.to_csv('../data/cleaned/inf_mortality.zip', index=False, compression=inf_df_compression_opts)

In [30]:
# creating a Unique Identifier to join later with other climate dfs
resp_df_compression_opts = dict(method='zip',
                        archive_name='resp_mortality.csv')

full_resp_df.to_csv('../data/cleaned/resp_mortality.zip', index=False, compression=resp_df_compression_opts)

In [31]:
# creating a Unique Identifier to join later with other climate dfs
subInj_df_compression_opts = dict(method='zip',
                        archive_name='substanceAbuse_selfInjury_mortality.csv')

full_subInj_df.to_csv('../data/cleaned/substanceAbuse_selfInjury_mortality.zip', index=False, compression=subInj_df_compression_opts)

---

# Creating Texas Only Data for Streamlit App

For the purposes of building our interactive data visualization application, we choose to implement a web app using streamlit. Because we had not yet covered databases, we opted to load and visualize a smaller dataset -- 

This is our ***"Texas Case Study"*** where we visualize climate, health and demographic data, as well as our K-Means Clustering. 

But First, I'm going to go ahead and create some Texas Data to improve our applications rendering time. 

In [32]:
# This is all of the Texas Health Data
tx_cvd_df = full_cvd_df[full_cvd_df['state'] == 'Texas']
tx_inf_df = full_inf_df[full_inf_df['state'] == 'Texas']
tx_resp_df = full_resp_df[full_resp_df['state'] == 'Texas']
tx_subInj_df = full_subInj_df[full_subInj_df['state'] == 'Texas']

In [33]:
# create the texas climate data
precip_airTemp['county_name'] = precip_airTemp['county_name'].astype(object)
#precip_airTemp

climate_df = pd.read_csv('../data/cleaned/precip_AirTemp_monthly_1979_2011.zip', parse_dates=['month_year_long'], dtype={'county_FIPS': object, 'county_name': object})
#climate_df

#climate_df['county_name'].str.split(', ')[1000][1]

climate_df['state'] = [i.split(', ')[1] for i in climate_df['county_name']]

climate_df.drop(columns=['Unnamed: 0', 'month_year_short'], inplace=True)

tx_climate_df = climate_df[climate_df['state'] == 'TX']

heat_wave_df['state'] = [i.split(', ')[1] for i in heat_wave_df['County']]

In [34]:
tx_heat_wave_df = heat_wave_df[heat_wave_df['state'] == 'TX']

In [36]:
# create a tx demographics file
demographics_df = pd.read_csv('../data/cleaned/final_combined.csv', dtype={'fips': object})
tx_demographics = demographics_df[demographics_df['state'] == ' Texas']

In [38]:
# write out all TX data frames & stuff to CSV

# tx_subInj_df, tx_cvd_df, tx_inf_df, tx_resp_df
tx_cvd_df.to_csv('../data/cleaned/tx_for_streamlit/tx_cvd.csv', index=False)
tx_inf_df.to_csv('../data/cleaned/tx_for_streamlit/tx_inf.csv', index=False)
tx_resp_df.to_csv('../data/cleaned/tx_for_streamlit/tx_resp.csv', index=False)
tx_subInj_df.to_csv('../data/cleaned/tx_for_streamlit/tx_subInj.csv', index=False)

# tx_demographics, tx_heat_wave_df, tx_climate_df
tx_demographics.to_csv('../data/cleaned/tx_for_streamlit/tx_demographics.csv', index=False)
tx_heat_wave_df.to_csv('../data/cleaned/tx_for_streamlit/tx_heat_wave.csv', index=False)
tx_climate_df.to_csv('../data/cleaned/tx_for_streamlit/tx_climate_wave.csv', index=False)