# Reading and Cleaning Climate Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
# cdc wonder Daily Air Temp by County (aggregated by month)
# Jan 1, 1979 - Dec 31, 2011
#temp_df = pd.read_csv('data/climate_data/2011_AirTemp_byState.txt', delimiter="\t", header=0)
#particulateMatter_df = pd.read_csv('data/climate_data/2011_fineParticulateMatter_µgm3.txt', delimiter="\t", header=0)

# Read in Temperature Data

In [52]:
temperature_files = os.listdir('../data/01_climate_data/monthlyAirTemp_byCounty_RAW/')
temperature_files.remove('.ipynb_checkpoints')

In [4]:
#temperature_files

In [6]:
# temp_1979_df = pd.read_csv('./data/01_climate_data/monthlyAirTemp_byCounty_RAW/dailyAirTemp_byCounty_1979.txt', delimiter='\t', header=0)
# temp_1979_df = temp_1979_df[temp_1979_df['County'].isna() == False]
# temp_1979_df = temp_1979_df[temp_1979_df['Notes'] != 'Total']

# temp_1979_df.columns

# temp_1979_df = temp_1979_df[['County', 'County Code', 'Month, Year', 'Month, Year Code', 'Record Count for Daily Max Air Temp (F)' ,'Avg Daily Max Air Temperature (F)', 'Min Temp for Daily Max Air Temp (F)', 'Max Temp for Daily Max Air Temp (F)']]

In [53]:
# making the above easier with a function to grab data in folder, read it in, remove unwanted stuff and return a clean df
def process_temp_data(filename):
    file_string = '../data/01_climate_data/monthlyAirTemp_byCounty_RAW/' + filename
    temp_df = pd.read_csv(file_string, delimiter='\t', header=0, parse_dates=['Month, Year Code'], dtype={'County Code': object})
    
    #drop unneccesary rows
    temp_df = temp_df[temp_df['County'].isna() == False]
    temp_df = temp_df[temp_df['Notes'] != 'Total'] #drop the total rows
    
    # create UID
    temp_df['UID'] = temp_df['County'] + " - " + temp_df['Month, Year']
    
    #get only needed columns
    output_df = temp_df[['UID', 'County', 'County Code', 'Month, Year', 'Month, Year Code','Avg Daily Max Air Temperature (F)', 'Min Temp for Daily Max Air Temp (F)', 'Max Temp for Daily Max Air Temp (F)']]
    output_df = output_df.rename(columns={'Avg Daily Max Air Temperature (F)': 'avg_dailyMaxAirTemp_F', 'Min Temp for Daily Max Air Temp (F)': 'min_dailyMaxAirTemp_F', 'Max Temp for Daily Max Air Temp (F)': 'max_dailyMaxAirTemp_F'})
    return output_df

In [54]:
# run process data function on all data in temperature files
air_temperature_df = [process_temp_data(file) for file in temperature_files]
air_temperature_df = pd.concat(air_temperature_df)

# sort the values
air_temperature_df.sort_values(by=['County Code', 'Month, Year Code'], inplace=True)

In [55]:
air_temperature_df.memory_usage(deep=True) / 1_000_000

Index                      9.855648
UID                      107.115228
County                    92.331756
County Code               76.381272
Month, Year               81.309096
Month, Year Code           9.855648
avg_dailyMaxAirTemp_F      9.855648
min_dailyMaxAirTemp_F      9.855648
max_dailyMaxAirTemp_F      9.855648
dtype: float64

In [56]:
# turn objects to categories to save memory
air_temperature_df['County'] = air_temperature_df['County'].astype('category')
air_temperature_df['Month, Year'] = air_temperature_df['Month, Year'].astype('category')
air_temperature_df['Month, Year Code'] = air_temperature_df['Month, Year Code'].astype('category')

In [416]:
# #write out clean data to csv
# temp_compression_opts = dict(method='zip',
#                         archive_name='airTempMonthly_1979_2011.csv')


# air_temperature_df.to_csv('./data/01_climate_data/01_climate_data_CLEAN/airTempMonthly_1979_2011.zip', index=False, compression=temp_compression_opts)

In [408]:
# # read in clean csv just to test it works
# data = pd.read_csv('./data/01_climate_data/01_climate_data_CLEAN/airTempMonthly_1979_2011.csv')
# data.info()

# Read in the Particulate Matter Data

Significant issues, we probs won't be using this data

In [10]:
# # make list of files in particulate matter
particle_files = os.listdir('../data/01_climate_data/monthlyParticulateMatter_RAW/')
# #particle_files

# test_pfm_df = pd.read_csv('./data/01_climate_data/monthlyParticulateMatter_RAW/monthlyFPM_byCounty_2009.txt', delimiter="\t", header=0)

# test_pfm_df.head(2)

# test_pfm_df = test_pfm_df[test_pfm_df['County'].isna() == False]
# test_pfm_df = test_pfm_df[test_pfm_df['Notes'] != 'Total']

# test_pfm_df.columns

In [1]:
# def process_fpm_data(filename):
#     file_string = '../data/01_climate_data/monthlyParticulateMatter_RAW/' + filename
#     fpm_df = pd.read_csv(file_string, delimiter='\t', header=0, parse_dates=['Month, Year Code'], dtype={'County Code': object}, encoding='unicode_escape')
    
#     #drop unneccesary rows
#     fpm_df = fpm_df[fpm_df['County'].isna() == False]
#     fpm_df = fpm_df[fpm_df['Notes'] != 'Total'] #drop the total rows
    
#     # create UID
#     fpm_df['UID'] = fpm_df['County'] + " - " + fpm_df['Month, Year']
    
#     #get only needed columns
#     output_df = fpm_df[['UID', 'County', 'County Code', 'Month, Year', 'Month, Year Code', 'Avg Fine Particulate Matter (�g/m�)' ,'Min Fine Particulate Matter', 'Max Fine Particulate Matter']]
#     final_df = output_df.rename(columns={'Avg Fine Particulate Matter (�g/m�)': 'avg_FPM', 'Min Fine Particulate Matter': 'min_FPM', 'Max Fine Particulate Matter': 'max_FPM'})
#     return final_df

In [13]:
# #process_fpm_data(particle_files[2]).head(2)

# # run process data function on all data in temperature files
# fine_particulate_matter_df = [process_fpm_data(file) for file in particle_files]
# fine_particulate_matter_df = pd.concat(fine_particulate_matter_df)

# # sort the values
# fine_particulate_matter_df.sort_values(by=['County Code', 'Month, Year Code'], inplace=True)

# # turn objects to categories to save memory
# fine_particulate_matter_df['County'] = fine_particulate_matter_df['County'].astype('category')
# fine_particulate_matter_df['Month, Year'] = fine_particulate_matter_df['Month, Year'].astype('category')
# fine_particulate_matter_df['Month, Year Code'] = fine_particulate_matter_df['Month, Year Code'].astype('category')

# fine_particulate_matter_df

In [426]:
# # creating a Unique Identifier to join later with other climate dfs
# fpm_compression_opts = dict(method='zip',
#                         archive_name='fpm_monthlyByCounty_2003_2011.csv')


# fine_particulate_matter_df.to_csv('./data/01_climate_data/01_climate_data_CLEAN/fpm_monthly_byCounty_2003_2011.zip', index=False, compression=fpm_compression_opts)

# Read in the Heat Wave Days

In [57]:
heat_files = os.listdir('../data/01_climate_data/heatWaveDays_RAW/')

In [25]:
# test_df = pd.read_csv('./data/01_climate_data/heatWaveDays_RAW/heatWaveDays_1982_1985.txt', delimiter='\t', header=0, dtype={'County Code': object})
# test_df[test_df['County'].isna() == False].head(1)

# test_df.columns

In [58]:
def process_heat_wave_data(filename):
    file_string = '../data/01_climate_data/heatWaveDays_RAW/' + filename
    hw_df = pd.read_csv(file_string, delimiter='\t', header=0, parse_dates=['Year'], dtype={'County Code': object})
    
    #drop unneccesary rows
    hw_df = hw_df[hw_df['County'].isna() == False]
    hw_df = hw_df[hw_df['Notes'] != 'Total'] #drop the total rows
    
    #county code to int
    #hw_df = hw_df.astype({'County Code': 'int64'})
    
    # No UID in this df, since it doesn't have the Month, Year col

    
    #get only needed columns
    output_df = hw_df[['County', 'County Code', 'Year', 'Heat Wave Days Based on Daily Maximum Temperature', 'Heat Wave Days Based on Daily Maximum Heat Index', 'Heat Wave Days Based on Net Daily Heat Stress']]
    final_df = output_df.rename(columns={'Heat Wave Days Based on Daily Maximum Temperature': 'count_hwDays_onDailyMaxTemp', 'Heat Wave Days Based on Daily Maximum Heat Index': 'count_hwDays_onDailyMaxHeatIndex', 'Heat Wave Days Based on Net Daily Heat Stress': 'count_hwDays_onDailyNetHeatStress'})
    return final_df

In [59]:
heat_wave_df = [process_heat_wave_data(file) for file in heat_files]
heat_wave_df = pd.concat(heat_wave_df)
heat_wave_df.sort_values(by=['County Code', 'Year'], inplace=True)

In [60]:
heat_wave_df.memory_usage(deep=True) / 1_000_000

Index                                0.746160
County                               6.990030
County Code                          5.782740
Year                                 0.746160
count_hwDays_onDailyMaxTemp          5.431019
count_hwDays_onDailyMaxHeatIndex     5.428730
count_hwDays_onDailyNetHeatStress    5.428157
dtype: float64

In [61]:
heat_wave_df.to_csv('../data/cleaned/heat_wave_days_1981_2010.csv', index=False)

# Read in Precipitation

In [62]:
precipitation_files = os.listdir('../data/01_climate_data/monthlyPrecipitation_RAW/')
#precipitation_files

In [63]:
def process_precip(filename):
    file_string = '../data/01_climate_data/monthlyPrecipitation_RAW/' + filename
    precip_df = pd.read_csv(file_string, delimiter='\t', header=0, dtype={'County Code': object}, parse_dates=['Month, Year Code'])
    
    #drop unneccesary rows
    precip_df = precip_df[precip_df['County'].isna() == False]
    precip_df = precip_df[precip_df['Notes'] != 'Total'] #drop the total rows
    
    #county code to int
    #precip_df = precip_df.astype({'County Code': 'int64'})
    
    # create UID
    precip_df['UID'] = precip_df['County'] + " - " + precip_df['Month, Year']
    
    #get only needed columns
    output_df = precip_df[['UID', 'County', 'County Code', 'Month, Year', 'Month, Year Code', 'Avg Daily Precipitation (mm)' ,'Min Daily Precipitation', 'Max Daily Precipitation']]
    final_df = output_df.rename(columns={'Avg Daily Precipitation (mm)': 'avg_daily_precip_mm', 'Min Daily Precipitation': 'min_daily_precip_mm', 'Max Daily Precipitation': 'max_daily_precip_mm'})
    return final_df

In [64]:
precipitation_df = [process_precip(file) for file in precipitation_files]
precipitation_df = pd.concat(precipitation_df)

In [65]:
# turn objects to categories to save memory
precipitation_df['County'] = precipitation_df['County'].astype('category')
precipitation_df['Month, Year'] = precipitation_df['Month, Year'].astype('category')

In [66]:
precipitation_df.memory_usage(deep=True) / 1_000_000

Index                    9.855648
UID                    107.115228
County                   2.763161
County Code             76.381272
Month, Year              2.506600
Month, Year Code         9.855648
avg_daily_precip_mm      9.855648
min_daily_precip_mm      9.855648
max_daily_precip_mm      9.855648
dtype: float64

In [67]:
precipitation_df.head(2)

Unnamed: 0,UID,County,County Code,"Month, Year","Month, Year Code",avg_daily_precip_mm,min_daily_precip_mm,max_daily_precip_mm
0,"Autauga County, AL - Jan, 1993","Autauga County, AL",1001,"Jan, 1993",1993-01-01,5.18,0.0,46.3
1,"Autauga County, AL - Feb, 1993","Autauga County, AL",1001,"Feb, 1993",1993-02-01,3.4,0.0,28.0


In [440]:
# creating a Unique Identifier to join later with other climate dfs
# precip_compression_opts = dict(method='zip',
#                         archive_name='precipitation_monthlyByCounty_1979_2011.csv')

# precipitation_df.to_csv('./data/01_climate_data/01_climate_data_CLEAN/precipitation_monthlyByCounty_1979_2011.zip', compression=precip_compression_opts)

# Combining all Climate Dfs into One

Here we will combine all climate data into one master data frame (sans the heat wave data, since that is not monthly), and write out to a zipped csv. This is to save on file size for github file size limits.

In [68]:
#all dfs to merge: precipitation_df, fine_particulate_matter_df, air_temperature_df
# contextual - heat_wave_df
precip_airTemp = pd.merge(air_temperature_df, precipitation_df, left_on='UID', right_on='UID')
#combined_df = pd.merge(precip_airTemp, fine_particulate_matter_df, left_on='UID', right_on='UID', how='outer')

In [28]:
precip_airTemp.columns

Index(['UID', 'County_x', 'County Code_x', 'Month, Year_x',
       'Month, Year Code_x', 'avg_dailyMaxAirTemp_F', 'min_dailyMaxAirTemp_F',
       'max_dailyMaxAirTemp_F', 'County_y', 'County Code_y', 'Month, Year_y',
       'Month, Year Code_y', 'avg_daily_precip_mm', 'min_daily_precip_mm',
       'max_daily_precip_mm'],
      dtype='object')

In [69]:
precip_airTemp.drop(columns=['UID','County_y', 'County Code_y', 'Month, Year_y', 'Month, Year Code_y'], inplace=True)

In [70]:
precip_airTemp.rename(columns={'County_x': 'county_name', 'County Code_x': 'county_FIPS', 'Month, Year_x': 'month_year_long', 'Month, Year Code_x': 'month_year_short'}, inplace=True)

In [31]:
# precip_airTemp.memory_usage(deep=True) / 1_000_000

# precip_airTemp.info()

In [71]:
# turn objects to categories to save memory
precip_airTemp['county_name'] = precip_airTemp['county_name'].astype('category')
precip_airTemp['county_FIPS'] = precip_airTemp['county_FIPS'].astype('category')
precip_airTemp['month_year_long'] = precip_airTemp['month_year_long'].astype('category')

In [72]:
precip_airTemp.memory_usage(deep=True) / 1_000_000

Index                    9.855648
county_name              2.763161
county_FIPS              2.722882
month_year_long          2.506600
month_year_short         2.483632
avg_dailyMaxAirTemp_F    9.855648
min_dailyMaxAirTemp_F    9.855648
max_dailyMaxAirTemp_F    9.855648
avg_daily_precip_mm      9.855648
min_daily_precip_mm      9.855648
max_daily_precip_mm      9.855648
dtype: float64

In [73]:
# creating a Unique Identifier to join later with other climate dfs
final_compression_opts = dict(method='zip',
                        archive_name='precip_AirTemp_monthly_1979_2011.csv')

precip_airTemp.to_csv('../data/cleaned/precip_AirTemp_monthly_1979_2011.zip', compression=final_compression_opts)

In [35]:
#precip_airTemp

# Inspecting Health Data

In [79]:
# tx_county_inf_df = pd.read_csv('./data/IHME_USA_COUNTY_INFECT_DIS_MORT_1980_2014/IHME_USA_COUNTY_INFECT_DIS_MORT_1980_2014_TEXAS_Y2018M03D27.csv')

# tx_county_diseases_df = tx_county_inf_df[tx_county_inf_df['location_name'] != 'Texas']

# travis_county = tx_county_diseases_df[tx_county_diseases_df['location_name'] == 'Travis County']

# travis_county.groupby(by='cause_name')[['mx']].mean()

# travis_county = travis_county[travis_county['sex'] == 'Both']

# travis_county.groupby(by=['cause_name', 'year_id'])[['mx']].mean()

# sns.histplot(travis_county.groupby(by=['cause_name', 'year_id'])[['mx']].mean())

In [2]:
cvd_files = os.listdir('../data/03_health_data/IHME_USA_COUNTY_CVD_MORTALITY_RATES_1980_2014/')
inf_files = os.listdir('../data/03_health_data/IHME_USA_COUNTY_INFECT_DIS_MORT_1980_2014/')
resp_files = os.listdir('../data/03_health_data/IHME_USA_COUNTY_RESP_DISEASE_MORTALITY_1980_2014/')
cancer_files = os.listdir('../data/03_health_data/IHME_USA_COUNTY_CANCER_MORTALITY_RATES_1980_2014/')
substance_injury_files = os.listdir('../data/03_health_data/IHME_USA_COUNTY_USE_INJ_MORTALITY_1980_2014/')

cvd_files.remove('.ipynb_checkpoints')
inf_files.remove('.ipynb_checkpoints')
substance_injury_files.remove('.ipynb_checkpoints')
cancer_files.remove('.ipynb_checkpoints')

In [31]:
# universal function to process data in all folders
def process_health_data(file):
    state = file.split('_')[-2].title()
    stat_type = file.split('_')[3]
    
    #set filepath based on stat_type
    if stat_type == 'CVD':
        folder = '../data/03_health_data/IHME_USA_COUNTY_CVD_MORTALITY_RATES_1980_2014/'
    elif stat_type == 'INFECT':
        folder = '../data/03_health_data/IHME_USA_COUNTY_INFECT_DIS_MORT_1980_2014/'
    elif stat_type == 'RESP':
        folder = '../data/03_health_data/IHME_USA_COUNTY_RESP_DISEASE_MORTALITY_1980_2014/'
    elif stat_type == 'CANCER':
        folder = '../data/03_health_data/IHME_USA_COUNTY_CANCER_MORTALITY_RATES_1980_2014/'
    elif stat_type == 'USE':
        folder = '../data/03_health_data/IHME_USA_COUNTY_USE_INJ_MORTALITY_1980_2014/'
    full_path = folder + file
    #print(full_path)
    
    #read in csv
    df = pd.read_csv(full_path, dtype={'FIPS': object})
    # drop rows that are sums of whole state
    df = df[(df['location_name'] != state) & (df['FIPS'].isna() == False)]
    
    #recase cols to save memory and so we can create a UID later
    # UID is formatted: 'FIPS-cause_id-sex_id-year_id'
    #df[['cause_id', 'sex', 'cause_name', 'location_name', 'FIPS']] = df[['location_name', 'FIPS', 'cause_id', 'sex', 'cause_name']].astype('category')
    #df['UID'] = df['FIPS'] + '-' + df['cause_id'] + '-' + df['sex_id'] + '-' + df['year_id']
    df['state'] = state
    df.drop(columns=['sex_id', 'location_id'], inplace=True)
    
    #recast variables as category for file size handling
    #df[['location_name', 'FIPS', 'cause_id', 'cause_name', 'sex_id', 'sex', 'year_id', 'UID']] = df[['location_name', 'FIPS', 'cause_id', 'cause_name', 'sex_id', 'sex', 'year_id', 'UID']].astype('category')
    
    return df

In [32]:
# process cardiovascular mortality data
full_cvd_df = [process_health_data(file) for file in cvd_files]
full_cvd_df = pd.concat(full_cvd_df)

full_cvd_df['FIPS'] = full_cvd_df['FIPS'].str.zfill(5)

In [35]:
# process infectious disease mortality data
full_inf_df = [process_health_data(file) for file in inf_files]
full_inf_df = pd.concat(full_inf_df)

full_inf_df['FIPS'] = full_inf_df['FIPS'].str.zfill(5)

In [36]:
# process respiratory disease mortality data
full_resp_df = [process_health_data(file) for file in resp_files]
full_resp_df = pd.concat(full_resp_df)

full_resp_df['FIPS'] = full_resp_df['FIPS'].str.zfill(5)

In [37]:
# process substance abuse/self injury disease mortality data
full_subInj_df = [process_health_data(file) for file in substance_injury_files]
full_subInj_df = pd.concat(full_subInj_df)

full_subInj_df['FIPS'] = full_subInj_df['FIPS'].str.zfill(5)

In [38]:
# process cancer disease mortality data
full_cancer_df = [process_health_data(file) for file in cancer_files]
full_cancer_df = pd.concat(full_cancer_df)

full_cancer_df['FIPS'] = full_cancer_df['FIPS'].str.zfill(5)

In [39]:
# dropping all the unnecessary columns
full_cvd_df.drop(columns=['measure_id', 'measure_name', 'cause_id', 'age_id', 'age_name', 'metric', 'upper', 'lower'], inplace=True)
full_inf_df.drop(columns=['measure_id', 'measure_name', 'cause_id', 'age_id', 'age_name', 'metric', 'upper', 'lower'], inplace=True)
full_resp_df.drop(columns=['measure_id', 'measure_name', 'cause_id', 'age_id', 'age_name', 'metric', 'upper', 'lower'], inplace=True)
full_cancer_df.drop(columns=['cause_id', 'upper', 'lower'], inplace=True)
full_subInj_df.drop(columns=['measure_id', 'measure_name', 'cause_id', 'age_id', 'age_name', 'metric', 'upper', 'lower', 'measure_ID'], inplace=True)

In [40]:
full_cvd_df

Unnamed: 0,location_name,FIPS,cause_name,sex,year_id,mx,state
105,Beaverhead County,30001,Cardiovascular diseases,Male,1980,629.352118,Montana
106,Beaverhead County,30001,Cardiovascular diseases,Male,1981,622.091399,Montana
107,Beaverhead County,30001,Cardiovascular diseases,Male,1982,590.455653,Montana
108,Beaverhead County,30001,Cardiovascular diseases,Male,1983,573.687746,Montana
109,Beaverhead County,30001,Cardiovascular diseases,Male,1984,566.645135,Montana
...,...,...,...,...,...,...,...
121480,Wyandot County,39175,Other cardiovascular and circulatory diseases,Both,2010,9.689966,Ohio
121481,Wyandot County,39175,Other cardiovascular and circulatory diseases,Both,2011,9.641477,Ohio
121482,Wyandot County,39175,Other cardiovascular and circulatory diseases,Both,2012,9.653720,Ohio
121483,Wyandot County,39175,Other cardiovascular and circulatory diseases,Both,2013,9.765255,Ohio


In [41]:
full_cvd_df[['location_name', 'FIPS', 'cause_name', 'sex', 'year_id', 'state']] = full_cvd_df[['location_name', 'FIPS', 'cause_name', 'sex', 'year_id', 'state']].astype('category')

full_inf_df[['location_name', 'FIPS', 'cause_name', 'sex', 'year_id', 'state']] = full_inf_df[['location_name', 'FIPS', 'cause_name', 'sex', 'year_id', 'state']].astype('category')

full_resp_df[['location_name', 'FIPS', 'cause_name', 'sex', 'year_id', 'state']] = full_resp_df[['location_name', 'FIPS', 'cause_name', 'sex', 'year_id', 'state']].astype('category')

full_subInj_df[['location_name', 'FIPS', 'cause_name', 'sex', 'year_id', 'state']] = full_subInj_df[['location_name', 'FIPS', 'cause_name', 'sex', 'year_id', 'state']].astype('category')

full_cancer_df[['location_name', 'FIPS', 'cause_name', 'sex', 'year_id', 'state']] = full_cancer_df[['location_name', 'FIPS', 'cause_name', 'sex', 'year_id', 'state']].astype('category')

In [44]:
# creating a Unique Identifier to join later with other climate dfs
cvd_df_compression_opts = dict(method='zip',
                        archive_name='cvd_mortality.csv')

full_cvd_df.to_csv('../data/cleaned/cvd_mortality.zip', index=False, compression=cvd_df_compression_opts)

In [45]:
# creating a Unique Identifier to join later with other climate dfs
inf_df_compression_opts = dict(method='zip',
                        archive_name='inf_mortality.csv')

full_inf_df.to_csv('../data/cleaned/inf_mortality.zip', index=False, compression=inf_df_compression_opts)

In [46]:
# creating a Unique Identifier to join later with other climate dfs
resp_df_compression_opts = dict(method='zip',
                        archive_name='resp_mortality.csv')

full_resp_df.to_csv('../data/cleaned/resp_mortality.zip', index=False, compression=resp_df_compression_opts)

In [47]:
# creating a Unique Identifier to join later with other climate dfs
subInj_df_compression_opts = dict(method='zip',
                        archive_name='substanceAbuse_selfInjury_mortality.csv')

full_subInj_df.to_csv('../data/cleaned/substanceAbuse_selfInjury_mortality.zip', index=False, compression=subInj_df_compression_opts)

---

# Creating Texas Only Data for Streamlit App

For the purposes of building our interactive data visualization application, we choose to implement a web app using streamlit. Because we had not yet covered databases, we opted to load and visualize a smaller dataset -- 

This is our ***"Texas Case Study"*** where we visualize climate, health and demographic data, as well as our K-Means Clustering. 

But First, I'm going to go ahead and create some Texas Data to improve our applications rendering time. 

In [50]:
# This is all of the Texas Health Data
tx_cvd_df = full_cvd_df[full_cvd_df['state'] == 'Texas']
tx_inf_df = full_inf_df[full_inf_df['state'] == 'Texas']
tx_resp_df = full_resp_df[full_resp_df['state'] == 'Texas']
tx_subInj_df = full_subInj_df[full_subInj_df['state'] == 'Texas']

In [88]:
# create the texas climate data
precip_airTemp['county_name'] = precip_airTemp['county_name'].astype(object)
#precip_airTemp

In [93]:
climate_df = pd.read_csv('../data/cleaned/precip_AirTemp_monthly_1979_2011.zip', parse_dates=['month_year_long'], dtype={'county_FIPS': object, 'county_name': object})
#climate_df

In [107]:
#climate_df['county_name'].str.split(', ')[1000][1]

In [105]:
climate_df['state'] = [i.split(', ')[1] for i in climate_df['county_name']]

In [108]:
climate_df.drop(columns=['Unnamed: 0', 'month_year_short'], inplace=True)

In [109]:
tx_climate_df = climate_df[climate_df['state'] == 'TX']
tx_climate_df

Unnamed: 0,county_name,county_FIPS,month_year_long,avg_dailyMaxAirTemp_F,min_dailyMaxAirTemp_F,max_dailyMaxAirTemp_F,avg_daily_precip_mm,min_daily_precip_mm,max_daily_precip_mm,state
985644,"Anderson County, TX",48001,1979-01-01,45.30,26.0,74.3,3.50,0.0,32.7,TX
985645,"Anderson County, TX",48001,1979-02-01,57.64,31.4,81.2,3.11,0.0,24.6,TX
985646,"Anderson County, TX",48001,1979-03-01,69.99,48.5,81.3,6.22,0.0,62.1,TX
985647,"Anderson County, TX",48001,1979-04-01,75.90,61.8,87.5,3.08,0.0,30.0,TX
985648,"Anderson County, TX",48001,1979-05-01,81.19,68.4,91.4,5.60,0.0,49.2,TX
...,...,...,...,...,...,...,...,...,...,...
1086223,"Zavala County, TX",48507,2011-08-01,105.02,101.0,110.2,0.35,0.0,21.8,TX
1086224,"Zavala County, TX",48507,2011-09-01,99.11,90.5,107.7,0.81,0.0,13.9,TX
1086225,"Zavala County, TX",48507,2011-10-01,86.49,64.9,96.0,2.32,0.0,58.2,TX
1086226,"Zavala County, TX",48507,2011-11-01,78.56,59.8,92.2,0.75,0.0,41.4,TX


In [112]:
heat_wave_df['state'] = [i.split(', ')[1] for i in heat_wave_df['County']]

In [114]:
tx_heat_wave_df = heat_wave_df[heat_wave_df['state'] == 'TX']
tx_heat_wave_df

Unnamed: 0,County,County Code,Year,count_hwDays_onDailyMaxTemp,count_hwDays_onDailyMaxHeatIndex,count_hwDays_onDailyNetHeatStress,state
4978,"Anderson County, TX",48001,1981-01-01,0,6,6,TX
12445,"Anderson County, TX",48001,1982-01-01,6,0,0,TX
12446,"Anderson County, TX",48001,1983-01-01,0,0,0,TX
12447,"Anderson County, TX",48001,1984-01-01,7,2,0,TX
12448,"Anderson County, TX",48001,1985-01-01,4,0,0,TX
...,...,...,...,...,...,...,...
16452,"Zavala County, TX",48507,2006-01-01,2,5,12,TX
16453,"Zavala County, TX",48507,2007-01-01,0,4,4,TX
16454,"Zavala County, TX",48507,2008-01-01,0,0,5,TX
16455,"Zavala County, TX",48507,2009-01-01,27,15,14,TX


In [117]:
demographics_df = pd.read_csv('../data/cleaned/final_combined_df.csv', dtype={'fips': object})
tx_demographics = demographics_df[demographics_df['state'] == ' Texas']

In [121]:
# write out all TX data frames & stuff to CSV

# tx_subInj_df, tx_cvd_df, tx_inf_df, tx_resp_df
tx_cvd_df.to_csv('../data/cleaned/tx_for_streamlit/tx_cvd.csv', index=False)
tx_inf_df.to_csv('../data/cleaned/tx_for_streamlit/tx_inf.csv', index=False)
tx_resp_df.to_csv('../data/cleaned/tx_for_streamlit/tx_resp.csv', index=False)
tx_subInj_df.to_csv('../data/cleaned/tx_for_streamlit/tx_subInj.csv', index=False)

# tx_demographics, tx_heat_wave_df, tx_climate_df
tx_demographics.to_csv('../data/cleaned/tx_for_streamlit/tx_demographics.csv', index=False)
tx_heat_wave_df.to_csv('../data/cleaned/tx_for_streamlit/tx_heat_wave.csv', index=False)
tx_climate_df.to_csv('../data/cleaned/tx_for_streamlit/tx_climate_wave.csv', index=False)

In [127]:
tx_demographics

Unnamed: 0,fips,total_pop_age_sex,median_age,sex_ratio_males,under_18_percent,over_18_percent,over_65_percent,county,state,total_hh,...,Metro2013,rural_urban_continuum_code_2013,retirement_destination_2015_update,metro_adjacent_2013,female_2011_lower_bound_percent,female_2011_upper_bound_percent,male_2011_lower_bound_percent,male_2011_upper_bound_percent,FIPS,mx
2513,48001,57810.0,39.3,155.7,19.5,80.5,14.3,Anderson County,Texas,16677.0,...,7.0,0.0,0.0,0.0,33.68,43.69,32.53,43.64,48001,68.349249
2514,48003,18036.0,30.8,103.5,31.5,68.5,10.0,Andrews County,Texas,5573.0,...,6.0,0.0,0.0,1.0,35.02,47.33,33.59,45.76,48003,69.646064
2515,48005,87322.0,37.5,95.3,25.7,74.3,15.9,Angelina County,Texas,31035.0,...,5.0,0.0,0.0,0.0,36.60,47.15,32.62,43.40,48005,76.484950
2516,48007,24462.0,50.6,94.8,18.8,81.2,27.4,Aransas County,Texas,9548.0,...,2.0,1.0,1.0,0.0,33.18,44.83,31.73,43.06,48007,52.755955
2517,48009,8716.0,44.5,99.5,21.7,78.3,19.5,Archer County,Texas,3452.0,...,3.0,1.0,0.0,0.0,30.82,42.92,32.06,44.20,48009,54.978487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2762,48499,44366.0,47.9,98.9,19.4,80.6,26.7,Wood County,Texas,16510.0,...,6.0,0.0,1.0,1.0,33.24,43.90,34.65,46.06,48499,67.041721
2763,48501,8631.0,30.3,107.9,33.0,67.0,11.5,Yoakum County,Texas,2617.0,...,7.0,0.0,0.0,0.0,35.33,47.81,33.88,46.02,48501,56.447088
2764,48503,18036.0,41.1,95.0,24.3,75.7,20.4,Young County,Texas,7307.0,...,7.0,0.0,0.0,0.0,31.94,43.70,32.59,44.60,48503,65.106934
2765,48505,14304.0,29.5,98.4,33.5,66.5,13.8,Zapata County,Texas,4503.0,...,6.0,0.0,0.0,1.0,44.36,57.57,38.81,51.67,48505,51.928444


In [None]:
pwd = '../data/cleaned/tx_for_streamlit/tx_cvd.csv'