# Reading and Cleaning Climate Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
# cdc wonder Daily Air Temp by County (aggregated by month)
# Jan 1, 1979 - Dec 31, 2011
#temp_df = pd.read_csv('data/climate_data/2011_AirTemp_byState.txt', delimiter="\t", header=0)
#particulateMatter_df = pd.read_csv('data/climate_data/2011_fineParticulateMatter_µgm3.txt', delimiter="\t", header=0)

# Read in Temperature Data

In [5]:
temperature_files = os.listdir('../data/01_climate_data/monthlyAirTemp_byCounty_RAW/')
temperature_files.remove('.ipynb_checkpoints')

In [4]:
#temperature_files

In [6]:
# temp_1979_df = pd.read_csv('./data/01_climate_data/monthlyAirTemp_byCounty_RAW/dailyAirTemp_byCounty_1979.txt', delimiter='\t', header=0)
# temp_1979_df = temp_1979_df[temp_1979_df['County'].isna() == False]
# temp_1979_df = temp_1979_df[temp_1979_df['Notes'] != 'Total']

# temp_1979_df.columns

# temp_1979_df = temp_1979_df[['County', 'County Code', 'Month, Year', 'Month, Year Code', 'Record Count for Daily Max Air Temp (F)' ,'Avg Daily Max Air Temperature (F)', 'Min Temp for Daily Max Air Temp (F)', 'Max Temp for Daily Max Air Temp (F)']]

In [6]:
# making the above easier with a function to grab data in folder, read it in, remove unwanted stuff and return a clean df
def process_temp_data(filename):
    file_string = '../data/01_climate_data/monthlyAirTemp_byCounty_RAW/' + filename
    temp_df = pd.read_csv(file_string, delimiter='\t', header=0, parse_dates=['Month, Year Code'], dtype={'County Code': object})
    
    #drop unneccesary rows
    temp_df = temp_df[temp_df['County'].isna() == False]
    temp_df = temp_df[temp_df['Notes'] != 'Total'] #drop the total rows
    
    # create UID
    temp_df['UID'] = temp_df['County'] + " - " + temp_df['Month, Year']
    
    #get only needed columns
    output_df = temp_df[['UID', 'County', 'County Code', 'Month, Year', 'Month, Year Code','Avg Daily Max Air Temperature (F)', 'Min Temp for Daily Max Air Temp (F)', 'Max Temp for Daily Max Air Temp (F)']]
    output_df = output_df.rename(columns={'Avg Daily Max Air Temperature (F)': 'avg_dailyMaxAirTemp_F', 'Min Temp for Daily Max Air Temp (F)': 'min_dailyMaxAirTemp_F', 'Max Temp for Daily Max Air Temp (F)': 'max_dailyMaxAirTemp_F'})
    return output_df

In [7]:
# run process data function on all data in temperature files
air_temperature_df = [process_temp_data(file) for file in temperature_files]
air_temperature_df = pd.concat(air_temperature_df)

# sort the values
air_temperature_df.sort_values(by=['County Code', 'Month, Year Code'], inplace=True)

In [8]:
air_temperature_df.memory_usage(deep=True) / 1_000_000

Index                      9.855648
UID                      107.115228
County                    92.331756
County Code               76.381272
Month, Year               81.309096
Month, Year Code           9.855648
avg_dailyMaxAirTemp_F      9.855648
min_dailyMaxAirTemp_F      9.855648
max_dailyMaxAirTemp_F      9.855648
dtype: float64

In [9]:
# turn objects to categories to save memory
air_temperature_df['County'] = air_temperature_df['County'].astype('category')
air_temperature_df['Month, Year'] = air_temperature_df['Month, Year'].astype('category')
air_temperature_df['Month, Year Code'] = air_temperature_df['Month, Year Code'].astype('category')

In [416]:
# #write out clean data to csv
# temp_compression_opts = dict(method='zip',
#                         archive_name='airTempMonthly_1979_2011.csv')


# air_temperature_df.to_csv('./data/01_climate_data/01_climate_data_CLEAN/airTempMonthly_1979_2011.zip', index=False, compression=temp_compression_opts)

In [408]:
# # read in clean csv just to test it works
# data = pd.read_csv('./data/01_climate_data/01_climate_data_CLEAN/airTempMonthly_1979_2011.csv')
# data.info()

# Read in the Particulate Matter Data

Significant issues, we probs won't be using this data

In [10]:
# # make list of files in particulate matter
particle_files = os.listdir('../data/01_climate_data/monthlyParticulateMatter_RAW/')
# #particle_files

# test_pfm_df = pd.read_csv('./data/01_climate_data/monthlyParticulateMatter_RAW/monthlyFPM_byCounty_2009.txt', delimiter="\t", header=0)

# test_pfm_df.head(2)

# test_pfm_df = test_pfm_df[test_pfm_df['County'].isna() == False]
# test_pfm_df = test_pfm_df[test_pfm_df['Notes'] != 'Total']

# test_pfm_df.columns

In [1]:
# def process_fpm_data(filename):
#     file_string = '../data/01_climate_data/monthlyParticulateMatter_RAW/' + filename
#     fpm_df = pd.read_csv(file_string, delimiter='\t', header=0, parse_dates=['Month, Year Code'], dtype={'County Code': object}, encoding='unicode_escape')
    
#     #drop unneccesary rows
#     fpm_df = fpm_df[fpm_df['County'].isna() == False]
#     fpm_df = fpm_df[fpm_df['Notes'] != 'Total'] #drop the total rows
    
#     # create UID
#     fpm_df['UID'] = fpm_df['County'] + " - " + fpm_df['Month, Year']
    
#     #get only needed columns
#     output_df = fpm_df[['UID', 'County', 'County Code', 'Month, Year', 'Month, Year Code', 'Avg Fine Particulate Matter (�g/m�)' ,'Min Fine Particulate Matter', 'Max Fine Particulate Matter']]
#     final_df = output_df.rename(columns={'Avg Fine Particulate Matter (�g/m�)': 'avg_FPM', 'Min Fine Particulate Matter': 'min_FPM', 'Max Fine Particulate Matter': 'max_FPM'})
#     return final_df

In [13]:
# #process_fpm_data(particle_files[2]).head(2)

# # run process data function on all data in temperature files
# fine_particulate_matter_df = [process_fpm_data(file) for file in particle_files]
# fine_particulate_matter_df = pd.concat(fine_particulate_matter_df)

# # sort the values
# fine_particulate_matter_df.sort_values(by=['County Code', 'Month, Year Code'], inplace=True)

# # turn objects to categories to save memory
# fine_particulate_matter_df['County'] = fine_particulate_matter_df['County'].astype('category')
# fine_particulate_matter_df['Month, Year'] = fine_particulate_matter_df['Month, Year'].astype('category')
# fine_particulate_matter_df['Month, Year Code'] = fine_particulate_matter_df['Month, Year Code'].astype('category')

# fine_particulate_matter_df

In [426]:
# # creating a Unique Identifier to join later with other climate dfs
# fpm_compression_opts = dict(method='zip',
#                         archive_name='fpm_monthlyByCounty_2003_2011.csv')


# fine_particulate_matter_df.to_csv('./data/01_climate_data/01_climate_data_CLEAN/fpm_monthly_byCounty_2003_2011.zip', index=False, compression=fpm_compression_opts)

# Read in the Heat Wave Days

In [14]:
heat_files = os.listdir('../data/01_climate_data/heatWaveDays_RAW/')

In [25]:
# test_df = pd.read_csv('./data/01_climate_data/heatWaveDays_RAW/heatWaveDays_1982_1985.txt', delimiter='\t', header=0, dtype={'County Code': object})
# test_df[test_df['County'].isna() == False].head(1)

# test_df.columns

In [15]:
def process_heat_wave_data(filename):
    file_string = '../data/01_climate_data/heatWaveDays_RAW/' + filename
    hw_df = pd.read_csv(file_string, delimiter='\t', header=0, parse_dates=['Year'], dtype={'County Code': object})
    
    #drop unneccesary rows
    hw_df = hw_df[hw_df['County'].isna() == False]
    hw_df = hw_df[hw_df['Notes'] != 'Total'] #drop the total rows
    
    #county code to int
    #hw_df = hw_df.astype({'County Code': 'int64'})
    
    # No UID in this df, since it doesn't have the Month, Year col

    
    #get only needed columns
    output_df = hw_df[['County', 'County Code', 'Year', 'Heat Wave Days Based on Daily Maximum Temperature', 'Heat Wave Days Based on Daily Maximum Heat Index', 'Heat Wave Days Based on Net Daily Heat Stress']]
    final_df = output_df.rename(columns={'Heat Wave Days Based on Daily Maximum Temperature': 'count_hwDays_onDailyMaxTemp', 'Heat Wave Days Based on Daily Maximum Heat Index': 'count_hwDays_onDailyMaxHeatIndex', 'Heat Wave Days Based on Net Daily Heat Stress': 'count_hwDays_onDailyNetHeatStress'})
    return final_df

In [16]:
heat_wave_df = [process_heat_wave_data(file) for file in heat_files]
heat_wave_df = pd.concat(heat_wave_df)
heat_wave_df.sort_values(by=['County Code', 'Year'], inplace=True)

In [17]:
heat_wave_df.memory_usage(deep=True) / 1_000_000

Index                                0.746160
County                               6.990030
County Code                          5.782740
Year                                 0.746160
count_hwDays_onDailyMaxTemp          5.431019
count_hwDays_onDailyMaxHeatIndex     5.428730
count_hwDays_onDailyNetHeatStress    5.428157
dtype: float64

In [18]:
heat_wave_df.to_csv('../data/cleaned/heat_wave_days_1981_2010.csv', index=False)

# Read in Precipitation

In [19]:
precipitation_files = os.listdir('../data/01_climate_data/monthlyPrecipitation_RAW/')
#precipitation_files

In [20]:
test_precip = pd.read_csv('../data/01_climate_data/monthlyPrecipitation_RAW/monthlyPrecip_byCounty_1979.txt', delimiter="\t", header=0, dtype={'County Code': object})
#test_precip
test_precip = test_precip[test_precip['County'].isna() != True]
test_precip.head(2)

Unnamed: 0,Notes,County,County Code,"Month, Year","Month, Year Code",Avg Daily Precipitation (mm),# of Observations for Daily Precipitation,Min Daily Precipitation,Max Daily Precipitation
0,,"Autauga County, AL",1001,"Jan, 1979",1979/01,5.59,341.0,0.0,52.6
1,,"Autauga County, AL",1001,"Feb, 1979",1979/02,5.33,308.0,0.0,34.8


In [22]:
def process_precip(filename):
    file_string = '../data/01_climate_data/monthlyPrecipitation_RAW/' + filename
    precip_df = pd.read_csv(file_string, delimiter='\t', header=0, dtype={'County Code': object}, parse_dates=['Month, Year Code'])
    
    #drop unneccesary rows
    precip_df = precip_df[precip_df['County'].isna() == False]
    precip_df = precip_df[precip_df['Notes'] != 'Total'] #drop the total rows
    
    #county code to int
    #precip_df = precip_df.astype({'County Code': 'int64'})
    
    # create UID
    precip_df['UID'] = precip_df['County'] + " - " + precip_df['Month, Year']
    
    #get only needed columns
    output_df = precip_df[['UID', 'County', 'County Code', 'Month, Year', 'Month, Year Code', 'Avg Daily Precipitation (mm)' ,'Min Daily Precipitation', 'Max Daily Precipitation']]
    final_df = output_df.rename(columns={'Avg Daily Precipitation (mm)': 'avg_daily_precip_mm', 'Min Daily Precipitation': 'min_daily_precip_mm', 'Max Daily Precipitation': 'max_daily_precip_mm'})
    return final_df

In [23]:
precipitation_df = [process_precip(file) for file in precipitation_files]
precipitation_df = pd.concat(precipitation_df)

In [24]:
precipitation_df.sort_values(by=['County', 'Month, Year Code'], inplace=True)

In [437]:
# precipitation_df.memory_usage(deep=True) / 1_000_000

# precipitation_df.columns

In [25]:
# turn objects to categories to save memory
precipitation_df['County'] = precipitation_df['County'].astype('category')
precipitation_df['Month, Year'] = precipitation_df['Month, Year'].astype('category')

In [26]:
precipitation_df.memory_usage(deep=True) / 1_000_000

Index                    9.855648
UID                    107.115228
County                   2.763161
County Code             76.381272
Month, Year              2.506600
Month, Year Code         9.855648
avg_daily_precip_mm      9.855648
min_daily_precip_mm      9.855648
max_daily_precip_mm      9.855648
dtype: float64

In [44]:
precipitation_df.head(2)

Unnamed: 0,UID,County,County Code,"Month, Year","Month, Year Code",avg_daily_precip_mm,min_daily_precip_mm,max_daily_precip_mm
29666,"Abbeville County, SC - Jan, 1979","Abbeville County, SC",45001,"Jan, 1979",1979-01-01,5.2,0.0,51.0
29667,"Abbeville County, SC - Feb, 1979","Abbeville County, SC",45001,"Feb, 1979",1979-02-01,6.61,0.0,42.9


In [440]:
# creating a Unique Identifier to join later with other climate dfs
# precip_compression_opts = dict(method='zip',
#                         archive_name='precipitation_monthlyByCounty_1979_2011.csv')

# precipitation_df.to_csv('./data/01_climate_data/01_climate_data_CLEAN/precipitation_monthlyByCounty_1979_2011.zip', compression=precip_compression_opts)

# Combining all Climate Dfs into One

Here we will combine all climate data into one master data frame (sans the heat wave data, since that is not monthly), and write out to a zipped csv. This is to save on file size for github file size limits.

In [27]:
#all dfs to merge: precipitation_df, fine_particulate_matter_df, air_temperature_df
# contextual - heat_wave_df
precip_airTemp = pd.merge(air_temperature_df, precipitation_df, left_on='UID', right_on='UID')
#combined_df = pd.merge(precip_airTemp, fine_particulate_matter_df, left_on='UID', right_on='UID', how='outer')

In [28]:
precip_airTemp.columns

Index(['UID', 'County_x', 'County Code_x', 'Month, Year_x',
       'Month, Year Code_x', 'avg_dailyMaxAirTemp_F', 'min_dailyMaxAirTemp_F',
       'max_dailyMaxAirTemp_F', 'County_y', 'County Code_y', 'Month, Year_y',
       'Month, Year Code_y', 'avg_daily_precip_mm', 'min_daily_precip_mm',
       'max_daily_precip_mm'],
      dtype='object')

In [29]:
precip_airTemp.drop(columns=['UID','County_y', 'County Code_y', 'Month, Year_y', 'Month, Year Code_y'], inplace=True)

In [30]:
precip_airTemp.rename(columns={'County_x': 'county_name', 'County Code_x': 'county_FIPS', 'Month, Year_x': 'month_year_long', 'Month, Year Code_x': 'month_year_short'}, inplace=True)

In [31]:
# precip_airTemp.memory_usage(deep=True) / 1_000_000

# precip_airTemp.info()

In [32]:
# turn objects to categories to save memory
precip_airTemp['county_name'] = precip_airTemp['county_name'].astype('category')
precip_airTemp['county_FIPS'] = precip_airTemp['county_FIPS'].astype('category')
precip_airTemp['month_year_long'] = precip_airTemp['month_year_long'].astype('category')

In [33]:
precip_airTemp.memory_usage(deep=True) / 1_000_000

Index                    9.855648
county_name              2.763161
county_FIPS              2.722882
month_year_long          2.506600
month_year_short         2.483632
avg_dailyMaxAirTemp_F    9.855648
min_dailyMaxAirTemp_F    9.855648
max_dailyMaxAirTemp_F    9.855648
avg_daily_precip_mm      9.855648
min_daily_precip_mm      9.855648
max_daily_precip_mm      9.855648
dtype: float64

In [34]:
# creating a Unique Identifier to join later with other climate dfs
final_compression_opts = dict(method='zip',
                        archive_name='precip_AirTemp_monthly_1979_2011.csv')

precip_airTemp.to_csv('../data/cleaned/precip_AirTemp_monthly_1979_2011.zip', compression=final_compression_opts)

In [35]:
#precip_airTemp

# Inspecting Health Data

In [79]:
# tx_county_inf_df = pd.read_csv('./data/IHME_USA_COUNTY_INFECT_DIS_MORT_1980_2014/IHME_USA_COUNTY_INFECT_DIS_MORT_1980_2014_TEXAS_Y2018M03D27.csv')

# tx_county_diseases_df = tx_county_inf_df[tx_county_inf_df['location_name'] != 'Texas']

# travis_county = tx_county_diseases_df[tx_county_diseases_df['location_name'] == 'Travis County']

# travis_county.groupby(by='cause_name')[['mx']].mean()

# travis_county = travis_county[travis_county['sex'] == 'Both']

# travis_county.groupby(by=['cause_name', 'year_id'])[['mx']].mean()

# sns.histplot(travis_county.groupby(by=['cause_name', 'year_id'])[['mx']].mean())

In [2]:
cvd_files = os.listdir('../data/03_health_data/IHME_USA_COUNTY_CVD_MORTALITY_RATES_1980_2014/')
inf_files = os.listdir('../data/03_health_data/IHME_USA_COUNTY_INFECT_DIS_MORT_1980_2014/')
resp_files = os.listdir('../data/03_health_data/IHME_USA_COUNTY_RESP_DISEASE_MORTALITY_1980_2014/')
cancer_files = os.listdir('../data/03_health_data/IHME_USA_COUNTY_CANCER_MORTALITY_RATES_1980_2014/')
substance_injury_files = os.listdir('../data/03_health_data/IHME_USA_COUNTY_USE_INJ_MORTALITY_1980_2014/')

cvd_files.remove('.ipynb_checkpoints')
inf_files.remove('.ipynb_checkpoints')
substance_injury_files.remove('.ipynb_checkpoints')
cancer_files.remove('.ipynb_checkpoints')

In [3]:
# universal function to process data in all folders
def process_health_data(file):
    state = file.split('_')[-2].title()
    stat_type = file.split('_')[3]
    
    #set filepath based on stat_type
    if stat_type == 'CVD':
        folder = '../data/03_health_data/IHME_USA_COUNTY_CVD_MORTALITY_RATES_1980_2014/'
    elif stat_type == 'INFECT':
        folder = '../data/03_health_data/IHME_USA_COUNTY_INFECT_DIS_MORT_1980_2014/'
    elif stat_type == 'RESP':
        folder = '../data/03_health_data/IHME_USA_COUNTY_RESP_DISEASE_MORTALITY_1980_2014/'
    elif stat_type == 'CANCER':
        folder = '../data/03_health_data/IHME_USA_COUNTY_CANCER_MORTALITY_RATES_1980_2014/'
    elif stat_type == 'USE':
        folder = '../data/03_health_data/IHME_USA_COUNTY_USE_INJ_MORTALITY_1980_2014/'
    full_path = folder + file
    #print(full_path)
    
    #read in csv
    df = pd.read_csv(full_path, dtype={'FIPS': object})
    # drop rows that are sums of whole state
    df = df[(df['location_name'] != state) & (df['FIPS'].isna() == False)]
    
    #recase cols to save memory and so we can create a UID later
    # UID is formatted: 'FIPS-cause_id-sex_id-year_id'
    #df[['cause_id', 'sex', 'cause_name', 'location_name', 'FIPS']] = df[['location_name', 'FIPS', 'cause_id', 'sex', 'cause_name']].astype('category')
    #df['UID'] = df['FIPS'] + '-' + df['cause_id'] + '-' + df['sex_id'] + '-' + df['year_id']
    df.drop(columns=['sex_id', 'location_id'], inplace=True)
    
    #recast variables as category for file size handling
    #df[['location_name', 'FIPS', 'cause_id', 'cause_name', 'sex_id', 'sex', 'year_id', 'UID']] = df[['location_name', 'FIPS', 'cause_id', 'cause_name', 'sex_id', 'sex', 'year_id', 'UID']].astype('category')
    
    return df

In [4]:
# process cardiovascular mortality data
full_cvd_df = [process_health_data(file) for file in cvd_files]
full_cvd_df = pd.concat(full_cvd_df)

full_cvd_df['FIPS'] = full_cvd_df['FIPS'].str.zfill(5)

In [6]:
# process infectious disease mortality data
full_inf_df = [process_health_data(file) for file in inf_files]
full_inf_df = pd.concat(full_inf_df)

full_inf_df['FIPS'] = full_inf_df['FIPS'].str.zfill(5)

In [8]:
# process respiratory disease mortality data
full_resp_df = [process_health_data(file) for file in resp_files]
full_resp_df = pd.concat(full_resp_df)

full_resp_df['FIPS'] = full_resp_df['FIPS'].str.zfill(5)

In [10]:
process_health_data(substance_injury_files[0])

Unnamed: 0,measure_id,measure_name,location_name,FIPS,cause_id,cause_name,sex,age_id,age_name,year_id,metric,mx,lower,upper
105,1,Deaths,Adams County,17001,560,Alcohol use disorders,Male,27,Age-standardized,1980,Rate,1.952425,1.275259,2.813030
106,1,Deaths,Adams County,17001,560,Alcohol use disorders,Male,27,Age-standardized,1981,Rate,1.907006,1.288928,2.794510
107,1,Deaths,Adams County,17001,560,Alcohol use disorders,Male,27,Age-standardized,1982,Rate,1.852684,1.259262,2.666070
108,1,Deaths,Adams County,17001,560,Alcohol use disorders,Male,27,Age-standardized,1983,Rate,1.882635,1.284099,2.718167
109,1,Deaths,Adams County,17001,560,Alcohol use disorders,Male,27,Age-standardized,1984,Rate,1.869145,1.296593,2.642727
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43255,1,Deaths,Woodford County,17203,724,Interpersonal violence,Both,27,Age-standardized,2010,Rate,1.790531,1.220157,2.508647
43256,1,Deaths,Woodford County,17203,724,Interpersonal violence,Both,27,Age-standardized,2011,Rate,1.847006,1.229327,2.641223
43257,1,Deaths,Woodford County,17203,724,Interpersonal violence,Both,27,Age-standardized,2012,Rate,1.821832,1.195120,2.586801
43258,1,Deaths,Woodford County,17203,724,Interpersonal violence,Both,27,Age-standardized,2013,Rate,1.886489,1.246973,2.792552


In [14]:
cancer_files

['IHME_USA_COUNTY_CANCER_MORTALITY_RATES_1980_2014_WASHINGTON_Y2017M01D24.CSV',
 'IHME_USA_COUNTY_CANCER_MORTALITY_RATES_1980_2014_MARYLAND_Y2017M01D24.CSV',
 'IHME_USA_COUNTY_CANCER_MORTALITY_RATES_1980_2014_WYOMING_Y2017M01D24.CSV',
 'IHME_USA_COUNTY_CANCER_MORTALITY_RATES_1980_2014_MISSISSIPPI_Y2017M01D24.CSV',
 'IHME_USA_COUNTY_CANCER_MORTALITY_RATES_1980_2014_OHIO_Y2017M01D24.CSV',
 'IHME_USA_COUNTY_CANCER_MORTALITY_RATES_1980_2014_WISCONSIN_Y2017M01D24.CSV',
 'IHME_USA_COUNTY_CANCER_MORTALITY_RATES_1980_2014_IOWA_Y2017M01D24.CSV',
 'IHME_USA_COUNTY_CANCER_MORTALITY_RATES_1980_2014_UNITED STATES_Y2017M01D24.CSV',
 'IHME_USA_COUNTY_CANCER_MORTALITY_RATES_1980_2014_OKLAHOMA_Y2017M01D24.CSV',
 'IHME_USA_COUNTY_CANCER_MORTALITY_RATES_1980_2014_NEVADA_Y2017M01D24.CSV',
 'IHME_USA_COUNTY_CANCER_MORTALITY_RATES_1980_2014_FLORIDA_Y2017M01D24.CSV',
 'IHME_USA_COUNTY_CANCER_MORTALITY_RATES_1980_2014_CALIFORNIA_Y2017M01D24.CSV',
 'IHME_USA_COUNTY_CANCER_MORTALITY_RATES_1980_2014_COLORADO_Y20

In [9]:
# process substance abuse/self injury disease mortality data
full_subInj_df = [process_health_data(file) for file in substance_injury_files]
full_subInj_df = pd.concat(full_subInj_df)

IndexError: list index out of range

In [19]:
# process cancer disease mortality data
full_cancer_df = [process_health_data(file) for file in cancer_files]
full_cancer_df = pd.concat(full_cancer_df)

IndexError: list index out of range

In [None]:
full_resp_df['FIPS'] = full_resp_df['FIPS'].str.zfill(5)

In [54]:
# dropping all the unnecessary columns
full_cvd_df.drop(columns=['measure_id', 'measure_name', 'cause_id', 'age_id', 'age_name', 'metric', 'upper', 'lower'], inplace=True)
full_inf_df.drop(columns=['measure_id', 'measure_name', 'cause_id', 'age_id', 'age_name', 'metric', 'upper', 'lower'], inplace=True)
full_resp_df.drop(columns=['measure_id', 'measure_name', 'cause_id', 'age_id', 'age_name', 'metric', 'upper', 'lower'], inplace=True)
full_cancer_df.drop(columns=['cause_id', 'upper', 'lower'], inplace=True)
full_subInj_df.drop(columns=['measure_id', 'measure_name', 'cause_id', 'age_id', 'age_name', 'metric', 'upper', 'lower', 'measure_ID'], inplace=True)

In [61]:
full_cvd_df.memory_usage(deep=True) / 1_000_000

Index             34.321560
location_name    304.790850
FIPS             265.556655
cause_name       346.845765
sex              264.562025
year_id           34.321560
mx                34.321560
dtype: float64

In [62]:
full_cvd_df[['location_name', 'FIPS', 'cause_name', 'sex', 'year_id']] = full_cvd_df[['location_name', 'FIPS', 'cause_name', 'sex', 'year_id']].astype('category')

full_inf_df[['location_name', 'FIPS', 'cause_name', 'sex', 'year_id']] = full_inf_df[['location_name', 'FIPS', 'cause_name', 'sex', 'year_id']].astype('category')

full_resp_df[['location_name', 'FIPS', 'cause_name', 'sex', 'year_id']] = full_resp_df[['location_name', 'FIPS', 'cause_name', 'sex', 'year_id']].astype('category')

full_subInj_df[['location_name', 'FIPS', 'cause_name', 'sex', 'year_id']] = full_subInj_df[['location_name', 'FIPS', 'cause_name', 'sex', 'year_id']].astype('category')

full_cancer_df[['location_name', 'FIPS', 'cause_name', 'sex', 'year_id']] = full_cancer_df[['location_name', 'FIPS', 'cause_name', 'sex', 'year_id']].astype('category')

In [72]:
full_subInj_df.memory_usage(deep=True).sum() / 1_000_000

30.922044

In [75]:
full_inf_df.memory_usage(deep=True).sum() / 1_000_000

46.151163

In [76]:
# creating a Unique Identifier to join later with other climate dfs
cvd_df_compression_opts = dict(method='zip',
                        archive_name='cvd_mortality.csv')

full_cvd_df.to_csv('../data/cleaned/cvd_mortality.zip', index=False, compression=cvd_df_compression_opts)

In [77]:
# creating a Unique Identifier to join later with other climate dfs
inf_df_compression_opts = dict(method='zip',
                        archive_name='inf_mortality.csv')

full_inf_df.to_csv('../data/cleaned/inf_mortality.zip', index=False, compression=inf_df_compression_opts)

In [78]:
# creating a Unique Identifier to join later with other climate dfs
resp_df_compression_opts = dict(method='zip',
                        archive_name='resp_mortality.csv')

full_resp_df.to_csv('../data/cleaned/resp_mortality.zip', index=False, compression=resp_df_compression_opts)

In [79]:
# creating a Unique Identifier to join later with other climate dfs
subInj_df_compression_opts = dict(method='zip',
                        archive_name='substanceAbuse_selfInjury_mortality.csv')

full_subInj_df.to_csv('../data/cleaned/resp_mortality.zip', index=False, compression=subInj_df_compression_opts)