In [5]:
import pandas as pd
import glob

In [7]:
path = r'00_source_data/cause_of_death' # point to correct folder
filenames = glob.glob(path + "/*.txt") # select all text files in folder

df = pd.DataFrame() # empty df - will store data from all txt files

for f in filenames:
    temp = pd.read_csv(f, index_col=None, header=0, sep='\t')
    # we're getting some extraneous notes at the bottom - let's just drop based on county as these will only be null for these useless notes columns
    temp.dropna(subset={'County'}, inplace=True)
    
    df = pd.concat([df, temp], axis=0, ignore_index=True)

In [9]:
# helper functions to separate county and state

def abtract_state(county):
    """
    Args:
        county (str): county name

    Returns:
        str: state
    """
    return county.split(", ")[1]



def abstract_county(county):
    """
    Args:
        county (str): county name

    Returns:
        str: county
    """
    return county.split(", ")[0]

# apply functions to our df
df["State"] = df.apply(lambda x: abtract_state(x["County"]), axis=1)
df["County"] = df.apply(lambda x: abstract_county(x["County"]), axis=1)

# do not need notes column, let's just drop it here
df.drop(columns={"Notes"}, inplace=True)

df_cause_of_death = df.copy() # keep a copy of this df for later filtering

In [13]:
# function to select only overdose records
# double check this

def select_overdose(record):
    """Simple function to select only overdose records"""

    if record == "All other non-drug and non-alcohol causes":
        return 0

    if record == "All other alcohol-induced causes":
        return 0

    if record == "All other drug-induced causes":
        return 0

    if record == "Alcohol poisonings (overdose) (X45, X65, Y15)":
        return 0

    if record == "Drug poisonings (overdose) Unintentional (X40-X44)":
        return 1

    if record == "Drug poisonings (overdose) Suicide (X60-X64)":
        return 1

    if record == "Drug poisonings (overdose) Undetermined (Y10-Y14)":
        return 1

    else:
        return "error"
    

# apply to cause of death
df_cause_of_death["overdose"] = df_cause_of_death["Drug/Alcohol Induced Cause"].apply(lambda x: select_overdose(x))

In [14]:
df_cause_of_death = df_cause_of_death[df_cause_of_death["overdose"] == 1]

In [23]:
df_cause_of_death[df_cause_of_death["Deaths"] == "Missing"].head()
# interesting case where death data is missing. These will either have to be imputed or dropped if we use AK as a comparison state

Unnamed: 0,County,County Code,Year,Year Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths,State,overdose
52576,Prince of Wales-Outer Ketchikan Census Area,2201.0,2015.0,2015.0,Drug poisonings (overdose) Unintentional (X40-...,D1,Missing,AK,1
52577,Prince of Wales-Outer Ketchikan Census Area,2201.0,2015.0,2015.0,Drug poisonings (overdose) Suicide (X60-X64),D2,Missing,AK,1
52579,Prince of Wales-Outer Ketchikan Census Area,2201.0,2015.0,2015.0,Drug poisonings (overdose) Undetermined (Y10-Y14),D4,Missing,AK,1
52585,Skagway-Hoonah-Angoon Census Area,2232.0,2015.0,2015.0,Drug poisonings (overdose) Unintentional (X40-...,D1,Missing,AK,1
52586,Skagway-Hoonah-Angoon Census Area,2232.0,2015.0,2015.0,Drug poisonings (overdose) Suicide (X60-X64),D2,Missing,AK,1


In [24]:
# for now, let's drop them

df_cause_of_death = df_cause_of_death[df_cause_of_death["Deaths"] != "Missing"]

In [31]:
df_cause_of_death["Deaths"] = df_cause_of_death["Deaths"].apply(lambda x: float(x))
df_cause_of_death["DeaYearths"] = df_cause_of_death["Year"].apply(lambda x: int(x))

In [34]:
# set max display rows to 100
pd.set_option('display.max_rows', 100)

In [35]:
df_cause_of_death.groupby(["State", "Year"]).Deaths.sum().reset_index().sort_values("Deaths", ascending=False)

Unnamed: 0,State,Year,Deaths
64,CA,2015.0,4412.0
63,CA,2014.0,4292.0
62,CA,2013.0,4223.0
60,CA,2011.0,3948.0
58,CA,2009.0,3782.0
...,...,...,...
577,VT,2007.0,12.0
622,WY,2007.0,11.0
520,SD,2010.0,11.0
363,ND,2014.0,10.0
