In [26]:
import pandas as pd
from datetime import datetime
import numpy as np
import glob
import os


## Clean up overdose data

In [27]:
orig_cols_to_keep = ["REPORTER_DEA_NO", "REPORTER_STATE", "REPORTER_ZIP", "REPORTER_COUNTY", "DRUG_CODE", "TRANSACTION_CODE", "DRUG_NAME", "QUANTITY", "TRANSACTION_DATE", "CALC_BASE_WT_IN_GM", "DOSAGE_UNIT",
                "TRANSACTION_ID", "Product_Name", "Ingredient_Name", "Measure", "dos_str"]
# keeping more columns than is probably necessary
# just want to make sure we have everything we need, as well as have the opportunity to catch any further filtering we miss at first
pd.set_option('display.max_columns', None)


In [28]:
def find_year(TRANSACTION_DATE):
    """
    Args:
        TRANSACTION_DATE (str): date in format MMDDYYYY

    Returns:
        int: year
    """
    TRANSACTION_DATE = str(TRANSACTION_DATE)
    
    return int(TRANSACTION_DATE[-4:])

# quick test 
print(f"testing the find_year function: {find_year(12202019)}")


def find_month(TRANSACTION_DATE):
    """
    Args:
        TRANSACTION_DATE (str): date in format MMDDYYYY

    Returns:
        int: month
    """
    TRANSACTION_DATE = str(TRANSACTION_DATE)
    
    return int(TRANSACTION_DATE[:2])

# quick test 
print(f"testing the find_month function: {find_month(12202019)}")

testing the find_year function: 2019
testing the find_month function: 12


In [29]:
# to load in the data, we need to truncate the amount of columns we use as well as the states
cols_to_keep = ["REPORTER_DEA_NO", "REPORTER_STATE", "REPORTER_ZIP", "REPORTER_COUNTY", "DRUG_CODE", "TRANSACTION_CODE", "DRUG_NAME", "QUANTITY", "TRANSACTION_DATE", "Product_Name"]

# we know we need Florida, Texas, and Washington
states = ["FL", "TX", "WA"]
# since we are normalizing based on population, I think we should pick states that are regionally close to one another
# we can change this later as a group, but I have these selected below:
variable_states = ["GA", "OK", "OR"]

# append variable states to our original list
states.extend(variable_states)

# now, load in our data as an iterator so we can load in chunks
it = pd.read_csv("00_source_data/arcos_all_washpost.tsv", chunksize=500_000, sep='\t', usecols = cols_to_keep) # may have to change chunksize depending on your computer's memory

# init empty dataframe
df = pd.DataFrame()

for chunk in it:
    # extract year out of date column
    chunk["year"] = chunk["TRANSACTION_DATE"].apply(lambda x: find_year(x))
    chunk["month"] = chunk["TRANSACTION_DATE"].apply(lambda x: find_month(x))

    # ensure we're working in the correct date range
    filtered_chunk = chunk[chunk["year"] > 2002]
    filtered_chunk = filtered_chunk[filtered_chunk["year"] < 2016]

    # filter out the states we want
    filtered_chunk = filtered_chunk[filtered_chunk["REPORTER_STATE"].isin(states)]

    df = df.append(filtered_chunk)
    break
df

df_overdoses = df.copy() # keep a copy of this df for later filtering

In [30]:
df_overdoses.head()

Unnamed: 0,REPORTER_DEA_NO,REPORTER_STATE,REPORTER_ZIP,REPORTER_COUNTY,TRANSACTION_CODE,DRUG_CODE,DRUG_NAME,QUANTITY,TRANSACTION_DATE,Product_Name,year,month
6224,PB0167127,GA,31793,TIFT,S,9193,HYDROCODONE,2.0,11232009,HYDROCODONE BIT/IBUPROFEN 7.5MG/200M,2009,11
6225,PB0167127,GA,31793,TIFT,S,9193,HYDROCODONE,2.0,12172012,HYDROCODONE BIT./ACETAMINOPHEN TABS.,2012,12
6226,PB0167127,GA,31793,TIFT,S,9193,HYDROCODONE,1.0,2192007,HYDROCODONE/IBUPROFEN 7.5MG/200MG TA,2007,21
6227,PB0167127,GA,31793,TIFT,S,9193,HYDROCODONE,1.0,6102011,HYDROCODONE BIT/ACETA 10MG/325MG USP,2011,61
6228,PB0167127,GA,31793,TIFT,S,9193,HYDROCODONE,1.0,6022008,HYDROCODONE BIT/ACETA 7.5MG/325MG US,2008,60


In [31]:
# now that we have our columns and states filtered, let's export this as a csv and store the (much smaller) result on github

df_overdoses.to_csv("05_cleaned_data/arcos_all_washpost_clean.csv", index=False)

## Clean up cause of death data

In [32]:
path = r'00_source_data/cause_of_death' # use your path
filenames = glob.glob(path + "/*.txt")

df = pd.DataFrame()

for f in filenames:
    temp = pd.read_csv(f, index_col=None, header=0, sep='\t')
    # we're getting some extraneous notes at the bottom - let's just drop for now
    temp.dropna(subset={'County'}, inplace=True)
    
    df = pd.concat([df, temp], axis=0, ignore_index=True)

In [33]:
df

Unnamed: 0,Notes,County,County Code,Year,Year Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths
0,,"Autauga County, AL",1001.0,2003.0,2003.0,All other non-drug and non-alcohol causes,O9,397.0
1,,"Baldwin County, AL",1003.0,2003.0,2003.0,Drug poisonings (overdose) Unintentional (X40-...,D1,10.0
2,,"Baldwin County, AL",1003.0,2003.0,2003.0,All other alcohol-induced causes,A9,14.0
3,,"Baldwin County, AL",1003.0,2003.0,2003.0,All other non-drug and non-alcohol causes,O9,1479.0
4,,"Barbour County, AL",1005.0,2003.0,2003.0,All other non-drug and non-alcohol causes,O9,287.0
...,...,...,...,...,...,...,...,...
57236,,"Sweetwater County, WY",56037.0,2015.0,2015.0,All other non-drug and non-alcohol causes,O9,251
57237,,"Teton County, WY",56039.0,2015.0,2015.0,All other non-drug and non-alcohol causes,O9,95
57238,,"Uinta County, WY",56041.0,2015.0,2015.0,All other non-drug and non-alcohol causes,O9,142
57239,,"Washakie County, WY",56043.0,2015.0,2015.0,All other non-drug and non-alcohol causes,O9,81


In [34]:
# helper functions to separate county and state

def abtract_state(county):
    """
    Args:
        county (str): county name

    Returns:
        str: state
    """
    return county.split(", ")[1]



def abstract_county(county):
    """
    Args:
        county (str): county name

    Returns:
        str: county
    """
    return county.split(", ")[0]


df["State"] = df.apply(lambda x: abtract_state(x["County"]), axis=1)
df["County"] = df.apply(lambda x: abstract_county(x["County"]), axis=1)

df.drop(columns={"Notes"}, inplace=True)

df_cause_of_death = df.copy() # keep a copy of this df for later filtering

In [None]:
df_cause_of_death.head()

Unnamed: 0,County,County Code,Year,Year Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths,State
0,Autauga County,1001.0,2003.0,2003.0,All other non-drug and non-alcohol causes,O9,397.0,AL
1,Baldwin County,1003.0,2003.0,2003.0,Drug poisonings (overdose) Unintentional (X40-...,D1,10.0,AL
2,Baldwin County,1003.0,2003.0,2003.0,All other alcohol-induced causes,A9,14.0,AL
3,Baldwin County,1003.0,2003.0,2003.0,All other non-drug and non-alcohol causes,O9,1479.0,AL
4,Barbour County,1005.0,2003.0,2003.0,All other non-drug and non-alcohol causes,O9,287.0,AL


In [None]:
df_cause_of_death.to_csv("05_cleaned_data/cause_of_death_clean.csv", index=False)

## Final 3 datasets

We should have: (UNSURE IF WE SHOULD EXTEND DATE RANGES, CURRENTLY 3 YEARS BEFORE AND AFTER POLICY IMPLEMENTATION)

- Florida and Georgia 2007 - 2013
- Texas and Oklahoma 2004 - 2010
- Washington and Oregon 2009 - 2015

### Drug overdose - broken down by state

In [None]:
# Florida and Georgia

overdoses_fl_ga = df_overdoses.copy()
overdoses_tx_ok = df_overdoses.copy()
overdoses_wa_or = df_overdoses.copy()

overdoses_fl_ga = overdoses_fl_ga[(overdoses_fl_ga["REPORTER_STATE"] == "FL") | (overdoses_fl_ga["REPORTER_STATE"] == "GA")]
overdoses_tx_ok = overdoses_tx_ok[(overdoses_tx_ok["REPORTER_STATE"] == "TX") | (overdoses_tx_ok["REPORTER_STATE"] == "OK")]
overdoses_wa_or = overdoses_wa_or[(overdoses_wa_or["REPORTER_STATE"] == "WA") | (overdoses_wa_or["REPORTER_STATE"] == "OR")]



# filter appropriate years
fl_ga_start = 2007
fl_ga_end = 2013

tx_ok_start = 2004
tx_ok_end = 2010

wa_or_start = 2009
wa_or_end = 2015


overdoses_fl_ga = overdoses_fl_ga[(overdoses_fl_ga["year"] >= fl_ga_start) & (overdoses_fl_ga["year"] <= fl_ga_end)]
overdoses_tx_ok = overdoses_tx_ok[(overdoses_tx_ok["year"] >= tx_ok_start) & (overdoses_tx_ok["year"] <= tx_ok_end)]
overdoses_wa_or = overdoses_wa_or[(overdoses_wa_or["year"] >= wa_or_start) & (overdoses_wa_or["year"] <= wa_or_end)]


### Cause of death - broken down by state

In [None]:
deaths_fl_ga = df_cause_of_death.copy()
deaths_tx_ok = df_cause_of_death.copy()
deaths_wa_or = df_cause_of_death.copy()

deaths_fl_ga = deaths_fl_ga[(deaths_fl_ga["State"] == "Florida") | (deaths_fl_ga["State"] == "Georgia")]
deaths_tx_ok = deaths_tx_ok[(deaths_tx_ok["State"] == "Texas") | (deaths_tx_ok["State"] == "Oklahoma")]
deaths_wa_or = deaths_wa_or[(deaths_wa_or["State"] == "Washington") | (deaths_wa_or["State"] == "Oregon")]

deaths_fl_ga = deaths_fl_ga[(deaths_fl_ga["Year"] >= fl_ga_start) & (deaths_fl_ga["Year"] <= fl_ga_end)]
deaths_tx_ok = deaths_tx_ok[(deaths_tx_ok["Year"] >= tx_ok_start) & (deaths_tx_ok["Year"] <= tx_ok_end)]  
deaths_wa_or = deaths_wa_or[(deaths_wa_or["Year"] >= wa_or_start) & (deaths_wa_or["Year"] <= wa_or_end)]

### export all to csv

In [None]:
overdoses_fl_ga.to_csv("05_cleaned_data/overdoses_fl_ga.csv", index=False)
overdoses_tx_ok.to_csv("05_cleaned_data/overdoses_tx_ok.csv", index=False)
overdoses_wa_or.to_csv("05_cleaned_data/overdoses_wa_or.csv", index=False)

deaths_fl_ga.to_csv("05_cleaned_data/deaths_fl_ga.csv", index=False)
deaths_tx_ok.to_csv("05_cleaned_data/deaths_tx_ok.csv", index=False)
deaths_wa_or.to_csv("05_cleaned_data/deaths_wa_or.csv", index=False)

## Notes for the group

- need to normalize over population once we get this data
- may need to filter out a couple more columns - haven't done this yet as I don't want to accidentally delete something we need
- overdose data is only broken down by year unless i messed something up - not really sure how to handle this