In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
import glob
import os
import xlrd
import re
import openpyxl

# custom file that maps state names to abbreviations
from abbreviation_conversion import abbrev_to_us_state

KeyboardInterrupt: 

## Clean up prescription data


In [None]:
def find_year(TRANSACTION_DATE):
    """
    Args:
        TRANSACTION_DATE (str): date in format MMDDYYYY

    Returns:
        int: year
    """
    TRANSACTION_DATE = str(TRANSACTION_DATE)
    
    return int(TRANSACTION_DATE[-4:])

# quick test 
print(f"testing the find_year function: {find_year(12202019)}")


def find_month(TRANSACTION_DATE):
    """
    Args:
        TRANSACTION_DATE (str): date in format MMDDYYYY

    Returns:
        int: month
    """
    TRANSACTION_DATE = str(TRANSACTION_DATE)

    if len(TRANSACTION_DATE) == 8:
        return int(TRANSACTION_DATE[:2])
    else:
        return int(TRANSACTION_DATE[:1])
    

# quick test 
print(f"testing the find_month function: {find_month(12202019)}")

testing the find_year function: 2019
testing the find_month function: 12


In [None]:
# to load in the data, we need to truncate the amount of columns we use as well as the states
cols_to_keep = ["REPORTER_DEA_NO", "BUYER_STATE", "BUYER_ZIP", "BUYER_COUNTY", "DRUG_CODE", "TRANSACTION_CODE", "DRUG_NAME", "QUANTITY", "TRANSACTION_DATE", "Product_Name"]

# we know we need Florida, Texas, and Washington
states = ["FL", "TX", "WA"]
# since we are normalizing based on population, I think we should pick states that are regionally close to our target states
# we can change this later as a group, but I have these selected below:

# Florida comparison states: Georgia, Alabama, Missisippi, South Carolina, Tennessee
fl_states = ["GA", "AL", "MS", "SC", "TN"]

# Texas comparison states: Oklahoma, Louisiana, New Mexico, Arkansas, Kansas
tx_states = ["OK", "LA", "NM", "AR", "KS"]

# Washington comparison states: Oregon, Idaho, Montana, Nevada, Wyoming
wa_states = ["OR", "ID", "MT", "NV", "WY"]

# create list of all states to use
variable_states = []
variable_states.extend(fl_states)
variable_states.extend(tx_states)
variable_states.extend(wa_states)

# append variable states to our original list
states.extend(variable_states)


In [None]:
# now, load in our data as an iterator so we can load in chunks
it = pd.read_csv("../00_source_data/arcos_all_washpost.tsv", chunksize=500_000, sep='\t', usecols = cols_to_keep) # may have to change chunksize depending on your computer's memory

# init empty dataframe
df = pd.DataFrame()

for chunk in it:
    # extract year out of date column
    chunk["year"] = chunk["TRANSACTION_DATE"].apply(lambda x: find_year(x))
    chunk["month"] = chunk["TRANSACTION_DATE"].apply(lambda x: find_month(x))

    # ensure we're working in the correct date range
    filtered_chunk = chunk[chunk["year"] > 2002]
    filtered_chunk = filtered_chunk[filtered_chunk["year"] < 2016]

    # filter out the states we want
    filtered_chunk = filtered_chunk[filtered_chunk["BUYER_STATE"].isin(states)]

    df = df.append(filtered_chunk)
    break
df

df_prescriptions = df.copy() # keep a copy of this df for later filtering

While doing analysis, we learned that a handful of county values in Nevada were missing. However, when we looked up its zip code (89303), we learned that we could fill this value in with Clark County.

In [None]:
# quick look at the null values (we checked, and these are all the values for which county is null)
df_prescriptions[df_prescriptions["BUYER_COUNTY"].isnull()]

Unnamed: 0,REPORTER_DEA_NO,BUYER_STATE,BUYER_ZIP,BUYER_COUNTY,TRANSACTION_CODE,DRUG_CODE,DRUG_NAME,QUANTITY,TRANSACTION_DATE,Product_Name,year,month
106970,PF0000012,NV,89303,,S,9143,OXYCODONE,1.0,8222006,OXYCONTIN - 80MG OXYCODONE.HCL CONTR,2006,8
106971,PF0000012,NV,89303,,S,9143,OXYCODONE,1.0,10272006,OXYCONTIN - 80MG OXYCODONE.HCL CONTR,2006,10
106972,PF0000012,NV,89303,,S,9143,OXYCODONE,2.0,11012006,OXYCODONE HCL/ACETAMINOPHEN 7.5MG/32,2006,11
106973,PF0000012,NV,89303,,S,9143,OXYCODONE,1.0,12052006,OXYCODONE HCL 40MG TABS,2006,12
106974,PF0000012,NV,89303,,S,9143,OXYCODONE,1.0,1092007,OXYCONTIN (OXYCODONE.HCL) CONTROLLED,2007,1
106975,PF0000012,NV,89303,,S,9143,OXYCODONE,1.0,7202007,OXYCONTIN - 80MG OXYCODONE.HCL CONTR,2007,7
106976,PF0000012,NV,89303,,S,9143,OXYCODONE,10.0,8012007,OXYCODONE HCL/ACETAMINOPHEN 10MG/325,2007,8
106977,PF0000012,NV,89303,,S,9143,OXYCODONE,1.0,9262007,OXYCODONE HYDROCHLORIDE 30MG TABLET,2007,9
106978,PF0000012,NV,89303,,S,9143,OXYCODONE,1.0,11192007,OXYCODONE & ASPIRIN 4.5MG OXYCOD.HCL,2007,11
106979,PF0000012,NV,89303,,S,9143,OXYCODONE,2.0,12262007,OXYCODONE HYDRCHLORIDE 40MG EXTENDED,2007,12


In [None]:
# replace values accordingly
df_prescriptions.loc[df_prescriptions["BUYER_ZIP"] == 89303, "BUYER_COUNTY"] = "CLARK"

In [None]:
# check null values again
df_prescriptions[df_prescriptions["BUYER_COUNTY"].isnull()]

Unnamed: 0,REPORTER_DEA_NO,BUYER_STATE,BUYER_ZIP,BUYER_COUNTY,TRANSACTION_CODE,DRUG_CODE,DRUG_NAME,QUANTITY,TRANSACTION_DATE,Product_Name,year,month


## Clean up cause of death data

In [None]:
path = r'../00_source_data/cause_of_death' # point to correct folder
filenames = glob.glob(path + "/*.txt")

df = pd.DataFrame()

for f in filenames:
    temp = pd.read_csv(f, index_col=None, header=0, sep='\t')
    # we're getting some extraneous notes at the bottom - let's just drop for now
    temp.dropna(subset={'County'}, inplace=True)
    
    df = pd.concat([df, temp], axis=0, ignore_index=True)

In [None]:
# helper functions to separate county and state

def abtract_state(county):
    """
    Args:
        county (str): county name

    Returns:
        str: state
    """
    return county.split(", ")[1]



def abstract_county(county):
    """
    Args:
        county (str): county name

    Returns:
        str: county
    """
    return county.split(", ")[0]


df["State"] = df.apply(lambda x: abtract_state(x["County"]), axis=1)
df["County"] = df.apply(lambda x: abstract_county(x["County"]), axis=1)

df.drop(columns={"Notes"}, inplace=True)

df_cause_of_death = df.copy() # keep a copy of this df for later filtering

In [None]:
df_cause_of_death.head()

Unnamed: 0,County,County Code,Year,Year Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths,State
0,Autauga County,1001.0,2003.0,2003.0,All other non-drug and non-alcohol causes,O9,397.0,AL
1,Baldwin County,1003.0,2003.0,2003.0,Drug poisonings (overdose) Unintentional (X40-...,D1,10.0,AL
2,Baldwin County,1003.0,2003.0,2003.0,All other alcohol-induced causes,A9,14.0,AL
3,Baldwin County,1003.0,2003.0,2003.0,All other non-drug and non-alcohol causes,O9,1479.0,AL
4,Barbour County,1005.0,2003.0,2003.0,All other non-drug and non-alcohol causes,O9,287.0,AL


In [None]:
df_cause_of_death = df_cause_of_death[df_cause_of_death["State"].isin(states)]

# rename df_cause_of death state AK to AR
df_cause_of_death.loc[df_cause_of_death["State"] == "AK", "State"] = "AR"

## Adding in County Population data

[Census county pop. data, 2000-2010](https://www.census.gov/data/tables/time-series/demo/popest/intercensal-2000-2010-counties.html)<br>
[Census county pop. data, 2010-2019](https://www.census.gov/data/datasets/time-series/demo/popest/2010s-counties-total.html)<br>
For both, just select the appropriate states on the webpage. We will clean and merge as needed in this notebook.



#### Guide to cleaning - 2000s data

The way the 2000s excel files are formatted, we can clean the data in the following way

- load in with header=3
- drop null on any of the populations
    - notes at the bottom will be removed
- drop unnamed 1, 12, and 13
    - these contain redundant data about populations from specific dates
    - Unnamed 12 is 2010s pop - will be redundant as our next dataset has this as well. Using the newer data
- drop first row
    - state as a whole
- rename Unnamed: 0 to county


In [None]:
pops00 = pd.DataFrame()

# add every excel file in 00_source_data/county_pop/2000s to pops00

path = r"../00_source_data/county_pop/2000s/" # point to correct folder
filenames = glob.glob(path + "*.xls")

for f in filenames:

    # read in current file with header = 3
    temp = pd.read_excel(f, header = 3)

    # regex to pull out state from filename
    r = re.search("(2000s)(.)(\w+)", f)[3]
    temp["State"] = r[:2].upper()
    
    # drop null on any of the years
    temp.dropna(subset=[2000], inplace=True)

    #drop useless columns
    temp.drop(columns={"Unnamed: 1", "Unnamed: 12", "Unnamed: 13"}, inplace=True)

    # drop first row
    temp = temp.iloc[1:, :]

    # rename some cols
    temp.rename(columns={"Unnamed: 0": "County"}, inplace=True)

    # remove period at beginning of each county
    temp["County"] = temp["County"].apply(lambda x: x[1:])

    pops00 = pd.concat([pops00, temp], axis=0, ignore_index=True)

# quick peek at the data
pops00.head()


Unnamed: 0,County,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,State
0,Autauga County,44021.0,44889.0,45909.0,46800.0,48366.0,49676.0,51328.0,52405.0,53277.0,54135.0,AL
1,Baldwin County,141342.0,144875.0,147957.0,151509.0,156266.0,162183.0,168121.0,172404.0,175827.0,179406.0,AL
2,Barbour County,29015.0,28863.0,28653.0,28594.0,28287.0,28027.0,27861.0,27757.0,27808.0,27657.0,AL
3,Bibb County,19913.0,21028.0,21199.0,21399.0,21721.0,22042.0,22099.0,22438.0,22705.0,22941.0,AL
4,Blount County,51107.0,51845.0,52551.0,53457.0,54124.0,54624.0,55485.0,56240.0,57055.0,57341.0,AL


#### Guide to cleaning - 2010s data

The way the 2010s excel files are formatted, we can clean the data in the following way

- load in with header=3
- drop null on any of the populations
    - notes at the bottom will be removed
- drop census, estimates base
- drop first row
    - state as a whole
- rename Unnamed: 0 to county


In [None]:
pops10 = pd.DataFrame()

# add every excel file in 00_source_data/county_pop/2000s to pops00

path = r"../00_source_data/county_pop/2010s" # point to correct folder
filenames = glob.glob(path + "/*.xlsx")

for f in filenames:

    # read in current file with header = 3
    temp = pd.read_excel(f, header = 3)

    # regex to pull out state from filename
    r = re.search("(2010s)(.)(\w+)", f)[3]
    temp["State"] = r[:2].upper()
    
    # drop null on any of the years
    temp.dropna(subset=[2010], inplace=True)

    #drop useless columns
    temp.drop(columns={"Census", "Estimates Base"}, inplace=True)

    # drop first row
    temp = temp.iloc[1:, :]

    # rename some cols
    temp.rename(columns={"Unnamed: 0": "County"}, inplace=True)

    # remove period at beginning of each county
    temp["County"] = temp["County"].apply(lambda x: x[1:])

    # strip state from county
    temp["County"] = temp["County"].apply(lambda x: x.split(", ")[0])

    pops10 = pd.concat([pops10, temp], axis=0, ignore_index=True)

# quick peek at the data
pops10.head()


Unnamed: 0,County,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,State
0,Autauga County,54773.0,55227.0,54954.0,54727.0,54893.0,54864.0,55243.0,55390.0,55533.0,55869.0,AL
1,Baldwin County,183112.0,186558.0,190145.0,194885.0,199183.0,202939.0,207601.0,212521.0,217855.0,223234.0,AL
2,Barbour County,27327.0,27341.0,27169.0,26937.0,26755.0,26283.0,25806.0,25157.0,24872.0,24686.0,AL
3,Bibb County,22870.0,22745.0,22667.0,22521.0,22553.0,22566.0,22586.0,22550.0,22367.0,22394.0,AL
4,Blount County,57376.0,57560.0,57580.0,57619.0,57526.0,57526.0,57494.0,57787.0,57771.0,57826.0,AL


In [None]:
pops10.head()

Unnamed: 0,County,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,State
0,Autauga County,54773.0,55227.0,54954.0,54727.0,54893.0,54864.0,55243.0,55390.0,55533.0,55869.0,AL
1,Baldwin County,183112.0,186558.0,190145.0,194885.0,199183.0,202939.0,207601.0,212521.0,217855.0,223234.0,AL
2,Barbour County,27327.0,27341.0,27169.0,26937.0,26755.0,26283.0,25806.0,25157.0,24872.0,24686.0,AL
3,Bibb County,22870.0,22745.0,22667.0,22521.0,22553.0,22566.0,22586.0,22550.0,22367.0,22394.0,AL
4,Blount County,57376.0,57560.0,57580.0,57619.0,57526.0,57526.0,57494.0,57787.0,57771.0,57826.0,AL


In [None]:
# melt both dfs to get tidy format

pops00 = pops00.melt(["County", "State"])
pops10 = pops10.melt(["County", "State"])

# rename columns accordingly
pops00.rename(columns={"variable": "Year", "value": "Population"}, inplace=True)
pops10.rename(columns={"variable": "Year", "value": "Population"}, inplace=True)

# concatenate the two dfs
pops = pd.concat([pops00, pops10], ignore_index=True)


In [None]:
# check that we have the same number of counties between datasets
assert len(pops00["County"].unique()) == len(pops10["County"].unique())

# check that we have the same number of counties every year
# first, create a df with the number of counties per year
pops_county_check = pops.groupby(["State", "Year"])["County"].count().reset_index()

In [None]:
# group the sum of counties by year and state - will help us check if number of counties changes over the years
grouped_states = pops_county_check.groupby(["Year", "State"])["County"].sum().reset_index().rename(columns={"County": "county_count"})

# here's what this looks like
# we get a dataframe of states and years, with the number of counties in each state in each year
grouped_states.head()

Unnamed: 0,Year,State,county_count
0,2000,AL,67
1,2000,AR,75
2,2000,FL,67
3,2000,GA,159
4,2000,ID,44


In [None]:
# using the above query, we should be able to assert that the number of counties per year is the same
# below statement should always equal zero

assert (grouped_states.duplicated(subset=["Year", "State"]).sum() == 0)
#assert (grouped_states10.duplicated(subset=["Year", "State"]).sum() == 0)


# ensure no duplicate values
assert pops.duplicated().sum() == 0

# loop to check that every state has the same number of counties every year
for state in states:
    assert (pops[pops["State"] == state].Year.value_counts().nunique() == 1), f"error on {state}"

## trying to integrate fip numbers for a better merge

In [None]:
fips = pd.read_csv("https://github.com/ChuckConnell/articles/raw/master/fips2county.tsv", sep="\t")

In [None]:
def get_keys_from_value(d, val):
    return [k for k, v in d.items() if v == val]


keys = get_keys_from_value(abbrev_to_us_state, 'Alabama')
keys

['AL']

In [None]:
fips["state_abbrev"] = fips["StateName"].apply(lambda x: get_keys_from_value(abbrev_to_us_state, x)[0])

In [None]:
fips = fips[fips["state_abbrev"].isin(states)]

In [None]:
# helper function to get rid of the word county in pop df
def remove_county(x):

    if "County" in x:
        return x[:-7]
    else:
        return x


pops["county_test"] = pops["County"].apply(lambda x: remove_county(x))


# fix dona ana and la salle parish
pops["county_test"] = pops["county_test"].apply(lambda x: x.replace("Doña Ana", "Dona Ana"))
fips["CountyName"] = fips["CountyName"].apply(lambda x: x.replace("DoÃ±a Ana", "Dona Ana"))


#pops["county_test"] = pops["county_test"].apply(lambda x: x.replace("La Salle Parish", "La Salle"))


# rename county_test where state is texas and county is la salle to La Salle (TX)
pops.loc[(pops["State"] == "TX") & (pops["county_test"] == "La Salle"), "county_test"] = "La Salle County"
        

In [None]:
# change La Salle county name in fips to La Salle County
fips.loc[fips["CountyName"] == "La Salle", "CountyName"] = "La Salle County"
fips.loc[fips["CountyName"] == "LaSalle Parish", "CountyName"] = "La Salle Parish"
pops.loc[pops["county_test"] == "LaSalle Parish", "county_test"] = "La Salle Parish"



In [None]:
# final merge for population dataset & fip number dataset
pops_copy = pops.merge(fips[["state_abbrev", "CountyFIPS", "StateFIPS", "CountyName"]], left_on=["county_test", "State"], right_on=["CountyName", "state_abbrev"], how="outer", indicator=True)

In [None]:
# should never end up with anything left out of merge
assert len(pops_copy[pops_copy["_merge"] != "both"]) == 0


#TODO: clean up pops_copy dataset for redundant variables

In [None]:
# add fip numbers to df_prescriptions

# create copies of both dfs in case I mess up

prescriptions_copy = df_prescriptions.copy()
fips_copy = fips.copy()

In [None]:
# make buyer_county all lowercase
prescriptions_copy["BUYER_COUNTY"] = prescriptions_copy["BUYER_COUNTY"].apply(lambda x: x.lower())

# do the same for fips
fips_copy["CountyName"] = fips_copy["CountyName"].apply(lambda x: x.lower())

In [None]:
# remove county and parish from fips_copy

def remove_parish(x):

    if "parish" in x:
        return x[:-7]
    else:
        return x


# prescription dataset has similar format - match fips to this format
fips_copy["CountyName"] = fips_copy["CountyName"].apply(lambda x: remove_county(x))
fips_copy["CountyName"] = fips_copy["CountyName"].apply(lambda x: remove_parish(x))

def expand_saint(x):

    if "st." in x:
        return x.replace("st.", "saint")
    else:
        return x

# fix various other inconsistencies
# left only values first
fips_copy["CountyName"] = fips_copy["CountyName"].apply(lambda x: expand_saint(x))

fips_copy["CountyName"] = fips_copy["CountyName"].apply(lambda x: x.replace("desoto", "de soto"))
prescriptions_copy["BUYER_COUNTY"] = prescriptions_copy["BUYER_COUNTY"].apply(lambda x: x.replace("desoto", "de soto"))
prescriptions_copy["BUYER_COUNTY"] = prescriptions_copy["BUYER_COUNTY"].apply(lambda x: x.replace("st john the baptist", "saint john the baptist"))

fips_copy["CountyName"] = fips_copy["CountyName"].apply(lambda x: x.replace("dekalb", "de kalb"))
prescriptions_copy["BUYER_COUNTY"] = prescriptions_copy["BUYER_COUNTY"].apply(lambda x: x.replace("dekalb", "de kalb"))

# fix right only values

prescriptions_copy["BUYER_COUNTY"] = prescriptions_copy["BUYER_COUNTY"].apply(lambda x: x.replace("desoto", "de soto"))




In [None]:
prescriptions_fips = prescriptions_copy.merge(fips_copy, left_on=["BUYER_COUNTY", "BUYER_STATE"], right_on=["CountyName", "state_abbrev"], how="outer", indicator=True)

# capitalize year and month columns
prescriptions_fips.rename(columns={"year": "Year", "month": "Month"}, inplace=True)

In [None]:
assert prescriptions_fips[prescriptions_fips["_merge"] == "left_only"].shape[0] == 0

#### Not all counties joining back to prescription dataset

This could be okay, but I want to do a quick check that there are just not records for these counties. To do this, I'll take a small sample of counties in our FIPS dataset that did NOT merge properly to the prescriptions dataset, and search each one manually in the prescription dataset. I will search various different ways the counties could be transcribed, as well as google the county to ensure there are no secondary names for the same county.

In [None]:
# looks like we have some missing values from right_only that we can't find in df_prescriptions
# may need to look more into why this is - but as of now all we know is the given data is incomplete
# choose a sample, search other dataframe, make conclusion

prescriptions_fips[prescriptions_fips["_merge"] != "both"].sample(10, random_state=0)

Unnamed: 0,REPORTER_DEA_NO,BUYER_STATE,BUYER_ZIP,BUYER_COUNTY,TRANSACTION_CODE,DRUG_CODE,DRUG_NAME,QUANTITY,TRANSACTION_DATE,Product_Name,...,Month,StateFIPS,CountyFIPS_3,CountyName,StateName,CountyFIPS,StateAbbr,STATE_COUNTY,state_abbrev,_merge
148422,,,,,,,,,,,...,,48,459,upshur,Texas,48459,TX,TX | UPSHUR,TX,right_only
147901,,,,,,,,,,,...,,13,73,columbia,Georgia,13073,GA,GA | COLUMBIA,GA,right_only
148377,,,,,,,,,,,...,,48,335,mitchell,Texas,48335,TX,TX | MITCHELL,TX,right_only
148192,,,,,,,,,,,...,,41,21,gilliam,Oregon,41021,OR,OR | GILLIAM,OR,right_only
147904,,,,,,,,,,,...,,13,85,dawson,Georgia,13085,GA,GA | DAWSON,GA,right_only
147911,,,,,,,,,,,...,,13,123,gilmer,Georgia,13123,GA,GA | GILMER,GA,right_only
148195,,,,,,,,,,,...,,45,1,abbeville,South Carolina,45001,SC,SC | ABBEVILLE,SC,right_only
148153,,,,,,,,,,,...,,40,63,hughes,Oklahoma,40063,OK,OK | HUGHES,OK,right_only
148055,,,,,,,,,,,...,,20,173,sedgwick,Kansas,20173,KS,KS | SEDGWICK,KS,right_only
148254,,,,,,,,,,,...,,48,35,bosque,Texas,48035,TX,TX | BOSQUE,TX,right_only


In [None]:
# set max number of columns, so I can look at all records in a separate file
#pd.set_option("display.max_rows", None)

# look at every county in Texas that is not in the prescription dataset
# by using value_counts(), we can look at every county in the dataset
#prescriptions_fips[prescriptions_fips["BUYER_STATE"] == "KS"]["BUYER_COUNTY"].value_counts()

The above is now commented out, but the same test can be repeated as follows:

1. uncomment lines 2, 6 above
2. for each county you want to look for in the dataset, change the logic in line 6 to the county's associated state
3. click on the "show more in text editor" and ctrl-f for the county along with potential other spellings/variations of the county
4. google the county to ensure we did not miss any variations

Given that we aren't able to find any records from our sample, we conclude that these values are simply missing from the prescription shipping dataset. Therefore, the right_only merge indicator is acceptable as we still have a comprehensive set of counties in our data.

In [None]:
# reset max rows
pd.reset_option("display.max_rows")


In [None]:
prescriptions_fips = prescriptions_fips[prescriptions_fips["_merge"] != "right_only"]

# may have to add explanation for this higher up
assert len(df_prescriptions) == len(prescriptions_fips)


### adding fips to our cause of death data

In [None]:
# create copies of both dfs

cause_of_death_copy = df_cause_of_death.copy()
fips_copy = fips.copy()

In [None]:
cause_of_death_copy["County"] = cause_of_death_copy["County"].apply(lambda x: remove_county(x))


In [None]:
# clean some values up

cause_of_death_copy["County"] = cause_of_death_copy["County"].apply(lambda x: x.replace("LaSalle Parish", "La Salle Parish"))
cause_of_death_copy["County"] = cause_of_death_copy["County"].apply(lambda x: x.replace("DeBaca", "De Baca"))
cause_of_death_copy["County"] = cause_of_death_copy["County"].apply(lambda x: x.replace("La Salle", "La Salle County"))
cause_of_death_copy["County"] = cause_of_death_copy["County"].apply(lambda x: x.replace("La Salle County Parish", "La Salle Parish"))



In [None]:
cause_of_death_fips = cause_of_death_copy.merge(fips_copy, left_on=["County", "State"], right_on=["CountyName", "state_abbrev"], how="outer", indicator=True)

#### Not all counties joining back to cause of death dataset

This could be okay, but I want to do a quick check that there are just not records for these counties. We have a smaller amount of counties not joining back (only four this time), so we'll just run our previous test on all of these counties.

In [None]:
cause_of_death_fips[cause_of_death_fips["_merge"] != "both"]

# only four missing values

Unnamed: 0,County,County Code,Year,Year Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths,State,StateFIPS,CountyFIPS_3,CountyName,StateName,CountyFIPS,StateAbbr,STATE_COUNTY,state_abbrev,_merge
23285,,,,,,,,,30,69,Petroleum,Montana,30069,MT,MT | PETROLEUM,MT,right_only
23286,,,,,,,,,48,261,Kenedy,Texas,48261,TX,TX | KENEDY,TX,right_only
23287,,,,,,,,,48,269,King,Texas,48269,TX,TX | KING,TX,right_only
23288,,,,,,,,,48,301,Loving,Texas,48301,TX,TX | LOVING,TX,right_only


In [None]:
# repeating the test from prescriptions dataset
#pd.set_option("display.max_rows", None)
#cause_of_death_copy[cause_of_death_copy["State"] == "MT"]["County"].value_counts()

# reset max rows
pd.reset_option("display.max_rows")

Given that we aren't able to find any records from our only four missing records, we conclude that these values are simply missing from the cause of death dataset. Therefore, the right_only merge indicator is acceptable as we still have a comprehensive set of counties in our data.

### Adding Population to final DataFrames

For pop_fips, cause_of_death_fips, and prescription_fips. Steps needed:

- Create unique ID from county FIPS and state FIPS
- Merge population dataset based on this

In [None]:
cause_of_death_fips = cause_of_death_fips[cause_of_death_fips["_merge"] == "both"]
#pops_copy = pops_copy[cause_of_death_fips["_merge"] == "both"]
prescriptions_fips = prescriptions_fips[prescriptions_fips["_merge"] == "both"]


# drop merge columns
cause_of_death_fips.drop(columns=["_merge"], inplace=True)
prescriptions_fips.drop(columns=["_merge"], inplace=True)
pops_copy.drop(columns=["_merge",], inplace=True)

In [None]:
# create unique FIP from county and state fips

cause_of_death_fips["FIP_unique"] = cause_of_death_fips["CountyFIPS"].apply(lambda x: str(x)) + cause_of_death_fips["StateFIPS"].apply(lambda x: str(x))
prescriptions_fips["FIP_unique"] = prescriptions_fips["CountyFIPS"].apply(lambda x: str(x)) + prescriptions_fips["StateFIPS"].apply(lambda x: str(x))
pops_copy["FIP_unique"] = pops_copy["CountyFIPS"].apply(lambda x: str(x)) + pops_copy["StateFIPS"].apply(lambda x: str(x))


In [None]:
# TODO: add some sort of assert here. not sure what it should be yet


In [None]:
pops_copy

Unnamed: 0,County,State,Year,Population,county_test,state_abbrev,CountyFIPS,StateFIPS,CountyName,FIP_unique
0,Autauga County,AL,2000,44021.0,Autauga,AL,1001,1,Autauga,10011
1,Autauga County,AL,2001,44889.0,Autauga,AL,1001,1,Autauga,10011
2,Autauga County,AL,2002,45909.0,Autauga,AL,1001,1,Autauga,10011
3,Autauga County,AL,2003,46800.0,Autauga,AL,1001,1,Autauga,10011
4,Autauga County,AL,2004,48366.0,Autauga,AL,1001,1,Autauga,10011
...,...,...,...,...,...,...,...,...,...,...
26775,Weston County,WY,2015,7208.0,Weston,WY,56045,56,Weston,5604556
26776,Weston County,WY,2016,7220.0,Weston,WY,56045,56,Weston,5604556
26777,Weston County,WY,2017,6968.0,Weston,WY,56045,56,Weston,5604556
26778,Weston County,WY,2018,6924.0,Weston,WY,56045,56,Weston,5604556


In [None]:
# create final prescriptions dataset with populations
# can safely left join here, because we only need records in the prescriptions dataset
prescriptions = prescriptions_fips.merge(pops_copy, on=["FIP_unique", "Year"], how="left", indicator=True)

assert (prescriptions["_merge"] == "both").all()

In [None]:
# one more assert to check length
assert len(prescriptions) == len(prescriptions_fips)


In [None]:
# drop some useless columns
prescriptions.drop(columns=["_merge", "CountyName_y", "StateFIPS_y", "CountyFIPS_y","state_abbrev_y", "County", "CountyFIPS_3"], inplace=True)

# rename x columns
prescriptions.rename(columns={"CountyName_x": "CountyName", "StateFIPS_x": "StateFIPS", "CountyFIPS_x": "CountyFIPS", "state_abbrev_x": "state_abbrev"}, inplace=True)


In [None]:
# see what the data looks like
prescriptions.head()

Unnamed: 0,REPORTER_DEA_NO,BUYER_STATE,BUYER_ZIP,BUYER_COUNTY,TRANSACTION_CODE,DRUG_CODE,DRUG_NAME,QUANTITY,TRANSACTION_DATE,Product_Name,...,CountyName,StateName,CountyFIPS,StateAbbr,STATE_COUNTY,state_abbrev,FIP_unique,State,Population,county_test
0,PB0034861,FL,33460.0,palm beach,S,9143.0,OXYCODONE,1.0,8182006.0,OXYCODONE HCL 40MG TABS,...,palm beach,Florida,12099,FL,FL | PALM BEACH,FL,1209912,FL,1284489.0,Palm Beach
1,PB0034861,FL,33460.0,palm beach,S,9143.0,OXYCODONE,8.0,11292006.0,ENDOCET - 10MG OXYCODONE.HCL/325MG A,...,palm beach,Florida,12099,FL,FL | PALM BEACH,FL,1209912,FL,1284489.0,Palm Beach
2,PB0034861,FL,33460.0,palm beach,S,9143.0,OXYCODONE,3.0,2062007.0,OXYCODONE HCL 80MG TABS,...,palm beach,Florida,12099,FL,FL | PALM BEACH,FL,1209912,FL,1286586.0,Palm Beach
3,PB0034861,FL,33460.0,palm beach,S,9143.0,OXYCODONE,3.0,3012007.0,OXYCODONE HCL 80MG TABS,...,palm beach,Florida,12099,FL,FL | PALM BEACH,FL,1209912,FL,1286586.0,Palm Beach
4,PB0034861,FL,33460.0,palm beach,S,9143.0,OXYCODONE,6.0,4162007.0,ENDOCET - 10MG OXYCODONE.HCL/325MG A,...,palm beach,Florida,12099,FL,FL | PALM BEACH,FL,1209912,FL,1286586.0,Palm Beach


In [None]:
# create final cause of death dataset with populations
# can safely left join here, because we only need records in the cause of death dataset
cause_of_death = cause_of_death_fips.merge(pops_copy, on=["FIP_unique", "Year"], how="left", indicator=True)

assert (cause_of_death["_merge"] == "both").all()

In [None]:
# quick look at the dataset
cause_of_death.head()


# drop some useless columns
cause_of_death.drop(columns=["_merge", "CountyName_y", "StateFIPS_y", "CountyFIPS_y","state_abbrev_y", "County_y", "CountyFIPS_3", "State_y"], inplace=True)

# rename x columns
cause_of_death.rename(columns={"County_x": "County", "Year_x": "Year", "State_x": "State", "StateFIPS_x": "StateFIPS", "CountyFIPS_x": "CountyFIPS", "state_abbrev_x": "state_abbrev", "CountyName_x": "CountyName"}, inplace=True)


In [None]:
# asserts to make sure we didn't lose any records from our original datasets

assert len(df_cause_of_death) == len(cause_of_death)
assert len(df_prescriptions) == len(prescriptions)

#### Export main, unjoined datasets in case we need them

In [None]:
cause_of_death.to_csv("../05_cleaned_data/cause_of_death_clean.csv", index=False)
prescriptions.to_csv("../05_cleaned_data/arcos_all_washpost_clean.csv", index=False)

## Final 3 datasets

We should have: (UNSURE IF WE SHOULD EXTEND DATE RANGES, CURRENTLY 3 YEARS BEFORE AND AFTER POLICY IMPLEMENTATION)

- Florida and Georgia 2007 - 2013
- Texas and Oklahoma 2004 - 2010
- Washington and Oregon 2009 - 2015

### Drug overdose - broken down by state

In [None]:
# Florida and Georgia

prescriptions_fl = prescriptions.copy()
prescriptions_tx = prescriptions.copy()
prescriptions_wa = prescriptions.copy()

prescriptions_fl = prescriptions_fl[(prescriptions_fl["BUYER_STATE"] == "FL") | (prescriptions_fl["BUYER_STATE"].isin(fl_states))]
prescriptions_tx = prescriptions_tx[(prescriptions_tx["BUYER_STATE"] == "TX") | (prescriptions_tx["BUYER_STATE"]).isin(tx_states)]
prescriptions_wa = prescriptions_wa[(prescriptions_wa["BUYER_STATE"] == "WA") | (prescriptions_wa["BUYER_STATE"]).isin(wa_states)]



# filter appropriate years
fl_start = 2007
fl_end = 2013

tx_start = 2004
tx_end = 2010

wa_start = 2009
wa_end = 2015


prescriptions_fl = prescriptions_fl[(prescriptions_fl["Year"] >= fl_start) & (prescriptions_fl["Year"] <= fl_end)]
prescriptions_tx = prescriptions_tx[(prescriptions_tx["Year"] >= tx_start) & (prescriptions_tx["Year"] <= tx_end)]
prescriptions_wa = prescriptions_wa[(prescriptions_wa["Year"] >= wa_start) & (prescriptions_wa["Year"] <= wa_end)]


### Cause of death - broken down by state

In [None]:
deaths_fl = cause_of_death.copy()
deaths_tx = cause_of_death.copy()
deaths_wa = cause_of_death.copy()

deaths_fl = deaths_fl[(deaths_fl["StateName"] == "Florida") | (deaths_fl["State"].isin(fl_states))]
deaths_tx = deaths_tx[(deaths_tx["StateName"] == "Texas") | (deaths_tx["State"].isin(tx_states))]
deaths_wa = deaths_wa[(deaths_wa["StateName"] == "Washington") | (deaths_wa["State"].isin(wa_states))]

deaths_fl = deaths_fl[(deaths_fl["Year"] >= fl_start) & (deaths_fl["Year"] <= fl_end)]
deaths_tx = deaths_tx[(deaths_tx["Year"] >= tx_start) & (deaths_tx["Year"] <= tx_end)]  
deaths_wa = deaths_wa[(deaths_wa["Year"] >= wa_start) & (deaths_wa["Year"] <= wa_end)]

### export all to csv

In [None]:
prescriptions_fl.to_csv("../05_cleaned_data/prescriptions_fl.csv", index=False)
prescriptions_tx.to_csv("../05_cleaned_data/prescriptions_tx.csv", index=False)
prescriptions_wa.to_csv("../05_cleaned_data/prescriptions_wa.csv", index=False)

deaths_fl.to_csv("../05_cleaned_data/deaths_fl.csv", index=False)
deaths_tx.to_csv("../05_cleaned_data/deaths_tx.csv", index=False)
deaths_wa.to_csv("../05_cleaned_data/deaths_wa.csv", index=False)

## Notes for the group

- may need to filter out a couple more columns - haven't done this yet as I don't want to accidentally delete something we need
- overdose data is only broken down by year unless i messed something up - overdose analysis will have to be less granular