In [60]:
import pandas as pd
from datetime import datetime
import numpy as np
import glob
import os
import xlrd
import re
import openpyxl

from abbreviation_conversion import abbrev_to_us_state

## Clean up prescription data


In [61]:
pd.set_option('display.max_columns', None)


In [62]:
def find_year(TRANSACTION_DATE):
    """
    Args:
        TRANSACTION_DATE (str): date in format MMDDYYYY

    Returns:
        int: year
    """
    TRANSACTION_DATE = str(TRANSACTION_DATE)
    
    return int(TRANSACTION_DATE[-4:])

# quick test 
print(f"testing the find_year function: {find_year(12202019)}")


def find_month(TRANSACTION_DATE):
    """
    Args:
        TRANSACTION_DATE (str): date in format MMDDYYYY

    Returns:
        int: month
    """
    TRANSACTION_DATE = str(TRANSACTION_DATE)

    if len(TRANSACTION_DATE) == 8:
        return int(TRANSACTION_DATE[:2])
    else:
        return int(TRANSACTION_DATE[:1])
    

# quick test 
print(f"testing the find_month function: {find_month(12202019)}")

testing the find_year function: 2019
testing the find_month function: 12


In [63]:
# to load in the data, we need to truncate the amount of columns we use as well as the states
cols_to_keep = ["REPORTER_DEA_NO", "BUYER_STATE", "BUYER_ZIP", "BUYER_COUNTY", "DRUG_CODE", "TRANSACTION_CODE", "DRUG_NAME", "QUANTITY", "TRANSACTION_DATE", "Product_Name"]

# we know we need Florida, Texas, and Washington
states = ["FL", "TX", "WA"]
# since we are normalizing based on population, I think we should pick states that are regionally close to our target states
# we can change this later as a group, but I have these selected below:

# Florida comparison states: Georgia, Alabama, Missisippi, South Carolina, Tennessee
fl_states = ["GA", "AL", "MS", "SC", "TN"]

# Texas comparison states: Oklahoma, Louisiana, New Mexico, Arkansas, Kansas
tx_states = ["OK", "LA", "NM", "AR", "KS"]

# Washington comparison states: Oregon, Idaho, Montana, Nevada, Wyoming
wa_states = ["OR", "ID", "MT", "NV", "WY"]

# create list of all states to use
variable_states = []
variable_states.extend(fl_states)
variable_states.extend(tx_states)
variable_states.extend(wa_states)

# append variable states to our original list
states.extend(variable_states)


In [64]:
# now, load in our data as an iterator so we can load in chunks
it = pd.read_csv("00_source_data/arcos_all_washpost.tsv", chunksize=500_000, sep='\t', usecols = cols_to_keep) # may have to change chunksize depending on your computer's memory

# init empty dataframe
df = pd.DataFrame()

for chunk in it:
    # extract year out of date column
    chunk["year"] = chunk["TRANSACTION_DATE"].apply(lambda x: find_year(x))
    chunk["month"] = chunk["TRANSACTION_DATE"].apply(lambda x: find_month(x))

    # ensure we're working in the correct date range
    filtered_chunk = chunk[chunk["year"] > 2002]
    filtered_chunk = filtered_chunk[filtered_chunk["year"] < 2016]

    # filter out the states we want
    filtered_chunk = filtered_chunk[filtered_chunk["BUYER_STATE"].isin(states)]

    df = df.append(filtered_chunk)
    break
df

df_prescriptions = df.copy() # keep a copy of this df for later filtering

While doing analysis, we learned that a handful of county values in Nevada were missing. However, when we looked up its zip code (89303), we learned that we could fill this value in with Clark County.

In [65]:
# quick look at the null values (we checked, and these are all the values for which county is null)
df_prescriptions[df_prescriptions["BUYER_COUNTY"].isnull()]

Unnamed: 0,REPORTER_DEA_NO,BUYER_STATE,BUYER_ZIP,BUYER_COUNTY,TRANSACTION_CODE,DRUG_CODE,DRUG_NAME,QUANTITY,TRANSACTION_DATE,Product_Name,year,month
106970,PF0000012,NV,89303,,S,9143,OXYCODONE,1.0,8222006,OXYCONTIN - 80MG OXYCODONE.HCL CONTR,2006,8
106971,PF0000012,NV,89303,,S,9143,OXYCODONE,1.0,10272006,OXYCONTIN - 80MG OXYCODONE.HCL CONTR,2006,10
106972,PF0000012,NV,89303,,S,9143,OXYCODONE,2.0,11012006,OXYCODONE HCL/ACETAMINOPHEN 7.5MG/32,2006,11
106973,PF0000012,NV,89303,,S,9143,OXYCODONE,1.0,12052006,OXYCODONE HCL 40MG TABS,2006,12
106974,PF0000012,NV,89303,,S,9143,OXYCODONE,1.0,1092007,OXYCONTIN (OXYCODONE.HCL) CONTROLLED,2007,1
106975,PF0000012,NV,89303,,S,9143,OXYCODONE,1.0,7202007,OXYCONTIN - 80MG OXYCODONE.HCL CONTR,2007,7
106976,PF0000012,NV,89303,,S,9143,OXYCODONE,10.0,8012007,OXYCODONE HCL/ACETAMINOPHEN 10MG/325,2007,8
106977,PF0000012,NV,89303,,S,9143,OXYCODONE,1.0,9262007,OXYCODONE HYDROCHLORIDE 30MG TABLET,2007,9
106978,PF0000012,NV,89303,,S,9143,OXYCODONE,1.0,11192007,OXYCODONE & ASPIRIN 4.5MG OXYCOD.HCL,2007,11
106979,PF0000012,NV,89303,,S,9143,OXYCODONE,2.0,12262007,OXYCODONE HYDRCHLORIDE 40MG EXTENDED,2007,12


In [66]:
# replace values accordingly
df_prescriptions.loc[df_prescriptions["BUYER_ZIP"] == 89303, "BUYER_COUNTY"] = "CLARK"

In [67]:
# check null values again
df_prescriptions[df_prescriptions["BUYER_COUNTY"].isnull()]

Unnamed: 0,REPORTER_DEA_NO,BUYER_STATE,BUYER_ZIP,BUYER_COUNTY,TRANSACTION_CODE,DRUG_CODE,DRUG_NAME,QUANTITY,TRANSACTION_DATE,Product_Name,year,month


In [68]:
# now that we have our columns and states filtered, let's export this as a csv and store the (much smaller) result on github

df_prescriptions.to_csv("05_cleaned_data/arcos_all_washpost_clean.csv", index=False)

## Clean up cause of death data

In [69]:
path = r'00_source_data/cause_of_death' # point to correct folder
filenames = glob.glob(path + "/*.txt")

df = pd.DataFrame()

for f in filenames:
    temp = pd.read_csv(f, index_col=None, header=0, sep='\t')
    # we're getting some extraneous notes at the bottom - let's just drop for now
    temp.dropna(subset={'County'}, inplace=True)
    
    df = pd.concat([df, temp], axis=0, ignore_index=True)

In [70]:
# helper functions to separate county and state

def abtract_state(county):
    """
    Args:
        county (str): county name

    Returns:
        str: state
    """
    return county.split(", ")[1]



def abstract_county(county):
    """
    Args:
        county (str): county name

    Returns:
        str: county
    """
    return county.split(", ")[0]


df["State"] = df.apply(lambda x: abtract_state(x["County"]), axis=1)
df["County"] = df.apply(lambda x: abstract_county(x["County"]), axis=1)

df.drop(columns={"Notes"}, inplace=True)

df_cause_of_death = df.copy() # keep a copy of this df for later filtering

In [71]:
df_cause_of_death.head()

Unnamed: 0,County,County Code,Year,Year Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths,State
0,Autauga County,1001.0,2003.0,2003.0,All other non-drug and non-alcohol causes,O9,397.0,AL
1,Baldwin County,1003.0,2003.0,2003.0,Drug poisonings (overdose) Unintentional (X40-...,D1,10.0,AL
2,Baldwin County,1003.0,2003.0,2003.0,All other alcohol-induced causes,A9,14.0,AL
3,Baldwin County,1003.0,2003.0,2003.0,All other non-drug and non-alcohol causes,O9,1479.0,AL
4,Barbour County,1005.0,2003.0,2003.0,All other non-drug and non-alcohol causes,O9,287.0,AL


In [72]:
df_cause_of_death = df_cause_of_death[df_cause_of_death["State"].isin(states)]

# rename df_cause_of death state AK to AR
df_cause_of_death.loc[df_cause_of_death["State"] == "AK", "State"] = "AR"

In [73]:
df_cause_of_death.to_csv("05_cleaned_data/cause_of_death_clean.csv", index=False)

## Adding in County Population data

[Census county pop. data, 2000-2010](https://www.census.gov/data/tables/time-series/demo/popest/intercensal-2000-2010-counties.html)<br>
[Census county pop. data, 2010-2019](https://www.census.gov/data/datasets/time-series/demo/popest/2010s-counties-total.html)<br>
For both, just select the appropriate states on the webpage. We will clean and merge as needed in this notebook.



#### Guide to cleaning - 2000s data

The way the 2000s excel files are formatted, we can clean the data in the following way

- load in with header=3
- drop null on any of the populations
    - notes at the bottom will be removed
- drop unnamed 1, 12, and 13
    - these contain redundant data about populations from specific dates
    - Unnamed 12 is 2010s pop - will be redundant as our next dataset has this as well. Using the newer data
- drop first row
    - state as a whole
- rename Unnamed: 0 to county


In [74]:
pops00 = pd.DataFrame()

# add every excel file in 00_source_data/county_pop/2000s to pops00

path = r"00_source_data/county_pop/2000s/" # point to correct folder
filenames = glob.glob(path + "*.xls")

for f in filenames:

    # read in current file with header = 3
    temp = pd.read_excel(f, header = 3)

    # regex to pull out state from filename
    r = re.search("(2000s)(.)(\w+)", f)[3]
    temp["State"] = r[:2].upper()
    
    # drop null on any of the years
    temp.dropna(subset=[2000], inplace=True)

    #drop useless columns
    temp.drop(columns={"Unnamed: 1", "Unnamed: 12", "Unnamed: 13"}, inplace=True)

    # drop first row
    temp = temp.iloc[1:, :]

    # rename some cols
    temp.rename(columns={"Unnamed: 0": "County"}, inplace=True)

    # remove period at beginning of each county
    temp["County"] = temp["County"].apply(lambda x: x[1:])

    pops00 = pd.concat([pops00, temp], axis=0, ignore_index=True)

# quick peek at the data
pops00.head()


Unnamed: 0,County,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,State
0,Autauga County,44021.0,44889.0,45909.0,46800.0,48366.0,49676.0,51328.0,52405.0,53277.0,54135.0,AL
1,Baldwin County,141342.0,144875.0,147957.0,151509.0,156266.0,162183.0,168121.0,172404.0,175827.0,179406.0,AL
2,Barbour County,29015.0,28863.0,28653.0,28594.0,28287.0,28027.0,27861.0,27757.0,27808.0,27657.0,AL
3,Bibb County,19913.0,21028.0,21199.0,21399.0,21721.0,22042.0,22099.0,22438.0,22705.0,22941.0,AL
4,Blount County,51107.0,51845.0,52551.0,53457.0,54124.0,54624.0,55485.0,56240.0,57055.0,57341.0,AL


#### Guide to cleaning - 2010s data

The way the 2010s excel files are formatted, we can clean the data in the following way

- load in with header=3
- drop null on any of the populations
    - notes at the bottom will be removed
- drop census, estimates base
- drop first row
    - state as a whole
- rename Unnamed: 0 to county


In [75]:
pops10 = pd.DataFrame()

# add every excel file in 00_source_data/county_pop/2000s to pops00

path = r"00_source_data/county_pop/2010s" # point to correct folder
filenames = glob.glob(path + "/*.xlsx")

for f in filenames:

    # read in current file with header = 3
    temp = pd.read_excel(f, header = 3)

    # regex to pull out state from filename
    r = re.search("(2010s)(.)(\w+)", f)[3]
    temp["State"] = r[:2].upper()
    
    # drop null on any of the years
    temp.dropna(subset=[2010], inplace=True)

    #drop useless columns
    temp.drop(columns={"Census", "Estimates Base"}, inplace=True)

    # drop first row
    temp = temp.iloc[1:, :]

    # rename some cols
    temp.rename(columns={"Unnamed: 0": "County"}, inplace=True)

    # remove period at beginning of each county
    temp["County"] = temp["County"].apply(lambda x: x[1:])

    # strip state from county
    temp["County"] = temp["County"].apply(lambda x: x.split(", ")[0])

    pops10 = pd.concat([pops10, temp], axis=0, ignore_index=True)

# quick peek at the data
pops10.head()


Unnamed: 0,County,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,State
0,Autauga County,54773.0,55227.0,54954.0,54727.0,54893.0,54864.0,55243.0,55390.0,55533.0,55869.0,AL
1,Baldwin County,183112.0,186558.0,190145.0,194885.0,199183.0,202939.0,207601.0,212521.0,217855.0,223234.0,AL
2,Barbour County,27327.0,27341.0,27169.0,26937.0,26755.0,26283.0,25806.0,25157.0,24872.0,24686.0,AL
3,Bibb County,22870.0,22745.0,22667.0,22521.0,22553.0,22566.0,22586.0,22550.0,22367.0,22394.0,AL
4,Blount County,57376.0,57560.0,57580.0,57619.0,57526.0,57526.0,57494.0,57787.0,57771.0,57826.0,AL


In [76]:
pops10.head()

Unnamed: 0,County,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,State
0,Autauga County,54773.0,55227.0,54954.0,54727.0,54893.0,54864.0,55243.0,55390.0,55533.0,55869.0,AL
1,Baldwin County,183112.0,186558.0,190145.0,194885.0,199183.0,202939.0,207601.0,212521.0,217855.0,223234.0,AL
2,Barbour County,27327.0,27341.0,27169.0,26937.0,26755.0,26283.0,25806.0,25157.0,24872.0,24686.0,AL
3,Bibb County,22870.0,22745.0,22667.0,22521.0,22553.0,22566.0,22586.0,22550.0,22367.0,22394.0,AL
4,Blount County,57376.0,57560.0,57580.0,57619.0,57526.0,57526.0,57494.0,57787.0,57771.0,57826.0,AL


In [77]:
# melt both dfs to get tidy format

pops00 = pops00.melt(["County", "State"])
pops10 = pops10.melt(["County", "State"])

# rename columns accordingly
pops00.rename(columns={"variable": "Year", "value": "Population"}, inplace=True)
pops10.rename(columns={"variable": "Year", "value": "Population"}, inplace=True)

# concatenate the two dfs
pops = pd.concat([pops00, pops10], ignore_index=True)


In [78]:
# check that we have the same number of counties between datasets
assert len(pops00["County"].unique()) == len(pops10["County"].unique())

# check that we have the same number of counties every year
# first, create a df with the number of counties per year
pops_county_check = pops.groupby(["State", "Year"])["County"].count().reset_index()

In [79]:
# group the sum of counties by year and state - will help us check if number of counties changes over the years
grouped_states = pops_county_check.groupby(["Year", "State"])["County"].sum().reset_index().rename(columns={"County": "county_count"})

# here's what this looks like
# we get a dataframe of states and years, with the number of counties in each state in each year
grouped_states.head()

Unnamed: 0,Year,State,county_count
0,2000,AL,67
1,2000,AR,75
2,2000,FL,67
3,2000,GA,159
4,2000,ID,44


In [80]:
# using the above query, we should be able to assert that the number of counties per year is the same
# below statement should always equal zero

assert (grouped_states.duplicated(subset=["Year", "State"]).sum() == 0)
#assert (grouped_states10.duplicated(subset=["Year", "State"]).sum() == 0)


# ensure no duplicate values
assert pops.duplicated().sum() == 0

# loop to check that every state has the same number of counties every year
for state in states:
    assert (pops[pops["State"] == state].Year.value_counts().nunique() == 1), f"error on {state}"

In [81]:
# fip code addition

#pd.read_html("https://transition.fcc.gov/oet/info/maps/census/fips/fips.txt")

## trying to integrate fip numbers for a better merge

In [82]:
fips = pd.read_csv("https://github.com/ChuckConnell/articles/raw/master/fips2county.tsv", sep="\t")

In [83]:
def get_keys_from_value(d, val):
    return [k for k, v in d.items() if v == val]


keys = get_keys_from_value(abbrev_to_us_state, 'Alabama')
keys

['AL']

In [84]:
fips["state_abbrev"] = fips["StateName"].apply(lambda x: get_keys_from_value(abbrev_to_us_state, x)[0])

In [85]:
fips = fips[fips["state_abbrev"].isin(states)]

In [86]:
# helper function to get rid of the word county in pop df
def remove_county(x):

    if "County" in x:
        return x[:-7]
    else:
        return x


pops["county_test"] = pops["County"].apply(lambda x: remove_county(x))


# fix dona ana and la salle parish
pops["county_test"] = pops["county_test"].apply(lambda x: x.replace("Doña Ana", "Dona Ana"))
fips["CountyName"] = fips["CountyName"].apply(lambda x: x.replace("DoÃ±a Ana", "Dona Ana"))


#pops["county_test"] = pops["county_test"].apply(lambda x: x.replace("La Salle Parish", "La Salle"))


# rename county_test where state is texas and county is la salle to La Salle (TX)
pops.loc[(pops["State"] == "TX") & (pops["county_test"] == "La Salle"), "county_test"] = "La Salle County"
        

In [87]:
# change La Salle county name in fips to La Salle County
fips.loc[fips["CountyName"] == "La Salle", "CountyName"] = "La Salle County"
fips.loc[fips["CountyName"] == "LaSalle Parish", "CountyName"] = "La Salle Parish"
pops.loc[pops["county_test"] == "LaSalle Parish", "county_test"] = "La Salle Parish"



In [88]:
# final merge for population dataset & fip number dataset
pops_copy = pops.merge(fips[["state_abbrev", "CountyFIPS", "CountyName"]], left_on=["county_test", "State"], right_on=["CountyName", "state_abbrev"], how="outer", indicator=True)

In [89]:
# should never end up with anything left out of merge
assert len(pops_copy[pops_copy["_merge"] != "both"]) == 0


#TODO: clean up pops_copy dataset for redundant variables

In [90]:
# add fip numbers to df_prescriptions

# create copies of both dfs in case I mess up

prescriptions_copy = df_prescriptions.copy()
fips_copy = fips.copy()

In [91]:
# make buyer_county all lowercase
prescriptions_copy["BUYER_COUNTY"] = prescriptions_copy["BUYER_COUNTY"].apply(lambda x: x.lower())

# do the same for fips
fips_copy["CountyName"] = fips_copy["CountyName"].apply(lambda x: x.lower())

In [92]:
# remove county and parish from fips_copy

def remove_parish(x):

    if "parish" in x:
        return x[:-7]
    else:
        return x


# prescription dataset has similar format - match fips to this format
fips_copy["CountyName"] = fips_copy["CountyName"].apply(lambda x: remove_county(x))
fips_copy["CountyName"] = fips_copy["CountyName"].apply(lambda x: remove_parish(x))

def expand_saint(x):

    if "st." in x:
        return x.replace("st.", "saint")
    else:
        return x

# fix various other inconsistencies
# left only values first
fips_copy["CountyName"] = fips_copy["CountyName"].apply(lambda x: expand_saint(x))

fips_copy["CountyName"] = fips_copy["CountyName"].apply(lambda x: x.replace("desoto", "de soto"))
prescriptions_copy["BUYER_COUNTY"] = prescriptions_copy["BUYER_COUNTY"].apply(lambda x: x.replace("desoto", "de soto"))
prescriptions_copy["BUYER_COUNTY"] = prescriptions_copy["BUYER_COUNTY"].apply(lambda x: x.replace("st john the baptist", "saint john the baptist"))

fips_copy["CountyName"] = fips_copy["CountyName"].apply(lambda x: x.replace("dekalb", "de kalb"))
prescriptions_copy["BUYER_COUNTY"] = prescriptions_copy["BUYER_COUNTY"].apply(lambda x: x.replace("dekalb", "de kalb"))

# fix right only values

prescriptions_copy["BUYER_COUNTY"] = prescriptions_copy["BUYER_COUNTY"].apply(lambda x: x.replace("desoto", "de soto"))




In [93]:
prescriptions_fips = prescriptions_copy[["BUYER_COUNTY", "BUYER_STATE"]].merge(fips_copy, left_on=["BUYER_COUNTY", "BUYER_STATE"], right_on=["CountyName", "state_abbrev"], how="outer", indicator=True)

In [94]:
assert prescriptions_fips[prescriptions_fips["_merge"] == "left_only"].shape[0] == 0

#### Not all counties joining back to prescription dataset

This could be okay, but I want to do a quick check that there are just not records for these counties. To do this, I'll take a small sample of counties in our FIPS dataset that did NOT merge properly to the prescriptions dataset, and search each one manually in the prescription dataset. I will search various different ways the counties could be transcribed, as well as google the county to ensure there are no secondary names for the same county.

In [95]:
# looks like we have some missing values from right_only that we can't find in df_prescriptions
# may need to look more into why this is - but as of now all we know is the given data is incomplete
# choose a sample, search other dataframe, make conclusion

prescriptions_fips[prescriptions_fips["_merge"] != "both"].sample(10, random_state=0)

Unnamed: 0,BUYER_COUNTY,BUYER_STATE,StateFIPS,CountyFIPS_3,CountyName,StateName,CountyFIPS,StateAbbr,STATE_COUNTY,state_abbrev,_merge
148422,,,48,459,upshur,Texas,48459,TX,TX | UPSHUR,TX,right_only
147901,,,13,73,columbia,Georgia,13073,GA,GA | COLUMBIA,GA,right_only
148377,,,48,335,mitchell,Texas,48335,TX,TX | MITCHELL,TX,right_only
148192,,,41,21,gilliam,Oregon,41021,OR,OR | GILLIAM,OR,right_only
147904,,,13,85,dawson,Georgia,13085,GA,GA | DAWSON,GA,right_only
147911,,,13,123,gilmer,Georgia,13123,GA,GA | GILMER,GA,right_only
148195,,,45,1,abbeville,South Carolina,45001,SC,SC | ABBEVILLE,SC,right_only
148153,,,40,63,hughes,Oklahoma,40063,OK,OK | HUGHES,OK,right_only
148055,,,20,173,sedgwick,Kansas,20173,KS,KS | SEDGWICK,KS,right_only
148254,,,48,35,bosque,Texas,48035,TX,TX | BOSQUE,TX,right_only


In [129]:
# set max number of columns, so I can look at all records in a separate file
#pd.set_option("display.max_rows", None)

# look at every county in Texas that is not in the prescription dataset
# by using value_counts(), we can look at every county in the dataset
#prescriptions_fips[prescriptions_fips["BUYER_STATE"] == "KS"]["BUYER_COUNTY"].value_counts()

The above is now commented out, but the same test can be repeated as follows:

1. uncomment lines 2, 6 above
2. for each county you want to look for in the dataset, change the logic in line 6 to the county's associated state
3. click on the "show more in text editor" and ctrl-f for the county along with potential other spellings/variations of the county
4. google the county to ensure we did not miss any variations

Given that we aren't able to find any records from our sample, we conclude that these values are simply missing from the prescription shipping dataset. Therefore, the right_only merge indicator is acceptable as we still have a comprehensive set of counties in our data.

In [130]:
# reset max rows
pd.reset_option("display.max_rows")


In [97]:
# adding fips to our cause of death data

In [98]:
# create copies of both dfs

cause_of_death_copy = df_cause_of_death.copy()
fips_copy = fips.copy()

In [99]:
cause_of_death_copy["County"] = cause_of_death_copy["County"].apply(lambda x: remove_county(x))


In [100]:
temp = cause_of_death_copy.merge(fips_copy, left_on=["County", "State"], right_on=["CountyName", "state_abbrev"], how="outer", indicator=True)

In [101]:
fips[fips["StateAbbr"] == "AR"]

Unnamed: 0,StateFIPS,CountyFIPS_3,CountyName,StateName,CountyFIPS,StateAbbr,STATE_COUNTY,state_abbrev
112,5,1,Arkansas,Arkansas,5001,AR,AR | ARKANSAS,AR
113,5,3,Ashley,Arkansas,5003,AR,AR | ASHLEY,AR
114,5,5,Baxter,Arkansas,5005,AR,AR | BAXTER,AR
115,5,7,Benton,Arkansas,5007,AR,AR | BENTON,AR
116,5,9,Boone,Arkansas,5009,AR,AR | BOONE,AR
...,...,...,...,...,...,...,...,...
182,5,141,Van Buren,Arkansas,5141,AR,AR | VAN BUREN,AR
183,5,143,Washington,Arkansas,5143,AR,AR | WASHINGTON,AR
184,5,145,White,Arkansas,5145,AR,AR | WHITE,AR
185,5,147,Woodruff,Arkansas,5147,AR,AR | WOODRUFF,AR


In [102]:
temp[temp["_merge"] != "both"]

Unnamed: 0,County,County Code,Year,Year Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths,State,StateFIPS,CountyFIPS_3,CountyName,StateName,CountyFIPS,StateAbbr,STATE_COUNTY,state_abbrev,_merge
9462,LaSalle Parish,22059.0,2003.0,2003.0,All other non-drug and non-alcohol causes,O9,169.0,LA,,,,,,,,,left_only
9463,LaSalle Parish,22059.0,2004.0,2004.0,All other non-drug and non-alcohol causes,O9,178.0,LA,,,,,,,,,left_only
9464,LaSalle Parish,22059.0,2005.0,2005.0,All other non-drug and non-alcohol causes,O9,182.0,LA,,,,,,,,,left_only
9465,LaSalle Parish,22059.0,2006.0,2006.0,All other non-drug and non-alcohol causes,O9,157.0,LA,,,,,,,,,left_only
9466,LaSalle Parish,22059.0,2007.0,2007.0,All other non-drug and non-alcohol causes,O9,157.0,LA,,,,,,,,,left_only
9467,LaSalle Parish,22059.0,2008.0,2008.0,All other non-drug and non-alcohol causes,O9,152.0,LA,,,,,,,,,left_only
9468,LaSalle Parish,22059.0,2009.0,2009.0,All other non-drug and non-alcohol causes,O9,164.0,LA,,,,,,,,,left_only
9469,LaSalle Parish,22059.0,2010.0,2010.0,All other non-drug and non-alcohol causes,O9,155.0,LA,,,,,,,,,left_only
9470,LaSalle Parish,22059.0,2011.0,2011.0,All other non-drug and non-alcohol causes,O9,168.0,LA,,,,,,,,,left_only
9471,LaSalle Parish,22059.0,2012.0,2012.0,All other non-drug and non-alcohol causes,O9,165.0,LA,,,,,,,,,left_only


### Add population data to our other two dataframes

In [103]:
# ensure columns are in the same format between our first two dfs

pops_copy1 = pops.copy()
df_prescriptions_copy = df_prescriptions.copy()

#pops_copy1["County"] = pops_copy1["County"].apply(lambda x: x.split(" ")[0])

In [104]:
# calculate length of each county name
pops_copy1["county name length"] = pops_copy1["County"].apply(lambda x: len(x.split(" ")))
df_prescriptions_copy["county name length"] = df_prescriptions_copy["BUYER_COUNTY"].apply(lambda x: len(x.split(" ")))


In [105]:
# fix mismatches in county names

df_prescriptions_copy.loc[df_prescriptions_copy["BUYER_COUNTY"] == "SAINT LANDRY", "BUYER_COUNTY"] = "St. Landry Parish"

In [106]:
df_prescriptions_copy["county name length"].value_counts()

1    133178
2     13170
3      1471
4        30
Name: county name length, dtype: int64

In [107]:
pops_copy1["county name length"].value_counts()

2    25190
3     1510
4       60
5       20
Name: county name length, dtype: int64

In [108]:
# function to remove the word county from dataset

def clean_county(county_name, county_name_len):
    if county_name_len == 2:
        return county_name.split(" ")[0]
    if county_name_len == 3:
        return county_name.split(" ")[0] + " " + county_name.split(" ")[1]
    if county_name_len == 4:
        return county_name.split(" ")[0] + " " + county_name.split(" ")[1] + " " + county_name.split(" ")[2]
    if county_name_len == 5:
        return county_name.split(" ")[0] + " " + county_name.split(" ")[1] + " " + county_name.split(" ")[2] + " " + county_name.split(" ")[3]
    else:
        return "error"

In [109]:
pops_copy1["County"] = pops_copy1.apply(lambda x: clean_county(x["County"], x["county name length"]), axis=1)

In [110]:
# set both dataframe counties to lowercase and merge them

pops_copy1["County"] = pops_copy1["County"].str.lower()
df_prescriptions_copy["BUYER_COUNTY"] = df_prescriptions_copy["BUYER_COUNTY"].str.lower()


In [111]:
# fix various names
df_prescriptions_copy.loc[df_prescriptions_copy["BUYER_COUNTY"] == "st. landry parish", "BUYER_COUNTY"] = "saint landry parish"
df_prescriptions_copy.loc[df_prescriptions_copy["BUYER_COUNTY"] == "st john the baptist", "BUYER_COUNTY"] = "saint john the baptist"


# create function that expands st. to saint

def expand_st(county_name):
    if "st." in county_name:
        return county_name.replace("st.", "saint")
    else:
        return county_name

# apply function
pops_copy1["County"] = pops_copy1["County"].apply(lambda x: expand_st(x))




In [112]:
# merge the two dfs
temp = df_prescriptions_copy.merge(pops_copy1, left_on=["BUYER_STATE", "BUYER_COUNTY"], right_on=["State", "County"], how = "outer", indicator=True)

In [113]:
#pd.set_option("display.max_rows", None)

In [114]:
temp[temp["_merge"] != "both"].BUYER_COUNTY.value_counts() # prescriptions dataset

saint landry parish    848
dona ana               292
carson city            164
de soto                 20
de kalb                 12
Name: BUYER_COUNTY, dtype: int64

In [115]:
df_prescriptions_copy[df_prescriptions_copy["BUYER_COUNTY"] == "jefferson"]

Unnamed: 0,REPORTER_DEA_NO,BUYER_STATE,BUYER_ZIP,BUYER_COUNTY,TRANSACTION_CODE,DRUG_CODE,DRUG_NAME,QUANTITY,TRANSACTION_DATE,Product_Name,year,month,county name length
4964,PB0034861,TX,77706,jefferson,S,9193,HYDROCODONE,1.0,1042006,HYDROCODONE BIT 5MG/ACETAMINOPHEN 50,2006,1,1
4965,PB0034861,TX,77706,jefferson,S,9193,HYDROCODONE,4.0,11022006,"HYDROCODONE.BIT./ACET.,10MG & 325MG/",2006,11,1
4966,PB0034861,TX,77706,jefferson,S,9193,HYDROCODONE,1.0,1042007,"HYDROCODONE.BIT./ACET.,10MG & 325MG/",2007,1,1
4967,PB0034861,TX,77706,jefferson,S,9193,HYDROCODONE,1.0,2142007,HYDROCODONE BIT. 10MG/ACETAMINOPHEN,2007,2,1
4968,PB0034861,TX,77706,jefferson,S,9193,HYDROCODONE,3.0,3282007,"HYDROCODONE.BIT./ACET.,10MG & 325MG/",2007,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
493646,RC0221236,AL,35215,jefferson,S,9193,HYDROCODONE,1.0,9012011,LORCET HYD.BIT10MG/ACET650MG TAB,2011,9,1
493647,RC0221236,AL,35215,jefferson,S,9143,OXYCODONE,1.0,5222012,OXYCODONE HYDROCHLORIDE TABLETS 5MG,2012,5,1
493648,RC0221236,AL,35215,jefferson,S,9193,HYDROCODONE,1.0,7032012,"LORTAB 10MG/500MG/TAB,HYDROCOD.BIT.&",2012,7,1
493653,RC0221236,AL,35226,jefferson,S,9193,HYDROCODONE,1.0,12282012,LORTAB 7.5MG HYDROCODONE.BIT / 500MG,2012,12,1


In [116]:
temp[temp["_merge"] != "both"].County.reset_index()

Unnamed: 0,index,County
0,1137160,
1,1137161,
2,1137162,
3,1137163,
4,1137164,
...,...,...
13541,2943021,lasalle
13542,2943022,lasalle
13543,2943023,lasalle
13544,2943024,lasalle


In [117]:
# for pops.County values ==2, strip with a space and take the first value
pops_copy1.loc[pops_copy1["county name length"] == 2, "County"] = pops_copy1.loc[pops_copy1["county name length"] == 2, "County"].apply(lambda x: x.split(" ")[0])

# make all county values lowercase
pops_copy1["County"] = pops_copy1["County"].apply(lambda x: x.lower())
df_prescriptions_copy["REPORTER_COUNTY"] = df_prescriptions_copy["REPORTER_COUNTY"].apply(lambda x: x.lower())

KeyError: 'REPORTER_COUNTY'

In [None]:
# ensure columns will match up

df_prescriptions_copy.rename(columns={"REPORTER_STATE": "State", "year": "Year", "REPORTER_COUNTY": "County"}, inplace=True)

In [None]:
temp = pops_copy1.merge(df_prescriptions_copy, on=["County", "State"], how="outer", indicator=True)

In [None]:
temp[temp["_merge"] != "both"]

Unnamed: 0,County,State,Year_x,Population,county name length_x,REPORTER_DEA_NO,REPORTER_ZIP,TRANSACTION_CODE,DRUG_CODE,DRUG_NAME,QUANTITY,TRANSACTION_DATE,Product_Name,Year_y,month,county name length_y,_merge
0,autauga,AL,2000,44021.0,2,,,,,,,,,,,,left_only
1,autauga,AL,2001,44889.0,2,,,,,,,,,,,,left_only
2,autauga,AL,2002,45909.0,2,,,,,,,,,,,,left_only
3,autauga,AL,2003,46800.0,2,,,,,,,,,,,,left_only
4,autauga,AL,2004,48366.0,2,,,,,,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2288275,lasalle,LA,2015,14979.0,2,,,,,,,,,,,,left_only
2288276,lasalle,LA,2016,15022.0,2,,,,,,,,,,,,left_only
2288277,lasalle,LA,2017,14887.0,2,,,,,,,,,,,,left_only
2288278,lasalle,LA,2018,14901.0,2,,,,,,,,,,,,left_only


In [None]:
temp.columns

Index(['County', 'State', 'Year_x', 'Population', 'county name length_x',
       'REPORTER_DEA_NO', 'REPORTER_ZIP', 'TRANSACTION_CODE', 'DRUG_CODE',
       'DRUG_NAME', 'QUANTITY', 'TRANSACTION_DATE', 'Product_Name', 'Year_y',
       'month', 'county name length_y', '_merge'],
      dtype='object')

In [None]:
df_prescriptions_copy.State.value_counts()

FL    39837
OR    31626
TN    27953
LA     7113
MS     4634
GA     1056
SC      867
Name: State, dtype: int64

In [None]:
df_prescriptions_copy[df_prescriptions_copy["State"] == "AL"]

Unnamed: 0,REPORTER_DEA_NO,State,REPORTER_ZIP,County,TRANSACTION_CODE,DRUG_CODE,DRUG_NAME,QUANTITY,TRANSACTION_DATE,Product_Name,Year,month,county name length


In [None]:
pops.head(2)

Unnamed: 0,County,State,Year,Population
0,Autauga County,AL,2000,44021.0
1,Baldwin County,AL,2000,141342.0


In [None]:
df_prescriptions_copy.head(2)

Unnamed: 0,REPORTER_DEA_NO,REPORTER_STATE,REPORTER_ZIP,REPORTER_COUNTY,TRANSACTION_CODE,DRUG_CODE,DRUG_NAME,QUANTITY,TRANSACTION_DATE,Product_Name,year,month
6224,PB0167127,GA,31793,TIFT,S,9193,HYDROCODONE,2.0,11232009,HYDROCODONE BIT/IBUPROFEN 7.5MG/200M,2009,11
6225,PB0167127,GA,31793,TIFT,S,9193,HYDROCODONE,2.0,12172012,HYDROCODONE BIT./ACETAMINOPHEN TABS.,2012,12


In [None]:
df_cause_of_death.head(2)

Unnamed: 0,County,County Code,Year,Year Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths,State
0,Autauga County,1001.0,2003.0,2003.0,All other non-drug and non-alcohol causes,O9,397.0,AL
1,Baldwin County,1003.0,2003.0,2003.0,Drug poisonings (overdose) Unintentional (X40-...,D1,10.0,AL


## Final 3 datasets

We should have: (UNSURE IF WE SHOULD EXTEND DATE RANGES, CURRENTLY 3 YEARS BEFORE AND AFTER POLICY IMPLEMENTATION)

- Florida and Georgia 2007 - 2013
- Texas and Oklahoma 2004 - 2010
- Washington and Oregon 2009 - 2015

### Drug overdose - broken down by state

In [None]:
# Florida and Georgia

prescriptions_fl = df_prescriptions.copy()
prescriptions_tx = df_prescriptions.copy()
prescriptions_wa = df_prescriptions.copy()

prescriptions_fl = prescriptions_fl[(prescriptions_fl["REPORTER_STATE"] == "FL") | (prescriptions_fl["REPORTER_STATE"].isin(fl_states))]
prescriptions_tx = prescriptions_tx[(prescriptions_tx["REPORTER_STATE"] == "TX") | (prescriptions_tx["REPORTER_STATE"]).isin(tx_states)]
prescriptions_wa = prescriptions_wa[(prescriptions_wa["REPORTER_STATE"] == "WA") | (prescriptions_wa["REPORTER_STATE"]).isin(wa_states)]



# filter appropriate years
fl_start = 2007
fl_end = 2013

tx_start = 2004
tx_end = 2010

wa_start = 2009
wa_end = 2015


prescriptions_fl = prescriptions_fl[(prescriptions_fl["year"] >= fl_start) & (prescriptions_fl["year"] <= fl_end)]
prescriptions_tx = prescriptions_tx[(prescriptions_tx["year"] >= tx_start) & (prescriptions_tx["year"] <= tx_end)]
prescriptions_wa = prescriptions_wa[(prescriptions_wa["year"] >= wa_start) & (prescriptions_wa["year"] <= wa_end)]


### Cause of death - broken down by state

In [None]:
# ensure states are now in their full form to match the cause of death data
fl_states = ["Georgia", "Alabama", "Missisippi", "South Carolina", "Tennessee"]

tx_states = ["Oklahoma", "Louisiana", "New Mexico", "Arkansas", "Kansas"]

wa_states = ["Oregon", "Idaho", "Montana", "Nevada", "Wyoming"]

In [None]:
deaths_fl = df_cause_of_death.copy()
deaths_tx = df_cause_of_death.copy()
deaths_wa = df_cause_of_death.copy()

deaths_fl = deaths_fl[(deaths_fl["State"] == "Florida") | (deaths_fl["State"].isin(fl_states))]
deaths_tx = deaths_tx[(deaths_tx["State"] == "Texas") | (deaths_tx["State"].isin(tx_states))]
deaths_wa = deaths_wa[(deaths_wa["State"] == "Washington") | (deaths_wa["State"].isin(wa_states))]

deaths_fl = deaths_fl[(deaths_fl["Year"] >= fl_start) & (deaths_fl["Year"] <= fl_end)]
deaths_tx = deaths_tx[(deaths_tx["Year"] >= tx_start) & (deaths_tx["Year"] <= tx_end)]  
deaths_wa = deaths_wa[(deaths_wa["Year"] >= wa_start) & (deaths_wa["Year"] <= wa_end)]

### export all to csv

In [None]:
prescriptions_fl.to_csv("05_cleaned_data/prescriptions_fl.csv", index=False)
oveprescriptions_tx.to_csv("05_cleaned_data/prescriptions_tx.csv", index=False)
prescriptions_wa.to_csv("05_cleaned_data/prescriptions_wa.csv", index=False)

deaths_fl.to_csv("05_cleaned_data/deaths_fl.csv", index=False)
deaths_tx.to_csv("05_cleaned_data/deaths_tx.csv", index=False)
deaths_wa.to_csv("05_cleaned_data/deaths_wa.csv", index=False)

## Notes for the group

- may need to filter out a couple more columns - haven't done this yet as I don't want to accidentally delete something we need
- overdose data is only broken down by year unless i messed something up - overdose analysis will have to be less granular