In [17]:
import pandas as pd
from datetime import datetime
import numpy as np
import glob
import os
import xlrd
import re
import openpyxl

# custom file that maps state names to abbreviations
from abbreviation_conversion import abbrev_to_us_state

## Clean up prescription data


In [18]:
def find_year(TRANSACTION_DATE):
    """
    Args:
        TRANSACTION_DATE (str): date in format MMDDYYYY

    Returns:
        int: year
    """
    TRANSACTION_DATE = str(TRANSACTION_DATE)
    
    return int(TRANSACTION_DATE[-4:])

# quick test 
print(f"testing the find_year function: {find_year(12202019)}")


def find_month(TRANSACTION_DATE):
    """
    Args:
        TRANSACTION_DATE (str): date in format MMDDYYYY

    Returns:
        int: month
    """
    TRANSACTION_DATE = str(TRANSACTION_DATE)

    if len(TRANSACTION_DATE) == 8:
        return int(TRANSACTION_DATE[:2])
    else:
        return int(TRANSACTION_DATE[:1])
    

# quick test 
print(f"testing the find_month function: {find_month(12202019)}")

testing the find_year function: 2019
testing the find_month function: 12


In [19]:
# to load in the data, we need to truncate the amount of columns we use as well as the states
cols_to_keep = ["REPORTER_DEA_NO", "BUYER_STATE", "BUYER_ZIP", "BUYER_COUNTY", "DRUG_CODE", "TRANSACTION_CODE", "DRUG_NAME", "QUANTITY", "TRANSACTION_DATE", "Product_Name"]

# we know we need Florida, Texas, and Washington
states = ["FL", "TX", "WA"]
# since we are normalizing based on population, I think we should pick states that are regionally close to our target states
# we can change this later as a group, but I have these selected below:

# Florida comparison states
fl_states = ["PA", "MI", "NC"]

# Texas comparison states
tx_states = ["IL", "MA", "MI"]

# Washington comparison states
wa_states = ["NC", "CO", "MD"]

# create list of all states to use
variable_states = []
variable_states.extend(fl_states)
variable_states.extend(tx_states)
variable_states.extend(wa_states)

# append variable states to our original list
states.extend(variable_states)


In [20]:
# now, load in our data as an iterator so we can load in chunks
it = pd.read_csv("00_source_data/arcos_all_washpost.tsv", chunksize=500_000, sep='\t', usecols = cols_to_keep) # may have to change chunksize depending on your computer's memory

# init empty dataframe
df = pd.DataFrame()

for chunk in it:
    # extract year out of date column
    chunk["year"] = chunk["TRANSACTION_DATE"].apply(lambda x: find_year(x))
    chunk["month"] = chunk["TRANSACTION_DATE"].apply(lambda x: find_month(x))

    # ensure we're working in the correct date range
    filtered_chunk = chunk[chunk["year"] > 2002]
    filtered_chunk = filtered_chunk[filtered_chunk["year"] < 2016]

    # filter out the states we want
    filtered_chunk = filtered_chunk[filtered_chunk["BUYER_STATE"].isin(states)]

    df = df.append(filtered_chunk)
    break
df

df_prescriptions = df.copy() # keep a copy of this df for later filtering

  df = df.append(filtered_chunk)


While doing analysis, we learned that a handful of county values in MA were missing. However, when we looked up the associated zip codes (02401, 02174), we learned that we could fill these values with Plymouth and Middlesex Counties, respectively.

In [21]:
# quick look at the null values (we checked, and these are all the values for which county is null)
df_prescriptions[df_prescriptions["BUYER_COUNTY"].isnull()].head()

Unnamed: 0,REPORTER_DEA_NO,BUYER_STATE,BUYER_ZIP,BUYER_COUNTY,TRANSACTION_CODE,DRUG_CODE,DRUG_NAME,QUANTITY,TRANSACTION_DATE,Product_Name,year,month
4035,PB0020139,MA,2401,,S,9143,OXYCODONE,2.0,5182006,OXYCODO.HCL 5.35MG/TAB,2006,5
4036,PB0020139,MA,2401,,S,9143,OXYCODONE,12.0,7312006,OXYCODONE HCL/ACETAMINOPHEN 5MG/325M,2006,7
141994,PG0149650,MA,2174,,S,9193,HYDROCODONE,2.0,1172006,"HYDROCOD.BIT.& APAP,10MG/660MG/TAB",2006,1


In [22]:
# replace values accordingly
df_prescriptions.loc[df_prescriptions["BUYER_ZIP"] == 2401, "BUYER_COUNTY"] = "PLYMOUTH"
df_prescriptions.loc[df_prescriptions["BUYER_ZIP"] == 2174, "BUYER_COUNTY"] = "MIDDLESEX"

In [23]:
# check null values again
df_prescriptions[df_prescriptions["BUYER_COUNTY"].isnull()]

assert len(df_prescriptions[df_prescriptions["BUYER_COUNTY"].isnull()]) == 0, "still have missing values for counties - double check code above"

## Clean up cause of death data

In [24]:
path = r'00_source_data/cause_of_death' # point to correct folder
filenames = glob.glob(path + "/*.txt") # select all text files in folder

df = pd.DataFrame() # empty df - will store data from all txt files

for f in filenames:
    temp = pd.read_csv(f, index_col=None, header=0, sep='\t')
    # we're getting some extraneous notes at the bottom - let's just drop based on county as these will only be null for these useless notes columns
    temp.dropna(subset={'County'}, inplace=True)
    
    df = pd.concat([df, temp], axis=0, ignore_index=True)

In [25]:
# helper functions to separate county and state

def abtract_state(county):
    """
    Args:
        county (str): county name

    Returns:
        str: state
    """
    return county.split(", ")[1]



def abstract_county(county):
    """
    Args:
        county (str): county name

    Returns:
        str: county
    """
    return county.split(", ")[0]

# apply functions to our df
df["State"] = df.apply(lambda x: abtract_state(x["County"]), axis=1)
df["County"] = df.apply(lambda x: abstract_county(x["County"]), axis=1)

# do not need notes column, let's just drop it here
df.drop(columns={"Notes"}, inplace=True)

df_cause_of_death = df.copy() # keep a copy of this df for later filtering

In [26]:
# now, let's filter our dataframe to be only the states we want
df_cause_of_death = df_cause_of_death[df_cause_of_death["State"].isin(states)]


## Adding in County Population data

[Census county pop. data, 2000-2010](https://www.census.gov/data/tables/time-series/demo/popest/intercensal-2000-2010-counties.html)<br>
[Census county pop. data, 2010-2019](https://www.census.gov/data/datasets/time-series/demo/popest/2010s-counties-total.html)<br>
For both, just select the appropriate states on the webpage. We will clean and merge as needed in this notebook.



#### Guide to cleaning - 2000s data

The way the 2000s excel files are formatted, we can clean the data in the following way

- load in with header=3
- drop null on any of the populations
    - notes at the bottom will be removed
- drop unnamed 1, 12, and 13
    - these contain redundant data about populations from specific dates
    - Unnamed 12 is 2010s pop - will be redundant as our next dataset has this as well. Using the newer data
- drop first row
    - state as a whole
- rename Unnamed: 0 to county


In [27]:
# init emmpty df for our population data
pops00 = pd.DataFrame()

# end goal - add every excel file in 00_source_data/county_pop/2000s to pops00

path = r"00_source_data/county_pop/2000s/" # point to correct folder
filenames = glob.glob(path + "*.xls")

for f in filenames:

    # read in current file with header = 3
    temp = pd.read_excel(f, header = 3)

    # regex to pull out state from filename
    r = re.search("(2000s)(.)(\w+)", f)[3]
    temp["State"] = r[:2].upper()
    
    # drop null on any of the years
    temp.dropna(subset=[2000], inplace=True)

    #drop useless columns
    temp.drop(columns={"Unnamed: 1", "Unnamed: 12", "Unnamed: 13"}, inplace=True)

    # drop first row
    temp = temp.iloc[1:, :]

    # rename some cols
    temp.rename(columns={"Unnamed: 0": "County"}, inplace=True)

    # remove period at beginning of each county
    temp["County"] = temp["County"].apply(lambda x: x[1:])

    pops00 = pd.concat([pops00, temp], axis=0, ignore_index=True)

# quick peek at the data
pops00.head()


Unnamed: 0,County,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,State
0,Adams County,350888.0,359816.0,370753.0,377464.0,384809.0,395146.0,406575.0,415746.0,424913.0,435700.0,CO
1,Alamosa County,14954.0,14956.0,15114.0,15067.0,15217.0,15236.0,15196.0,15180.0,15300.0,15289.0,CO
2,Arapahoe County,491482.0,502393.0,508936.0,513690.0,518971.0,524466.0,531619.0,542039.0,552461.0,563161.0,CO
3,Archuleta County,10020.0,10454.0,10885.0,11089.0,11266.0,11496.0,11937.0,12262.0,12250.0,12169.0,CO
4,Baca County,4501.0,4471.0,4336.0,4117.0,4064.0,3997.0,3933.0,3866.0,3806.0,3767.0,CO


#### Guide to cleaning - 2010s data

The way the 2010s excel files are formatted, we can clean the data in the following way

- load in with header=3
- drop null on any of the populations
    - notes at the bottom will be removed
- drop census, estimates base
- drop first row
    - state as a whole
- rename Unnamed: 0 to county


In [28]:
pops10 = pd.DataFrame()

# add every excel file in 00_source_data/county_pop/2000s to pops00

path = r"00_source_data/county_pop/2010s" # point to correct folder
filenames = glob.glob(path + "/*.xlsx")

for f in filenames:

    # read in current file with header = 3
    temp = pd.read_excel(f, header = 3)

    # regex to pull out state from filename
    r = re.search("(2010s)(.)(\w+)", f)[3]
    temp["State"] = r[:2].upper()
    
    # drop null on any of the years
    temp.dropna(subset=[2010], inplace=True)

    #drop useless columns
    temp.drop(columns={"Census", "Estimates Base"}, inplace=True)

    # drop first row
    temp = temp.iloc[1:, :]

    # rename some cols
    temp.rename(columns={"Unnamed: 0": "County"}, inplace=True)

    # remove period at beginning of each county
    temp["County"] = temp["County"].apply(lambda x: x[1:])

    # strip state from county
    temp["County"] = temp["County"].apply(lambda x: x.split(", ")[0])

    pops10 = pd.concat([pops10, temp], axis=0, ignore_index=True)

# quick peek at the data
pops10.head()


Unnamed: 0,County,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,State
0,Adams County,443691.0,452201.0,460558.0,469978.0,479946.0,490443.0,497734.0,503590.0,511354.0,517421.0,CO
1,Alamosa County,15515.0,15709.0,15680.0,15787.0,15803.0,15894.0,16053.0,16108.0,16248.0,16233.0,CO
2,Arapahoe County,574747.0,585968.0,596500.0,608467.0,619034.0,630984.0,638950.0,644478.0,651797.0,656590.0,CO
3,Archuleta County,12046.0,12021.0,12132.0,12216.0,12231.0,12387.0,12825.0,13295.0,13730.0,14029.0,CO
4,Baca County,3807.0,3778.0,3722.0,3656.0,3587.0,3555.0,3530.0,3554.0,3584.0,3581.0,CO


In [29]:
# melt both dfs to get tidy format
pops00 = pops00.melt(["County", "State"])
pops10 = pops10.melt(["County", "State"])

# rename columns accordingly
pops00.rename(columns={"variable": "Year", "value": "Population"}, inplace=True)
pops10.rename(columns={"variable": "Year", "value": "Population"}, inplace=True)

# concatenate the two dfs to get all our population data in one place
pops = pd.concat([pops00, pops10], ignore_index=True)


In [30]:
# check that we have the same number of counties between datasets
assert len(pops00["County"].unique()) == len(pops10["County"].unique())

# check that we have the same number of counties every year
# first, create a df with the number of counties per year
pops_county_check = pops.groupby(["State", "Year"])["County"].count().reset_index()

In [31]:
# group the sum of counties by year and state - will help us check if number of counties changes over the years
grouped_states = pops_county_check.groupby(["Year", "State"])["County"].sum().reset_index().rename(columns={"County": "county_count"})

# here's what this looks like
# we get a dataframe of states and years, with the number of counties in each state in each year
grouped_states.head()

Unnamed: 0,Year,State,county_count
0,2000,CO,64
1,2000,FL,67
2,2000,IL,102
3,2000,MA,14
4,2000,MD,24


In [32]:
# using the above query, we should be able to assert that the number of counties per year is the same
# below statement should always equal zero

assert (grouped_states.duplicated(subset=["Year", "State"]).sum() == 0)
#assert (grouped_states10.duplicated(subset=["Year", "State"]).sum() == 0)


# ensure no duplicate values
assert pops.duplicated().sum() == 0

# loop to check that every state has the same number of counties every year
for state in states:
    assert (pops[pops["State"] == state].Year.value_counts().nunique() == 1), f"error on {state}"

## trying to integrate fip numbers for a better merge

In [33]:
# load in fips data from external source
fips = pd.read_csv("https://github.com/ChuckConnell/articles/raw/master/fips2county.tsv", sep="\t")

In [34]:
# function to get key from value in our abbreviation dictionary
# will help us have consistent formatting across dataframes for merging purposes
def get_keys_from_value(d, val):
    return [k for k, v in d.items() if v == val]


keys = get_keys_from_value(abbrev_to_us_state, 'Alabama')
keys # quick peek to make sure it worked

['AL']

In [35]:
# apply the above to entire fips dataframe
fips["state_abbrev"] = fips["StateName"].apply(lambda x: get_keys_from_value(abbrev_to_us_state, x)[0])

# filter fips to appropriate states, now that it's in the correct format
fips = fips[fips["state_abbrev"].isin(states)]

#### Further cleaning of values before merge

In [36]:
# helper function to get rid of the word county in pop df
def remove_county(x):

    if "County" in x:
        return x[:-7]
    else:
        return x


pops["county_test"] = pops["County"].apply(lambda x: remove_county(x))


# fix dona ana and la salle parish
pops["county_test"] = pops["county_test"].apply(lambda x: x.replace("Doña Ana", "Dona Ana"))
fips["CountyName"] = fips["CountyName"].apply(lambda x: x.replace("DoÃ±a Ana", "Dona Ana"))


#pops["county_test"] = pops["county_test"].apply(lambda x: x.replace("La Salle Parish", "La Salle"))


# rename county_test where state is texas and county is la salle to La Salle (TX)
pops.loc[(pops["State"] == "TX") & (pops["county_test"] == "La Salle"), "county_test"] = "La Salle County"
        

In [37]:
# change La Salle county name in fips to La Salle County
fips.loc[fips["CountyName"] == "La Salle", "CountyName"] = "La Salle County"
fips.loc[fips["CountyName"] == "LaSalle Parish", "CountyName"] = "La Salle Parish"
pops.loc[pops["county_test"] == "LaSalle Parish", "county_test"] = "La Salle Parish"



In [38]:
# final merge for population dataset & fip number dataset
pops_copy = pops.merge(fips[["state_abbrev", "CountyFIPS", "StateFIPS", "CountyName"]], left_on=["county_test", "State"], right_on=["CountyName", "state_abbrev"], how="outer", indicator=True)

In [39]:
# should never end up with anything left out of merge
assert len(pops_copy[pops_copy["_merge"] != "both"]) == 0

In [65]:
# add fip numbers to df_prescriptions

# create copies of both dfs so we have a checkpoint to access our old dfs
prescriptions_copy = df_prescriptions.copy()
fips_copy = fips.copy()

In [66]:
# make buyer_county all lowercase
prescriptions_copy["BUYER_COUNTY"] = prescriptions_copy["BUYER_COUNTY"].apply(lambda x: x.lower())

# do the same for fips
fips_copy["CountyName"] = fips_copy["CountyName"].apply(lambda x: x.lower())

In [77]:
# remove county and parish from fips_copy

def remove_parish(x):

    if "parish" in x:
        return x[:-7]
    else:
        return x


# prescription dataset has similar format - match fips to this format
fips_copy["CountyName"] = fips_copy["CountyName"].apply(lambda x: remove_county(x))
fips_copy["CountyName"] = fips_copy["CountyName"].apply(lambda x: remove_parish(x))

def expand_saint(x):

    if "st." in x:
        return x.replace("st.", "saint")
    else:
        return x

# fix various other inconsistencies
# left only values first
fips_copy["CountyName"] = fips_copy["CountyName"].apply(lambda x: expand_saint(x))

fips_copy["CountyName"] = fips_copy["CountyName"].apply(lambda x: x.replace("desoto", "de soto"))
prescriptions_copy["BUYER_COUNTY"] = prescriptions_copy["BUYER_COUNTY"].apply(lambda x: x.replace("desoto", "de soto"))
prescriptions_copy["BUYER_COUNTY"] = prescriptions_copy["BUYER_COUNTY"].apply(lambda x: x.replace("st john the baptist", "saint john the baptist"))

fips_copy["CountyName"] = fips_copy["CountyName"].apply(lambda x: x.replace("dekalb", "de kalb"))
prescriptions_copy["BUYER_COUNTY"] = prescriptions_copy["BUYER_COUNTY"].apply(lambda x: x.replace("dekalb", "de kalb"))

# fix right only values

prescriptions_copy["BUYER_COUNTY"] = prescriptions_copy["BUYER_COUNTY"].apply(lambda x: x.replace("desoto", "de soto"))



# function to remove apostrophes from county names
def remove_apostrophe(x):
    
    if "'" in x:
        return x.replace("'", "")
    else:
        return x


# apply to fips
fips_copy["CountyName"] = fips_copy["CountyName"].apply(lambda x: remove_apostrophe(x))

# replace lasalle with la salle in fips copy
fips_copy["CountyName"] = fips_copy["CountyName"].apply(lambda x: x.replace("lasalle", "la salle"))

# replace dewitt with de witt in prescriptions copy
prescriptions_copy["BUYER_COUNTY"] = prescriptions_copy["BUYER_COUNTY"].apply(lambda x: x.replace("dewitt", "de witt"))


In [78]:
prescriptions_fips = prescriptions_copy.merge(fips_copy, left_on=["BUYER_COUNTY", "BUYER_STATE"], right_on=["CountyName", "state_abbrev"], how="outer", indicator=True)

# capitalize year and month columns
prescriptions_fips.rename(columns={"year": "Year", "month": "Month"}, inplace=True)

#### Imputing missing values

Since we have plenty of values joined with right_only indicator status, we know that some counties in our FIPS dataset is not merging correctly to our prescriptions dataset. Let's take one example of missing data - San Juan County in Washington. When exploring the Washington Post's website on prescription data and selecting for this county individually, we can see that the data does, in fact, exist here. However, we see an exceptionally low rate of pills prescribed (32 pills per person per year, in this case). Upon looking at some other examples, we can see that the counties joining with right_only below are likely missing from the Washington Post data due to having such small numbers.

Since any given missing county does not have data available, we need to find a way to impute these values.

In [44]:
# let's look at an example of our missing data
wa = df_prescriptions[df_prescriptions["BUYER_STATE"] == 'WA']

wa["BUYER_COUNTY"].value_counts()

CLARK           1663
YAKIMA          1419
BENTON           966
COWLITZ          855
SPOKANE          647
WALLA WALLA      449
WHITMAN          276
ASOTIN           260
KITTITAS         170
STEVENS          135
FRANKLIN          97
GRANT             97
COLUMBIA          82
PEND OREILLE      63
LINCOLN           30
ADAMS             23
KLICKITAT         23
SKAMANIA          22
FERRY             20
PACIFIC           17
CLALLAM            3
PIERCE             2
DOUGLAS            2
WAHKIAKUM          2
MASON              1
KING               1
Name: BUYER_COUNTY, dtype: int64

In [47]:
# set max columns to None
pd.set_option('display.max_columns', None)

In [82]:
assert prescriptions_fips[prescriptions_fips["_merge"] == "left_only"].shape[0] == 0

#### Not all counties joining back to prescription dataset

This could be okay, but I want to do a quick check that there are just not records for these counties. To do this, I'll take a small sample of counties in our FIPS dataset that did NOT merge properly to the prescriptions dataset, and search each one manually in the prescription dataset. I will search various different ways the counties could be transcribed, as well as google the county to ensure there are no secondary names for the same county.

In [None]:
prescriptions_fips[prescriptions_fips["_merge"] != "both"].value_counts()

Series([], dtype: int64)

In [None]:
# looks like we have some missing values from right_only that we can't find in df_prescriptions
# may need to look more into why this is - but as of now all we know is the given data is incomplete
# choose a sample, search other dataframe, make conclusion

prescriptions_fips[prescriptions_fips["_merge"] != "both"].sample(10, random_state=0)

Unnamed: 0,REPORTER_DEA_NO,BUYER_STATE,BUYER_ZIP,BUYER_COUNTY,TRANSACTION_CODE,DRUG_CODE,DRUG_NAME,QUANTITY,TRANSACTION_DATE,Product_Name,...,Month,StateFIPS,CountyFIPS_3,CountyName,StateName,CountyFIPS,StateAbbr,STATE_COUNTY,state_abbrev,_merge
148422,,,,,,,,,,,...,,48,459,upshur,Texas,48459,TX,TX | UPSHUR,TX,right_only
147901,,,,,,,,,,,...,,13,73,columbia,Georgia,13073,GA,GA | COLUMBIA,GA,right_only
148377,,,,,,,,,,,...,,48,335,mitchell,Texas,48335,TX,TX | MITCHELL,TX,right_only
148192,,,,,,,,,,,...,,41,21,gilliam,Oregon,41021,OR,OR | GILLIAM,OR,right_only
147904,,,,,,,,,,,...,,13,85,dawson,Georgia,13085,GA,GA | DAWSON,GA,right_only
147911,,,,,,,,,,,...,,13,123,gilmer,Georgia,13123,GA,GA | GILMER,GA,right_only
148195,,,,,,,,,,,...,,45,1,abbeville,South Carolina,45001,SC,SC | ABBEVILLE,SC,right_only
148153,,,,,,,,,,,...,,40,63,hughes,Oklahoma,40063,OK,OK | HUGHES,OK,right_only
148055,,,,,,,,,,,...,,20,173,sedgwick,Kansas,20173,KS,KS | SEDGWICK,KS,right_only
148254,,,,,,,,,,,...,,48,35,bosque,Texas,48035,TX,TX | BOSQUE,TX,right_only


In [None]:
# set max number of columns, so I can look at all records in a separate file
#pd.set_option("display.max_rows", None)

# look at every county in Texas that is not in the prescription dataset
# by using value_counts(), we can look at every county in the dataset
#prescriptions_fips[prescriptions_fips["BUYER_STATE"] == "KS"]["BUYER_COUNTY"].value_counts()

The above is now commented out, but the same test can be repeated as follows:

1. uncomment lines 2, 6 above
2. for each county you want to look for in the dataset, change the logic in line 6 to the county's associated state
3. click on the "show more in text editor" and ctrl-f for the county along with potential other spellings/variations of the county
4. google the county to ensure we did not miss any variations

Given that we aren't able to find any records from our sample, we conclude that these values are simply missing from the prescription shipping dataset. Therefore, the right_only merge indicator is acceptable as we still have a comprehensive set of counties in our data.

In [None]:
#prescriptions_fips = prescriptions_fips[prescriptions_fips["_merge"] != "right_only"]

# may have to add explanation for this higher up
assert len(prescriptions_copy) == len(prescriptions_fips)


AssertionError: 

### adding fips to our cause of death data

In [None]:
# create copies of both dfs

cause_of_death_copy = df_cause_of_death.copy()
fips_copy = fips.copy()

In [None]:
# remove county once again
cause_of_death_copy["County"] = cause_of_death_copy["County"].apply(lambda x: remove_county(x))


# clean some other miscellaneous values up

cause_of_death_copy["County"] = cause_of_death_copy["County"].apply(lambda x: x.replace("LaSalle Parish", "La Salle Parish"))
cause_of_death_copy["County"] = cause_of_death_copy["County"].apply(lambda x: x.replace("DeBaca", "De Baca"))
cause_of_death_copy["County"] = cause_of_death_copy["County"].apply(lambda x: x.replace("La Salle", "La Salle County"))
cause_of_death_copy["County"] = cause_of_death_copy["County"].apply(lambda x: x.replace("La Salle County Parish", "La Salle Parish"))


In [None]:
cause_of_death_fips = cause_of_death_copy.merge(fips_copy, left_on=["County", "State"], right_on=["CountyName", "state_abbrev"], how="outer", indicator=True)

#### Not all counties joining back to cause of death dataset

This could be okay, but I want to do a quick check that there are just not records for these counties. We have a smaller amount of counties not joining back (only four this time), so we'll just run our previous test on all of these counties.

In [None]:
cause_of_death_fips[cause_of_death_fips["_merge"] != "both"]

# only four missing values

Unnamed: 0,County,County Code,Year,Year Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths,State,StateFIPS,CountyFIPS_3,CountyName,StateName,CountyFIPS,StateAbbr,STATE_COUNTY,state_abbrev,_merge
23285,,,,,,,,,30,69,Petroleum,Montana,30069,MT,MT | PETROLEUM,MT,right_only
23286,,,,,,,,,48,261,Kenedy,Texas,48261,TX,TX | KENEDY,TX,right_only
23287,,,,,,,,,48,269,King,Texas,48269,TX,TX | KING,TX,right_only
23288,,,,,,,,,48,301,Loving,Texas,48301,TX,TX | LOVING,TX,right_only


In [None]:
# repeating the test from prescriptions dataset
#pd.set_option("display.max_rows", None)
#cause_of_death_copy[cause_of_death_copy["State"] == "MT"]["County"].value_counts()

# reset max rows
pd.reset_option("display.max_rows")

Given that we aren't able to find any records from our only four missing records, we conclude that these values are simply missing from the cause of death dataset. Therefore, the right_only merge indicator is acceptable as we still have a comprehensive set of counties in our data.

### Adding Population to final DataFrames

For pop_fips, cause_of_death_fips, and prescription_fips. Steps needed:

- Create unique ID from county FIPS and state FIPS
- Merge population dataset based on this

In [None]:
cause_of_death_fips = cause_of_death_fips[cause_of_death_fips["_merge"] == "both"]
#pops_copy = pops_copy[cause_of_death_fips["_merge"] == "both"]
prescriptions_fips = prescriptions_fips[prescriptions_fips["_merge"] == "both"]


# drop merge columns
cause_of_death_fips.drop(columns=["_merge"], inplace=True)
prescriptions_fips.drop(columns=["_merge"], inplace=True)
pops_copy.drop(columns=["_merge",], inplace=True)

In [None]:
# create unique FIP from county and state fips

cause_of_death_fips["FIP_unique"] = cause_of_death_fips["CountyFIPS"].apply(lambda x: str(x)) + cause_of_death_fips["StateFIPS"].apply(lambda x: str(x))
prescriptions_fips["FIP_unique"] = prescriptions_fips["CountyFIPS"].apply(lambda x: str(x)) + prescriptions_fips["StateFIPS"].apply(lambda x: str(x))
pops_copy["FIP_unique"] = pops_copy["CountyFIPS"].apply(lambda x: str(x)) + pops_copy["StateFIPS"].apply(lambda x: str(x))


In [None]:
# TODO: add some sort of assert here. not sure what it should be yet


In [None]:
# create final prescriptions dataset with populations
# can safely left join here, because we only need records in the prescriptions dataset
prescriptions = prescriptions_fips.merge(pops_copy, on=["FIP_unique", "Year"], how="left", indicator=True)

assert (prescriptions["_merge"] == "both").all()

In [None]:
# one more assert to check length
assert len(prescriptions) == len(prescriptions_fips)


In [None]:
# drop some useless columns
prescriptions.drop(columns=["_merge", "CountyName_y", "StateFIPS_y", "CountyFIPS_y","state_abbrev_y", "County", "CountyFIPS_3"], inplace=True)

# rename x columns
prescriptions.rename(columns={"CountyName_x": "CountyName", "StateFIPS_x": "StateFIPS", "CountyFIPS_x": "CountyFIPS", "state_abbrev_x": "state_abbrev"}, inplace=True)


In [None]:
# create final cause of death dataset with populations
# can safely left join here, because we only need records in the cause of death dataset
cause_of_death = cause_of_death_fips.merge(pops_copy, on=["FIP_unique", "Year"], how="left", indicator=True)

assert (cause_of_death["_merge"] == "both").all()

In [None]:
# drop some useless columns
cause_of_death.drop(columns=["_merge", "CountyName_y", "StateFIPS_y", "CountyFIPS_y","state_abbrev_y", "County_y", "CountyFIPS_3", "State_y"], inplace=True)

# rename x columns
cause_of_death.rename(columns={"County_x": "County", "Year_x": "Year", "State_x": "State", "StateFIPS_x": "StateFIPS", "CountyFIPS_x": "CountyFIPS", "state_abbrev_x": "state_abbrev", "CountyName_x": "CountyName"}, inplace=True)


In [None]:
# asserts to make sure we didn't lose any records from our original datasets

assert len(df_cause_of_death) == len(cause_of_death)
assert len(df_prescriptions) == len(prescriptions)

#### Export main, unjoined datasets in case we need them

In [None]:
cause_of_death.to_csv("05_cleaned_data/cause_of_death_clean.csv", index=False)
prescriptions.to_csv("05_cleaned_data/arcos_all_washpost_clean.csv", index=False)

## Final 3 datasets

We should have: (UNSURE IF WE SHOULD EXTEND DATE RANGES, CURRENTLY 3 YEARS BEFORE AND AFTER POLICY IMPLEMENTATION)

- Florida and Georgia 2007 - 2013
- Texas and Oklahoma 2004 - 2010
- Washington and Oregon 2009 - 2015

### Drug overdose - broken down by state

In [None]:
# Florida and Georgia

prescriptions_fl = prescriptions.copy()
prescriptions_tx = prescriptions.copy()
prescriptions_wa = prescriptions.copy()

prescriptions_fl = prescriptions_fl[(prescriptions_fl["BUYER_STATE"] == "FL") | (prescriptions_fl["BUYER_STATE"].isin(fl_states))]
prescriptions_tx = prescriptions_tx[(prescriptions_tx["BUYER_STATE"] == "TX") | (prescriptions_tx["BUYER_STATE"]).isin(tx_states)]
prescriptions_wa = prescriptions_wa[(prescriptions_wa["BUYER_STATE"] == "WA") | (prescriptions_wa["BUYER_STATE"]).isin(wa_states)]



# filter appropriate years
fl_start = 2007
fl_end = 2013

tx_start = 2004
tx_end = 2010

wa_start = 2009
wa_end = 2015


prescriptions_fl = prescriptions_fl[(prescriptions_fl["Year"] >= fl_start) & (prescriptions_fl["Year"] <= fl_end)]
prescriptions_tx = prescriptions_tx[(prescriptions_tx["Year"] >= tx_start) & (prescriptions_tx["Year"] <= tx_end)]
prescriptions_wa = prescriptions_wa[(prescriptions_wa["Year"] >= wa_start) & (prescriptions_wa["Year"] <= wa_end)]


### Cause of death - broken down by state

In [None]:
deaths_fl = cause_of_death.copy()
deaths_tx = cause_of_death.copy()
deaths_wa = cause_of_death.copy()

deaths_fl = deaths_fl[(deaths_fl["StateName"] == "Florida") | (deaths_fl["State"].isin(fl_states))]
deaths_tx = deaths_tx[(deaths_tx["StateName"] == "Texas") | (deaths_tx["State"].isin(tx_states))]
deaths_wa = deaths_wa[(deaths_wa["StateName"] == "Washington") | (deaths_wa["State"].isin(wa_states))]

deaths_fl = deaths_fl[(deaths_fl["Year"] >= fl_start) & (deaths_fl["Year"] <= fl_end)]
deaths_tx = deaths_tx[(deaths_tx["Year"] >= tx_start) & (deaths_tx["Year"] <= tx_end)]  
deaths_wa = deaths_wa[(deaths_wa["Year"] >= wa_start) & (deaths_wa["Year"] <= wa_end)]

### export all to csv

In [None]:
prescriptions_fl.to_csv("05_cleaned_data/prescriptions_fl.csv", index=False)
prescriptions_tx.to_csv("05_cleaned_data/prescriptions_tx.csv", index=False)
prescriptions_wa.to_csv("05_cleaned_data/prescriptions_wa.csv", index=False)

deaths_fl.to_csv("05_cleaned_data/deaths_fl.csv", index=False)
deaths_tx.to_csv("05_cleaned_data/deaths_tx.csv", index=False)
deaths_wa.to_csv("05_cleaned_data/deaths_wa.csv", index=False)

## Notes for the group

- may need to filter out a couple more columns - haven't done this yet as I don't want to accidentally delete something we need
- overdose data is only broken down by year unless i messed something up - overdose analysis will have to be less granular