# Data Preparation Notebook

#### This notebook will be the test bed for data read functions to ingest data from a data folder on the local machine

#### The end outputs of this notebook are that all data structures will have a 'COUNTY NAME, ST' column, and where applicable a FIPS code as well. Merges will be performed elsewhere.

### Import and parameters

In [1]:
# import packages

import pandas as pd
import numpy as np
import os

# set the states concerned for the analysis
states = [
    "FL",
    "AL",
    "GA",
    "MS",
    "SC",
    "TX",
    "OK",
    "AZ",
    "NM",
    "WA",
    "OR",
    "ID",
    "CA",
    "NY",
]

# Drop LA and NV because of issues. Carson city is an independent city in Nevada...??


In [2]:
# ingest the state abbreviations as its own DF
state_df = pd.read_table("../00_source_data/03_state_names.rtf", sep=",")
state_df.columns = ["STATE", "ABBREV"]

# make state upper
state_df["STATE"] = state_df["STATE"].str.upper()

# drop the trailing slash from the abbrev
state_df["ABBREV"] = state_df["ABBREV"].str[0:2]
# state_df.head()


### WAPO Dataset

This section takes an argument for the path to the WAPO dataset and will ultimately return an annualized dataframe of the states with respective values for each year

***Ingest actions***

In [3]:
# set path to WAPO file
wapo = "../00_source_data/prescription_data.zip"


In [None]:
# Ingest the WAPO file as chunks - takes roughly 15 minutes

chunks = 500000  # Leave this, there will be 465 chunks

chunk_counter = 0

wapo_df = pd.DataFrame()

for chunk in pd.read_csv(
    wapo,
    compression="zip",
    chunksize=chunks,
    usecols=[
        "BUYER_COUNTY",
        "BUYER_STATE",
        "DRUG_NAME",
        "TRANSACTION_DATE",
        "QUANTITY",
        "UNIT",
    ],
):
    chunk_counter += 1
    percent_chunk = round(chunk_counter / 465 * 100, 2)
    print("Reading chunk: ", chunk_counter, "of 465 (", percent_chunk, "%)")

    # filter the chunk to only include the stations in the list
    chunk = chunk[chunk["BUYER_STATE"].isin(states)].copy()

    chunk["TRANSACTION_DATE"] = pd.to_datetime(
        chunk["TRANSACTION_DATE"], format="%m%d%Y"
    )

    chunk["YEAR"] = chunk["TRANSACTION_DATE"].dt.year
    chunk["MONTH"] = chunk["TRANSACTION_DATE"].dt.month

    # make the quantity numeric
    chunk["QUANTITY"] = pd.to_numeric(chunk["QUANTITY"], errors="coerce")

    chunk = chunk.groupby(["BUYER_COUNTY", "BUYER_STATE", "YEAR", "MONTH"]).agg(
        {"QUANTITY": "sum"}
    )

    # concat with the base df
    wapo_df = pd.concat([wapo_df, chunk])

print("Ingest complete")


In [5]:
# transform the wapo df
print("Adding county name...")

# Add an index to the wapo_df
wapo_df = wapo_df.reset_index()

# Do some transformations on the WAPO dataset
wapo_df["COUNTY_NAME"] = wapo_df["BUYER_COUNTY"] + " COUNTY, " + wapo_df["BUYER_STATE"]

print("Grouping WAPO data...")

wapo_df = (
    wapo_df.groupby(["COUNTY_NAME", "BUYER_STATE", "YEAR", "MONTH"])
    .agg({"QUANTITY": "sum"})
    .reset_index()
)

# # rename buyer state to state
wapo_df = wapo_df.rename(columns={"BUYER_STATE": "STATE"})

# # Change the year column to a string
wapo_df["YEAR"] = wapo_df["YEAR"].astype(str)

# # Change the month column to a string
wapo_df["MONTH"] = wapo_df["MONTH"].astype(str)

print("Operation complete")


Adding county name...
Grouping WAPO data...
Operation complete


***Print a sample to make sure you did it right***

In [6]:
# wapo_df.sample(10)


In [7]:
# wapo_df["YEAR"].unique()
# wapo_df["STATE"].unique()

assert len(wapo_df["STATE"].unique()) == len(states)


***Assert tests to verify that we have the right states, and some checks on the data***

In [8]:
# Assert that no counties are missing
assert wapo_df["COUNTY_NAME"].isnull().sum() == 0
# Assert that states are in the list
assert set(wapo_df["STATE"].unique()) == set(states)


***Save the output file to a csv in the intermediate files directory***

In [9]:
# save this file as a csv called wapo_clean.csv in the current directory
wapo_df.to_csv("../20_intermediate_files/wapo_clean.csv", index=False)


### Vitality Data

This section takes the txt files passed and will return a dataframe with the respective values for each year

***Ingest actions***

In [10]:
# Set a directory path to find the txt files
nick_path = "../00_source_data/US_VitalStatistics/"


In [11]:
# generate a df from the txt files in a folder path

# initialize the empty df
nick_df = pd.DataFrame()

# set a loop to iterate through the files in the folder
print("Ingesting text files...")

# set a loop to iterate through the files in the folder
for file in os.listdir(nick_path):
    if file.endswith(".txt"):
        txt_table = pd.read_table(
            os.path.join(nick_path, file), sep="\t", skipfooter=15, engine="python"
        )
        # remove the bottom rows that are not needed
        # txt_table = txt_table.iloc[0:-16, :]
        nick_df = pd.concat([nick_df, txt_table], axis=0)

# subset to the columns we want
vital_df = nick_df[
    [
        "County",
        "County Code",
        "Year",
        "Drug/Alcohol Induced Cause",
        "Drug/Alcohol Induced Cause Code",
        "Deaths",
    ]
].copy()

print("Transforming vitality data...")

# change the year column to a string with a year only
vital_df["Year"] = vital_df["Year"].astype(str).str[0:4]

# change the County Code column to 6-digit string FIPS code
vital_df["County Code"] = vital_df["County Code"].astype(str).str.zfill(6)

# add a state column with the last two characters from county
vital_df["State"] = vital_df["County"].str[-2:]

# convert NaN deaths to 0
vital_df["Deaths"] = vital_df["Deaths"].fillna(0)

# change the county name to all caps
vital_df["County"] = vital_df["County"].str.upper()

# change all the column names to all caps
vital_df.columns = vital_df.columns.str.upper()

# rename county code to FIPS code
vital_df = vital_df.rename(columns={"COUNTY CODE": "FIPS"})
vital_df = vital_df.rename(columns={"COUNTY": "COUNTY_NAME"})

# filter the vital df to the states of interest
vital_df = vital_df[vital_df["STATE"].isin(states)].copy()

print("Operation complete")


Ingesting text files...
Transforming vitality data...
Operation complete


***Subset to rows with drug deaths, group and add***

In [12]:
vital_df["DRUG/ALCOHOL INDUCED CAUSE"].value_counts()

# convert the DRUG Cause column to all lower case
vital_df["DRUG/ALCOHOL INDUCED CAUSE"] = vital_df[
    "DRUG/ALCOHOL INDUCED CAUSE"
].str.lower()

# Make a new column called drug with indicators for cause
vital_df["DRUG"] = np.where(
    vital_df["DRUG/ALCOHOL INDUCED CAUSE"].str.contains("drug"), 1, 0
)

# subset to remove all other non-drug deaths
vital_df = vital_df[
    vital_df["DRUG/ALCOHOL INDUCED CAUSE"]
    != "all other non-drug and non-alcohol causes"
].copy()

# subset to remove all other non-drug deaths
vital_df = vital_df[vital_df["DRUG"] == 1].copy()


In [13]:
# vital_df["DRUG/ALCOHOL INDUCED CAUSE"].unique()
vital_df["DRUG/ALCOHOL INDUCED CAUSE"].value_counts()


drug poisonings (overdose) unintentional (x40-x44)    3017
drug poisonings (overdose) suicide (x60-x64)           783
all other drug-induced causes                          326
drug poisonings (overdose) undetermined (y10-y14)      195
Name: DRUG/ALCOHOL INDUCED CAUSE, dtype: int64

In [14]:
# convert the death column to an integer
vital_df["DEATHS"] = vital_df["DEATHS"].astype(int)


In [15]:
# group the vital df and sum the deaths
new_vital_df = (
    vital_df.groupby(["COUNTY_NAME", "FIPS", "YEAR", "STATE", "DRUG"])
    .sum()
    .reset_index()
)


  .sum()


***Check a sample to inspect data***

In [16]:
# new_vital_df.sample(10)


***Assert tests to verify that we have the right states, and some checks on the data***

In [17]:
assert vital_df.all().isnull().sum() == 0
# Assert that states are in the list
assert set(vital_df["STATE"].unique()) == set(states)
# Still need to find a way to check if Vital is correctly chunked

assert vital_df["DEATHS"].sum() == new_vital_df["DEATHS"].sum()


***Save the output dataframe to the intermediate files directory***

In [18]:
new_vital_df.to_csv("../20_intermediate_files/vital_clean.csv", index=False)


### Add FIPS Codes to the available data

This chunk takes the path to FIPS codes and will ingest them into a dataframe

***Ingest data***

In [19]:
fips_path = "../00_source_data/02_fcc_fips_codes.txt"


***start with counties***

In [20]:
# Ingest the fips codes
fips_county_df = pd.read_table(fips_path, sep="\t", skiprows=71, header="infer")

# name the column header
fips_county_df.columns = ["FIPS"]

# split the FIPS column into two columns after the first 5 characters
fips_county_df["COUNTY_NAME"] = fips_county_df["FIPS"].str[10:]

# Turn the FIPS column into just the numbers
fips_county_df["FIPS"] = fips_county_df["FIPS"].str[4:10]

# remove the spaces from the county name
fips_county_df["COUNTY_NAME"] = fips_county_df["COUNTY_NAME"].str.strip()

# make county name upper
fips_county_df["COUNTY_NAME"] = fips_county_df["COUNTY_NAME"].str.upper()

print("Operation complete")


Operation complete


In [21]:
# Ingest the fips codes
fips_state_df = pd.read_table(fips_path, sep="\t", skiprows=15, header="infer")

# keep the first 50 rows
fips_state_df = fips_state_df.iloc[0:50, :]

# name the column header
fips_state_df.columns = ["FIPS"]

# # split the FIPS column into two columns after the first 5 characters
fips_state_df["STATE"] = fips_state_df["FIPS"].str[10:]
fips_state_df["STATE"] = fips_state_df["STATE"].str.strip()

# #Turn the FIPS column into just the numbers
fips_state_df["FIPS"] = fips_state_df["FIPS"].str[4:10]

# merge to add the state abbreviations from the state_df
fips_state_df = fips_state_df.merge(state_df, on="STATE", how="left")

print("Operation complete")


Operation complete


***merge the two dataframes***

In [22]:
# this is going to be a little different
fips_df = fips_county_df.copy()

# add a state column
fips_df["STATE"] = fips_df["COUNTY_NAME"]

# merge to get the state abbreviations
fips_df = fips_df.merge(fips_state_df, on="STATE", how="left")

# drop the state column
fips_df = fips_df.drop(columns=["STATE"])

# rename the columns
fips_df = fips_df.rename(
    columns={"FIPS_x": "FIPS", "ABBREV": "STATE", "FIPS_y": "STATE_FIPS"}
)

# forward fill the state fips and state columns
fips_df["STATE_FIPS"] = fips_df["STATE_FIPS"].fillna(method="ffill")
fips_df["STATE"] = fips_df["STATE"].fillna(method="ffill")

# add state to the county name
fips_df["COUNTY_NAME"] = fips_df["COUNTY_NAME"] + ", " + fips_df["STATE"]

# filter the df to the states of interest
fips_df = fips_df[fips_df["STATE"].isin(states)]


***Sample the FIPS codes to see if we did it right***

In [23]:
# fips_df.sample(10)


***Add some assert tests***

In [24]:
assert vital_df.all().isnull().sum() == 0
# check the first two digits of the FIPS code to make sure they match with STATE_FIPS
assert list(fips_df["FIPS"].str[:2]) == list(
    fips_df["STATE_FIPS"].astype(str).str.strip()
)
# fips_df["FIPS"].dtype


***Save the FIPS file cleaned to the 20_intermediate_files directory***

In [25]:
# save the fips_df to a csv
fips_df.to_csv("../20_intermediate_files/fips_df.csv", index=False)


### Census Data

This chunk will ingest filtered county population data and return a dataframe

***Ingest data***

In [26]:
# set the directory path to the raw data
census_path = "../00_source_data/01_census_data.xlsx"


In [27]:
# ingest the raw data and filter for states of interest
census_df = pd.read_excel(census_path, header=0, skiprows=4, usecols="A:B")

# Change the column names to county name and population
census_df.columns = ["COUNTY_NAME", "POPULATION"]

# remove the leading period from the county name
census_df["COUNTY_NAME"] = census_df["COUNTY_NAME"].str[1:]

# move the state name to its own column
census_df["STATE"] = census_df["COUNTY_NAME"].str.split(",").str[1]
census_df["STATE"] = census_df["STATE"].str.strip()
census_df["STATE"] = census_df["STATE"].str.upper()

# remove the state name from the county name
census_df["COUNTY_NAME"] = census_df["COUNTY_NAME"].str.split(",").str[0]

# change population to an integer
census_df["POPULATION"] = census_df["POPULATION"].astype("Int64")

# add an abbreviation column for the state from the state_df
census_df_merge = census_df.merge(state_df, on="STATE", how="outer")

# reformat county name to include abbrev
census_df_merge["COUNTY_NAME"] = (
    census_df_merge["COUNTY_NAME"] + ", " + census_df_merge["ABBREV"]
)
census_df_merge["COUNTY_NAME"] = census_df_merge["COUNTY_NAME"].str.upper()

# remove the state column
census_df_merge = census_df_merge.drop(columns="STATE")

# rename the abbrev column to state
census_df_merge = census_df_merge.rename(columns={"ABBREV": "STATE"})

# filter for the states of interest
census_df_merge = census_df_merge[census_df_merge["STATE"].isin(states)].copy()


***Sample to see if we did it right***

In [33]:
census_df_merge["STATE"].unique()

array(['AL', 'AZ', 'CA', 'FL', 'GA', 'ID', 'MS', 'NM', 'NY', 'OK', 'OR',
       'SC', 'TX', 'WA'], dtype=object)

In [28]:
# census_df_merge.sample(10)


***Add assert tests***

In [36]:
# number of counties in each state we have picked, according to google and wikipedia
counties = {"FL":67,"AL":67, "GA":159, "MS":82, "SC":46, "TX":254 , "OK":77, "AZ":15, "NM":33, "WA":39, "OR":36, "ID":44, "CA":58, "NY":62}
# check we have all the counties in our dictionary
assert set(census_df_merge["STATE"].unique()) == set(counties.keys())

In [39]:
assert census_df_merge.all().isnull().sum() == 0
# check last two characters of county name to make sure they match with state
assert list(census_df_merge["COUNTY_NAME"].str[-2:]) == list(census_df_merge["STATE"])
# we need to check county numbers to make sure they match (when we know which control states we want)
assert census_df_merge["COUNTY_NAME"].nunique() == sum(counties.values())

***Save the output file to the intermediate files directory***

In [30]:
# save the file to the intermediate folder
census_df_merge.to_csv("../20_intermediate_files/census_df.csv", index=False)


In [41]:
"DEBACA COUNTY, NM" in census_df_merge["COUNTY_NAME"]

False

In [42]:
"DONA ANA COUNTY, NM" in census_df_merge["COUNTY_NAME"]

False

In [50]:
len(census_df_merge[census_df_merge["STATE"]=="NM"])

33

In [49]:
census_df_merge[census_df_merge["STATE"]=="NM"]

Unnamed: 0,COUNTY_NAME,POPULATION,STATE
1795,"BERNALILLO COUNTY, NM",662564,NM
1796,"CATRON COUNTY, NM",3725,NM
1797,"CHAVES COUNTY, NM",65645,NM
1798,"CIBOLA COUNTY, NM",27213,NM
1799,"COLFAX COUNTY, NM",13750,NM
1800,"CURRY COUNTY, NM",48376,NM
1801,"DE BACA COUNTY, NM",2022,NM
1802,"DOÑA ANA COUNTY, NM",209233,NM
1803,"EDDY COUNTY, NM",53829,NM
1804,"GRANT COUNTY, NM",29514,NM
