In [102]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
from plotnine import *
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [103]:
# load in pre-cleaned deaths data for Texas and comparison states
deaths = pd.read_csv("../20_intermediate_files/deaths_wa.csv")


In [104]:
# two separate dfs - one for texas and one for comp states

washington = deaths[deaths["StateName"] == "Washington"]
comp = deaths[deaths["StateName"] != "Washington"]

### Helper functions 

In [105]:
# function to select only overdose records
# double check this

def select_overdose(record):
    """Simple function to select only overdose records"""

    if record == "All other non-drug and non-alcohol causes":
        return 0

    if record == "All other alcohol-induced causes":
        return 0

    if record == "All other drug-induced causes":
        return 0

    if record == "Alcohol poisonings (overdose) (X45, X65, Y15)":
        return 0

    if record == "Drug poisonings (overdose) Unintentional (X40-X44)":
        return 1

    if record == "Drug poisonings (overdose) Suicide (X60-X64)":
        return 1

    if record == "Drug poisonings (overdose) Undetermined (Y10-Y14)":
        return 1

    else:
        return "error"
    

# copy to fix the dreaded "A value is trying to be set on a copy of a slice" error
wa_deaths = washington.copy()
comp_deaths = comp.copy()

# apply new function to our df
wa_deaths["overdose"] = wa_deaths["Drug/Alcohol Induced Cause"].apply(lambda x: select_overdose(x))
comp_deaths["overdose"] = comp_deaths["Drug/Alcohol Induced Cause"].apply(lambda x: select_overdose(x))

assert len(wa_deaths[wa_deaths["overdose"] == "error"]) == 0
assert len(comp_deaths[comp_deaths["overdose"] == "error"]) == 0

# filter accordingly based on new column
wa_deaths = wa_deaths[wa_deaths["overdose"] != 0]
comp_deaths = comp_deaths[comp_deaths["overdose"] != 0]

In [106]:
# calculate overdoses per 100_000 residents

wa_deaths["overdose_per_100k"] = wa_deaths["Deaths"] / wa_deaths["Population"] * 100_000
comp_deaths["overdose_per_100k"] = comp_deaths["Deaths"] / comp_deaths["Population"] * 100_000

In [107]:
# groupby year and county
wa_result = wa_deaths.groupby(["Year", "County"])["overdose_per_100k"].sum().reset_index()
comp_result = comp_deaths.groupby(["Year", "StateName", "CountyName"])["overdose_per_100k"].sum().reset_index()

In [108]:
wa_result

Unnamed: 0,Year,County,overdose_per_100k
0,2009.0,Benton,9.934433
1,2009.0,Clallam,18.290023
2,2009.0,Clark,14.006400
3,2009.0,Cowlitz,17.625286
4,2009.0,Grant,19.464163
...,...,...,...
105,2015.0,Snohomish,16.499978
106,2015.0,Spokane,18.828923
107,2015.0,Thurston,9.702289
108,2015.0,Whatcom,10.857456


In [109]:
# calculate summary stats
wa_res = pd.DataFrame(wa_result.describe()["overdose_per_100k"]).rename(columns={"overdose_per_100k": "Overdoses per 100k Residents - Washington"})
comp_res = pd.DataFrame(comp_result.describe()["overdose_per_100k"]).rename(columns={"overdose_per_100k": "Overdoses per 100k Residents - Comp States"})

In [110]:
stats = pd.concat([wa_res, comp_res], axis=1)
stats

Unnamed: 0,Overdoses per 100k Residents - Washington,Overdoses per 100k Residents - Comp States
count,110.0,432.0
mean,13.685771,15.081958
std,4.513948,8.240315
min,5.666846,1.845015
25%,9.990164,9.607172
50%,13.668037,13.405339
75%,16.473949,18.537336
max,26.39064,78.330841


In [111]:
# overwrite wa result to be grouped by year only
wa_result = wa_result.groupby("Year")["overdose_per_100k"].mean().reset_index()
comp_result = comp_result.groupby(["Year"])["overdose_per_100k"].mean().reset_index()

In [112]:
wa_res_checkpoint = wa_result.copy()
comp_res_checkpoint = comp_result.copy()


# assert length is the same
assert (len(wa_result) == len(wa_res_checkpoint)) & (len(comp_result) == len(comp_res_checkpoint))

## Pre-post plot - overdoses per 100k through the years

In [113]:

wa_b4 = wa_result[wa_result["Year"] < 2012]
wa_after = wa_result[wa_result["Year"] >= 2012]

In [114]:
comp_b4 = comp_result[comp_result["Year"] < 2012]
comp_after = comp_result[comp_result["Year"] >= 2012]


In [115]:
x = "Year"
y = "overdose_per_100k"

In [116]:
def vertical_line(year):
    """Function to plot a vertical line at year of policy implementation"""
    line = alt.Chart(pd.DataFrame({
    'Date': [year],
    'color': ["black"]
    })).mark_rule().encode(
    x='Date:Q',
    color=alt.Color('color:N', scale=None)
    )

    return line

In [117]:
def get_reg_fit(data, yvar, xvar, color, title, alpha=0.05):
    import statsmodels.formula.api as smf

    # Grid for predicted values
    x = data.loc[pd.notnull(data[yvar]), xvar]
    xmin = x.min()
    xmax = x.max()
    step = (xmax - xmin) / 100
    grid = np.arange(xmin, xmax + step, step)
    predictions = pd.DataFrame({xvar: grid})

    # Fit model, get predictions
    model = smf.ols(f"{yvar} ~ {xvar}", data=data).fit()
    model_predict = model.get_prediction(predictions[xvar])
    predictions[yvar] = model_predict.summary_frame()["mean"]
    predictions[["ci_low", "ci_high"]] = model_predict.conf_int(alpha=alpha)

    # Build chart
    reg = (
        alt.Chart(predictions)
        .mark_line()
        .encode(
            x=alt.X(xvar, scale=alt.Scale(zero=False), title="Year"),
            y=alt.Y(
                yvar, scale=alt.Scale(zero=False), title="Overdoses per 100k Residents"
            ),
            color=alt.value(color)
        ).properties(title=title)
    )
    ci = (
        alt.Chart(predictions)
        .mark_errorband()
        .encode(
            x=xvar,
            y=alt.Y("ci_low", title=yvar),
            y2="ci_high",
            color=alt.value(color),
        )
    )
    chart = ci + reg
    return predictions, chart


In [118]:
line = vertical_line(2012)

In [119]:
fit_wa_b4, reg_chart_wa_b4 = get_reg_fit(
    wa_b4, yvar="overdose_per_100k", xvar="Year", color="blue", title= "Overdoses per 100k Residents - Washington", alpha=0.05
)


fit_wa_b4, reg_chart_wa_after = get_reg_fit(
    wa_after, yvar="overdose_per_100k", xvar="Year", color="blue", title= "Overdoses per 100k Residents - Washington",alpha=0.05
)
# (reg_chart_wa_b4 + reg_chart_wa_after + line).properties(width=700, height=500)



# Diff-in-Diff Plot

In [120]:
# remaking these plots to change the title for diff in diff
fit_wa_b4, reg_chart_wa_b4 = get_reg_fit(
    wa_b4, yvar="overdose_per_100k", xvar="Year", color="blue", title= "Overdoses per 100k Residents - Washington (blue) vs. Comp States (green)", alpha=0.05
)


fit_wa_b4, reg_chart_wa_after = get_reg_fit(
    wa_after, yvar="overdose_per_100k", xvar="Year", color="blue", title= "Overdoses per 100k Residents - Washington (blue) vs. Comp States (green)",alpha=0.05
)
(reg_chart_wa_b4 + reg_chart_wa_after + line).properties(width=700, height=500)

# texas above
################################################################
# comp below

fit_comp_b4, reg_chart_comp_b4 = get_reg_fit(
    comp_b4, yvar="overdose_per_100k", xvar="Year", color="green", title= "Overdoses per 100k Residents - Comp States",alpha=0.05
)
reg_chart_comp_b4


fit_comp_after, reg_chart_comp_after = get_reg_fit(
    comp_after, yvar="overdose_per_100k", xvar="Year", color="green", title= "Overdoses per 100k Residents - Comp States",alpha=0.05
)
# reg_chart_comp_b4 + reg_chart_comp_after


In [121]:
(reg_chart_wa_b4 + reg_chart_wa_after + reg_chart_comp_b4 + reg_chart_comp_after + line).properties(width=700, height=500)