In [1]:
import numpy as np
import pandas as pd
import altair as alt
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
deaths = pd.read_csv("../20_intermediate_files/deaths_wa.csv")

In [3]:
# check Washington and all other three comparison states are included 
deaths["StateName"].value_counts()

North Carolina    1142
Colorado           642
Washington         570
Maryland           363
Name: StateName, dtype: int64

In [4]:
deaths.columns

Index(['County', 'County Code', 'Year', 'Year Code',
       'Drug/Alcohol Induced Cause', 'Drug/Alcohol Induced Cause Code',
       'Deaths', 'State', 'StateFIPS', 'CountyName', 'StateName', 'CountyFIPS',
       'StateAbbr', 'STATE_COUNTY', 'state_abbrev', 'FIP_unique', 'Population',
       'county_test'],
      dtype='object')

In [5]:
# identify overdose related causes to make a list
deaths["Drug/Alcohol Induced Cause"].unique()

array(['Drug poisonings (overdose) Unintentional (X40-X44)',
       'Drug poisonings (overdose) Suicide (X60-X64)',
       'All other alcohol-induced causes',
       'All other non-drug and non-alcohol causes',
       'Alcohol poisonings (overdose) (X45, X65, Y15)',
       'Drug poisonings (overdose) Undetermined (Y10-Y14)',
       'All other drug-induced causes'], dtype=object)

In [6]:
# stratify by Washington and comparison states 
washington = deaths[deaths["StateName"] == "Washington"]
comp = deaths[deaths["StateName"] != "Washington"]

In [7]:
wa_deaths = washington.copy()
comp_deaths = comp.copy()

In [8]:
overdose_list = ['Drug poisonings (overdose) Unintentional (X40-X44)',
       'Drug poisonings (overdose) Suicide (X60-X64)',
       'Drug poisonings (overdose) Undetermined (Y10-Y14)',
       'Alcohol poisonings (overdose) (X45, X65, Y15)']

In [9]:
# add a new column for overdose with sentinel value 
wa_deaths["overdose"] = 0
comp_deaths["overdose"] = 0

In [10]:
# recode overdose to 1 if in the list 
wa_deaths.loc[wa_deaths["Drug/Alcohol Induced Cause"].isin(overdose_list), "overdose"] = 1
comp_deaths.loc[comp_deaths["Drug/Alcohol Induced Cause"].isin(overdose_list), "overdose"] = 1

In [11]:
# take subsets with overdoses only 
wa_deaths = wa_deaths[wa_deaths["overdose"] == 1]
comp_deaths = comp_deaths[comp_deaths["overdose"] == 1]

In [12]:
# standardize overdose deaths by population 
wa_deaths["overdose_per_100k"] = wa_deaths["Deaths"] / wa_deaths["Population"] * 100_000
comp_deaths["overdose_per_100k"] = comp_deaths["Deaths"] / comp_deaths["Population"] * 100_000

In [13]:
# calculate total deaths by year, state, county
wa_result = wa_deaths.groupby(["Year", "County"])["overdose_per_100k"].sum().reset_index()
comp_result = comp_deaths.groupby(["Year", "StateName", "CountyName"])["overdose_per_100k"].sum().reset_index()

In [14]:
# create new dataframes for plotting - mean of deaths each year 
wa_stats = wa_deaths.groupby(["Year"])["overdose_per_100k"].mean().reset_index()
comp_stats = comp_deaths.groupby(["Year"])["overdose_per_100k"].mean().reset_index()

In [15]:
# compute descriptive statistics for each 
wa_res = pd.DataFrame(wa_result.describe()["overdose_per_100k"]).rename(columns={"overdose_per_100k": "Overdoses per 100k Residents - Washington"})
comp_res = pd.DataFrame(comp_result.describe()["overdose_per_100k"]).rename(columns={"overdose_per_100k": "Overdoses per 100k Residents - Comp States"})
stats = pd.concat([wa_res, comp_res], axis=1)
stats

Unnamed: 0,Overdoses per 100k Residents - Washington,Overdoses per 100k Residents - Comp States
count,110.0,432.0
mean,13.76018,15.142326
std,4.515969,8.280981
min,5.666846,1.845015
25%,9.990164,9.649632
50%,14.01108,13.405339
75%,16.473949,18.800017
max,26.39064,78.330841


In [16]:
# wa_result = wa_result.groupby("Year")["overdose_per_100k"].mean().reset_index()
# comp_result = comp_result.groupby(["Year"])["overdose_per_100k"].mean().reset_index()

In [17]:
wa_res_checkpoint = wa_stats.copy()
comp_res_checkpoint = comp_stats.copy()


# assert length is the same
assert (len(wa_stats) == len(wa_res_checkpoint)) & (len(comp_stats) == len(comp_res_checkpoint))

In [18]:
# change Year to be integer 
wa_stats["Year"] = wa_stats["Year"].astype("int")
wa_stats.dtypes

Year                   int64
overdose_per_100k    float64
dtype: object

In [19]:
# split into before and after policy implementation 
wa_b4 = wa_stats[wa_stats["Year"] < 2012]
wa_after = wa_stats[wa_stats["Year"] >= 2012]

In [20]:
x = "Year"
y = "overdose_per_100k"

In [21]:

# function for adding a vertical line
def vertical_line(year):
    """Function to plot a vertical line at year of policy implementation"""
    line = alt.Chart(pd.DataFrame({
    'Date': [year],
    'color': ["black"]
    })).mark_rule().encode(
    x='Date:Q',
    color=alt.Color('color:N', scale=None)
    )

    return line

In [22]:
# function for adding confidence intervals 
def get_reg_fit(data, yvar, xvar, color, title, alpha=0.05):
    import statsmodels.formula.api as smf

    # Grid for predicted values
    x = data.loc[pd.notnull(data[yvar]), xvar]
    xmin = x.min()
    xmax = x.max()
    step = (xmax - xmin) / 100
    grid = np.arange(xmin, xmax + step, step)
    predictions = pd.DataFrame({xvar: grid})

    # Fit model, get predictions
    model = smf.ols(f"{yvar} ~ {xvar}", data=data).fit()
    model_predict = model.get_prediction(predictions[xvar])
    predictions[yvar] = model_predict.summary_frame()["mean"]
    predictions[["ci_low", "ci_high"]] = model_predict.conf_int(alpha=alpha)

    # Build chart
    reg = (
        alt.Chart(predictions)
        .mark_line()
        .encode(
            x=alt.X(xvar, scale=alt.Scale(zero=False), title="Year"),
            y=alt.Y(
                yvar, scale=alt.Scale(zero=False), title="Overdoses per 100k Residents"
            ),
            color=alt.value(color)
        ).properties(title=title)
    )
    ci = (
        alt.Chart(predictions)
        .mark_errorband()
        .encode(
            x=xvar,
            y=alt.Y("ci_low", title=""),
            y2="ci_high",
            color=alt.value(color),
        )
    )
    chart = ci + reg
    return predictions, chart

In [23]:
# add a cutoff year when policy was implemented 
line = vertical_line(2012)

In [24]:
# plotting - need to make x axis discrete 
fit_wa_b4, reg_chart_wa_b4 = get_reg_fit(
    wa_b4, yvar="overdose_per_100k", xvar="Year", color="blue", title= "Overdoses per 100k Residents - Washington", alpha=0.05
)


fit_wa_b4, reg_chart_wa_after = get_reg_fit(
    wa_after, yvar="overdose_per_100k", xvar="Year", color="blue", title= "Overdoses per 100k Residents - Washington",alpha=0.05
)
(reg_chart_wa_b4 + reg_chart_wa_after + line).properties(width=700, height=500)

In [None]:
########################################################################################################################
# ends here for final report     

In [None]:
def get_reg_fit(data, yvar, xvar, alpha=0.05):
    import statsmodels.formula.api as smf

    # Grid for predicted values
    x = data.loc[pd.notnull(data[yvar]), xvar]
    xmin = x.min()
    xmax = x.max()
    step = (xmax - xmin) / 100
    grid = np.arange(xmin, xmax + step, step)
    predictions = pd.DataFrame({xvar: grid})

    # Fit model, get predictions
    model = smf.ols(f"{yvar} ~ {xvar}", data=data).fit()
    model_predict = model.get_prediction(predictions[xvar])
    predictions[yvar] = model_predict.summary_frame()["mean"]
    predictions[["ci_low", "ci_high"]] = model_predict.conf_int(alpha=alpha)

    # Build chart
    reg = alt.Chart(predictions).mark_line().encode(x=xvar, y=yvar)
    ci = (
        alt.Chart(predictions)
        .mark_errorband()
        .encode(
            x=xvar,
            y=alt.Y("ci_low", title=yvar),
            y2="ci_high",
        )
    )
    chart = ci + reg
    return predictions, chart

In [None]:
wa_deaths.groupby("Year")["overdose_per_100k"].agg([np.mean, np.std])

wa_deaths_b4 = wa_deaths[wa_deaths["Year"] < 2012]
wa_deaths_after = wa_deaths[wa_deaths["Year"] >= 2012]

In [None]:
source_data = wa_deaths_b4

plot_b4 = (
    alt.Chart(source_data)
    .mark_point()
    .encode(
        x=alt.X("Year:O", scale=alt.Scale(zero=False)),
        y=alt.Y("mean_overdose:Q", scale=alt.Scale(zero=False)),
    ).transform_aggregate(
        mean_overdose='mean(overdose_per_100k)',
        groupby=["Year"]
    )
)


In [None]:
fit_b4, reg_chart_b4 = get_reg_fit(
    source_data, yvar="overdose_per_100k", xvar="Year", alpha=0.05
)
fit_b4

In [None]:
plot_b4 + reg_chart_b4

In [None]:

source_data = wa_deaths_after

plot_after = (
    alt.Chart(source_data)
    .mark_point()
    .encode(
        x=alt.X("Year:O", scale=alt.Scale(zero=False)),
        y=alt.Y("mean_overdose:Q", scale=alt.Scale(zero=False)),
    ).transform_aggregate(
        mean_overdose='mean(overdose_per_100k)',
        groupby=["Year"]
    )
)
plot_after
fit_after, reg_chart_after = get_reg_fit(
    source_data, yvar="overdose_per_100k", xvar="Year", alpha=0.05
)
fit_after
plot_after + reg_chart_after

In [None]:
plot_b4 + reg_chart_b4 + plot_after + reg_chart_after

In [None]:
################################

In [None]:
# # saved original code for plotting
# wa_deaths.groupby("Year")["overdose_per_100k"].agg([np.mean, np.std])

# wa_deaths_b4 = wa_deaths[wa_deaths["Year"] < 2012]
# wa_deaths_after = wa_deaths[wa_deaths["Year"] >= 2012]

# source_data = wa_deaths_b4

# plot_wa_b4 = alt.Chart(source_data).mark_point().encode(
#     y=alt.Y("mean_overdose:Q", scale=alt.Scale(zero=False)),
#     x=alt.X("Year:O", scale=alt.Scale(zero=False))
# ).transform_aggregate(
#     mean_overdose='mean(overdose_per_100k)',
#     groupby=["Year"]
# )

# plot_wa_b4

# fit_wa_b4 = plot_wa_b4.transform_regression('Year', 'mean_overdose',method="linear"
# ).mark_line()

# fit_wa_b4

# source_data = wa_deaths_after

# plot_wa_after = alt.Chart(source_data).mark_point().encode(
#     y=alt.Y("mean_overdose:Q", scale=alt.Scale(zero=False)),
#     x=alt.X("Year:O", scale=alt.Scale(zero=False))
# ).transform_aggregate(
#     mean_overdose='mean(overdose_per_100k)',
#     groupby=["Year"]
# )

# plot_wa_after

# fit_wa_after = plot_wa_after.transform_regression('Year', 'mean_overdose',method="linear"
# ).mark_line()

# fit_wa_after

# plot_wa_b4 + fit_wa_b4 + plot_wa_after + fit_wa_after

In [None]:
# create a scale for number of years before and after 2007 (target year)

def scale_years(year):
    if year == 2009:
        return -3
    if year == 2010:
        return -2
    if year == 2011:
        return -1
    if year == 2012:
        return 0
    if year == 2013:
        return 1
    if year == 2014:
        return 2
    if year == 2015:
        return 3

wa_stats["year relative to policy"] = wa_stats["Year"].apply(lambda x: scale_years(x))
# comp_result["year relative to policy"] = comp_result["Year"].apply(lambda x: scale_years(x))

# double check no nulls in "year relative to policy"

assert (wa_stats["year relative to policy"].isnull().sum() == 0)
# assert (comp_result["year relative to policy"].isnull().sum() == 0)

# doing this in case the float == int comparison causes issues

In [None]:
# split into before 2007 and after 2007

wa_b4 = wa_stats[wa_stats["Year"] < 2012]
wa_after = wa_stats[wa_stats["Year"] > 2012]


# wa_after = wa_after[wa_after["Year"] != 2007] # may need to handle this differently

In [None]:
from sklearn.linear_model import LinearRegression 

regressor_b4 = LinearRegression() 
regressor_after = LinearRegression()


X_b4 = np.array(wa_b4["year relative to policy"]).reshape(-1, 1)
y_b4 = np.array(wa_b4["overdose_per_100k"]).reshape(-1, 1)

X_after = np.array(wa_after["year relative to policy"]).reshape(-1, 1)
y_after = np.array(wa_after["overdose_per_100k"]).reshape(-1, 1)


regressor_b4.fit(X_b4,y_b4) 
regressor_after.fit(X_after,y_after) 


y_pred_b4 = regressor_b4.predict(X_b4)
y_pred_after = regressor_after.predict(X_b4)

In [None]:
y_pred_b4

In [None]:
y_pred_after

In [None]:
plt.xlim(-3, 3)
#plt.ylim(0, 500)

plt.title("Overdose Deaths in Washington")

plt.plot(X_b4, y_pred_b4,color='k')
plt.plot(X_after, y_pred_after,color='k')

# plot avg value in each year

In [None]:
x = "Year"
y = "overdose_per_100k"

In [None]:
def vertical_line(year):
    """Function to plot a vertical line at year of policy implementation"""
    line = alt.Chart(pd.DataFrame({
    'Date': [year],
    'color': ["black"]
    })).mark_rule().encode(
    x='Date:Q', # use q for "quantitative" - as per altair docs
    color=alt.Color('color:N', scale=None)
    )

    return line

In [None]:
# test function
line = vertical_line(2004)

In [None]:
def get_charts(b4, after, title_b4, title_after):
    """
    Function to plot the pre and post charts.
    Will not use in final plot - used as a baseline for our fit charts later.

    """

    base_before = (
        alt.Chart(b4)
        .mark_point()
        .encode(
            y=alt.Y("overdose_per_100k", scale=alt.Scale(zero=False)),
            x=alt.X("year relative to policy", scale=alt.Scale(zero=False)),
        )
        .properties(title=title_b4)
        
    )

    base_after = (
        alt.Chart(after)
        .mark_point()
        .encode(
            y=alt.Y("overdose_per_100k", scale=alt.Scale(zero=False)),
            x=alt.X("year relative to policy", scale=alt.Scale(zero=False)),
        )
        .properties(title=title_after)
    )

    return base_before, base_after

In [None]:
# test the function
# may remove title parameters later - not really necessary as we aren't plotting this part in our final analysis
# however, if we can't add a title to our fit/regression line charts, we may need to add them here
base_before, base_after = get_charts(b4 = wa_b4, after = wa_after, title_b4 = "deaths before policy", title_after="deaths after policy")
base_before + base_after

In [None]:
# starting here in final report
# no longer calculating base chart above - just adding in regression line at same time
def get_preds(df, x, y):

    # init new empty df for our predictions
    predictions = pd.DataFrame()

    # fit our model and predict values
    model = smf.ols(f"{y} ~ {x}", data=df).fit()
    model_predict = model.get_prediction(df[x])

    # save predictions back to df, calculate confidence intervals
    predictions["overdose_per_100k"] = model.predict(df[x])
    predictions[["ci_low", "ci_high"]] = model_predict.conf_int(alpha=0.05)

    # save original year columns to new predictions df
    predictions["Year"] = df["Year"]
    predictions["year relative to policy"] = df["year relative to policy"]
    return predictions

In [None]:
wa_b4_preds = get_preds(wa_b4, x, y)
wa_after_preds = get_preds(wa_after, x, y)

In [None]:
def get_charts(b4, after, title_b4, title_after, color):
    """
    Function to plot the pre and post charts.
    Will not use in final plot - used as a baseline for our fit charts later.

    """

    base_before = (
        alt.Chart(b4)
        .mark_point()
        .encode(
            y=alt.Y("overdose_per_100k", scale=alt.Scale(zero=False)),
            x=alt.X("year relative to policy", scale=alt.Scale(zero=False)),
        )
        .properties(title=title_b4).transform_regression("year relative to policy", "overdose_per_100k")
    .mark_line()
    .encode(color=alt.value(color))
        
    )

    base_after = (
        alt.Chart(after)
        .mark_point()
        .encode(
            y=alt.Y("overdose_per_100k", scale=alt.Scale(zero=False)),
            x=alt.X("year relative to policy", scale=alt.Scale(zero=False)),
            
        )
        .properties(title=title_after).transform_regression("year relative to policy", "overdose_per_100k")
    .mark_line()
    .encode(color=alt.value(color))
    )

    return base_before, base_after

In [None]:
# test the function
# may remove title parameters later - not really necessary as we aren't plotting this part in our final analysis
# however, if we can't add a title to our fit/regression line charts, we may need to add them here
base_before, base_after = get_charts(b4 = wa_b4_preds, after = wa_after_preds, title_b4 = "Deaths Before Policy Implementation", title_after="Deaths After Policy Implementation", color="red")
base_before + base_after