In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
from plotnine import *
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
# load in pre-cleaned deaths data for Texas and comparison states
deaths = pd.read_csv("05_cleaned_data/deaths_tx.csv")


In [3]:
# two separate dfs - one for texas and one for comp states

texas = deaths[deaths["StateName"] == "Texas"]
comp = deaths[deaths["StateName"] != "Texas"]

### Helper functions 

In [4]:
# function to select only overdose records
# double check this

def select_overdose(record):
    """Simple function to select only overdose records"""

    if record == "All other non-drug and non-alcohol causes":
        return 0

    if record == "All other alcohol-induced causes":
        return 0

    if record == "All other drug-induced causes":
        return 0

    if record == "Alcohol poisonings (overdose) (X45, X65, Y15)":
        return 0

    if record == "Drug poisonings (overdose) Unintentional (X40-X44)":
        return 1

    if record == "Drug poisonings (overdose) Suicide (X60-X64)":
        return 1

    if record == "Drug poisonings (overdose) Undetermined (Y10-Y14)":
        return 1

    else:
        return "error"
    

# copy to fix the dreaded "A value is trying to be set on a copy of a slice" error
tx_deaths = texas.copy()
comp_deaths = comp.copy()

# apply new function to our df
tx_deaths["overdose"] = tx_deaths["Drug/Alcohol Induced Cause"].apply(lambda x: select_overdose(x))
comp_deaths["overdose"] = comp_deaths["Drug/Alcohol Induced Cause"].apply(lambda x: select_overdose(x))

# filter accordingly based on new column
tx_deaths = tx_deaths[tx_deaths["overdose"] != 0]
comp_deaths = comp_deaths[comp_deaths["overdose"] != 0]

In [5]:
# calculate overdoses per 100_000 residents

tx_deaths["overdose_per_100k"] = tx_deaths["Deaths"] / tx_deaths["Population"] * 100_000
comp_deaths["overdose_per_100k"] = comp_deaths["Deaths"] / comp_deaths["Population"] * 100_000

In [6]:
tx_result = tx_deaths.groupby(["Year", "County"])["overdose_per_100k"].sum().reset_index()
comp_result = comp_deaths.groupby(["Year", "StateName", "CountyName"])["overdose_per_100k"].sum().reset_index()

# keep validating num counties


# recent change - added county to both results above, removed state from comp_result

In [7]:
comp_result[comp_result["StateName"] == "Arkansas"].groupby("Year")["overdose_per_100k"].mean()

Year
2004.0     9.546982
2005.0     9.061194
2006.0     8.093736
2007.0    12.526991
2008.0    11.997588
2009.0    13.608821
2010.0    12.148777
Name: overdose_per_100k, dtype: float64

In [None]:
comp_result[comp_result["StateName"] == "Kansas"].groupby("Year")["overdose_per_100k"].mean()

In [None]:
comp_result[comp_result["StateName"] == "Louisiana"].groupby("Year")["overdose_per_100k"].mean()

In [None]:
comp_result[comp_result["StateName"] == "New Mexico"].groupby("Year")["overdose_per_100k"].mean()

In [None]:
comp_result[comp_result["StateName"] == "Oklahoma"].groupby("Year")["overdose_per_100k"].mean()

In [None]:
tx_result.groupby("Year")["overdose_per_100k"].mean()

In [None]:
comp_result.groupby("Year")["overdose_per_100k"].mean()

In [None]:
# idea for counties -> histogram of population, drop the outliers?

# plt.hist(tx_deaths["Population"], bins=20)

In [None]:
tx_result = tx_result.groupby("Year")["overdose_per_100k"].mean().reset_index()
comp_result = comp_result.groupby(["Year"])["overdose_per_100k"].mean().reset_index()

In [None]:
# idea for counties -> histogram of population, drop the outliers?

In [None]:
comp_result

In [None]:
#tx_result = tx_deaths.groupby(["Year", "County"])["overdose_per_100k"].sum().reset_index()
#comp_result = comp_deaths.groupby(["Year", "County"])["overdose_per_100k"].sum().reset_index()
#
#tx_result = tx_result.groupby("Year")["overdose_per_100k"].mean().reset_index()
#comp_result = comp_result.groupby(["Year"])["overdose_per_100k"].mean().reset_index()

In [None]:
# collapse state column with a mean aggregation
# gives us the average overdoses of our comparison states over the years
#comp_result = comp_result.groupby(["Year"])["overdose_per_100k"].mean().reset_index()

In [None]:
# create a scale for number of years before and after 2007 (target year)

def scale_years(year):
    if year == 2004:
        return -3
    if year == 2005:
        return -2
    if year == 2006:
        return -1
    if year == 2007:
        return 0
    if year == 2008:
        return 1
    if year == 2009:
        return 2
    if year == 2010:
        return 3

tx_result["year relative to policy"] = tx_result["Year"].apply(lambda x: scale_years(x))
comp_result["year relative to policy"] = comp_result["Year"].apply(lambda x: scale_years(x))

# double check no nulls in "year relative to policy"

assert (tx_result["year relative to policy"].isnull().sum() == 0)
assert (comp_result["year relative to policy"].isnull().sum() == 0)

# doing this in case the float == int comparison causes issues

## Pre-post plot - overdoses per 100k through the years

In [None]:
# split into before 2007 and after 2007

tx_b4 = tx_result[tx_result["Year"] < 2007]
tx_after = tx_result[tx_result["Year"] >= 2007]


tx_after = tx_after[tx_after["Year"] != 2007] # may need to handle this differently

In [None]:
from sklearn.linear_model import LinearRegression 

regressor_b4 = LinearRegression() 
regressor_after = LinearRegression()


X_b4 = np.array(tx_b4["year relative to policy"]).reshape(-1, 1)
y_b4 = np.array(tx_b4["overdose_per_100k"]).reshape(-1, 1)

X_after = np.array(tx_after["year relative to policy"]).reshape(-1, 1)
y_after = np.array(tx_after["overdose_per_100k"]).reshape(-1, 1)


regressor_b4.fit(X_b4,y_b4) 
regressor_after.fit(X_after,y_after) 


y_pred_b4 = regressor_b4.predict(X_b4)
y_pred_after = regressor_after.predict(X_b4)



In [None]:
y_pred_b4

In [None]:
y_pred_after

In [None]:
plt.xlim(-3, 3)
#plt.ylim(0, 500)

plt.title("Overdose Deaths in Texas")

plt.plot(X_b4, y_pred_b4,color='k')
plt.plot(X_after, y_pred_after,color='k')

# plot avg value in each year


In [None]:
# diff in diff

In [None]:
comp_b4 = comp_result[comp_result["Year"] < 2007]
comp_after = comp_result[comp_result["Year"] >= 2007]

comp_after = comp_after[comp_after["Year"] != 2007] # may need to handle this differently


In [None]:
regressor_b4 = LinearRegression() 
regressor_after = LinearRegression()


X_b4 = np.array(comp_b4["year relative to policy"]).reshape(-1, 1)
y_b4 = np.array(comp_b4["overdose_per_100k"]).reshape(-1, 1)

X_after = np.array(comp_after["year relative to policy"]).reshape(-1, 1)
y_after = np.array(comp_after["overdose_per_100k"]).reshape(-1, 1)


regressor_b4.fit(X_b4,y_b4) 
regressor_after.fit(X_after,y_after) 


y_pred_b4 = regressor_b4.predict(X_b4)
y_pred_after = regressor_after.predict(X_b4)



In [None]:
y_pred_b4

In [None]:
y_pred_after

In [None]:
plt.xlim(-3, 3)
#plt.ylim(0, 500)

plt.plot(X_b4, y_pred_b4,color='k')
plt.plot(X_after, y_pred_after,color='k')


In [None]:
# new method after working more with reshaping exercises
# using statsmodels now, wrapping stuff in functions
# thinking it could be easier to break it down by task

## Helper functions

1) vertical_line()
    - takes in a year and plots a vertical line at that year
2) get_charts()
    - takes in two dataframes (one for before policy, one for after)
    - returns chart for each
    - not much utility by itself - used as a parameter for our get_fit() function

In [None]:
def vertical_line(year):
    """Function to plot a vertical line at year of policy implementation"""
    line = alt.Chart(pd.DataFrame({
    'Date': [year],
    'color': ["black"]
    })).mark_rule().encode(
    x='Date:Q', # use q for "quantitative" - as per altair docs
    color=alt.Color('color:N', scale=None)
    )

    return line

In [None]:
# test function
vertical_line(2004)

In [None]:
def get_charts(b4, after, title_b4, title_after):
    """
    Function to plot the pre and post charts.
    Will not use in final plot - used as a baseline for our fit charts later.

    """

    base_before = (
        alt.Chart(b4)
        .mark_point()
        .encode(
            y=alt.Y("overdose_per_100k", scale=alt.Scale(zero=False)),
            x=alt.X("year relative to policy", scale=alt.Scale(zero=False)),
        )
        .properties(title=title_b4)
    )

    base_after = (
        alt.Chart(after)
        .mark_point()
        .encode(
            y=alt.Y("overdose_per_100k", scale=alt.Scale(zero=False)),
            x=alt.X("year relative to policy", scale=alt.Scale(zero=False)),
        )
        .properties(title=title_after)
    )

    return base_before, base_after

In [None]:
# test the function
# may remove title parameters later - not really necessary as we aren't plotting this part in our final analysis
# however, if we can't add a title to our fit/regression line charts, we may need to add them here
base_before, base_after = get_charts(b4 = tx_b4, after = tx_after, title_b4 = "deaths before policy", title_after="deaths after policy")
base_before + base_after

In [None]:
def get_fits(chart_b4, chart_after):
    """
    Function to plot the regression lines for the pre and post charts.
    """

    fit_tx_b4 = (
    chart_b4.transform_regression("year relative to policy", "overdose_per_100k")
    .mark_line()
    .encode(color=alt.value("red"))
    )


    fit_tx_after = (
    chart_after.transform_regression("year relative to policy", "overdose_per_100k")
    .mark_line()
    .encode(color=alt.value("red"))
    )

    return fit_tx_b4, fit_tx_after

In [None]:
# test function
fit_b4, fit_after = get_fits(chart_b4 = base_before, chart_after = base_after)
fit_b4 + fit_after

In [None]:
band_b4 = alt.Chart(tx_b4).mark_errorband(extent='ci').encode(
    x=alt.X('Year'),
    y=alt.Y('overdose_per_100k'))


band_after = alt.Chart(tx_after).mark_errorband(extent='ci').encode(
    x=alt.X('Year'),
    y=alt.Y('overdose_per_100k'))

In [None]:
fit_b4 + fit_after + error_bars

In [None]:
error_bars = base_before.mark_rule().encode(
    x='ci0(overdose_per_100k):Q',
    x2='ci1(overdose_per_100k):Q',
)


In [None]:
error_bars

In [None]:
band = alt.Chart(base_before).mark_area(
    opacity=0.5
).encode(
    x='overdose_per_100k:Q',
    y='lower:N',
    y2='upper:N'
)

In [None]:
band

In [None]:
mean_points = alt.Chart(tx_b4).mark_circle(color='black').encode(
    alt.X("overdoses_per_100k:Q", bin=True),
    y='year relative to policy:Q',
)

error_bars = alt.Chart(tx_after).mark_errorbar(extent='ci').encode(
    alt.X("overdoses_per_100k:Q", bin=True),
    y='year relative to policy:Q',
)

In [None]:
mean_points + error_bars

In [None]:
fit_b4 + fit_after + error_bars

In [None]:
# attempt 1 - adding error bars to line graph
# https://www.pythoncharts.com/python/line-chart-with-confidence-interval/#altair
line = alt.Chart(tx_deaths).mark_line().encode(
    x='Year',
    y='mean(overdose_per_100k)'
)

band = alt.Chart(tx_deaths).mark_errorband(extent='ci').encode(
    x=alt.X('Year', title='Year'),
    y=alt.Y('overdose_per_100k', title='Overdose Deaths per 100k')
)

chart = alt.layer(
    band,
    line
).properties(
    width=600,
    height=400,
    title='Pre-post Model for Texas'
)

chart

In [None]:
# attempt 2
# https://cmdlinetips.com/2019/12/barplots-scatter-plots-boxplots-with-altair-4-0/
line = alt.Chart(tx_deaths).mark_line().encode(
    x=alt.X('Year'),
    y=alt.Y('overdose_per_100k')
)

line_regress = line.transform_regression('Year', 'overdose_per_100k',method="linear"
).mark_line()

In [None]:
band = alt.Chart(tx_deaths).mark_errorband(extent='ci').encode(
    x=alt.X('Year', title='Year'),
    y=alt.Y('overdose_per_100k', title='Overdose Deaths per 100k')
)

In [None]:
line_regress + band