In [1]:
import numpy as np
import pandas as pd
import altair as alt
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
prescriptions = pd.read_csv("05_cleaned_data/prescriptions_wa.csv", low_memory=False)

In [3]:
prescriptions.columns

Index(['REPORTER_DEA_NO', 'REPORTER_BUS_ACT', 'REPORTER_NAME',
       'REPORTER_ADDL_CO_INFO', 'REPORTER_ADDRESS1', 'REPORTER_ADDRESS2',
       'REPORTER_CITY', 'REPORTER_STATE', 'REPORTER_ZIP', 'REPORTER_COUNTY',
       'BUYER_DEA_NO', 'BUYER_BUS_ACT', 'BUYER_NAME', 'BUYER_ADDL_CO_INFO',
       'BUYER_ADDRESS1', 'BUYER_ADDRESS2', 'BUYER_CITY', 'BUYER_STATE',
       'BUYER_ZIP', 'BUYER_COUNTY', 'TRANSACTION_CODE', 'DRUG_CODE', 'NDC_NO',
       'DRUG_NAME', 'QUANTITY', 'UNIT', 'ACTION_INDICATOR', 'ORDER_FORM_NO',
       'CORRECTION_NO', 'STRENGTH', 'TRANSACTION_DATE', 'CALC_BASE_WT_IN_GM',
       'DOSAGE_UNIT', 'TRANSACTION_ID', 'Product_Name', 'Ingredient_Name',
       'Measure', 'MME_Conversion_Factor', 'Combined_Labeler_Name',
       'Revised_Company_Name', 'Reporter_family', 'dos_str', 'Year', 'Month',
       'StateFIPS', 'CountyName', 'StateName', 'CountyFIPS', 'StateAbbr',
       'STATE_COUNTY', 'state_abbrev', 'FIP_unique', 'State', 'Population',
       'county_test'],
      dtyp

In [4]:
# reducing the number of columns in the drug prescriptions dataset by including only the relevant attributes to create a new dataset

prescriptions_reduced = prescriptions[
    [
        "DRUG_CODE",
        "DRUG_NAME",
        "QUANTITY",
        "UNIT",
        "STRENGTH",
        "CALC_BASE_WT_IN_GM",
        "DOSAGE_UNIT",
        "Product_Name",
        "Ingredient_Name",
        "Measure",
        "MME_Conversion_Factor",
        "dos_str",
        "Year",
        "Month",
        "StateFIPS",
        "StateName",
        "CountyFIPS",
        "state_abbrev",
        "FIP_unique",
        "Population",
        "county_test",
    ]
]

In [5]:
# creating a copy of reduced dataset of prescriptions and converting some of the attributes to appropriate data type

prescriptions_reduced_copy = prescriptions_reduced.copy()

prescriptions_reduced_copy["Year"] = prescriptions_reduced_copy["Year"].astype("int64")
prescriptions_reduced_copy["DRUG_CODE"] = prescriptions_reduced_copy[
    "DRUG_CODE"
].astype("int64")
prescriptions_reduced_copy["Month"] = prescriptions_reduced_copy["Month"].astype(
    "int64"
)
prescriptions_reduced_copy["Population"] = prescriptions_reduced_copy[
    "Population"
].astype("int64")

prescriptions_reduced_copy.rename(
    columns={
        "DRUG_CODE": "Drug Code",
        "DRUG_NAME": "Drug Name",
        "QUANTITY": "Quantity",
        "UNIT": "Unit",
        "STRENGTH": "Strength",
        "CALC_BASE_WT_IN_GM": "Calc Base Weight (In Gm)",
        "DOSAGE_UNIT": "Dosage Unit",
        "dos_str": "Dosage Strength",
        "StateFIPS": "State FIPS",
        "StateName": "State",
        "CountyFIPS": "County FIPS",
        "state_abbrev": "State Abbreviation",
        "FIP_unique": "FIPS_Unique",
        "county_test": "County",
    },
    inplace=True,
)

In [6]:
prescriptions_reduced_copy.head(3)

Unnamed: 0,Drug Code,Drug Name,Quantity,Unit,Strength,Calc Base Weight (In Gm),Dosage Unit,Product_Name,Ingredient_Name,Measure,...,Dosage Strength,Year,Month,State FIPS,State,County FIPS,State Abbreviation,FIPS_Unique,Population,County
0,9193,HYDROCODONE,1.0,,,2.27025,500.0,HYDROCODONE BITARTRATE & ACETA 7.5MG,HYDROCODONE BITARTRATE HEMIPENTAHYDRATE,TAB,...,7.5,2012,6,53,Washington,53009,WA,5300953,71766,Clallam
1,9193,HYDROCODONE,1.0,,,0.3027,100.0,HYDROCODONE BITARTRATE AND ACETA 5MG,HYDROCODONE BITARTRATE HEMIPENTAHYDRATE,TAB,...,5.0,2010,4,30,Montana,30013,MT,3001330,81507,Cascade
2,9143,OXYCODONE,2.0,,,1.793,200.0,ENDOCET - 10MG OXYCODONE.HCL/325MG A,OXYCODONE HYDROCHLORIDE,TAB,...,10.0,2010,4,30,Montana,30013,MT,3001330,81507,Cascade


In [24]:
# creating a dataset that has all the drug prescriptions in the state of Washington

washington_prescriptions = prescriptions_reduced_copy[
    prescriptions_reduced_copy["State"] == "Washington"
]

washington_prescriptions.head()

Unnamed: 0,Drug Code,Drug Name,Quantity,Unit,Strength,Calc Base Weight (In Gm),Dosage Unit,Product_Name,Ingredient_Name,Measure,...,Dosage Strength,Year,Month,State FIPS,State,County FIPS,State Abbreviation,FIPS_Unique,Population,County
0,9193,HYDROCODONE,1.0,,,2.27025,500.0,HYDROCODONE BITARTRATE & ACETA 7.5MG,HYDROCODONE BITARTRATE HEMIPENTAHYDRATE,TAB,...,7.5,2012,6,53,Washington,53009,WA,5300953,71766,Clallam
257,9193,HYDROCODONE,3.0,,,0.9081,300.0,HYDROCODONE BIT/ACETA 5MG/325MG USP,HYDROCODONE BITARTRATE HEMIPENTAHYDRATE,TAB,...,5.0,2011,4,53,Washington,53063,WA,5306353,473436,Spokane
258,9193,HYDROCODONE,2.0,,,0.6054,200.0,HYDROCODONE BIT/ACETA 5MG/325MG USP,HYDROCODONE BITARTRATE HEMIPENTAHYDRATE,TAB,...,5.0,2011,9,53,Washington,53063,WA,5306353,473436,Spokane
259,9193,HYDROCODONE,1.0,,,1.5135,500.0,HYDROCODONE BIT./ACETAMINOPHEN TABS.,HYDROCODONE BITARTRATE HEMIPENTAHYDRATE,TAB,...,5.0,2011,11,53,Washington,53063,WA,5306353,473436,Spokane
260,9193,HYDROCODONE,2.0,,,4.5405,1000.0,HYDROCODONE BIT/ACETA 7.5MG/325MG US,HYDROCODONE BITARTRATE HEMIPENTAHYDRATE,TAB,...,7.5,2011,12,53,Washington,53063,WA,5306353,473436,Spokane


In [25]:
washington_prescriptions_copy = washington_prescriptions.copy()
washington_prescriptions_copy["shipment_per_100k"] = washington_prescriptions_copy["Quantity"] / washington_prescriptions_copy["Population"] * 100_000

In [9]:
# washington_prescriptions_copy = washington_prescriptions.copy()

# washington_prescriptions_copy["shipment_per_100k"] = (
#     (
#         washington_prescriptions_copy["Dosage Strength"]
#         * washington_prescriptions_copy["Dosage Unit"]
#         * washington_prescriptions_copy["MME_Conversion_Factor"]
#     )
#     / (washington_prescriptions_copy["Population"])
#     * 100000
# )

# washington_prescriptions_copy.head(3)

Unnamed: 0,Drug Code,Drug Name,Quantity,Unit,Strength,Calc Base Weight (In Gm),Dosage Unit,Product_Name,Ingredient_Name,Measure,...,Year,Month,State FIPS,State,County FIPS,State Abbreviation,FIPS_Unique,Population,County,shipment_per_100k
0,9193,HYDROCODONE,1.0,,,2.27025,500.0,HYDROCODONE BITARTRATE & ACETA 7.5MG,HYDROCODONE BITARTRATE HEMIPENTAHYDRATE,TAB,...,2012,6,53,Washington,53009,WA,5300953,71766,Clallam,5225.315609
257,9193,HYDROCODONE,3.0,,,0.9081,300.0,HYDROCODONE BIT/ACETA 5MG/325MG USP,HYDROCODONE BITARTRATE HEMIPENTAHYDRATE,TAB,...,2011,4,53,Washington,53063,WA,5306353,473436,Spokane,316.832687
258,9193,HYDROCODONE,2.0,,,0.6054,200.0,HYDROCODONE BIT/ACETA 5MG/325MG USP,HYDROCODONE BITARTRATE HEMIPENTAHYDRATE,TAB,...,2011,9,53,Washington,53063,WA,5306353,473436,Spokane,211.221791


In [26]:
wa_result = washington_prescriptions_copy.groupby(["Year", "County"])["shipment_per_100k"].sum().reset_index()

In [28]:
wa_stats = washington_prescriptions_copy.groupby(["Year"])["shipment_per_100k"].sum().reset_index()

In [None]:
# wa_res = pd.DataFrame(wa_stats.describe()["overdose_per_100k"]).rename(columns={"overdose_per_100k": "Overdoses per 100k Residents - Washington"})
# comp_res = pd.DataFrame(comp_stats.describe()["overdose_per_100k"]).rename(columns={"overdose_per_100k": "Overdoses per 100k Residents - Comp States"})
# stats = pd.concat([wa_res, comp_res], axis=1)
# stats

In [56]:
# create a scale for number of years before and after 2007 (target year)

def scale_years(year):
    if year == 2009:
        return -2
    if year == 2010:
        return -1
    if year == 2011:
        return 0
    if year == 2012:
        return 1
    # if year == 2013:
    #     return 1
    # if year == 2014:
    #     return 2
    # if year == 2015:
    #     return 3

wa_stats["year relative to policy"] = wa_stats["Year"].apply(lambda x: scale_years(x))
# comp_result["year relative to policy"] = comp_result["Year"].apply(lambda x: scale_years(x))

# double check no nulls in "year relative to policy"

assert (wa_stats["year relative to policy"].isnull().sum() == 0)
# assert (comp_result["year relative to policy"].isnull().sum() == 0)

# doing this in case the float == int comparison causes issues

In [62]:
# split into before 2007 and after 2007

wa_b4 = wa_stats[wa_stats["Year"] <= 2011]
wa_after = wa_stats[wa_stats["Year"] >= 2011]


# wa_after = wa_after[wa_after["Year"] != 2007] # may need to handle this differently

In [63]:
from sklearn.linear_model import LinearRegression 

regressor_b4 = LinearRegression() 
regressor_after = LinearRegression()


X_b4 = np.array(wa_b4["year relative to policy"]).reshape(-1, 1)
y_b4 = np.array(wa_b4["shipment_per_100k"]).reshape(-1, 1)

X_after = np.array(wa_after["year relative to policy"]).reshape(-1, 1)
y_after = np.array(wa_after["shipment_per_100k"]).reshape(-1, 1)


regressor_b4.fit(X_b4,y_b4) 
regressor_after.fit(X_after,y_after) 


y_pred_b4 = regressor_b4.predict(X_b4)
y_pred_after = regressor_after.predict(X_b4)

In [64]:
y_pred_b4

array([[1648.74132281],
       [3004.09019273],
       [4359.43906265]])

In [65]:
y_pred_after

array([[5125.93859275],
       [4907.82680007],
       [4689.71500739]])

In [61]:
# plt.xlim(-3, 3)
# #plt.ylim(0, 500)

# plt.title("Opioid Shipments in Washington")

# plt.plot(X_b4, y_pred_b4,color='k')
# plt.plot(X_after, y_pred_after,color='k')

# # plot avg value in each year

In [66]:
x = "Year"
y = "shipment_per_100k"

In [67]:
def vertical_line(year):
    """Function to plot a vertical line at year of policy implementation"""
    line = alt.Chart(pd.DataFrame({
    'Date': [year],
    'color': ["black"]
    })).mark_rule().encode(
    x='Date:Q', # use q for "quantitative" - as per altair docs
    color=alt.Color('color:N', scale=None)
    )

    return line

In [68]:
# test function
line = vertical_line(2004)

In [69]:
def get_charts(b4, after, title_b4, title_after):
    """
    Function to plot the pre and post charts.
    Will not use in final plot - used as a baseline for our fit charts later.

    """

    base_before = (
        alt.Chart(b4)
        .mark_point()
        .encode(
            y=alt.Y("shipment_per_100k", scale=alt.Scale(zero=False)),
            x=alt.X("year relative to policy", scale=alt.Scale(zero=False)),
        )
        .properties(title=title_b4)
        
    )

    base_after = (
        alt.Chart(after)
        .mark_point()
        .encode(
            y=alt.Y("shipment_per_100k", scale=alt.Scale(zero=False)),
            x=alt.X("year relative to policy", scale=alt.Scale(zero=False)),
        )
        .properties(title=title_after)
    )

    return base_before, base_after

In [70]:
# test the function
# may remove title parameters later - not really necessary as we aren't plotting this part in our final analysis
# however, if we can't add a title to our fit/regression line charts, we may need to add them here
base_before, base_after = get_charts(b4 = wa_b4, after = wa_after, title_b4 = "shipments before policy", title_after="shipments after policy")
base_before + base_after

In [71]:
# starting here in final report
# no longer calculating base chart above - just adding in regression line at same time
def get_preds(df, x, y):

    # init new empty df for our predictions
    predictions = pd.DataFrame()

    # fit our model and predict values
    model = smf.ols(f"{y} ~ {x}", data=df).fit()
    model_predict = model.get_prediction(df[x])

    # save predictions back to df, calculate confidence intervals
    predictions["shipment_per_100k"] = model.predict(df[x])
    predictions[["ci_low", "ci_high"]] = model_predict.conf_int(alpha=0.05)

    # save original year columns to new predictions df
    predictions["Year"] = df["Year"]
    predictions["year relative to policy"] = df["year relative to policy"]
    return predictions

In [72]:
wa_b4_preds = get_preds(wa_b4, x, y)
wa_after_preds = get_preds(wa_after, x, y)

  return np.dot(wresid, wresid) / self.df_resid


In [73]:
def get_charts(b4, after, title_b4, title_after, color):
    """
    Function to plot the pre and post charts.
    Will not use in final plot - used as a baseline for our fit charts later.

    """

    base_before = (
        alt.Chart(b4)
        .mark_point()
        .encode(
            y=alt.Y("shipment_per_100k", scale=alt.Scale(zero=False)),
            x=alt.X("year relative to policy", scale=alt.Scale(zero=False)),
        )
        .properties(title=title_b4).transform_regression("year relative to policy", "shipment_per_100k")
    .mark_line()
    .encode(color=alt.value(color))
        
    )

    base_after = (
        alt.Chart(after)
        .mark_point()
        .encode(
            y=alt.Y("shipment_per_100k", scale=alt.Scale(zero=False)),
            x=alt.X("year relative to policy", scale=alt.Scale(zero=False)),
            
        )
        .properties(title=title_after).transform_regression("year relative to policy", "shipment_per_100k")
    .mark_line()
    .encode(color=alt.value(color))
    )

    return base_before, base_after

In [74]:
# test the function
# may remove title parameters later - not really necessary as we aren't plotting this part in our final analysis
# however, if we can't add a title to our fit/regression line charts, we may need to add them here
base_before, base_after = get_charts(b4 = wa_b4_preds, after = wa_after_preds, title_b4 = "Opioid Shipments Before and After Policy Implementation in Washington", title_after="Shipments After Policy Implementation", color="red")
base_before + base_after

In [None]:
#####################################
# updated code for final report ends here 

In [None]:
prescriptions["QUANTITY"].describe()

In [None]:
prescriptions_reduced = prescriptions[['QUANTITY', 'Year', 'StateFIPS', 'StateName', 'CountyFIPS', 'FIP_unique', 'Population','county_test']]

In [None]:
prescriptions_reduced["StateName"].value_counts()

In [None]:
washington = prescriptions_reduced[prescriptions_reduced["StateName"] == "Washington"]
comp = prescriptions_reduced[prescriptions_reduced["StateName"] != "Washington"]

In [None]:
wa_prescriptions = washington.copy()
comp_prescriptions = comp.copy()

In [None]:
wa_prescriptions["quantity_sum"] = wa_prescriptions.groupby(["Year", "FIP_unique"])["QUANTITY"].transform('sum')
comp_prescriptions["quantity_sum"] = comp_prescriptions.groupby(["Year", "FIP_unique"])["QUANTITY"].transform('sum')

In [None]:
wa_prescriptions = wa_prescriptions.drop('QUANTITY', axis=1)
comp_prescriptions = comp_prescriptions.drop('QUANTITY', axis=1)

In [None]:
wa_result = wa_prescriptions.drop_duplicates()
comp_result = comp_prescriptions.drop_duplicates()

In [None]:
wa_result["shipment_per_100k"] = wa_result["quantity_sum"] / wa_result["Population"] * 100_000
comp_result["shipment_per_100k"] = comp_result["quantity_sum"] / comp_result["Population"] * 100_000

In [None]:
wa_result.groupby("Year")["shipment_per_100k"].agg([np.mean, np.std])

In [None]:
wa_result_b4 = wa_result[wa_result["Year"] < 2012]
wa_result_after = wa_result[wa_result["Year"] >= 2012]

In [None]:

# washington

source_data = wa_result_b4

plot_wa_b4 = alt.Chart(source_data).mark_point().encode(
    y=alt.Y("mean_shipment:Q", scale=alt.Scale(zero=False)),
    x=alt.X("Year:O", scale=alt.Scale(zero=False))
).transform_aggregate(
    mean_shipment='mean(shipment_per_100k)',
    groupby=["Year"]
)

plot_wa_b4

In [None]:
fit_wa_b4 = plot_wa_b4.transform_regression('Year', 'mean_shipment',method="linear"
).mark_line()

fit_wa_b4

In [None]:
source_data = wa_result_after

plot_wa_after = alt.Chart(source_data).mark_point().encode(
    y=alt.Y("mean_shipment:Q", scale=alt.Scale(zero=False)),
    x=alt.X("Year:O", scale=alt.Scale(zero=False))
).transform_aggregate(
    mean_shipment='mean(shipment_per_100k)',
    groupby=["Year"]
)

plot_wa_after

In [None]:
fit_wa_after = plot_wa_after.transform_regression('Year', 'mean_overdose',method="linear"
).mark_line()

fit_wa_after

In [None]:
plot_wa_b4 + fit_wa_b4 + plot_wa_after + fit_wa_after