In [3]:
## import libraries
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import altair as alt

## Load data from the FL mortality cleansed files
death_data_load_FL = pd.read_csv('../20_intermediate_files/florida_death_cleaned.csv')
death_data_load_FL_copy = death_data_load_FL.copy()
death_data_load_FL_copy['Death_Rate_Percentage'] = death_data_load_FL_copy['Death Rate (%)'] * 1000
death_data_FL = death_data_load_FL_copy.loc[death_data_load_FL_copy['STNAME']=='Florida']
death_data_FL_reference = death_data_load_FL_copy.loc[death_data_load_FL_copy['STNAME']!='Florida']
death_data_FL

Unnamed: 0,YEAR,STNAME,CTYNAME,Death Rate (%),POPULATION,Indicator,Death_Rate_Percentage
832,2003,Florida,Alachua County,0.004870,225862,Treatment,4.870230
833,2003,Florida,Baker County,0.011757,23285,Treatment,11.757325
834,2003,Florida,Bay County,0.013572,154726,Treatment,13.572380
835,2003,Florida,Bradford County,0.011757,27097,Treatment,11.757325
836,2003,Florida,Brevard County,0.019285,502985,Treatment,19.284869
...,...,...,...,...,...,...,...
1698,2015,Florida,Union County,0.016128,15268,Treatment,16.128257
1699,2015,Florida,Volusia County,0.018950,517144,Treatment,18.950234
1700,2015,Florida,Wakulla County,0.016128,31529,Treatment,16.128257
1701,2015,Florida,Walton County,0.016128,63145,Treatment,16.128257


In [4]:
## Transform and Groupby Death Rate by State and Year for FL
death_data_FL[
    "average_deaths_state"
] = death_data_FL.groupby(["STNAME", "YEAR"])[
    "Death_Rate_Percentage"
].transform(
    "mean"
)
death_data_FL_subset = death_data_FL[["STNAME", "YEAR", "average_deaths_state"]]
death_data_FL_subset_grouped = death_data_FL_subset.groupby(["STNAME", "YEAR"], as_index = False).mean()
death_data_FL_subset_grouped_pre = death_data_FL_subset_grouped.loc[death_data_FL_subset_grouped["YEAR"] < 2010]
death_data_FL_subset_grouped_post = death_data_FL_subset_grouped.loc[death_data_FL_subset_grouped["YEAR"] >= 2010]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  death_data_FL[


In [5]:
## Function to create confidence interval for FL
def get_reg_fit_FL(data, yvar, xvar, alpha):
    # Grid for predicted values
    x = data.loc[pd.notnull(data[yvar]), xvar]
    xmin = x.min()
    xmax = x.max()
    step = (xmax - xmin) / 100
    grid = np.arange(xmin, xmax + step, step)
    predictions = pd.DataFrame({xvar: grid})

    # Fit model, get predictions
    model = smf.ols(f"{yvar} ~ {xvar}", data=data).fit()
    model_predict = model.get_prediction(predictions[xvar])
    predictions[yvar] = model_predict.summary_frame()["mean"]
    predictions[["ci_low", "ci_high"]] = model_predict.conf_int(alpha=alpha)

    # Build chart
    reg = alt.Chart(predictions).mark_line(color = "teal").encode(
        x=alt.X(
            xvar, 
            scale=alt.Scale(zero=False), 
            axis = alt.Axis(format="T", 
            title = "Year")), 
        y = alt.Y(
            yvar, 
            scale=alt.Scale(zero=False),
            title = "Mortality Rate (per 100,000 people)")
    )
    ci = (
        alt.Chart(predictions)
        .mark_errorband(color = "teal")
        .encode(
            x=xvar,
            y=alt.Y("ci_low", title=""),
            y2="ci_high",
        )
    )
    chart = ci + reg
    return predictions, chart

In [6]:
## Generate Pre-Post Graphs for FL
fit, reg_chart_pre_FL = get_reg_fit_FL(
    death_data_FL_subset_grouped_pre, 
    yvar="average_deaths_state", 
    xvar="YEAR", 
    alpha=0.05
)
reg_chart_pre_FL

fit, reg_chart_post_FL = get_reg_fit_FL(
    death_data_FL_subset_grouped_post, 
    yvar="average_deaths_state", 
    xvar="YEAR", 
    alpha=0.05
)

## Create line post-policy implementation
line_2010 = alt.Chart(pd.DataFrame({'x': [2010]})).mark_rule(strokeDash=[5, 5]).encode(x='x')

## Generate final pre-post graph for FL
pre_post_FL = reg_chart_pre_FL + reg_chart_post_FL + line_2010
pre_post_FL.properties(title="Pre-Post Florida Mortality Rate Analysis")

  for col_name, dtype in df.dtypes.iteritems():


In [7]:
## Include indicator for reference states for aggregation
death_data_FL_reference["Reference_State_Indicator"] = 1
death_data_FL_reference

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  death_data_FL_reference["Reference_State_Indicator"] = 1


Unnamed: 0,YEAR,STNAME,CTYNAME,Death Rate (%),POPULATION,Indicator,Death_Rate_Percentage,Reference_State_Indicator
0,2003,Colorado,Adams County,0.011088,378785,Control,11.088084,1
1,2003,Colorado,Alamosa County,0.009527,15125,Control,9.527349,1
2,2003,Colorado,Arapahoe County,0.008748,514406,Control,8.747954,1
3,2003,Colorado,Archuleta County,0.009527,11167,Control,9.527349,1
4,2003,Colorado,Baca County,0.009527,4106,Control,9.527349,1
...,...,...,...,...,...,...,...,...
2751,2015,Nevada,Pershing County,0.029728,6620,Control,29.727663,1
2752,2015,Nevada,Storey County,0.029728,3881,Control,29.727663,1
2753,2015,Nevada,Washoe County,0.018978,442617,Control,18.978033,1
2754,2015,Nevada,White Pine County,0.029728,9875,Control,29.727663,1


In [8]:
## Transform and Groupby Death Rate by State and Year for FL Reference states
death_data_FL_reference[
    "average_deaths_state"
] = death_data_FL_reference.groupby(["STNAME", "YEAR"])[
    "Death_Rate_Percentage"
].transform(
    "mean"
)
death_data_FL__ref_subset = death_data_FL_reference[["STNAME", "YEAR", "average_deaths_state"]]
death_data_FL_ref_subset_grouped = death_data_FL__ref_subset.groupby(["STNAME", "YEAR"], as_index = False).mean()
death_data_FL_ref_subset_grouped_pre = death_data_FL_ref_subset_grouped.loc[death_data_FL_ref_subset_grouped["YEAR"] < 2010]
death_data_FL_ref_subset_grouped_post = death_data_FL_ref_subset_grouped.loc[death_data_FL_ref_subset_grouped["YEAR"] >= 2010]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  death_data_FL_reference[


In [9]:
## Function to create confidence interval for FL reference states
def get_reg_fit_FL_ref(data, yvar, xvar, alpha):
    # Grid for predicted values
    x = data.loc[pd.notnull(data[yvar]), xvar]
    xmin = x.min()
    xmax = x.max()
    step = (xmax - xmin) / 100
    grid = np.arange(xmin, xmax + step, step)
    predictions = pd.DataFrame({xvar: grid})

    # Fit model, get predictions
    model = smf.ols(f"{yvar} ~ {xvar}", data=data).fit()
    model_predict = model.get_prediction(predictions[xvar])
    predictions[yvar] = model_predict.summary_frame()["mean"]
    predictions[["ci_low", "ci_high"]] = model_predict.conf_int(alpha=alpha)

    # Build chart
    reg = alt.Chart(predictions).mark_line(color = "teal", opacity=0.2).encode(
        x=alt.X(
            xvar, 
            scale=alt.Scale(zero=False), 
            axis = alt.Axis(format="T", 
            title = "Year")), 
        y = alt.Y(
            yvar, 
            scale=alt.Scale(zero=False),
            title = "Mortality Rate (per 100,000 people)")
    )
    ci = (
        alt.Chart(predictions)
        .mark_errorband(color = "teal", opacity=0.2)
        .encode(
            x=xvar,
            y=alt.Y("ci_low", title=""),
            y2="ci_high",
        )
    )
    chart = ci + reg
    return predictions, chart

In [10]:
## Generate Pre-Post Graphs for FL reference states
fit, reg_chart_pre_FL_ref = get_reg_fit_FL_ref(
    death_data_FL_ref_subset_grouped_pre, 
    yvar="average_deaths_state", 
    xvar="YEAR", 
    alpha=0.05
)

fit, reg_chart_post_FL_ref = get_reg_fit_FL_ref(
    death_data_FL_ref_subset_grouped_post, 
    yvar="average_deaths_state", 
    xvar="YEAR", 
    alpha=0.05
)

## Create line post-policy implementation
line_2010 = alt.Chart(pd.DataFrame({'x': [2010]})).mark_rule(strokeDash=[5, 5]).encode(x='x')

## Generate final pre-post graph for FL reference states
pre_post_FL_ref = reg_chart_pre_FL_ref + reg_chart_post_FL_ref + line_2010
pre_post_FL_ref.properties(title="Pre-Post Florida Reference States Mortality Rate Analysis")

  for col_name, dtype in df.dtypes.iteritems():


In [11]:
## Combine pre-post graphs to create diff-in-diff graph for FL and FL reference states
diff_in_diff_FL = pre_post_FL + pre_post_FL_ref
diff_in_diff_FL.properties(title="Diff-in-Diff Florida Mortality Rate Analysis")

In [12]:
## Load data from the WA mortality cleansed files
death_data_load_WA = pd.read_csv('../20_intermediate_files/washington_death_cleaned.csv')
death_data_load_WA_copy = death_data_load_WA.copy()
death_data_load_WA_copy['Death_Rate_Percentage'] = death_data_load_WA_copy['Death Rate (%)'] * 1000
death_data_WA = death_data_load_WA_copy.loc[death_data_load_WA_copy['STNAME']=='Washington']
death_data_WA_reference = death_data_load_WA_copy.loc[death_data_load_WA_copy['STNAME']!='Washington']
death_data_WA

Unnamed: 0,YEAR,STNAME,CTYNAME,Death Rate (%),POPULATION,Indicator,Death_Rate_Percentage
1534,2003,Washington,Adams County,0.009967,16425,Treatment,9.967028
1535,2003,Washington,Asotin County,0.009967,20499,Treatment,9.967028
1536,2003,Washington,Benton County,0.006554,152586,Treatment,6.553681
1537,2003,Washington,Chelan County,0.009967,67377,Treatment,9.967028
1538,2003,Washington,Clallam County,0.009967,66349,Treatment,9.967028
...,...,...,...,...,...,...,...
2036,2015,Washington,Wahkiakum County,0.014345,3989,Treatment,14.345274
2037,2015,Washington,Walla Walla County,0.014345,59970,Treatment,14.345274
2038,2015,Washington,Whatcom County,0.010852,211942,Treatment,10.852026
2039,2015,Washington,Whitman County,0.014345,48224,Treatment,14.345274


In [13]:
## Transform and Groupby Death Rate by State and Year for WA
death_data_WA[
    "average_deaths_state"
] = death_data_WA.groupby(["STNAME", "YEAR"])[
    "Death_Rate_Percentage"
].transform(
    "mean"
)
death_data_WA_subset = death_data_WA[["STNAME", "YEAR", "average_deaths_state"]]
death_data_WA_subset_grouped = death_data_WA_subset.groupby(["STNAME", "YEAR"], as_index = False).mean()
death_data_WA_subset_grouped_pre = death_data_WA_subset_grouped.loc[death_data_WA_subset_grouped["YEAR"] < 2012]
death_data_WA_subset_grouped_post = death_data_WA_subset_grouped.loc[death_data_WA_subset_grouped["YEAR"] >= 2012]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  death_data_WA[


In [14]:
## Function to create confidence interval for WA
def get_reg_fit_WA(data, yvar, xvar, alpha):
    # Grid for predicted values
    x = data.loc[pd.notnull(data[yvar]), xvar]
    xmin = x.min()
    xmax = x.max()
    step = (xmax - xmin) / 100
    grid = np.arange(xmin, xmax + step, step)
    predictions = pd.DataFrame({xvar: grid})

    # Fit model, get predictions
    model = smf.ols(f"{yvar} ~ {xvar}", data=data).fit()
    model_predict = model.get_prediction(predictions[xvar])
    predictions[yvar] = model_predict.summary_frame()["mean"]
    predictions[["ci_low", "ci_high"]] = model_predict.conf_int(alpha=alpha)

    # Build chart
    reg = alt.Chart(predictions).mark_line(color = "purple").encode(
        x=alt.X(
            xvar, 
            scale=alt.Scale(zero=False), 
            axis = alt.Axis(format="T", 
            title = "Year")), 
        y = alt.Y(
            yvar, 
            scale=alt.Scale(zero=False),
            title = "Mortality Rate (per 100,000 people)")
    )
    ci = (
        alt.Chart(predictions)
        .mark_errorband(color = "purple")
        .encode(
            x=xvar,
            y=alt.Y("ci_low", title=""),
            y2="ci_high",
        )
    )
    chart = ci + reg
    return predictions, chart

In [15]:
## Generate Pre-Post Graphs for WA
fit, reg_chart_pre_WA = get_reg_fit_WA(
    death_data_WA_subset_grouped_pre, 
    yvar="average_deaths_state", 
    xvar="YEAR", 
    alpha=0.05
)
reg_chart_pre_WA

fit, reg_chart_post_WA = get_reg_fit_WA(
    death_data_WA_subset_grouped_post, 
    yvar="average_deaths_state", 
    xvar="YEAR", 
    alpha=0.05
)

## Create line post-policy implementation
line_2012 = alt.Chart(pd.DataFrame({'x': [2012]})).mark_rule(strokeDash=[5, 5]).encode(x='x')

## Generate final pre-post graph for WA
pre_post_WA = reg_chart_pre_WA + reg_chart_post_WA + line_2012
pre_post_WA.properties(title="Pre-Post Washington Mortality Rate Analysis")

  for col_name, dtype in df.dtypes.iteritems():


In [16]:
## Include indicator for reference states for aggregation
death_data_WA_reference["Reference_State_Indicator"] = 1
death_data_WA_reference

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  death_data_WA_reference["Reference_State_Indicator"] = 1


Unnamed: 0,YEAR,STNAME,CTYNAME,Death Rate (%),POPULATION,Indicator,Death_Rate_Percentage,Reference_State_Indicator
0,2003,Hawaii,Hawaii County,0.007036,156340,Control,7.035947,1
1,2003,Hawaii,Honolulu County,0.006982,888026,Control,6.981778,1
2,2003,Hawaii,Kalawao County,0.007009,129,Control,7.008862,1
3,2003,Hawaii,Kauai County,0.007009,60061,Control,7.008862,1
4,2003,Hawaii,Maui County,0.007009,134742,Control,7.008862,1
...,...,...,...,...,...,...,...,...
1529,2015,Oregon,Wallowa County,0.011214,6810,Control,11.214197,1
1530,2015,Oregon,Wasco County,0.011214,25469,Control,11.214197,1
1531,2015,Oregon,Washington County,0.007156,572955,Control,7.155885,1
1532,2015,Oregon,Wheeler County,0.011214,1327,Control,11.214197,1


In [17]:
## Transform and Groupby Death Rate by State and Year for WA Reference states
death_data_WA_reference[
    "average_deaths_state"
] = death_data_WA_reference.groupby(["STNAME", "YEAR"])[
    "Death_Rate_Percentage"
].transform(
    "mean"
)
death_data_WA_ref_subset = death_data_WA_reference[["STNAME", "YEAR", "average_deaths_state"]]
death_data_WA_ref_subset_grouped = death_data_WA_ref_subset.groupby(["STNAME", "YEAR"], as_index = False).mean()
death_data_WA_ref_subset_grouped_pre = death_data_WA_ref_subset_grouped.loc[death_data_WA_ref_subset_grouped["YEAR"] < 2012]
death_data_WA_ref_subset_grouped_post = death_data_WA_ref_subset_grouped.loc[death_data_WA_ref_subset_grouped["YEAR"] >= 2012]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  death_data_WA_reference[


In [18]:
## Function to create confidence interval for WA reference states
def get_reg_fit_WA_ref(data, yvar, xvar, alpha):
    # Grid for predicted values
    x = data.loc[pd.notnull(data[yvar]), xvar]
    xmin = x.min()
    xmax = x.max()
    step = (xmax - xmin) / 100
    grid = np.arange(xmin, xmax + step, step)
    predictions = pd.DataFrame({xvar: grid})

    # Fit model, get predictions
    model = smf.ols(f"{yvar} ~ {xvar}", data=data).fit()
    model_predict = model.get_prediction(predictions[xvar])
    predictions[yvar] = model_predict.summary_frame()["mean"]
    predictions[["ci_low", "ci_high"]] = model_predict.conf_int(alpha=alpha)

    # Build chart
    reg = alt.Chart(predictions).mark_line(color = "purple", opacity=0.2).encode(
        x=alt.X(
            xvar, 
            scale=alt.Scale(zero=False), 
            axis = alt.Axis(format="T", 
            title = "Year")), 
        y = alt.Y(
            yvar, 
            scale=alt.Scale(zero=False),
            title = "Mortality Rate (per 100,000 people)")
    )
    ci = (
        alt.Chart(predictions)
        .mark_errorband(color = "purple", opacity=0.2)
        .encode(
            x=xvar,
            y=alt.Y("ci_low", title=""),
            y2="ci_high",
        )
    )
    chart = ci + reg
    return predictions, chart

In [19]:
## Generate Pre-Post Graphs for WA reference states
fit, reg_chart_pre_WA_ref = get_reg_fit_WA_ref(
    death_data_WA_ref_subset_grouped_pre, 
    yvar="average_deaths_state", 
    xvar="YEAR", 
    alpha=0.05
)

fit, reg_chart_post_WA_ref = get_reg_fit_WA_ref(
    death_data_WA_ref_subset_grouped_post, 
    yvar="average_deaths_state", 
    xvar="YEAR", 
    alpha=0.05
)

## Create line post-policy implementation
line_2012 = alt.Chart(pd.DataFrame({'x': [2012]})).mark_rule(strokeDash=[5, 5]).encode(x='x')

## Generate final pre-post graph for WA reference states
pre_post_WA_ref = reg_chart_pre_WA_ref + reg_chart_post_WA_ref + line_2012
pre_post_WA_ref.properties(title="Pre-Post Washington Reference States Mortality Rate Analysis")

  for col_name, dtype in df.dtypes.iteritems():


In [20]:
## Combine pre-post graphs to create diff-in-diff graph for WA and WA reference states
diff_in_diff_WA = pre_post_WA + pre_post_WA_ref
diff_in_diff_WA.properties(title="Diff-in-Diff Washington Mortality Rate Analysis")

In [21]:
## Load data from the TX mortality cleansed files
death_data_load_TX = pd.read_csv('../20_intermediate_files/texas_death_cleaned.csv')
death_data_load_TX_copy = death_data_load_TX.copy()
death_data_load_TX_copy['Death_Rate_Percentage'] = death_data_load_TX_copy['Death Rate (%)'] * 1000
death_data_TX = death_data_load_TX_copy.loc[death_data_load_TX_copy['STNAME']=='Texas']
death_data_TX_reference = death_data_load_TX_copy.loc[death_data_load_TX_copy['STNAME']!='Texas']
death_data_TX

Unnamed: 0,YEAR,STNAME,CTYNAME,Death Rate (%),POPULATION,Indicator,Death_Rate_Percentage
1274,2003,Texas,Anderson County,0.009064,55573,Treatment,9.063686
1275,2003,Texas,Andrews County,0.009064,12816,Treatment,9.063686
1276,2003,Texas,Angelina County,0.009064,80662,Treatment,9.063686
1277,2003,Texas,Aransas County,0.009064,23384,Treatment,9.063686
1278,2003,Texas,Archer County,0.009064,8936,Treatment,9.063686
...,...,...,...,...,...,...,...
4571,2015,Texas,Wood County,0.009841,43142,Treatment,9.841293
4572,2015,Texas,Yoakum County,0.009841,8635,Treatment,9.841293
4573,2015,Texas,Young County,0.009841,18154,Treatment,9.841293
4574,2015,Texas,Zapata County,0.009841,14474,Treatment,9.841293


In [22]:
## Transform and Groupby Death Rate by State and Year for TX
death_data_TX[
    "average_deaths_state"
] = death_data_TX.groupby(["STNAME", "YEAR"])[
    "Death_Rate_Percentage"
].transform(
    "mean"
)
death_data_TX_subset = death_data_TX[["STNAME", "YEAR", "average_deaths_state"]]
death_data_TX_subset_grouped = death_data_TX_subset.groupby(["STNAME", "YEAR"], as_index = False).mean()
death_data_TX_subset_grouped_pre = death_data_TX_subset_grouped.loc[death_data_TX_subset_grouped["YEAR"] < 2007]
death_data_TX_subset_grouped_post = death_data_TX_subset_grouped.loc[death_data_TX_subset_grouped["YEAR"] >= 2007]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  death_data_TX[


In [23]:
## Function to create confidence interval for TX
def get_reg_fit_TX(data, yvar, xvar, alpha):
    # Grid for predicted values
    x = data.loc[pd.notnull(data[yvar]), xvar]
    xmin = x.min()
    xmax = x.max()
    step = (xmax - xmin) / 100
    grid = np.arange(xmin, xmax + step, step)
    predictions = pd.DataFrame({xvar: grid})

    # Fit model, get predictions
    model = smf.ols(f"{yvar} ~ {xvar}", data=data).fit()
    model_predict = model.get_prediction(predictions[xvar])
    predictions[yvar] = model_predict.summary_frame()["mean"]
    predictions[["ci_low", "ci_high"]] = model_predict.conf_int(alpha=alpha)

    # Build chart
    reg = alt.Chart(predictions).mark_line(color = "orange").encode(
        x=alt.X(
            xvar, 
            scale=alt.Scale(zero=False), 
            axis = alt.Axis(format="T", 
            title = "Year")), 
        y = alt.Y(
            yvar, 
            scale=alt.Scale(zero=False),
            title = "Mortality Rate (per 100,000 people)")
    )
    ci = (
        alt.Chart(predictions)
        .mark_errorband(color = "orange")
        .encode(
            x=xvar,
            y=alt.Y("ci_low", title=""),
            y2="ci_high",
        )
    )
    chart = ci + reg
    return predictions, chart

In [24]:
## Generate Pre-Post Graphs for TX
fit, reg_chart_pre_TX = get_reg_fit_TX(
    death_data_TX_subset_grouped_pre, 
    yvar="average_deaths_state", 
    xvar="YEAR", 
    alpha=0.05
)
reg_chart_pre_TX

fit, reg_chart_post_TX = get_reg_fit_TX(
    death_data_TX_subset_grouped_post, 
    yvar="average_deaths_state", 
    xvar="YEAR", 
    alpha=0.05
)

## Create line post-policy implementation
line_2007 = alt.Chart(pd.DataFrame({'x': [2007]})).mark_rule(strokeDash=[5, 5]).encode(x='x')

## Generate final pre-post graph for TX
pre_post_TX = reg_chart_pre_TX + reg_chart_post_TX + line_2007
pre_post_TX.properties(title="Pre-Post Texas Mortality Rate Analysis")

  for col_name, dtype in df.dtypes.iteritems():


In [25]:
## Include indicator for reference states for aggregation
death_data_TX_reference["Reference_State_Indicator"] = 1
death_data_TX_reference

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  death_data_TX_reference["Reference_State_Indicator"] = 1


Unnamed: 0,YEAR,STNAME,CTYNAME,Death Rate (%),POPULATION,Indicator,Death_Rate_Percentage,Reference_State_Indicator
0,2003,New York,Albany County,0.005506,298491,Control,5.506399,1
1,2003,New York,Allegany County,0.005506,50220,Control,5.506399,1
2,2003,New York,Bronx County,0.009677,1353712,Control,9.677095,1
3,2003,New York,Broome County,0.005506,198364,Control,5.506399,1
4,2003,New York,Cattaraugus County,0.005506,82968,Control,5.506399,1
...,...,...,...,...,...,...,...,...
5507,2015,Wisconsin,Waukesha County,0.008578,396377,Control,8.577692,1
5508,2015,Wisconsin,Waupaca County,0.014446,51684,Control,14.445883,1
5509,2015,Wisconsin,Waushara County,0.014446,23966,Control,14.445883,1
5510,2015,Wisconsin,Winnebago County,0.012988,169383,Control,12.988316,1


In [26]:
## Transform and Groupby Death Rate by State and Year for TX Reference states
death_data_TX_reference[
    "average_deaths_state"
] = death_data_TX_reference.groupby(["STNAME", "YEAR"])[
    "Death_Rate_Percentage"
].transform(
    "mean"
)
death_data_TX__ref_subset = death_data_TX_reference[["STNAME", "YEAR", "average_deaths_state"]]
death_data_TX_ref_subset_grouped = death_data_TX__ref_subset.groupby(["STNAME", "YEAR"], as_index = False).mean()
death_data_TX_ref_subset_grouped_pre = death_data_TX_ref_subset_grouped.loc[death_data_TX_ref_subset_grouped["YEAR"] < 2007]
death_data_TX_ref_subset_grouped_post = death_data_TX_ref_subset_grouped.loc[death_data_TX_ref_subset_grouped["YEAR"] >= 2007]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  death_data_TX_reference[


In [27]:
## Function to create confidence interval for TX reference states
def get_reg_fit_TX_ref(data, yvar, xvar, alpha):
    # Grid for predicted values
    x = data.loc[pd.notnull(data[yvar]), xvar]
    xmin = x.min()
    xmax = x.max()
    step = (xmax - xmin) / 100
    grid = np.arange(xmin, xmax + step, step)
    predictions = pd.DataFrame({xvar: grid})

    # Fit model, get predictions
    model = smf.ols(f"{yvar} ~ {xvar}", data=data).fit()
    model_predict = model.get_prediction(predictions[xvar])
    predictions[yvar] = model_predict.summary_frame()["mean"]
    predictions[["ci_low", "ci_high"]] = model_predict.conf_int(alpha=alpha)

    # Build chart
    reg = alt.Chart(predictions).mark_line(color = "orange", opacity=0.2).encode(
        x=alt.X(
            xvar, 
            scale=alt.Scale(zero=False), 
            axis = alt.Axis(format="T", 
            title = "Year")), 
        y = alt.Y(
            yvar, 
            scale=alt.Scale(zero=False),
            title = "Mortality Rate (per 100,000 people)")
    )
    ci = (
        alt.Chart(predictions)
        .mark_errorband(color = "orange", opacity=0.2)
        .encode(
            x=xvar,
            y=alt.Y("ci_low", title=""),
            y2="ci_high",
        )
    )
    chart = ci + reg
    return predictions, chart

In [28]:
## Generate Pre-Post Graphs for TX reference states
fit, reg_chart_pre_TX_ref = get_reg_fit_TX_ref(
    death_data_TX_ref_subset_grouped_pre, 
    yvar="average_deaths_state", 
    xvar="YEAR", 
    alpha=0.05
)

fit, reg_chart_post_TX_ref = get_reg_fit_TX_ref(
    death_data_TX_ref_subset_grouped_post, 
    yvar="average_deaths_state", 
    xvar="YEAR", 
    alpha=0.05
)

## Create line post-policy implementation
line_2007 = alt.Chart(pd.DataFrame({'x': [2007]})).mark_rule(strokeDash=[5, 5]).encode(x='x')

## Generate final pre-post graph for FL reference states
pre_post_TX_ref = reg_chart_pre_TX_ref + reg_chart_post_TX_ref + line_2007
pre_post_TX_ref.properties(title="Pre-Post Texas Reference States Mortality Rate Analysis")

  for col_name, dtype in df.dtypes.iteritems():


In [29]:
## Combine pre-post graphs to create diff-in-diff graph for FL and FL reference states
diff_in_diff_TX = pre_post_TX + pre_post_TX_ref
diff_in_diff_TX.properties(title="Diff-in-Diff Texas Mortality Rate Analysis")