In [1]:
## import libraries
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import altair as alt

## Load data from the FL mortality cleansed files
death_data_load_FL = pd.read_csv('/Users/sukhpreetsahota/Desktop/Duke/Fall 2022/IDS 720.01.F22/Class Project/pds-2022-yellow-team/20_intermediate_files/florida_death_cleaned.csv')
death_data_load_FL_copy = death_data_load_FL.copy()
death_data_load_FL_copy['Death_Rate_Percentage'] = death_data_load_FL_copy['Death Rate (%)'] * 1000
death_data_FL = death_data_load_FL_copy.loc[death_data_load_FL_copy['STNAME']=='Florida']
death_data_FL_reference = death_data_load_FL_copy.loc[death_data_load_FL_copy['STNAME']!='Florida']
death_data_FL

Unnamed: 0,YEAR,Death Rate (%),POPULATION,STNAME,CTYNAME,Indicator,Death_Rate_Percentage
832,2003.0,0.004870,225862.0,Florida,Alachua County,Treatment,4.870230
833,2003.0,0.027644,23285.0,Florida,Baker County,Treatment,27.644263
834,2003.0,0.013572,154726.0,Florida,Bay County,Treatment,13.572380
835,2003.0,0.027644,27097.0,Florida,Bradford County,Treatment,27.644263
836,2003.0,0.019285,502985.0,Florida,Brevard County,Treatment,19.284869
...,...,...,...,...,...,...,...
1698,2015.0,0.027644,15268.0,Florida,Union County,Treatment,27.644263
1699,2015.0,0.018950,517144.0,Florida,Volusia County,Treatment,18.950234
1700,2015.0,0.027644,31529.0,Florida,Wakulla County,Treatment,27.644263
1701,2015.0,0.020885,63145.0,Florida,Walton County,Treatment,20.885425


In [2]:
## Transform and Groupby Death Rate by State and Year for FL
death_data_FL[
    "average_deaths_state"
] = death_data_FL.groupby(["STNAME", "YEAR"])[
    "Death_Rate_Percentage"
].transform(
    "mean"
)
death_data_FL_subset = death_data_FL[["STNAME", "YEAR", "average_deaths_state"]]
death_data_FL_subset_grouped = death_data_FL_subset.groupby(["STNAME", "YEAR"], as_index = False).mean()
death_data_FL_subset_grouped_pre = death_data_FL_subset_grouped.loc[death_data_FL_subset_grouped["YEAR"] < 2010]
death_data_FL_subset_grouped_post = death_data_FL_subset_grouped.loc[death_data_FL_subset_grouped["YEAR"] >= 2010]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  death_data_FL[


In [3]:
## Function to create confidence interval for FL
def get_reg_fit_FL(data, yvar, xvar, alpha):
    # Grid for predicted values
    x = data.loc[pd.notnull(data[yvar]), xvar]
    xmin = x.min()
    xmax = x.max()
    step = (xmax - xmin) / 100
    grid = np.arange(xmin, xmax + step, step)
    predictions = pd.DataFrame({xvar: grid})

    # Fit model, get predictions
    model = smf.ols(f"{yvar} ~ {xvar}", data=data).fit()
    model_predict = model.get_prediction(predictions[xvar])
    predictions[yvar] = model_predict.summary_frame()["mean"]
    predictions[["ci_low", "ci_high"]] = model_predict.conf_int(alpha=alpha)

    # Build chart
    reg = alt.Chart(predictions).mark_line(color = "teal").encode(
        x=alt.X(
            xvar, 
            scale=alt.Scale(zero=False), 
            axis = alt.Axis(format="T", 
            title = "Year")), 
        y = alt.Y(
            yvar, 
            scale=alt.Scale(zero=False),
            title = "Mortality Rate (per 100,000 people)")
    )
    ci = (
        alt.Chart(predictions)
        .mark_errorband(color = "teal")
        .encode(
            x=xvar,
            y=alt.Y("ci_low", title=""),
            y2="ci_high",
        )
    )
    chart = ci + reg
    return predictions, chart

In [4]:
## Generate Pre-Post Graphs for FL
fit, reg_chart_pre_FL = get_reg_fit_FL(
    death_data_FL_subset_grouped_pre, 
    yvar="average_deaths_state", 
    xvar="YEAR", 
    alpha=0.05
)
reg_chart_pre_FL

fit, reg_chart_post_FL = get_reg_fit_FL(
    death_data_FL_subset_grouped_post, 
    yvar="average_deaths_state", 
    xvar="YEAR", 
    alpha=0.05
)

## Create line post-policy implementation
line_2010 = alt.Chart(pd.DataFrame({'x': [2010]})).mark_rule(strokeDash=[10, 7], color = "red", strokeWidth=3).encode(x='x')

## Generate final pre-post graph for FL
pre_post_FL = reg_chart_pre_FL + reg_chart_post_FL + line_2010
pre_post_FL.properties(title="Pre-Post Florida Mortality Rate Analysis")

In [5]:
## Include indicator for reference states for aggregation
death_data_FL_reference["Reference_State_Indicator"] = 1
death_data_FL_reference

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  death_data_FL_reference["Reference_State_Indicator"] = 1


Unnamed: 0,YEAR,Death Rate (%),POPULATION,STNAME,CTYNAME,Indicator,Death_Rate_Percentage,Reference_State_Indicator
0,2003.0,0.011088,378785.0,Colorado,Adams County,Control,11.088084,1
1,2003.0,0.031644,15125.0,Colorado,Alamosa County,Control,31.644289,1
2,2003.0,0.008748,514406.0,Colorado,Arapahoe County,Control,8.747954,1
3,2003.0,0.031644,11167.0,Colorado,Archuleta County,Control,31.644289,1
4,2003.0,0.031644,4106.0,Colorado,Baca County,Control,31.644289,1
...,...,...,...,...,...,...,...,...
2751,2015.0,0.029162,6620.0,Nevada,Pershing County,Control,29.162446,1
2752,2015.0,0.029162,3881.0,Nevada,Storey County,Control,29.162446,1
2753,2015.0,0.018978,442617.0,Nevada,Washoe County,Control,18.978033,1
2754,2015.0,0.029162,9875.0,Nevada,White Pine County,Control,29.162446,1


In [6]:
## Transform and Groupby Death Rate by State and Year for FL Reference states
death_data_FL_reference[
    "average_deaths_state"
] = death_data_FL_reference.groupby(["STNAME", "YEAR"])[
    "Death_Rate_Percentage"
].transform(
    "mean"
)
death_data_FL__ref_subset = death_data_FL_reference[["STNAME", "YEAR", "average_deaths_state"]]
death_data_FL_ref_subset_grouped = death_data_FL__ref_subset.groupby(["STNAME", "YEAR"], as_index = False).mean()
death_data_FL_ref_subset_grouped_pre = death_data_FL_ref_subset_grouped.loc[death_data_FL_ref_subset_grouped["YEAR"] < 2010]
death_data_FL_ref_subset_grouped_post = death_data_FL_ref_subset_grouped.loc[death_data_FL_ref_subset_grouped["YEAR"] >= 2010]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  death_data_FL_reference[


In [7]:
## Function to create confidence interval for FL reference states
def get_reg_fit_FL_ref(data, yvar, xvar, alpha):
    # Grid for predicted values
    x = data.loc[pd.notnull(data[yvar]), xvar]
    xmin = x.min()
    xmax = x.max()
    step = (xmax - xmin) / 100
    grid = np.arange(xmin, xmax + step, step)
    predictions = pd.DataFrame({xvar: grid})

    # Fit model, get predictions
    model = smf.ols(f"{yvar} ~ {xvar}", data=data).fit()
    model_predict = model.get_prediction(predictions[xvar])
    predictions[yvar] = model_predict.summary_frame()["mean"]
    predictions[["ci_low", "ci_high"]] = model_predict.conf_int(alpha=alpha)

    # Build chart
    reg = alt.Chart(predictions).mark_line(color = "black", opacity=0.2).encode(
        x=alt.X(
            xvar, 
            scale=alt.Scale(zero=False), 
            axis = alt.Axis(format="T", 
            title = "Year")), 
        y = alt.Y(
            yvar, 
            scale=alt.Scale(zero=False),
            title = "Mortality Rate (per 100,000 people)")
    )
    ci = (
        alt.Chart(predictions)
        .mark_errorband(color = "black", opacity=0.2)
        .encode(
            x=xvar,
            y=alt.Y("ci_low", title=""),
            y2="ci_high",
        )
    )
    chart = ci + reg
    return predictions, chart

In [8]:
## Generate Pre-Post Graphs for FL reference states
fit, reg_chart_pre_FL_ref = get_reg_fit_FL_ref(
    death_data_FL_ref_subset_grouped_pre, 
    yvar="average_deaths_state", 
    xvar="YEAR", 
    alpha=0.05
)

fit, reg_chart_post_FL_ref = get_reg_fit_FL_ref(
    death_data_FL_ref_subset_grouped_post, 
    yvar="average_deaths_state", 
    xvar="YEAR", 
    alpha=0.05
)

## Create line post-policy implementation
line_2010 = alt.Chart(pd.DataFrame({'x': [2010]})).mark_rule(strokeDash=[10, 7], color = "red", strokeWidth=3).encode(x='x')

## Generate final pre-post graph for FL reference states
pre_post_FL_ref = reg_chart_pre_FL_ref + reg_chart_post_FL_ref + line_2010
pre_post_FL_ref.properties(title="Pre-Post Florida Reference States Mortality Rate Analysis")

In [9]:
## Combine pre-post graphs to create diff-in-diff graph for FL and FL reference states
diff_in_diff_FL = pre_post_FL + pre_post_FL_ref
diff_in_diff_FL.properties(title="Diff-in-Diff Mortality Rate Analysis of Florida vs Reference States")

In [10]:
## Load data from the WA mortality cleansed files
death_data_load_WA = pd.read_csv('/Users/sukhpreetsahota/Desktop/Duke/Fall 2022/IDS 720.01.F22/Class Project/pds-2022-yellow-team/20_intermediate_files/washington_death_cleaned.csv')
death_data_load_WA_copy = death_data_load_WA.copy()
death_data_load_WA_copy['Death_Rate_Percentage'] = death_data_load_WA_copy['Death Rate (%)'] * 1000
death_data_WA = death_data_load_WA_copy.loc[death_data_load_WA_copy['STNAME']=='Washington']
death_data_WA_reference = death_data_load_WA_copy.loc[death_data_load_WA_copy['STNAME']!='Washington']
death_data_WA

Unnamed: 0,YEAR,Death Rate (%),POPULATION,STNAME,CTYNAME,Indicator,Death_Rate_Percentage
1534,2003.0,0.026438,16425.0,Washington,Adams County,Treatment,26.438340
1535,2003.0,0.026438,20499.0,Washington,Asotin County,Treatment,26.438340
1536,2003.0,0.006554,152586.0,Washington,Benton County,Treatment,6.553681
1537,2003.0,0.018423,67377.0,Washington,Chelan County,Treatment,18.422790
1538,2003.0,0.018423,66349.0,Washington,Clallam County,Treatment,18.422790
...,...,...,...,...,...,...,...
2036,2015.0,0.026438,3989.0,Washington,Wahkiakum County,Treatment,26.438340
2037,2015.0,0.020478,59970.0,Washington,Walla Walla County,Treatment,20.477983
2038,2015.0,0.010852,211942.0,Washington,Whatcom County,Treatment,10.852026
2039,2015.0,0.026438,48224.0,Washington,Whitman County,Treatment,26.438340


In [11]:
## Transform and Groupby Death Rate by State and Year for WA
death_data_WA[
    "average_deaths_state"
] = death_data_WA.groupby(["STNAME", "YEAR"])[
    "Death_Rate_Percentage"
].transform(
    "mean"
)
death_data_WA_subset = death_data_WA[["STNAME", "YEAR", "average_deaths_state"]]
death_data_WA_subset_grouped = death_data_WA_subset.groupby(["STNAME", "YEAR"], as_index = False).mean()
death_data_WA_subset_grouped_pre = death_data_WA_subset_grouped.loc[death_data_WA_subset_grouped["YEAR"] < 2012]
death_data_WA_subset_grouped_post = death_data_WA_subset_grouped.loc[death_data_WA_subset_grouped["YEAR"] >= 2012]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  death_data_WA[


In [12]:
## Function to create confidence interval for WA
def get_reg_fit_WA(data, yvar, xvar, alpha):
    # Grid for predicted values
    x = data.loc[pd.notnull(data[yvar]), xvar]
    xmin = x.min()
    xmax = x.max()
    step = (xmax - xmin) / 100
    grid = np.arange(xmin, xmax + step, step)
    predictions = pd.DataFrame({xvar: grid})

    # Fit model, get predictions
    model = smf.ols(f"{yvar} ~ {xvar}", data=data).fit()
    model_predict = model.get_prediction(predictions[xvar])
    predictions[yvar] = model_predict.summary_frame()["mean"]
    predictions[["ci_low", "ci_high"]] = model_predict.conf_int(alpha=alpha)

    # Build chart
    reg = alt.Chart(predictions).mark_line(color = "purple").encode(
        x=alt.X(
            xvar, 
            scale=alt.Scale(zero=False), 
            axis = alt.Axis(format="T", 
            title = "Year")), 
        y = alt.Y(
            yvar, 
            scale=alt.Scale(zero=False),
            title = "Mortality Rate (per 100,000 people)")
    )
    ci = (
        alt.Chart(predictions)
        .mark_errorband(color = "purple")
        .encode(
            x=xvar,
            y=alt.Y("ci_low", title=""),
            y2="ci_high",
        )
    )
    chart = ci + reg
    return predictions, chart

In [13]:
## Generate Pre-Post Graphs for WA
fit, reg_chart_pre_WA = get_reg_fit_WA(
    death_data_WA_subset_grouped_pre, 
    yvar="average_deaths_state", 
    xvar="YEAR", 
    alpha=0.05
)
reg_chart_pre_WA

fit, reg_chart_post_WA = get_reg_fit_WA(
    death_data_WA_subset_grouped_post, 
    yvar="average_deaths_state", 
    xvar="YEAR", 
    alpha=0.05
)

## Create line post-policy implementation
line_2012 = alt.Chart(pd.DataFrame({'x': [2012]})).mark_rule(strokeDash=[10, 7], color = "red", strokeWidth=3).encode(x='x')

## Generate final pre-post graph for WA
pre_post_WA = reg_chart_pre_WA + reg_chart_post_WA + line_2012
pre_post_WA.properties(title="Pre-Post Washington Mortality Rate Analysis")

In [14]:
## Include indicator for reference states for aggregation
death_data_WA_reference["Reference_State_Indicator"] = 1
death_data_WA_reference

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  death_data_WA_reference["Reference_State_Indicator"] = 1


Unnamed: 0,YEAR,Death Rate (%),POPULATION,STNAME,CTYNAME,Indicator,Death_Rate_Percentage,Reference_State_Indicator
0,2003.0,0.007036,156340.0,Hawaii,Hawaii County,Control,7.035947,1
1,2003.0,0.006982,888026.0,Hawaii,Honolulu County,Control,6.981778,1
2,2003.0,0.012706,129.0,Hawaii,Kalawao County,Control,12.706176,1
3,2003.0,0.012706,60061.0,Hawaii,Kauai County,Control,12.706176,1
4,2003.0,0.012706,134742.0,Hawaii,Maui County,Control,12.706176,1
...,...,...,...,...,...,...,...,...
1529,2015.0,0.023540,6810.0,Oregon,Wallowa County,Control,23.539954,1
1530,2015.0,0.023540,25469.0,Oregon,Wasco County,Control,23.539954,1
1531,2015.0,0.007156,572955.0,Oregon,Washington County,Control,7.155885,1
1532,2015.0,0.023540,1327.0,Oregon,Wheeler County,Control,23.539954,1


In [15]:
## Transform and Groupby Death Rate by State and Year for WA Reference states
death_data_WA_reference[
    "average_deaths_state"
] = death_data_WA_reference.groupby(["STNAME", "YEAR"])[
    "Death_Rate_Percentage"
].transform(
    "mean"
)
death_data_WA_ref_subset = death_data_WA_reference[["STNAME", "YEAR", "average_deaths_state"]]
death_data_WA_ref_subset_grouped = death_data_WA_ref_subset.groupby(["STNAME", "YEAR"], as_index = False).mean()
death_data_WA_ref_subset_grouped_pre = death_data_WA_ref_subset_grouped.loc[death_data_WA_ref_subset_grouped["YEAR"] < 2012]
death_data_WA_ref_subset_grouped_post = death_data_WA_ref_subset_grouped.loc[death_data_WA_ref_subset_grouped["YEAR"] >= 2012]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  death_data_WA_reference[


In [16]:
## Function to create confidence interval for WA reference states
def get_reg_fit_WA_ref(data, yvar, xvar, alpha):
    # Grid for predicted values
    x = data.loc[pd.notnull(data[yvar]), xvar]
    xmin = x.min()
    xmax = x.max()
    step = (xmax - xmin) / 100
    grid = np.arange(xmin, xmax + step, step)
    predictions = pd.DataFrame({xvar: grid})

    # Fit model, get predictions
    model = smf.ols(f"{yvar} ~ {xvar}", data=data).fit()
    model_predict = model.get_prediction(predictions[xvar])
    predictions[yvar] = model_predict.summary_frame()["mean"]
    predictions[["ci_low", "ci_high"]] = model_predict.conf_int(alpha=alpha)

    # Build chart
    reg = alt.Chart(predictions).mark_line(color = "red", opacity=0.2).encode(
        x=alt.X(
            xvar, 
            scale=alt.Scale(zero=False), 
            axis = alt.Axis(format="T", 
            title = "Year")), 
        y = alt.Y(
            yvar, 
            scale=alt.Scale(zero=False),
            title = "Mortality Rate (per 100,000 people)")
    )
    ci = (
        alt.Chart(predictions)
        .mark_errorband(color = "red", opacity=0.2)
        .encode(
            x=xvar,
            y=alt.Y("ci_low", title=""),
            y2="ci_high",
        )
    )
    chart = ci + reg
    return predictions, chart

In [17]:
## Generate Pre-Post Graphs for WA reference states
fit, reg_chart_pre_WA_ref = get_reg_fit_WA_ref(
    death_data_WA_ref_subset_grouped_pre, 
    yvar="average_deaths_state", 
    xvar="YEAR", 
    alpha=0.05
)

fit, reg_chart_post_WA_ref = get_reg_fit_WA_ref(
    death_data_WA_ref_subset_grouped_post, 
    yvar="average_deaths_state", 
    xvar="YEAR", 
    alpha=0.05
)

## Create line post-policy implementation
line_2012 = alt.Chart(pd.DataFrame({'x': [2012]})).mark_rule(strokeDash=[10, 7], color = "red", strokeWidth=3).encode(x='x')

## Generate final pre-post graph for WA reference states
pre_post_WA_ref = reg_chart_pre_WA_ref + reg_chart_post_WA_ref + line_2012
pre_post_WA_ref.properties(title="Pre-Post Washington Reference States Mortality Rate Analysis")

In [18]:
## Combine pre-post graphs to create diff-in-diff graph for WA and WA reference states
diff_in_diff_WA = pre_post_WA_ref + pre_post_WA
diff_in_diff_WA.properties(title="Diff-in-Diff Mortality Rate Analysis of Washington vs Reference States")

In [19]:
## Load data from the TX mortality cleansed files
death_data_load_TX = pd.read_csv('/Users/sukhpreetsahota/Desktop/Duke/Fall 2022/IDS 720.01.F22/Class Project/pds-2022-yellow-team/20_intermediate_files/texas_death_cleaned.csv')
death_data_load_TX_copy = death_data_load_TX.copy()
death_data_load_TX_copy['Death_Rate_Percentage'] = death_data_load_TX_copy['Death Rate (%)'] * 1000
death_data_TX = death_data_load_TX_copy.loc[death_data_load_TX_copy['STNAME']=='Texas']
death_data_TX_reference = death_data_load_TX_copy.loc[death_data_load_TX_copy['STNAME']!='Texas']
death_data_TX

Unnamed: 0,YEAR,Death Rate (%),POPULATION,STNAME,CTYNAME,Indicator,Death_Rate_Percentage
1274,2003.0,0.019253,55573.0,Texas,Anderson County,Treatment,19.253270
1275,2003.0,0.031808,12816.0,Texas,Andrews County,Treatment,31.808442
1276,2003.0,0.023133,80662.0,Texas,Angelina County,Treatment,23.132537
1277,2003.0,0.031808,23384.0,Texas,Aransas County,Treatment,31.808442
1278,2003.0,0.031808,8936.0,Texas,Archer County,Treatment,31.808442
...,...,...,...,...,...,...,...
4571,2015.0,0.024595,43142.0,Texas,Wood County,Treatment,24.595148
4572,2015.0,0.031808,8635.0,Texas,Yoakum County,Treatment,31.808442
4573,2015.0,0.031808,18154.0,Texas,Young County,Treatment,31.808442
4574,2015.0,0.031808,14474.0,Texas,Zapata County,Treatment,31.808442


In [20]:
## Transform and Groupby Death Rate by State and Year for TX
death_data_TX[
    "average_deaths_state"
] = death_data_TX.groupby(["STNAME", "YEAR"])[
    "Death_Rate_Percentage"
].transform(
    "mean"
)
death_data_TX_subset = death_data_TX[["STNAME", "YEAR", "average_deaths_state"]]
death_data_TX_subset_grouped = death_data_TX_subset.groupby(["STNAME", "YEAR"], as_index = False).mean()
death_data_TX_subset_grouped_pre = death_data_TX_subset_grouped.loc[death_data_TX_subset_grouped["YEAR"] < 2007]
death_data_TX_subset_grouped_post = death_data_TX_subset_grouped.loc[death_data_TX_subset_grouped["YEAR"] >= 2007]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  death_data_TX[


In [21]:
## Function to create confidence interval for TX
def get_reg_fit_TX(data, yvar, xvar, alpha):
    # Grid for predicted values
    x = data.loc[pd.notnull(data[yvar]), xvar]
    xmin = x.min()
    xmax = x.max()
    step = (xmax - xmin) / 100
    grid = np.arange(xmin, xmax + step, step)
    predictions = pd.DataFrame({xvar: grid})

    # Fit model, get predictions
    model = smf.ols(f"{yvar} ~ {xvar}", data=data).fit()
    model_predict = model.get_prediction(predictions[xvar])
    predictions[yvar] = model_predict.summary_frame()["mean"]
    predictions[["ci_low", "ci_high"]] = model_predict.conf_int(alpha=alpha)

    # Build chart
    reg = alt.Chart(predictions).mark_line(color = "orange").encode(
        x=alt.X(
            xvar, 
            scale=alt.Scale(zero=False), 
            axis = alt.Axis(format="T", 
            title = "Year")), 
        y = alt.Y(
            yvar, 
            scale=alt.Scale(zero=False),
            title = "Mortality Rate (per 100,000 people)")
    )
    ci = (
        alt.Chart(predictions)
        .mark_errorband(color = "orange")
        .encode(
            x=xvar,
            y=alt.Y("ci_low", title=""),
            y2="ci_high",
        )
    )
    chart = ci + reg
    return predictions, chart

In [22]:
## Generate Pre-Post Graphs for TX
fit, reg_chart_pre_TX = get_reg_fit_TX(
    death_data_TX_subset_grouped_pre, 
    yvar="average_deaths_state", 
    xvar="YEAR", 
    alpha=0.05
)
reg_chart_pre_TX

fit, reg_chart_post_TX = get_reg_fit_TX(
    death_data_TX_subset_grouped_post, 
    yvar="average_deaths_state", 
    xvar="YEAR", 
    alpha=0.05
)

## Create line post-policy implementation
line_2007 = alt.Chart(pd.DataFrame({'x': [2007]})).mark_rule(strokeDash=[10, 7], color = "red", strokeWidth=3).encode(x='x')

## Generate final pre-post graph for TX
pre_post_TX = reg_chart_pre_TX + reg_chart_post_TX + line_2007
pre_post_TX.properties(title="Pre-Post Texas Mortality Rate Analysis")

In [23]:
## Include indicator for reference states for aggregation
death_data_TX_reference["Reference_State_Indicator"] = 1
death_data_TX_reference

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  death_data_TX_reference["Reference_State_Indicator"] = 1


Unnamed: 0,YEAR,Death Rate (%),POPULATION,STNAME,CTYNAME,Indicator,Death_Rate_Percentage,Reference_State_Indicator
0,2003.0,0.005809,298491.0,New York,Albany County,Control,5.808991,1
1,2003.0,0.020627,50220.0,New York,Allegany County,Control,20.627037,1
2,2003.0,0.009677,1353712.0,New York,Bronx County,Control,9.677095,1
3,2003.0,0.010073,198364.0,New York,Broome County,Control,10.073125,1
4,2003.0,0.012510,82968.0,New York,Cattaraugus County,Control,12.509555,1
...,...,...,...,...,...,...,...,...
5507,2015.0,0.008578,396377.0,Wisconsin,Waukesha County,Control,8.577692,1
5508,2015.0,0.018892,51684.0,Wisconsin,Waupaca County,Control,18.892058,1
5509,2015.0,0.018892,23966.0,Wisconsin,Waushara County,Control,18.892058,1
5510,2015.0,0.012988,169383.0,Wisconsin,Winnebago County,Control,12.988316,1


In [24]:
## Transform and Groupby Death Rate by State and Year for TX Reference states
death_data_TX_reference[
    "average_deaths_state"
] = death_data_TX_reference.groupby(["STNAME", "YEAR"])[
    "Death_Rate_Percentage"
].transform(
    "mean"
)
death_data_TX__ref_subset = death_data_TX_reference[["STNAME", "YEAR", "average_deaths_state"]]
death_data_TX_ref_subset_grouped = death_data_TX__ref_subset.groupby(["STNAME", "YEAR"], as_index = False).mean()
death_data_TX_ref_subset_grouped_pre = death_data_TX_ref_subset_grouped.loc[death_data_TX_ref_subset_grouped["YEAR"] < 2007]
death_data_TX_ref_subset_grouped_post = death_data_TX_ref_subset_grouped.loc[death_data_TX_ref_subset_grouped["YEAR"] >= 2007]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  death_data_TX_reference[


In [25]:
## Function to create confidence interval for TX reference states
def get_reg_fit_TX_ref(data, yvar, xvar, alpha):
    # Grid for predicted values
    x = data.loc[pd.notnull(data[yvar]), xvar]
    xmin = x.min()
    xmax = x.max()
    step = (xmax - xmin) / 100
    grid = np.arange(xmin, xmax + step, step)
    predictions = pd.DataFrame({xvar: grid})

    # Fit model, get predictions
    model = smf.ols(f"{yvar} ~ {xvar}", data=data).fit()
    model_predict = model.get_prediction(predictions[xvar])
    predictions[yvar] = model_predict.summary_frame()["mean"]
    predictions[["ci_low", "ci_high"]] = model_predict.conf_int(alpha=alpha)

    # Build chart
    reg = alt.Chart(predictions).mark_line(color = "blue", opacity=0.2).encode(
        x=alt.X(
            xvar, 
            scale=alt.Scale(zero=False), 
            axis = alt.Axis(format="T", 
            title = "Year")), 
        y = alt.Y(
            yvar, 
            scale=alt.Scale(zero=False),
            title = "Mortality Rate (per 100,000 people)")
    )
    ci = (
        alt.Chart(predictions)
        .mark_errorband(color = "blue", opacity=0.2)
        .encode(
            x=xvar,
            y=alt.Y("ci_low", title=""),
            y2="ci_high",
        )
    )
    chart = ci + reg
    return predictions, chart

In [26]:
## Generate Pre-Post Graphs for TX reference states
fit, reg_chart_pre_TX_ref = get_reg_fit_TX_ref(
    death_data_TX_ref_subset_grouped_pre, 
    yvar="average_deaths_state", 
    xvar="YEAR", 
    alpha=0.05
)

fit, reg_chart_post_TX_ref = get_reg_fit_TX_ref(
    death_data_TX_ref_subset_grouped_post, 
    yvar="average_deaths_state", 
    xvar="YEAR", 
    alpha=0.05
)

## Create line post-policy implementation
line_2007 = alt.Chart(pd.DataFrame({'x': [2007]})).mark_rule(strokeDash=[10, 7], color = "red", strokeWidth=3).encode(x='x')

## Generate final pre-post graph for FL reference states
pre_post_TX_ref = reg_chart_pre_TX_ref + reg_chart_post_TX_ref + line_2007
pre_post_TX_ref.properties(title="Pre-Post Texas Reference States Mortality Rate Analysis")

In [27]:
## Combine pre-post graphs to create diff-in-diff graph for FL and FL reference states
diff_in_diff_TX = pre_post_TX + pre_post_TX_ref
diff_in_diff_TX.properties(title="Diff-in-Diff Mortality Rate Analysis of Texas vs Reference States")