In [27]:
from linearmodels import PanelOLS
import pandas as pd
import numpy as np

import warnings
warnings.simplefilter("ignore")

In [28]:
useable_counties_school_data = pd.read_csv("/Users/mohammadanas/Desktop/Nicks Project/New project/useable_counties_school_data.csv")

In [57]:
useable_counties_school_data = useable_counties_school_data.loc[
    ~(
        useable_counties_school_data["TeachingMethod"].isin(
            ["Unknown", "Other", "Pending"]
        )
    )
].copy()



In [58]:
# One major limitation of our model is that we have imbalanced 
# treatment variable. The data for remote teaching is very less 
# as compared to hybrid and online teaching. Therefore, to make 
# results robust we create three variations of treatment variable 
# than run three different models.
# the purpose of the model is the same but there is variation in 
# the treatment variable


# create one variation of treatment
# 0- remote
# 1- hybrid
# 2- in person

useable_counties_school_data["Teaching_ind"] = 0

useable_counties_school_data.loc[
    useable_counties_school_data["TeachingMethod"].isin(["Hybrid", "Hybrid/Partial"]),
    "Teaching_ind",
] = 1

useable_counties_school_data.loc[
    useable_counties_school_data["TeachingMethod"].isin(
        ["Full In-Person", "On Premises"]
    ),
    "Teaching_ind",
] = 2


In [31]:
# second variation
# 0 - remote
# 1 _ all else
# sever imbalance

useable_counties_school_data["Teaching_ind2"] = 0

useable_counties_school_data.loc[
    useable_counties_school_data["TeachingMethod"].isin(
        ["Full In-Person", "On Premises", "Hybrid", "Hybrid/Partial"]
    ),
    "Teaching_ind2",
] = 1

In [32]:
# another variation
# exactly same as variation 1 but WILL 
# slightly different when we aggregate 

useable_counties_school_data["Teaching_ind3"] = 0
useable_counties_school_data.loc[
    useable_counties_school_data["TeachingMethod"].isin(["Hybrid", "Hybrid/Partial"]),
    "Teaching_ind3",
] = 1
useable_counties_school_data.loc[
    useable_counties_school_data["TeachingMethod"].isin(
        ["Full In-Person", "On Premises"]
    ),
    "Teaching_ind3",
] = 2

In [33]:
# Now we concate year and term columns to maintain
# consistence with other datasets
useable_counties_school_data["Term"] = (
    useable_counties_school_data["Term"]
    + " "
    + useable_counties_school_data["SchoolYear"]
)

In [34]:
# Select only the required columns
final_school_data = useable_counties_school_data[
    ["Term", "county_fips", "Teaching_ind", "Teaching_ind2", "Teaching_ind3"]
].copy()

# create a copy
final_school_data_1 = final_school_data.copy()

# Create two continuous variable by taking mean 
# and aggregating on counties 

# Last one is different it is not the mean
# but rather we choose counties teaching method 
# based on mode of teaching method 
# in that county
final_school_data = final_school_data.groupby(
    ["Term", "county_fips"], as_index=False
).agg({"Teaching_ind": "mean", "Teaching_ind2": "mean"})

In [39]:
import scipy.stats

final_school_data["Teaching_ind3"] = final_school_data_1.groupby(
    ["Term", "county_fips"], as_index=False
)["Teaching_ind3"].agg(lambda x: scipy.stats.mode(x)[0])["Teaching_ind3"]


In [43]:
# Load panel data
covid_data = pd.read_csv(
    "/Users/mohammadanas/Desktop/Nicks Project/New project/county_pop_covid_panel.csv"
)

In [44]:
# List all the data fips counties 
counties_to_use = list(np.unique(useable_counties_school_data["county_fips"]))
# get only relevant counties from panel data
covid_data = covid_data.loc[covid_data["fips"].isin(counties_to_use)].copy()
# get covid rate
covid_data["Covid_Rate"] = covid_data["New_Cases"] / covid_data["TOT_POP"]
# filter out relevant columns
final_covid_data = covid_data[["Term", "fips", "Covid_Rate"]]


In [45]:
# rename to fips to merge
final_school_data.rename(columns={"county_fips": "fips"}, inplace=True)

In [46]:
# we finally merge school and covid data
master_data = pd.merge(
    final_covid_data, final_school_data, on=["Term", "fips"], how="left", indicator=True
)


In [47]:
# encode term to be processed by Panel OLS
master_data["Final_term"] = 1
master_data.loc[master_data["Term"] == "Spring 2020-2021", "Final_term"] = 2
master_data.loc[master_data["Term"] == "Fall 2021-2022", "Final_term"] = 3
master_data.loc[master_data["Term"] == "Spring 2021-2022", "Final_term"] = 4


In [48]:
# For variation 3 of treatment we do get dummies to treat variables as
# categorical 
master_data = pd.get_dummies(master_data, columns=['Teaching_ind3'])

In [49]:
# set indexes
master_data_v2 = master_data.set_index(["fips", "Final_term"])

In [50]:
# run regression on variation 1
lm1 = PanelOLS.from_formula(
    "Covid_Rate ~ Teaching_ind + EntityEffects + TimeEffects", data=master_data_v2
)
lm1.fit(cov_type="clustered", cluster_entity=True)

0,1,2,3
Dep. Variable:,Covid_Rate,R-squared:,0.0036
Estimator:,PanelOLS,R-squared (Between):,0.2549
No. Observations:,2171,R-squared (Within):,0.0386
Date:,"Tue, Apr 05 2022",R-squared (Overall):,0.2273
Time:,21:14:21,Log-likelihood,5688.2
Cov. Estimator:,Clustered,,
,,F-statistic:,5.9483
Entities:,543,P-value,0.0148
Avg Obs:,3.9982,Distribution:,"F(1,1624)"
Min Obs:,3.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Teaching_ind,0.0051,0.0027,1.8618,0.0628,-0.0003,0.0105


In [51]:
# Panel OLS on panel data for variation 2
lm1 = PanelOLS.from_formula(
    "Covid_Rate ~ Teaching_ind2 + EntityEffects + TimeEffects", data=master_data_v2
)
lm1.fit(cov_type="clustered", cluster_entity=True)

0,1,2,3
Dep. Variable:,Covid_Rate,R-squared:,0.0066
Estimator:,PanelOLS,R-squared (Between):,0.3729
No. Observations:,2171,R-squared (Within):,0.0072
Date:,"Tue, Apr 05 2022",R-squared (Overall):,0.3262
Time:,21:15:04,Log-likelihood,5691.4
Cov. Estimator:,Clustered,,
,,F-statistic:,10.725
Entities:,543,P-value,0.0011
Avg Obs:,3.9982,Distribution:,"F(1,1624)"
Min Obs:,3.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Teaching_ind2,0.0134,0.0056,2.4045,0.0163,0.0025,0.0244


In [56]:
# OLS regression on variation 3
# baseline hybrid
lm1 = PanelOLS.from_formula(
    "Covid_Rate ~ 	Teaching_ind3_0.0 + Teaching_ind3_2.0 + EntityEffects + TimeEffects", data=master_data_v2
)
lm1.fit(cov_type="clustered", cluster_entity=True)

0,1,2,3
Dep. Variable:,Covid_Rate,R-squared:,0.0033
Estimator:,PanelOLS,R-squared (Between):,0.0026
No. Observations:,2172,R-squared (Within):,0.0061
Date:,"Tue, Apr 05 2022",R-squared (Overall):,0.0030
Time:,21:17:37,Log-likelihood,5690.8
Cov. Estimator:,Clustered,,
,,F-statistic:,2.7254
Entities:,543,P-value,0.0658
Avg Obs:,4.0000,Distribution:,"F(2,1624)"
Min Obs:,4.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Teaching_ind3_0.0,-0.0063,0.0040,-1.5704,0.1165,-0.0142,0.0016
Teaching_ind3_2.0,0.0005,0.0020,0.2473,0.8047,-0.0035,0.0045
