# Data simulation


In [1]:
from numpy.random import normal, seed
from numpy import cos, mean, pi, arange, hstack, array, floor
import statsmodels.formula.api as smf
import pandas as pd
from doubleml import DoubleMLData
from doubleml import DoubleMLDID
from sklearn.linear_model import LogisticRegression, LinearRegression

seed(1234)

impact = 200
n_group = 10

time_points_base = arange(0, 2)
n = len(time_points_base)
seasonality_base = -cos(time_points_base / (2 * pi))

time_points = hstack((time_points_base,) * 2 * n_group)
seasonality = hstack((seasonality_base,) * 2 * n_group)
D = array([0] * (2 * n_group) + [1] * (2 * n_group))
id = list(range(1, 2 * n_group + 1)) * 2
id.sort()

Y_0 = 500 + 200 * D + 100 * seasonality + normal(size=4 * n_group, scale=100)
Y_1 = Y_0 + D * time_points * impact
Y = D * Y_1 + (1 - D) * Y_0

df_observed = pd.DataFrame({"id": id, "time_points": time_points, "D": D, "Y": Y})

# DiD ATET estimation with the direct method


In [2]:
diff_1 = mean(
    df_observed.loc[(df_observed.time_points == 1) & (df_observed.D == 1), "Y"]
) - mean(df_observed.loc[(df_observed.time_points == 0) & (df_observed.D == 1), "Y"])

diff_0 = mean(
    df_observed.loc[(df_observed.time_points == 1) & (df_observed.D == 0), "Y"]
) - mean(df_observed.loc[(df_observed.time_points == 0) & (df_observed.D == 0), "Y"])

diff_1 - diff_0

290.5375026309532

# DiD ATET with linear models


In [3]:
model = smf.ols(formula="Y~D*time_points", data=df_observed).fit()
model.summary()

0,1,2,3
Dep. Variable:,Y,R-squared:,0.806
Model:,OLS,Adj. R-squared:,0.79
Method:,Least Squares,F-statistic:,50.0
Date:,"Tue, 19 Dec 2023",Prob (F-statistic):,6.41e-13
Time:,16:50:04,Log-Likelihood:,-236.42
No. Observations:,40,AIC:,480.8
Df Residuals:,36,BIC:,487.6
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,455.5473,29.751,15.312,0.000,395.209,515.886
D,187.2648,42.075,4.451,0.000,101.933,272.597
time_points,-112.0903,42.075,-2.664,0.011,-197.422,-26.758
D:time_points,290.5375,59.503,4.883,0.000,169.860,411.215

0,1,2,3
Omnibus:,0.038,Durbin-Watson:,1.764
Prob(Omnibus):,0.981,Jarque-Bera (JB):,0.045
Skew:,0.005,Prob(JB):,0.978
Kurtosis:,2.835,Cond. No.,6.85


# DiD ATET with linear models with additional covariates


In [4]:
n_a = int(floor(n_group / 4))
A = [1] * (n_a * 2) + [0] * (2 * (n_group - n_a))
n_a = int(floor(n_group / 2))
A = array(A + [1] * (n_a * 2) + [0] * (2 * (n_group - n_a)))

D = array([0] * (2 * n_group) + [1] * (2 * n_group))

Y_0_age = Y_0 - 50 * A
Y_1_age = Y_1 - 50 * A
Y_age = D * Y_1_age + (1 - D) * Y_0_age

df_observed_covariates = pd.DataFrame(
    {"id": id, "time_points": time_points, "D": D, "Y": Y, "A": A}
)

model = smf.ols(formula="Y~D*time_points + A", data=df_observed_covariates).fit()
model.summary()

0,1,2,3
Dep. Variable:,Y,R-squared:,0.81
Model:,OLS,Adj. R-squared:,0.788
Method:,Least Squares,F-statistic:,37.33
Date:,"Tue, 19 Dec 2023",Prob (F-statistic):,3.58e-12
Time:,16:50:04,Log-Likelihood:,-236.04
No. Observations:,40,AIC:,482.1
Df Residuals:,35,BIC:,490.5
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,460.9698,30.607,15.061,0.000,398.834,523.105
D,195.3985,43.410,4.501,0.000,107.271,283.526
time_points,-112.0903,42.266,-2.652,0.012,-197.895,-26.285
D:time_points,290.5375,59.773,4.861,0.000,169.191,411.884
A,-27.1122,33.004,-0.821,0.417,-94.115,39.890

0,1,2,3
Omnibus:,0.036,Durbin-Watson:,1.864
Prob(Omnibus):,0.982,Jarque-Bera (JB):,0.209
Skew:,-0.048,Prob(JB):,0.901
Kurtosis:,2.66,Cond. No.,7.15


# DiD ATET with the did package


In [5]:
dml_data = DoubleMLData(
    df_observed_covariates, y_col="Y", d_cols="D", x_cols="A", t_col="time_points"
)

ml_g = LinearRegression()
ml_m = LogisticRegression(penalty=None)

dml_did = DoubleMLDID(
    dml_data,
    ml_g=ml_g,
    ml_m=ml_m,
    score="observational",
    in_sample_normalization=True,
    n_folds=5,
)

dml_did.fit()
print(dml_did)


------------------ Data summary      ------------------
Outcome variable: Y
Treatment variable(s): ['D']
Covariates: ['A']
Instrument variable(s): None
Time variable: time_points
No. Observations: 40

------------------ Score & algorithm ------------------
Score function: observational
DML algorithm: dml2

------------------ Machine learner   ------------------
Learner ml_g: LinearRegression()
Learner ml_m: LogisticRegression(penalty=None)
Out-of-sample Performance:
Learner ml_g0 RMSE: [[115.07635894]]
Learner ml_g1 RMSE: [[126.63685367]]
Learner ml_m RMSE: [[0.50651136]]

------------------ Resampling        ------------------
No. folds: 5
No. repeated sample splits: 1
Apply cross-fitting: True

------------------ Fit summary       ------------------
         coef    std err         t         P>|t|       2.5 %      97.5 %
D  300.258194  52.311374  5.739826  9.477379e-09  197.729785  402.786604
