# Homework Reflections Week 9 - Week 12

In [4]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from scipy import stats
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import statsmodels.formula.api as smf



# Week 9

Question 1

In [10]:
def simulate_se(true_effect=1, var_x=1, sample_size=1000, n_sims=1000, seed=0):
    np.random.seed(seed)
    slope_list = []
    ols_se_list = []

    for _ in range(n_sims):
        x = np.random.normal(0, np.sqrt(var_x), sample_size)
        eps = np.random.normal(0, np.abs(x), sample_size)
        y = true_effect * x + eps

        X = sm.add_constant(x)
        fit = sm.OLS(y, X).fit()
        slope_list.append(fit.params[1])
        ols_se_list.append(fit.bse[1])

    empirical_std = float(np.std(slope_list))
    mean_ols_se = float(np.mean(ols_se_list))

    print(round(empirical_std, 6), "empirical std of slope")
    print(round(mean_ols_se, 6), "mean OLS-reported standard error")

simulate_se()

0.053725 empirical std of slope
0.03155 mean OLS-reported standard error


Question 2

In [12]:
np.random.seed(0)

def make_data(n, rho):
    x = np.linspace(0, 10, n)
    # create correlated errors
    cov = rho ** np.abs(np.subtract.outer(np.arange(n), np.arange(n)))
    e = np.random.multivariate_normal(np.zeros(n), cov)
    y = 2 * x + e
    return x, y

def run_sim(n=100, rho=0.9, sims=200, boots=100):
    slopes = []
    ols_se = []
    boot_se = []

    for _ in range(sims):
        x, y = make_data(n, rho)
        X = sm.add_constant(x)
        model = sm.OLS(y, X).fit()
        slopes.append(model.params[1])
        ols_se.append(model.bse[1])

        # bootstrap residuals
        boot_slopes = []
        for _ in range(boots):
            r = resample(model.resid)
            yb = model.fittedvalues + r
            b_model = sm.OLS(yb, X).fit()
            boot_slopes.append(b_model.params[1])
        boot_se.append(np.std(boot_slopes))

    print("rho =", rho)
    print("True SD:", np.std(slopes))
    print("OLS SE:", np.mean(ols_se))
    print("Bootstrap SE:", np.mean(boot_se))
    print()

for rho in [0, 0.5, 0.9]:
    run_sim(rho=rho)

rho = 0
True SD: 0.034961095241031345
OLS SE: 0.034365599495386354
Bootstrap SE: 0.03397123700874447

rho = 0.5
True SD: 0.05480789361052515
OLS SE: 0.0335361108459322
Bootstrap SE: 0.03282796958384208

rho = 0.9
True SD: 0.11967134331095981
OLS SE: 0.02809543976701594
Bootstrap SE: 0.02756967309808162



# Week 11

Question 1

In [2]:
# Make reproducible data
np.random.seed(42)

# Create time axis
time_points = np.linspace(0, 20, 500)

# Event happens at t = 10
event_time = 10

# Make a baseline trend
# Before event: smooth curve
baseline_value = 0.2 * time_points + 2 * np.sin(time_points * 0.3)

# Create jumps at event
jump_value = 5
jump_derivative = 2
jump_second_derivative = 1.2

# Build synthetic "value" after event
value_after_event = baseline_value + jump_value * (time_points >= event_time)

# Build synthetic derivative: simple numerical derivative
value_derivative = np.gradient(value_after_event)

# Add jump to derivative
value_derivative = value_derivative + jump_derivative * (time_points >= event_time)

# Build synthetic second derivative
value_second_derivative = np.gradient(value_derivative)

# Add jump to second derivative
value_second_derivative = value_second_derivative + jump_second_derivative * (time_points >= event_time)

# Add noise to make it realistic
value_after_event = value_after_event + np.random.normal(0, 0.3, len(time_points))
value_derivative = value_derivative + np.random.normal(0, 0.2, len(time_points))
value_second_derivative = value_second_derivative + np.random.normal(0, 0.2, len(time_points))

# Build labels: 1 if event has passed, 0 otherwise
labels = (time_points >= event_time).astype(int)

# Put features into arrays
features_value_only = np.column_stack([value_after_event])
features_all_three = np.column_stack([
    value_after_event,
    value_derivative,
    value_second_derivative
])

# Split into train and test
train_value, test_value, train_labels, test_labels = train_test_split(
    features_value_only, labels, test_size=0.3, random_state=42
)

train_three, test_three, train_labels_2, test_labels_2 = train_test_split(
    features_all_three, labels, test_size=0.3, random_state=42
)

# Train models
model_value_only = LogisticRegression()
model_value_only.fit(train_value, train_labels)

model_three = LogisticRegression()
model_three.fit(train_three, train_labels_2)

# Evaluate
print("Model A: Value Only")
print(classification_report(test_labels, model_value_only.predict(test_value)))

print("Model B: Value + Derivative + Second Derivative")
print(classification_report(test_labels_2, model_three.predict(test_three)))


Model A: Value Only
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        76
           1       1.00      1.00      1.00        74

    accuracy                           1.00       150
   macro avg       1.00      1.00      1.00       150
weighted avg       1.00      1.00      1.00       150

Model B: Value + Derivative + Second Derivative
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        76
           1       1.00      1.00      1.00        74

    accuracy                           1.00       150
   macro avg       1.00      1.00      1.00       150
weighted avg       1.00      1.00      1.00       150



Question 2

In [5]:
np.random.seed(42)

# Create time axis
time_points = np.arange(0, 40)

# Three groups
groups = ["A", "B", "C"]

# Different starting values
baseline_levels = {"A": 5, "B": 15, "C": 25}

# Common trend slope
trend_slope = 0.4

# Event happens here
event_time = 20
jump_amount = 10

# Make dataset
rows = []
for group in groups:
    for t in time_points:
        baseline_value = baseline_levels[group]
        trend_value = baseline_value + trend_slope * t
        jump_value = jump_amount if t >= event_time else 0
        noisy_value = trend_value + jump_value + np.random.normal(0, 1)
        rows.append([group, t, noisy_value, int(t >= event_time)])

data = pd.DataFrame(rows, columns=["group", "time", "value", "post_event"])

# Fit model with group fixed effects and event effect
model = smf.ols("value ~ post_event + C(group)", data=data).fit()

print(model.summary())


                            OLS Regression Results                            
Dep. Variable:                  value   R-squared:                       0.963
Model:                            OLS   Adj. R-squared:                  0.962
Method:                 Least Squares   F-statistic:                     1004.
Date:                Thu, 20 Nov 2025   Prob (F-statistic):           8.59e-83
Time:                        11:52:18   Log-Likelihood:                -275.13
No. Observations:                 120   AIC:                             558.3
Df Residuals:                     116   BIC:                             569.4
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         8.5866      0.445     19.299

# Week 12

Question 1

In [6]:
np.random.seed(42)

# Time axis
time_points = np.arange(0, 40)

# Event happens here
event_time = 20
true_jump = 5

rows = []

# Create two groups: treatment and control
# Treatment has stable upward trend
# Control has flattening trend before event, which violates parallel trends

for t in time_points:
    # Treatment group
    treatment_trend = 2 + 0.5 * t
    treatment_jump = true_jump if t >= event_time else 0
    treatment_value = treatment_trend + treatment_jump + np.random.normal(0, 0.8)
    rows.append(["treat", t, treatment_value, int(t >= event_time), 1])

    # Control group
    if t < 10:
        control_trend = 5 + 0.5 * t
    else:
        control_trend = 5 + 0.5 * 10 + 0.1 * (t - 10)

    control_value = control_trend + np.random.normal(0, 0.8)
    rows.append(["control", t, control_value, int(t >= event_time), 0])

data = pd.DataFrame(rows, columns=["group", "time", "value", "post_event", "treated"])

# Differences-in-differences model
model = smf.ols("value ~ treated * post_event", data=data).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  value   R-squared:                       0.865
Model:                            OLS   Adj. R-squared:                  0.860
Method:                 Least Squares   F-statistic:                     162.1
Date:                Thu, 20 Nov 2025   Prob (F-statistic):           6.10e-33
Time:                        11:56:13   Log-Likelihood:                -179.30
No. Observations:                  80   AIC:                             366.6
Df Residuals:                      76   BIC:                             376.1
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept              8.6411      0