# Homework Reflections Week 9 - Week 12

In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from scipy import stats
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import statsmodels.formula.api as smf



# Week 9

Question 1

In [3]:
def simulate_standard_error(true_effect=1, variance_x=1, sample_size=1000, simulations=1000, random_state=0):
    np.random.seed(random_state)
    slopes = []
    standard_errors = []

    for simulation in range(simulations):
        x = np.random.normal(0, np.sqrt(variance_x), sample_size)
        noise = np.random.normal(0, np.abs(x), sample_size)
        y = true_effect * x + noise

        X = sm.add_constant(x)
        fit = sm.OLS(y, X).fit()
        slopes.append(fit.params[1])
        standard_errors.append(fit.bse[1])

    std_slopes = np.std(slopes)
    std_standard_errors = np.mean(standard_errors)

    print(std_slopes, "standard deviation of slope")
    print(std_standard_errors, "mean OLS standard error")

simulate_standard_error()

0.053724526685283824 standard deviation of slope
0.031550173011736495 mean OLS standard error


Question 2

In [4]:
np.random.seed(0)

def make_data(points, correlation_strength):
    x_values = np.linspace(0, 10, points)
    error_covariance_matrix = correlation_strength ** np.abs(
        np.subtract.outer(np.arange(points), np.arange(points))
    )
    error_values = np.random.multivariate_normal(
        np.zeros(points),
        error_covariance_matrix
    )
    y_values = 2 * x_values + error_values
    return x_values, y_values

def run_sim(
    points=100,
    correlation_strength=0.9,
    simulations=200,
    bootstrap_samples=100
):
    ols_slopes = []
    ols_standard_errors = []
    bootstrap_standard_errors = []

    for simulation in range(simulations):
        x_values, y_values = make_data(points, correlation_strength)
        design_matrix = sm.add_constant(x_values)
        linear_model = sm.OLS(y_values, design_matrix).fit()
        ols_slopes.append(linear_model.params[1])
        ols_standard_errors.append(linear_model.bse[1])

        bootstrap_slopes = []
        for bootstrap_sample in range(bootstrap_samples):
            resampled_residuals = resample(linear_model.resid)
            bootstrap_y_values = linear_model.fittedvalues + resampled_residuals
            bootstrap_model = sm.OLS(bootstrap_y_values, design_matrix).fit()
            bootstrap_slopes.append(bootstrap_model.params[1])

        bootstrap_standard_errors.append(np.std(bootstrap_slopes))

    print("correlation =", correlation_strength)
    print("True SD:", np.std(ols_slopes))
    print("OLS SE:", np.mean(ols_standard_errors))
    print("Bootstrap SE:", np.mean(bootstrap_standard_errors))
    print()

for correlation_strength in [0, 0.5, 0.9]:
    run_sim(correlation_strength=correlation_strength)


correlation = 0
True SD: 0.034961095241031345
OLS SE: 0.034365599495386354
Bootstrap SE: 0.03397123700874447

correlation = 0.5
True SD: 0.05480789361052515
OLS SE: 0.0335361108459322
Bootstrap SE: 0.03282796958384208

correlation = 0.9
True SD: 0.11967134331095981
OLS SE: 0.02809543976701594
Bootstrap SE: 0.02756967309808162



# Week 11

Question 1

In [7]:
np.random.seed(42)

time = np.linspace(0, 20, 500)

event_time = 10

baseline_value = 0.2 * time + 2 * np.sin(time * 0.3)

jump_value = 2
jump_derivative = 0.2
jump_second_derivative = 0.4

value_after_event = baseline_value + jump_value * (time >= event_time)

value_derivative = np.gradient(value_after_event)

value_derivative = value_derivative + jump_derivative * (time >= event_time)

value_second_derivative = np.gradient(value_derivative)

value_second_derivative = value_second_derivative + jump_second_derivative * (time >= event_time)

value_after_event = value_after_event + np.random.normal(0, 0.3, len(time))
value_derivative = value_derivative + np.random.normal(0, 0.2, len(time))
value_second_derivative = value_second_derivative + np.random.normal(0, 0.2, len(time))

labels = (time >= event_time).astype(int)

features_value_only = np.column_stack([value_after_event])
features_all_three = np.column_stack([
    value_after_event,
    value_derivative,
    value_second_derivative
])

train_value, test_value, train_labels, test_labels = train_test_split(
    features_value_only, labels, test_size=0.3, random_state=42
)

train_three, test_three, train_labels_2, test_labels_2 = train_test_split(
    features_all_three, labels, test_size=0.3, random_state=42
)

model_value_only = LogisticRegression()
model_value_only.fit(train_value, train_labels)

model_three = LogisticRegression()
model_three.fit(train_three, train_labels_2)

print("Model A: Value Only")
print(classification_report(test_labels, model_value_only.predict(test_value)))

print("Model B: Value + Derivative + Second Derivative")
print(classification_report(test_labels_2, model_three.predict(test_three)))


Model A: Value Only
              precision    recall  f1-score   support

           0       0.81      0.82      0.81        76
           1       0.81      0.80      0.80        74

    accuracy                           0.81       150
   macro avg       0.81      0.81      0.81       150
weighted avg       0.81      0.81      0.81       150

Model B: Value + Derivative + Second Derivative
              precision    recall  f1-score   support

           0       0.96      0.92      0.94        76
           1       0.92      0.96      0.94        74

    accuracy                           0.94       150
   macro avg       0.94      0.94      0.94       150
weighted avg       0.94      0.94      0.94       150



Question 2

In [8]:
np.random.seed(42)

time = np.arange(0, 40)

groups = ["A", "B", "C"]

baseline_levels = {"A": 5, "B": 15, "C": 25}

trend_slope = 0.4

event_time = 20
jump_amount = 10

rows = []
for group in groups:
    for t in time:
        baseline_value = baseline_levels[group]
        trend_value = baseline_value + trend_slope * t
        jump_value = jump_amount if t >= event_time else 0
        noisy_value = trend_value + jump_value + np.random.normal(0, 1)
        rows.append([group, t, noisy_value, int(t >= event_time)])

data = pd.DataFrame(rows, columns=["group", "time", "value", "post_event"])

model = smf.ols("value ~ post_event + C(group)", data=data).fit()

print(model.summary())


                            OLS Regression Results                            
Dep. Variable:                  value   R-squared:                       0.963
Model:                            OLS   Adj. R-squared:                  0.962
Method:                 Least Squares   F-statistic:                     1004.
Date:                Tue, 02 Dec 2025   Prob (F-statistic):           8.59e-83
Time:                        09:13:12   Log-Likelihood:                -275.13
No. Observations:                 120   AIC:                             558.3
Df Residuals:                     116   BIC:                             569.4
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         8.5866      0.445     19.299

# Week 12

Question 1

In [9]:
np.random.seed(42)

time = np.arange(0, 40)

event_time = 20
true_jump = 5

rows = []

for t in time:
    treatment_trend = 2 + 0.5 * t
    if t >= event_time:
        treatment_jump = true_jump
    else:
        treatment_jump = 0
    treatment_value = treatment_trend + treatment_jump + np.random.normal(0, 0.8)
    rows.append(["treat", t, treatment_value, int(t >= event_time), 1])

    if t < 10:
        control_trend = 5 + 0.5 * t
    else:
        control_trend = 5 + 0.5 * 10 + 0.1 * (t - 10)

    control_value = control_trend + np.random.normal(0, 0.8)
    rows.append(["control", t, control_value, int(t >= event_time), 0])

data = pd.DataFrame(rows, columns=["group", "time", "value", "post_event", "treated"])

model = smf.ols("value ~ treated * post_event", data=data).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  value   R-squared:                       0.865
Model:                            OLS   Adj. R-squared:                  0.860
Method:                 Least Squares   F-statistic:                     162.1
Date:                Tue, 02 Dec 2025   Prob (F-statistic):           6.10e-33
Time:                        09:19:11   Log-Likelihood:                -179.30
No. Observations:                  80   AIC:                             366.6
Df Residuals:                      76   BIC:                             376.1
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept              8.6411      0