In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors

In [2]:
# Load your dataset here (ensure lalonde.csv is uploaded to Colab or linked)
df = pd.read_csv('lalonde.csv')

In [4]:
# Naive Comparison
naive_diff = df[df['treat'] == 1]['re78'].mean() - df[df['treat'] == 0]['re78'].mean()
print(f"Naive Difference in Means: ${naive_diff:,.2f}")

Naive Difference in Means: $-635.03


In [5]:
# Define covariates (confounders that influence both treatment selection AND earnings)
X = df[['age', 'educ', 'black', 'hispan', 'married', 'nodegree', 're74', 're75']]
y = df['treat']

# Fit Propensity Model
logit = LogisticRegression(solver='liblinear')
logit.fit(X, y)

# Generate Scores
df['pscore'] = logit.predict_proba(X)[:, 1]

print(df.groupby('treat')['pscore'].describe())

       count      mean       std       min       25%       50%       75%  \
treat                                                                      
0      429.0  0.196656  0.226224  0.009029  0.053966  0.107163  0.168812   
1      185.0  0.561869  0.211763  0.040171  0.475299  0.669336  0.706804   

            max  
treat            
0      0.778098  
1      0.755915  


In [6]:
# Separate groups
treated = df[df['treat'] == 1]
control = df[df['treat'] == 0]

# Fit NN on Control scores
nbrs = NearestNeighbors(n_neighbors=1, metric='euclidean')
nbrs.fit(control[['pscore']])

# Find matches for Treated units
distances, indices = nbrs.kneighbors(treated[['pscore']])
matched_control = control.iloc[indices.flatten()]

# Construct Matched DataFrame
matched_df = pd.concat([treated, matched_control])

print(f"Treated units:         {len(treated)}")
print(f"Matched control units: {len(matched_control)}")
print(f"Total matched df size: {len(matched_df)}")

Treated units:         185
Matched control units: 185
Total matched df size: 370


In [8]:
from scipy import stats
# Balance Check: SMD for all covariates
covariates = ['age', 'educ', 'black', 'hispan', 'married', 'nodegree', 're74', 're75']

matched_treated_df = matched_df[matched_df['treat'] == 1]
matched_control_df = matched_df[matched_df['treat'] == 0]

print("Covariate Balance (SMD < 0.1 = good balance)")
print(f"{'Covariate':<12} {'SMD Before':>12} {'SMD After':>11}")
print("-" * 38)

for col in covariates:
    # Before matching
    smd_before = (treated[col].mean() - control[col].mean()) / \
                 np.sqrt((treated[col].std()**2 + control[col].std()**2) / 2)

    # After matching
    smd_after = (matched_treated_df[col].mean() - matched_control_df[col].mean()) / \
                np.sqrt((matched_treated_df[col].std()**2 + matched_control_df[col].std()**2) / 2)

    flag = "✓" if abs(smd_after) < 0.1 else "✗"
    print(f"{col:<12} {smd_before:>12.3f} {smd_after:>10.3f} {flag}")

# ── T-test on Raw (Unmatched) Data ─────────────────────────────────────────────
diff = treated['re78'].mean() - control['re78'].mean()
t_stat, p_val = stats.ttest_ind(treated['re78'], control['re78'])

print(f"\nRaw Effect (Difference):            ${diff:,.2f}")
print(f"P-value:                             {p_val:.4f}")

# ── T-test on Matched Data ─────────────────────────────────────────────────────
matched_treated = matched_df[matched_df['treat'] == 1]['re78']
matched_control = matched_df[matched_df['treat'] == 0]['re78']

matched_diff = matched_treated.mean() - matched_control.mean()
t_stat, p_val = stats.ttest_ind(matched_treated, matched_control)

print(f"\nRecovered Effect (Matched):         ${matched_diff:,.2f}")
print(f"P-value:                             {p_val:.4f}")

Covariate Balance (SMD < 0.1 = good balance)
Covariate      SMD Before   SMD After
--------------------------------------
age                -0.242      0.261 ✗
educ                0.045      0.027 ✓
black               1.668      0.015 ✓
hispan             -0.277      0.129 ✗
married            -0.719     -0.131 ✗
nodegree            0.235      0.206 ✗
re74               -0.596      0.049 ✓
re75               -0.287      0.080 ✓

Raw Effect (Difference):            $-635.03
P-value:                             0.3342

Recovered Effect (Matched):         $583.04
P-value:                             0.4438
