In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [2]:
# Load the data
df = pd.read_csv('homework_6.1.csv')

# Display basic information about the dataset
print("Dataset Info:")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
print(df.head())

Dataset Info:
Shape: (1000, 4)
Columns: ['Unnamed: 0', 'Z', 'X', 'Y']

First few rows:
   Unnamed: 0         Z  X         Y
0           0  0.548814  0 -0.823220
1           1  0.715189  1  0.842405
2           2  0.602763  1  0.898618
3           3  0.544883  0 -0.817325
4           4  0.423655  0 -0.635482


In [3]:
# Check treatment distribution
print(f"\nTreatment distribution:")
print(df['X'].value_counts().sort_index())

# Separate treated and untreated groups
treated = df[df['X'] == 1].copy()
untreated = df[df['X'] == 0].copy()

print(f"\nTreated group size: {len(treated)}")
print(f"Untreated group size: {len(untreated)}")


Treatment distribution:
X
0    509
1    491
Name: count, dtype: int64

Treated group size: 491
Untreated group size: 509


# 1. Average Treatment Effect (ATE)
# Match each observation to its nearest neighbor in the opposite group based on confounder Z


In [5]:
print("\n" + "="*60)
print("1. AVERAGE TREATMENT EFFECT (ATE)")
print("="*60)

# For ATE: match each treated to nearest untreated, and each untreated to nearest treated
all_effects = []


# Set up nearest neighbors for both groups using numpy arrays to avoid warnings
nn_untreated = NearestNeighbors(n_neighbors=1, metric='euclidean')
nn_untreated.fit(untreated[['Z']].values)  # Use .values to get numpy array

nn_treated = NearestNeighbors(n_neighbors=1, metric='euclidean')
nn_treated.fit(treated[['Z']].values)  # Use .values to get numpy array

# Match treated to untreated (ATT effects)
treated_effects = []
for idx, treated_row in treated.iterrows():
    # Find nearest untreated neighbor
    distances, indices = nn_untreated.kneighbors([[treated_row['Z']]])
    nearest_untreated_idx = untreated.iloc[indices[0][0]]
    
    # Treatment effect = Y_treated - Y_untreated_counterfactual
    effect = treated_row['Y'] - nearest_untreated_idx['Y']
    treated_effects.append(effect)
    all_effects.append(effect)

# Match untreated to treated (ATU effects)
untreated_effects = []
for idx, untreated_row in untreated.iterrows():
    # Find nearest treated neighbor
    distances, indices = nn_treated.kneighbors([[untreated_row['Z']]])
    nearest_treated_idx = treated.iloc[indices[0][0]]
    
    # Treatment effect = Y_treated_counterfactual - Y_untreated
    effect = nearest_treated_idx['Y'] - untreated_row['Y']
    untreated_effects.append(effect)
    all_effects.append(effect)

ate = np.mean(all_effects)
print(f"Average Treatment Effect (ATE): {ate:.6f}")
print(f"  - Based on {len(treated_effects)} treated observations and {len(untreated_effects)} untreated observations")



1. AVERAGE TREATMENT EFFECT (ATE)
Average Treatment Effect (ATE): 1.695270
  - Based on 491 treated observations and 509 untreated observations


In [6]:
# 2. Average Treatment Effect on the Treated (ATT)
print("\n" + "="*60)
print("2. AVERAGE TREATMENT EFFECT ON THE TREATED (ATT)")
print("="*60)

att = np.mean(treated_effects)
print(f"Average Treatment Effect on the Treated (ATT): {att:.6f}")


2. AVERAGE TREATMENT EFFECT ON THE TREATED (ATT)
Average Treatment Effect on the Treated (ATT): 1.846409


In [7]:
# 3. Average Treatment Effect on the Untreated (ATU)
print("\n" + "="*60)
print("3. AVERAGE TREATMENT Effect ON THE UNTREATED (ATU)")
print("="*60)

atu = np.mean(untreated_effects)
print(f"Average Treatment Effect on the Untreated (ATU): {atu:.6f}")


3. AVERAGE TREATMENT Effect ON THE UNTREATED (ATU)
Average Treatment Effect on the Untreated (ATU): 1.549477


In [8]:
# 4. Marginal Treatment Effect (MTE)
print("\n" + "="*60)
print("4. MARGINAL TREATMENT EFFECT (MTE)")
print("="*60)

# MTE is the maximum treatment effect across all untreated items
# For each untreated item, find its counterfactual (nearest treated) and compute effect
mte_effects = []
mte_details = []

for idx, untreated_row in untreated.iterrows():
    # Find nearest treated neighbor
    distances, indices = nn_treated.kneighbors([[untreated_row['Z']]])
    nearest_treated_idx = treated.iloc[indices[0][0]]
    
    # Treatment effect = Y_treated_counterfactual - Y_untreated
    effect = nearest_treated_idx['Y'] - untreated_row['Y']
    mte_effects.append(effect)
    mte_details.append({
        'untreated_idx': idx,
        'untreated_Z': untreated_row['Z'],
        'untreated_Y': untreated_row['Y'],
        'treated_counterfactual_Z': nearest_treated_idx['Z'],
        'treated_counterfactual_Y': nearest_treated_idx['Y'],
        'effect': effect
    })

mte = np.max(mte_effects)
max_effect_idx = np.argmax(mte_effects)
max_effect_details = mte_details[max_effect_idx]

print(f"Marginal Treatment Effect (MTE): {mte:.6f}")
print(f"MTE comes from untreated observation {max_effect_details['untreated_idx']}:")
print(f"  Untreated: Z={max_effect_details['untreated_Z']:.4f}, Y={max_effect_details['untreated_Y']:.4f}")
print(f"  Counterfactual: Z={max_effect_details['treated_counterfactual_Z']:.4f}, Y={max_effect_details['treated_counterfactual_Y']:.4f}")



4. MARGINAL TREATMENT EFFECT (MTE)
Marginal Treatment Effect (MTE): 2.172470
MTE comes from untreated observation 298:
  Untreated: Z=0.9729, Y=-1.4594
  Counterfactual: Z=0.9738, Y=0.7131


In [9]:
# Summary of results
print("\n" + "="*60)
print("SUMMARY OF RESULTS")
print("="*60)
print(f"Average Treatment Effect (ATE):              {ate:.6f}")
print(f"Average Treatment Effect on Treated (ATT):   {att:.6f}")
print(f"Average Treatment Effect on Untreated (ATU): {atu:.6f}")
print(f"Marginal Treatment Effect (MTE):             {mte:.6f}")



SUMMARY OF RESULTS
Average Treatment Effect (ATE):              1.695270
Average Treatment Effect on Treated (ATT):   1.846409
Average Treatment Effect on Untreated (ATU): 1.549477
Marginal Treatment Effect (MTE):             2.172470
