In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from scipy.spatial.distance import mahalanobis

In [4]:
# Load the datasets
data1 = pd.read_csv('homework_8.1.csv')  # For questions 1 and 2
data2 = pd.read_csv('homework_8.2.csv')  # For questions 3 and 4
print(f"Dataset 1 has {len(data1)} rows")
print(f"Dataset 2 has {len(data2)} rows")

Dataset 1 has 1000 rows
Dataset 2 has 1000 rows


QUESTION 1: Using homework_8.1.csv, find the Average treatment effect with inverse probability weighting. Then, include your code and a written explanation of your work (mentioning any choices or strategies you made in writing the code) in your homework reflection.  



Here are some steps to follow: 



* Estimate the propensity scores using logistic regression. Fit the model so that the Z values predict ﻿X﻿. 

* Use the model to predict the propensity scores (e.g., using predict_proba if you are using sklearn). 

* Calculate inverse probability weights (﻿1 over P﻿ for ﻿X equals 1﻿ and ﻿fraction numerator 1 over denominator 1 minus P end fraction﻿ for ﻿X equals 0﻿). 

* Estimate the average treatment effect (the Y difference between ﻿X equals 1﻿ and ﻿X equals 0﻿, using the appropriate weights for each). 



Then, the ATE is closest to:


In [9]:
# Step 1: Look at our data
print("Data:")
print(data1.head())
print(f"\nColumns: {list(data1.columns)}")

# Count how many people got treatment vs control
treated_people = data1[data1['X'] == 1]
control_people = data1[data1['X'] == 0]
print(f"\nTreated people (X=1): {len(treated_people)}")
print(f"Control people (X=0): {len(control_people)}")

# Step 2: Estimate propensity scores using logistic regression
print("\nStep 2: Estimating the propensity scores")
# We need to predict X (treatment) from Z (the confounder)

# Prepare the data for logistic regression
Z_values = data1[['Z']]  # Features (just Z)
X_values = data1['X']    # What we want to predict (treatment assignment)

# Fit the logistic regression model
logistic_model = LogisticRegression()
logistic_model.fit(Z_values, X_values)

# Get propensity scores (probability of getting treatment)
propensity_scores = logistic_model.predict_proba(Z_values)[:, 1]

print(f"Propensity scores calculated!")
print(f"Minimum propensity score: {min(propensity_scores):.4f}")
print(f"Maximum propensity score: {max(propensity_scores):.4f}")
print(f"Average propensity score: {np.mean(propensity_scores):.4f}")

# Step 3: Calculate inverse probability weights
print("\nStep 3: Calculate the weights")
weights = []

for i in range(len(data1)):
    person = data1.iloc[i]
    prop_score = propensity_scores[i]
    
    if person['X'] == 1:  # If person got treatment
        weight = 1 / prop_score  # Weight = 1 / P(treatment)
    else:  # If person got control
        weight = 1 / (1 - prop_score)  # Weight = 1 / P(control)
    
    weights.append(weight)

weights = np.array(weights)
print(f"Weights calculated!")
print(f"Minimum weight: {min(weights):.4f}")
print(f"Maximum weight: {max(weights):.4f}")

# Step 4: Calculate weighted average treatment effect
print("\nStep 4: Calculating the ATE")

# Calculate weighted means for treated and control groups
treated_outcomes = []
treated_weights = []
control_outcomes = []
control_weights = []

for i in range(len(data1)):
    person = data1.iloc[i]
    if person['X'] == 1:  # Treated person
        treated_outcomes.append(person['Y'])
        treated_weights.append(weights[i])
    else:  # Control person
        control_outcomes.append(person['Y'])
        control_weights.append(weights[i])

# Calculate weighted averages
weighted_treated_mean = np.average(treated_outcomes, weights=treated_weights)
weighted_control_mean = np.average(control_outcomes, weights=control_weights)

# The ATE is the difference
ate = weighted_treated_mean - weighted_control_mean

print(f"Weighted average outcome for treated group: {weighted_treated_mean:.4f}")
print(f"Weighted average outcome for control group: {weighted_control_mean:.4f}")
print(f"Average Treatment Effect (ATE): {ate:.4f}")
print(f"ATE rounded to 1 decimal: {ate:.1f}")


Data:
   Unnamed: 0  X         Y         Z
0           0  1  4.109218  1.764052
1           1  0  2.259504  0.400157
2           2  0 -0.647584  0.978738
3           3  0  2.106071  2.240893
4           4  1  3.583464  1.867558

Columns: ['Unnamed: 0', 'X', 'Y', 'Z']

Treated people (X=1): 481
Control people (X=0): 519

Step 2: Estimating the propensity scores
Propensity scores calculated!
Minimum propensity score: 0.0480
Maximum propensity score: 0.9322
Average propensity score: 0.4810

Step 3: Calculate the weights
Weights calculated!
Minimum weight: 1.0505
Maximum weight: 12.7167

Step 4: Calculating the ATE
Weighted average outcome for treated group: 2.2367
Weighted average outcome for control group: -0.0376
Average Treatment Effect (ATE): 2.2743
ATE rounded to 1 decimal: 2.3


QUESTION 2: The propensity scores of the first three items in the dataframe are closest to: 

In [10]:
first_three_scores = propensity_scores[:3]
print("The first three propensity scores are:")
for i in range(3):
    print(f"Person {i+1}: {first_three_scores[i]:.2f}")

print(f"\nAnswer: [{first_three_scores[0]:.2f}, {first_three_scores[1]:.2f}, {first_three_scores[2]:.2f}]")

The first three propensity scores are:
Person 1: 0.84
Person 2: 0.58
Person 3: 0.71

Answer: [0.84, 0.58, 0.71]


QUESTION 3: Using homework_8.2.csv, match all treated items to the single nearest untreated item using the Mahalanobis distance. (Do this with replacement — the same untreated item can be used again.) 



* Use the Mahalanobis function from scipy.spatial.distance 

* For the inverse covariance matrix, use all ﻿Z 1﻿ values and all ﻿Z 2﻿ values, make them into a ﻿2 x N﻿ matrix, find its ﻿2 x 2﻿ covariance, and invert. 



Then, the ATE is closest to:

In [13]:
# Step 1: Look at the second dataset
print("Dataset 2:")
print(data2.head())
print(f"Columns: {list(data2.columns)}")

# Separate treated and untreated people
treated_data = data2[data2['X'] == 1]
untreated_data = data2[data2['X'] == 0]
print(f"\nTreated people: {len(treated_data)}")
print(f"Untreated people: {len(untreated_data)}")

# Step 2: Calculate covariance matrix for Z1 and Z2
print("\nStep 2: Covariance matrix")

# Get all Z1 and Z2 values
all_z1 = data2['Z1'].values
all_z2 = data2['Z2'].values

# Create a matrix with Z1 and Z2 columns
Z_matrix = np.column_stack([all_z1, all_z2])

# Calculate covariance matrix
cov_matrix = np.cov(Z_matrix.T)  # .T means transpose
print("Covariance matrix:")
print(cov_matrix)

# Calculate inverse covariance matrix
inv_cov_matrix = np.linalg.inv(cov_matrix)
print("\nInverse covariance matrix:")
print(inv_cov_matrix)

# Step 3: Match each treated person to nearest untreated person
print("\nStep 3: Finding matches using Mahalanobis distance")

treatment_effects = []

# Go through each treated person
for treated_index, treated_person in treated_data.iterrows():
    treated_z = [treated_person['Z1'], treated_person['Z2']]
    
    best_match_distance = float('inf')  # Start with infinite distance
    best_match_outcome = None
    
    # Compare with every untreated person
    for untreated_index, untreated_person in untreated_data.iterrows():
        untreated_z = [untreated_person['Z1'], untreated_person['Z2']]
        
        # Calculate Mahalanobis distance
        distance = mahalanobis(treated_z, untreated_z, inv_cov_matrix)
        
        # If this is the closest match so far, remember it
        if distance < best_match_distance:
            best_match_distance = distance
            best_match_outcome = untreated_person['Y']
    
    # Calculate treatment effect for this matched pair
    individual_effect = treated_person['Y'] - best_match_outcome
    treatment_effects.append(individual_effect)

# Step 4: Calculate average treatment effect
ate_mahalanobis = np.mean(treatment_effects)

print(f"Matched {len(treatment_effects)} treated people to untreated people")
print(f"Average Treatment Effect: {ate_mahalanobis:.4f}")
print(f"ATE rounded to 1 decimal: {ate_mahalanobis:.1f}")

Dataset 2:
   Unnamed: 0  X         Y        Z1        Z2
0           0  1  4.652085  1.764052  2.320015
1           1  1  2.590221  0.400157  1.292631
2           2  1  3.898981  0.978738  0.556423
3           3  1  5.857179  2.240893  2.345607
4           4  1  3.647489  1.867558  2.095611
Columns: ['Unnamed: 0', 'X', 'Y', 'Z1', 'Z2']

Treated people: 483
Untreated people: 517

Step 2: Covariance matrix
Covariance matrix:
[[0.97520967 0.94507003]
 [0.94507003 1.85320242]]

Inverse covariance matrix:
[[ 2.02734407 -1.03387633]
 [-1.03387633  1.06684813]]

Step 3: Finding matches using Mahalanobis distance
Matched 483 treated people to untreated people
Average Treatment Effect: 3.4377
ATE rounded to 1 decimal: 3.4


QUESTION 4:  Find the nearest Z1 and Z2 values of the treated item with the least common support (the farthest Mahalanobis distance from the untreated). 

In [15]:
worst_support_distance = 0
worst_support_person = None
worst_support_z1 = None
worst_support_z2 = None

# Go through each treated person
for treated_index, treated_person in treated_data.iterrows():
    treated_z = [treated_person['Z1'], treated_person['Z2']]
    
    # Find the distance to the CLOSEST untreated person
    closest_distance = float('inf')
    
    for untreated_index, untreated_person in untreated_data.iterrows():
        untreated_z = [untreated_person['Z1'], untreated_person['Z2']]
        distance = mahalanobis(treated_z, untreated_z, inv_cov_matrix)
        
        if distance < closest_distance:
            closest_distance = distance
    
    # If this person's closest match is farther than anyone else's,
    # they have the least common support
    if closest_distance > worst_support_distance:
        worst_support_distance = closest_distance
        worst_support_person = treated_person
        worst_support_z1 = treated_person['Z1']
        worst_support_z2 = treated_person['Z2']

print(f"Treated person with least common support")
print(f"Their Z1 value: {worst_support_z1:.1f}")
print(f"Their Z2 value: {worst_support_z2:.1f}")
print(f"Distance to nearest untreated person: {worst_support_distance:.4f}")


Treated person with least common support
Their Z1 value: 2.7
Their Z2 value: 0.5
Distance to nearest untreated person: 1.3830


In [None]:
print(f"Question 1 - ATE closest to: {ate:.1f}")
print(f"Question 2 - First three propensity scores: [{first_three_scores[0]:.2f}, {first_three_scores[1]:.2f}, {first_three_scores[2]:.2f}]")
print(f"Question 3 - ATE closest to: {ate_mahalanobis:.1f}")
print(f"Question 4 - Z1, Z2 values: ({worst_support_z1:.1f}, {worst_support_z2:.1f})")

Question 1 - ATE closest to: 2.3
Question 2 - First three propensity scores: [0.84, 0.58, 0.71]
Question 3 - ATE closest to: 3.4
Question 4 - Z1, Z2 values: (2.7, 0.5)
