In [None]:
from econml.dml import CausalForestDML
import scipy 
import numpy as np
import pandas as pd

In [None]:
# This function is used to run the Causal Forest DML model
def run_cfdml(y, T, X):
    est = CausalForestDML(discrete_treatment=True)
    est.fit(y, T, X=X, W=None)
    return est

In [None]:
# Load the data
all_data_full = pd.read_csv('all_data_full.csv')
all_data_full = all_data_full[all_data_full["apparel_model_brand"] != "New Balance"]
all_data_full["shoe_type_b"] = all_data_full["shoe_type"].apply(lambda x: 1 if x == "Super Shoe" else 0)
all_data_full["gender_b"] =  all_data_full["gender"].apply(lambda x: 0 if x == "male" else 1)
all_data_full_q1 = all_data_full[all_data_full['quartile'] == 1]
all_data_full_q2 = all_data_full[all_data_full['quartile'] == 2]
all_data_full_q3 = all_data_full[all_data_full['quartile'] == 3]
all_data_full_q4 = all_data_full[all_data_full['quartile'] == 4]

In [None]:
def run_effect_modification(data, dataset_name, modifier_col, modifier_bins=None):
    results = []

    if modifier_bins is not None:
        data["modifier_bin"] = pd.cut(data[modifier_col], bins=modifier_bins)
    else:
        data["modifier_bin"] = data[modifier_col]
    
    for group in data["modifier_bin"].unique():
        subset_data = data[data["modifier_bin"] == group]
        if len(subset_data) < 10:
            print(f"Skipping group {group} in {dataset_name} due to insufficient data")
            continue
        if len(subset_data["shoe_type_b"].unique()) < 2:
            print(f"Skipping group {group} in {dataset_name} due to insufficient treatment variation")
            continue
        if subset_data["shoe_type_b"].value_counts().min() < 5:
            print(f"Skipping group {group} in {dataset_name} due to insufficient treatment variation")
            continue
        y = subset_data['speed']
        T = subset_data["shoe_type_b"]
        X = subset_data[["weight_kg", "critical_power", 'grade', 'env_temperature', 
                         'env_humidity', 'env_wind_speed', 'env_wind_gust', 'env_wind_bearing',
                         "elevation", "age", "gender_b"]]

        est = run_cfdml(y, T, X)
        ate = est.ate_
        ate_percentage = ate / np.mean(y) * 100
        ate_ci = est.ate_stderr_
        ate_ci_percentage = (ate_ci / np.mean(y)) * 100
        
        # Store results in a list
        results.append({
            "dataset": dataset_name,
            "modifier_col": modifier_col,
            "modifier_group": group,
            "ATE": ate,
            "ATE_percentage": ate_percentage,
            "ATE_ci": ate_ci,
            "ATE_ci_percentage": ate_ci_percentage
        })
        
        print(f"ATE for {modifier_col} group {group} in {dataset_name}: {ate_percentage}%")

    return pd.DataFrame(results)

# List of datasets to analyze
datasets = [
    ("all_data_full", all_data_full),
    ("all_data_full_q1", all_data_full_q1),
    ("all_data_full_q2", all_data_full_q2),
    ("all_data_full_q3", all_data_full_q3),
    ("all_data_full_q4", all_data_full_q4),
    # ("adidas_data_full", adidas_data_full),
    # ("adidas_data_full_q1", adidas_data_full_q1),
    # ("adidas_data_full_q2", adidas_data_full_q2),
    # ("adidas_data_full_q3", adidas_data_full_q3),
    # ("adidas_data_full_q4", adidas_data_full_q4),
    
]

# Initialize an empty DataFrame to store all results
all_results = pd.DataFrame()

# Loop through each dataset and run effect modification
for dataset_name, dataset_data in datasets:
    print(f"\nRunning effect modification for {dataset_name}:")

    # Run effect modification for gender
    gender_results = run_effect_modification(dataset_data, dataset_name, modifier_col="gender_b")

    # Example for continuous variable: weight
    weight_bins = [40, 50, 60, 70, 80, 90]   # Modify bins as needed
    weight_results = run_effect_modification(dataset_data, dataset_name, modifier_col="weight_kg", modifier_bins=weight_bins)

    # Example for continuous variable: age
    age_bins = [15, 20,30,40,50,60]  # Modify bins as needed
    age_results = run_effect_modification(dataset_data, dataset_name, modifier_col="age", modifier_bins=age_bins)
    
    # Example for continuous variable: NORMALISED SHOE SIZE 
    shoe_size_bins = [5, 7, 9, 11, 13, 15, 17, 19]  # Modify bins as needed
    shoe_size_results = run_effect_modification(dataset_data, dataset_name, modifier_col="normalised_shoe_size", modifier_bins=shoe_size_bins)
    
    # Example for continuous variable: NORMALISED SHOE SIZE 
    intensity_bins = [0.4, 0.7, 1.0, 1.3]  # Modify bins as needed
    intensity_results = run_effect_modification(dataset_data, dataset_name, modifier_col="intensity", modifier_bins=intensity_bins)
    
    # Example for continuous variable: NORMALISED SHOE SIZE 
    total_power_bins = [100,200,300,400]  # Modify bins as needed
    total_power_results = run_effect_modification(dataset_data, dataset_name, modifier_col="total_power", modifier_bins=total_power_bins)
    
    # Example for continuous variable: heart rate
    heart_rate_bins = [100, 130, 160, 190]  # Modify bins as needed
    heart_rate_results = run_effect_modification(dataset_data, dataset_name, modifier_col="heart_rate", modifier_bins=heart_rate_bins)

    # Example for continuous variable: cadence
    cadence_bins = [140, 160, 180, 200]  # Modify bins as needed
    cadence_results = run_effect_modification(dataset_data, dataset_name, modifier_col="cadence", modifier_bins=cadence_bins)

    # Example for continuous variable: stride_length
    stride_length_bins = [0.5, 1, 1.5, 2]  # Modify bins as needed
    stride_length_results = run_effect_modification(dataset_data, dataset_name, modifier_col="stride_length", modifier_bins=stride_length_bins)

    # Example for continuous variable: peak_vertical_grf
    peak_vertical_grf_bins = [1000, 1500, 2000, 2500]  # Modify bins as needed
    peak_vertical_grf_results = run_effect_modification(dataset_data, dataset_name, modifier_col="peak_vertical_grf", modifier_bins=peak_vertical_grf_bins)

    # Example for continuous variable: leg_spring
    leg_spring_bins = [5,8,12,16]  # Modify bins as needed
    leg_spring_results = run_effect_modification(dataset_data, dataset_name, modifier_col="leg_spring", modifier_bins=leg_spring_bins)

    # Example for continuous variable: flight_time_ms
    flight_time_ms_bins = [0,30,60,100]  # Modify bins as needed
    flight_time_ms_results = run_effect_modification(dataset_data, dataset_name, modifier_col="flight_time_ms", modifier_bins=flight_time_ms_bins)

    # Example for continuous variable: ground_time
    ground_time_bins = [150, 200, 250, 300, 400]  # Modify bins as needed
    ground_time_results = run_effect_modification(dataset_data, dataset_name, modifier_col="ground_time", modifier_bins=ground_time_bins)
                        
    # Combine results for the current dataset
    combined_results = pd.concat([gender_results, weight_results, age_results, shoe_size_results, intensity_results, total_power_results, heart_rate_results, cadence_results, stride_length_results, peak_vertical_grf_results, leg_spring_results,
                                  flight_time_ms_results, ground_time_results ], ignore_index=True)

    # Append to all results
    all_results = pd.concat([all_results, combined_results], ignore_index=True)

# Save all results to a CSV file
all_results.to_csv("effect_modification_results_all_data_full_cfdml.csv", index=False)