# did_multiplegt_dyn: Cross-Language Comparison

This notebook compares the results and runtime performance of `did_multiplegt_dyn` across:
- **Stata** (original implementation)
- **R** (DIDmultiplegtDYN package)
- **Python** (py-did-multiplegt-dyn package)

## Prerequisites
Before running this notebook, execute the following scripts to generate results:
1. `test_did_multiplegt_dyn_comprehensive.do` (Stata)
2. `test_did_multiplegt_dyn_comprehensive.R` (R)
3. `test_did_multiplegt_dyn_comprehensive.py` (Python)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set display options
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 200)

# Paths
RESULTS_PATH = Path("/Users/anzony.quisperojas/Documents/GitHub/diff_diff_test/CX")

print("Notebook initialized successfully!")

## 1. Load Runtime Results

In [None]:
# Load runtime results from each platform
runtime_files = {
    'Stata': 'runtime_Stata.csv',
    'R': 'runtime_R.csv',
    'Python': 'runtime_Python.csv'
}

runtime_dfs = {}
for platform, filename in runtime_files.items():
    filepath = RESULTS_PATH / filename
    if filepath.exists():
        runtime_dfs[platform] = pd.read_csv(filepath)
        print(f"Loaded {platform}: {len(runtime_dfs[platform])} tests")
    else:
        print(f"Warning: {filepath} not found")

# Combine all runtime results
if runtime_dfs:
    runtime_all = pd.concat(runtime_dfs.values(), ignore_index=True)
    print(f"\nTotal tests loaded: {len(runtime_all)}")

## 2. Runtime Comparison

In [None]:
# Create a pivot table for runtime comparison
if 'runtime_all' in dir():
    runtime_pivot = runtime_all.pivot_table(
        index=['Example', 'Model'],
        columns='Platform',
        values='Runtime_sec',
        aggfunc='mean'
    ).reset_index()
    
    # Reorder columns to have Stata first (as reference)
    cols = ['Example', 'Model']
    for p in ['Stata', 'R', 'Python']:
        if p in runtime_pivot.columns:
            cols.append(p)
    runtime_pivot = runtime_pivot[cols]
    
    print("Runtime Comparison (seconds):")
    print("=" * 80)
    display(runtime_pivot)

In [None]:
# Calculate speedup relative to Stata
if 'runtime_pivot' in dir() and 'Stata' in runtime_pivot.columns:
    speedup_df = runtime_pivot.copy()
    
    for platform in ['R', 'Python']:
        if platform in speedup_df.columns:
            speedup_df[f'{platform}_vs_Stata'] = speedup_df['Stata'] / speedup_df[platform]
    
    print("\nSpeedup relative to Stata (>1 means faster than Stata):")
    print("=" * 80)
    display(speedup_df)

In [None]:
# Summary statistics by platform
if 'runtime_all' in dir():
    summary = runtime_all.groupby('Platform')['Runtime_sec'].agg([
        'count', 'sum', 'mean', 'std', 'min', 'max'
    ]).round(4)
    summary.columns = ['N_Tests', 'Total_Time', 'Mean_Time', 'Std_Time', 'Min_Time', 'Max_Time']
    
    print("\nRuntime Summary by Platform:")
    print("=" * 80)
    display(summary)

## 3. Visualize Runtime Performance

In [None]:
# Bar plot of runtime by model and platform
if 'runtime_all' in dir():
    fig, ax = plt.subplots(figsize=(14, 8))
    
    # Filter to Wagepan tests only for clearer visualization
    wagepan_runtime = runtime_all[runtime_all['Example'] == 'Wagepan'].copy()
    
    if len(wagepan_runtime) > 0:
        pivot_plot = wagepan_runtime.pivot(index='Model', columns='Platform', values='Runtime_sec')
        pivot_plot.plot(kind='bar', ax=ax, width=0.8)
        
        ax.set_title('Runtime Comparison: Wagepan Tests', fontsize=14, fontweight='bold')
        ax.set_xlabel('Model Specification', fontsize=12)
        ax.set_ylabel('Runtime (seconds)', fontsize=12)
        ax.legend(title='Platform', loc='upper right')
        ax.tick_params(axis='x', rotation=45)
        
        plt.tight_layout()
        plt.savefig(RESULTS_PATH / 'runtime_comparison_wagepan.png', dpi=150, bbox_inches='tight')
        plt.show()
        print(f"\nPlot saved to: {RESULTS_PATH / 'runtime_comparison_wagepan.png'}")

In [None]:
# Total runtime comparison across all tests
if 'runtime_all' in dir():
    fig, ax = plt.subplots(figsize=(8, 6))
    
    total_by_platform = runtime_all.groupby('Platform')['Runtime_sec'].sum().sort_values()
    colors = ['#2ecc71', '#3498db', '#e74c3c'][:len(total_by_platform)]
    
    bars = ax.barh(total_by_platform.index, total_by_platform.values, color=colors)
    
    # Add value labels
    for bar, val in zip(bars, total_by_platform.values):
        ax.text(val + 0.5, bar.get_y() + bar.get_height()/2, 
                f'{val:.1f}s', va='center', fontsize=11)
    
    ax.set_title('Total Runtime by Platform', fontsize=14, fontweight='bold')
    ax.set_xlabel('Total Runtime (seconds)', fontsize=12)
    
    plt.tight_layout()
    plt.savefig(RESULTS_PATH / 'total_runtime_comparison.png', dpi=150, bbox_inches='tight')
    plt.show()

## 4. Load and Compare Coefficients

In [None]:
# Load coefficient results from each platform
coef_files = {
    'Stata': 'coefficients_Stata.csv',
    'R': 'coefficients_R.csv',
    'Python': 'coefficients_Python.csv'
}

coef_dfs = {}
for platform, filename in coef_files.items():
    filepath = RESULTS_PATH / filename
    if filepath.exists():
        coef_dfs[platform] = pd.read_csv(filepath)
        coef_dfs[platform]['Platform'] = platform
        print(f"Loaded {platform}: {len(coef_dfs[platform])} coefficients")
    else:
        print(f"Warning: {filepath} not found")

# Combine all coefficient results
if coef_dfs:
    coef_all = pd.concat(coef_dfs.values(), ignore_index=True)
    print(f"\nTotal coefficients loaded: {len(coef_all)}")

In [None]:
# Create pivot table for coefficient comparison
if 'coef_all' in dir():
    coef_pivot = coef_all.pivot_table(
        index=['Example', 'Model', 'Type', 'Index'],
        columns='Platform',
        values='Estimate',
        aggfunc='mean'
    ).reset_index()
    
    print("\nCoefficient Comparison (Estimates):")
    print("=" * 80)
    display(coef_pivot.head(30))

In [None]:
# Calculate differences between platforms
if 'coef_pivot' in dir():
    diff_df = coef_pivot.copy()
    
    # Calculate differences relative to Stata
    if 'Stata' in diff_df.columns:
        for platform in ['R', 'Python']:
            if platform in diff_df.columns:
                diff_df[f'{platform}_diff'] = diff_df[platform] - diff_df['Stata']
                diff_df[f'{platform}_pct_diff'] = ((diff_df[platform] - diff_df['Stata']) / diff_df['Stata'].abs()) * 100
    
    print("\nDifferences relative to Stata:")
    print("=" * 80)
    
    # Show summary of differences
    diff_cols = [c for c in diff_df.columns if '_diff' in c and '_pct_diff' not in c]
    if diff_cols:
        print("\nAbsolute differences summary:")
        for col in diff_cols:
            print(f"\n{col}:")
            print(f"  Mean: {diff_df[col].mean():.6f}")
            print(f"  Max:  {diff_df[col].abs().max():.6f}")
            print(f"  Std:  {diff_df[col].std():.6f}")

## 5. Estimate Comparison Visualization

In [None]:
# Scatter plot: R vs Stata estimates
if 'coef_pivot' in dir() and 'Stata' in coef_pivot.columns and 'R' in coef_pivot.columns:
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    # R vs Stata
    ax = axes[0]
    valid_data = coef_pivot.dropna(subset=['Stata', 'R'])
    ax.scatter(valid_data['Stata'], valid_data['R'], alpha=0.6, edgecolors='k', linewidth=0.5)
    
    # Add 45-degree line
    lims = [min(ax.get_xlim()[0], ax.get_ylim()[0]), max(ax.get_xlim()[1], ax.get_ylim()[1])]
    ax.plot(lims, lims, 'r--', alpha=0.75, zorder=0, label='Perfect agreement')
    ax.set_xlim(lims)
    ax.set_ylim(lims)
    
    ax.set_xlabel('Stata Estimates', fontsize=12)
    ax.set_ylabel('R Estimates', fontsize=12)
    ax.set_title('R vs Stata Estimates', fontsize=14, fontweight='bold')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    # Python vs Stata (if available)
    ax = axes[1]
    if 'Python' in coef_pivot.columns:
        valid_data = coef_pivot.dropna(subset=['Stata', 'Python'])
        ax.scatter(valid_data['Stata'], valid_data['Python'], alpha=0.6, edgecolors='k', linewidth=0.5, color='green')
        
        lims = [min(ax.get_xlim()[0], ax.get_ylim()[0]), max(ax.get_xlim()[1], ax.get_ylim()[1])]
        ax.plot(lims, lims, 'r--', alpha=0.75, zorder=0, label='Perfect agreement')
        ax.set_xlim(lims)
        ax.set_ylim(lims)
        
        ax.set_xlabel('Stata Estimates', fontsize=12)
        ax.set_ylabel('Python Estimates', fontsize=12)
        ax.set_title('Python vs Stata Estimates', fontsize=14, fontweight='bold')
        ax.legend()
        ax.grid(True, alpha=0.3)
    else:
        ax.text(0.5, 0.5, 'Python data not available', ha='center', va='center', transform=ax.transAxes)
    
    plt.tight_layout()
    plt.savefig(RESULTS_PATH / 'estimate_comparison_scatter.png', dpi=150, bbox_inches='tight')
    plt.show()

In [None]:
# Compare effects for a specific model (e.g., Placebos)
if 'coef_all' in dir():
    model_to_compare = 'Placebos'
    example_to_compare = 'Wagepan'
    
    subset = coef_all[
        (coef_all['Model'] == model_to_compare) & 
        (coef_all['Example'] == example_to_compare) &
        (coef_all['Type'] == 'Effect')
    ].copy()
    
    if len(subset) > 0:
        fig, ax = plt.subplots(figsize=(10, 6))
        
        for platform in ['Stata', 'R', 'Python']:
            pdata = subset[subset['Platform'] == platform].sort_values('Index')
            if len(pdata) > 0:
                ax.errorbar(pdata['Index'], pdata['Estimate'], yerr=pdata['SE']*1.96,
                           label=platform, marker='o', capsize=3, capthick=1)
        
        ax.axhline(y=0, color='gray', linestyle='--', alpha=0.5)
        ax.set_xlabel('Effect Period', fontsize=12)
        ax.set_ylabel('Estimate', fontsize=12)
        ax.set_title(f'Effect Estimates Comparison: {example_to_compare} - {model_to_compare}', 
                    fontsize=14, fontweight='bold')
        ax.legend(title='Platform')
        ax.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.savefig(RESULTS_PATH / 'effect_estimates_comparison.png', dpi=150, bbox_inches='tight')
        plt.show()

## 6. Summary Report

In [None]:
# Generate summary report
print("=" * 80)
print("SUMMARY REPORT: did_multiplegt_dyn Cross-Language Comparison")
print("=" * 80)

# Runtime summary
if 'runtime_all' in dir():
    print("\n1. RUNTIME PERFORMANCE")
    print("-" * 40)
    
    for platform in ['Stata', 'R', 'Python']:
        pdata = runtime_all[runtime_all['Platform'] == platform]
        if len(pdata) > 0:
            total = pdata['Runtime_sec'].sum()
            mean = pdata['Runtime_sec'].mean()
            n_tests = len(pdata)
            print(f"   {platform:10s}: {n_tests:2d} tests | Total: {total:7.2f}s | Mean: {mean:6.3f}s")

# Coefficient agreement summary
if 'coef_pivot' in dir():
    print("\n2. COEFFICIENT AGREEMENT")
    print("-" * 40)
    
    if 'Stata' in coef_pivot.columns:
        for platform in ['R', 'Python']:
            if platform in coef_pivot.columns:
                valid = coef_pivot.dropna(subset=['Stata', platform])
                if len(valid) > 0:
                    diff = (valid[platform] - valid['Stata']).abs()
                    corr = valid[['Stata', platform]].corr().iloc[0, 1]
                    max_diff = diff.max()
                    mean_diff = diff.mean()
                    
                    print(f"   {platform} vs Stata:")
                    print(f"      Correlation:  {corr:.6f}")
                    print(f"      Mean |diff|:  {mean_diff:.6f}")
                    print(f"      Max |diff|:   {max_diff:.6f}")
                    print(f"      N compared:   {len(valid)}")

# Test coverage summary
print("\n3. TEST COVERAGE")
print("-" * 40)

all_models = set()
if 'runtime_all' in dir():
    for platform in ['Stata', 'R', 'Python']:
        pdata = runtime_all[runtime_all['Platform'] == platform]
        models = set(pdata['Model'].unique())
        all_models.update(models)
        print(f"   {platform:10s}: {len(models)} unique model specifications")

print(f"\n   Total unique specifications: {len(all_models)}")

print("\n" + "=" * 80)
print("END OF REPORT")
print("=" * 80)

In [None]:
# Export combined results
if 'runtime_all' in dir():
    runtime_all.to_csv(RESULTS_PATH / 'runtime_all_platforms.csv', index=False)
    print(f"Combined runtime saved to: {RESULTS_PATH / 'runtime_all_platforms.csv'}")

if 'coef_all' in dir():
    coef_all.to_csv(RESULTS_PATH / 'coefficients_all_platforms.csv', index=False)
    print(f"Combined coefficients saved to: {RESULTS_PATH / 'coefficients_all_platforms.csv'}")

if 'coef_pivot' in dir():
    coef_pivot.to_csv(RESULTS_PATH / 'coefficients_comparison_wide.csv', index=False)
    print(f"Coefficient comparison saved to: {RESULTS_PATH / 'coefficients_comparison_wide.csv'}")

## 7. Detailed Model-by-Model Comparison

In [None]:
# Create detailed comparison table for each model
if 'coef_all' in dir() and 'runtime_all' in dir():
    models = runtime_all[runtime_all['Example'] == 'Wagepan']['Model'].unique()
    
    comparison_tables = []
    
    for model in models:
        print(f"\n{'=' * 60}")
        print(f"Model: {model}")
        print('=' * 60)
        
        # Runtime comparison
        rt = runtime_all[(runtime_all['Model'] == model) & (runtime_all['Example'] == 'Wagepan')]
        print("\nRuntime (seconds):")
        for _, row in rt.iterrows():
            print(f"  {row['Platform']:10s}: {row['Runtime_sec']:.4f}")
        
        # Effect estimates comparison
        effects = coef_all[
            (coef_all['Model'] == model) & 
            (coef_all['Example'] == 'Wagepan') &
            (coef_all['Type'] == 'Effect')
        ]
        
        if len(effects) > 0:
            print("\nEffect Estimates:")
            eff_pivot = effects.pivot_table(
                index='Index', 
                columns='Platform', 
                values='Estimate'
            )
            display(eff_pivot.round(6))

## 8. Conclusions

Based on the analysis above:

1. **Runtime Performance**: Compare which implementation is fastest for different test configurations.

2. **Numerical Agreement**: Verify that all three implementations produce consistent estimates.

3. **Feature Coverage**: Identify any features not yet implemented in R or Python versions.

### Notes:
- Bootstrap option is only available in Stata
- Some minor numerical differences may exist due to floating-point precision
- Larger differences should be investigated as potential implementation discrepancies