In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
from pandas.errors import PerformanceWarning
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import joblib
from sklearn.model_selection import KFold
# Add imports for advanced statistics
from scipy.stats import entropy, hmean, gmean
from scipy.signal import find_peaks

In [3]:
import sys
import os
sys.path.append('../')  # Add project root to Python path
from src.pipeline import PipelineRunner

  from .autonotebook import tqdm as notebook_tqdm


# 1. Feature specifications

1. Only basic features

2. Add logs of numerical features

3. Add group level aggregations


In [None]:
df_combined = pd.read_pickle('../data/processed/df_combined.pkl')

In [None]:
# 1. Basic features
runner = PipelineRunner(data_dir='../data', models_dir='../models', run_name='basic',
                        df_combined_path='../data/processed/df_combined.pkl',
                        variables_type_path='../data/variables/table_variables_type.xlsx',
                        add_logs=False,
                        add_group_aggregations=False,
                        return_shap=True,
                        return_ci=True)

_ = runner.run_full_pipeline()

In [None]:
# 2. Add logs features
runner = PipelineRunner(data_dir='../data', models_dir='../models', run_name='logs',
                        df_combined_path='../data/processed/df_combined.pkl',
                        variables_type_path='../data/variables/table_variables_type.xlsx',
                        add_logs=True,
                        add_group_aggregations=False)

_ = runner.run_full_pipeline()

In [None]:
# 3. Add group level aggregations
runner = PipelineRunner(data_dir='../data', models_dir='../models', run_name='group_agg',
                        df_combined_path='../data/processed/df_combined.pkl',
                        variables_type_path='../data/variables/table_variables_type.xlsx',
                        add_logs=True,
                        add_group_aggregations=True)

_ = runner.run_full_pipeline()

# 2. Different Cross-validation

1. Random split

2. Leave-one-country-out

3. Leave-one-country-out with partial holdout

4. Only one country (change dataset) with random split

In [None]:
# Define the countries and holdout modes
list_countries = ['mexico', 'us', 'uk', 'india']
holdout_modes = ['partial_holdout', 'full_holdout']

# Run all combinations
for country in list_countries:
    for holdout_mode in holdout_modes:
        run_name = f'{country}_{holdout_mode}'
        
        print(f"\n=== Running: {run_name} ===")
        
        # Create and run pipeline
        runner = PipelineRunner(
            data_dir='../data',
            models_dir='../models',
            run_name=run_name,
            df_combined_path='../data/processed/df_combined.pkl',
            variables_type_path='../data/variables/table_variables_type.xlsx',
            country_holdout=country,
            holdout_split_mode=holdout_mode
        )
        
        predictions = runner.run_full_pipeline()
        print(f"✅ Completed: {run_name}")

print("\n🎉 All pipelines completed!")

In [None]:
# Create separate datasets for each country
df_combined = pd.read_pickle('../data/processed/df_combined.pkl')

for country in df_combined['country'].unique():
    df_country = df_combined[df_combined['country'] == country]
    df_country.to_pickle(f'../data/processed/df_combined_{country}.pkl')
    print(f'Saved {country} dataset shape: {df_country.shape}')

In [None]:
# Run country-specific pipeline
for country in ['mexico', 'us', 'uk', 'india']:
    print(f' === Running {country} pipeline ===')
    runner = PipelineRunner(data_dir='../data', models_dir='../models', run_name=f'{country}',
                            df_combined_path=f'../data/processed/df_combined_{country}.pkl',
                            variables_type_path='../data/variables/table_variables_type.xlsx')
    _ = runner.run_full_pipeline()

# 3. Robustness checks

1. Remove random 10%, 30%, 50% of features
2. Remove random 10%, 30%, 50% of observations

In [None]:
df_combined = pd.read_pickle('../data/processed/df_combined.pkl')
keep_cols = ['country', 'fgcp', 'age_group'
             'r1agey', 'r2agey', 'r3agey', 'r4agey', 'r5agey']

In [None]:
# Get features that can be removed (all columns except keep_cols)
removable_features = [col for col in df_combined.columns if col not in keep_cols]
print(f"Total features: {len(df_combined.columns)}")
print(f"Removable features: {len(removable_features)}")

# Define removal percentages
removal_percentages = [10, 30, 50, 70, 90]

# Create datasets with different feature removal percentages
for pct in removal_percentages:
    print(f"\nCreating dataset with {pct}% features removed...")
    
    # Calculate number of features to remove
    n_features_to_remove = int(len(removable_features) * pct / 100)
    
    # Randomly select features to remove
    np.random.seed(42)  # For reproducibility
    features_to_remove = np.random.choice(removable_features, size=n_features_to_remove, replace=False)
    
    # Create new dataset by dropping selected features
    df_reduced = df_combined.drop(columns=features_to_remove)
    
    # Save the reduced dataset
    filename = f'../data/processed/df_combined_remove_{pct}pct_features.pkl'
    df_reduced.to_pickle(filename)
    
    print(f"  Removed {n_features_to_remove} features")
    print(f"  New dataset shape: {df_reduced.shape}")
    print(f"  Saved as: df_combined_remove_{pct}pct_features.pkl")

print(f"\n✅ All feature-reduced datasets created!")

In [None]:
print(f"Original dataset shape: {df_combined.shape}")

# Define removal percentages
removal_percentages = [10, 30, 50, 70, 90]

# Create datasets with different observation removal percentages
for pct in removal_percentages:
    print(f"\nCreating dataset with {pct}% observations removed...")
    
    # Calculate number of observations to keep (remove pct%)
    n_obs_to_keep = int(len(df_combined) * (100 - pct) / 100)
    
    # Randomly select observations to keep
    np.random.seed(42)  # For reproducibility
    keep_indices = np.random.choice(df_combined.index, size=n_obs_to_keep, replace=False)
    
    # Create new dataset with selected observations
    df_reduced = df_combined.loc[keep_indices].copy()
    
    # Save the reduced dataset
    filename = f'../data/processed/df_combined_remove_{pct}pct_obs.pkl'
    df_reduced.to_pickle(filename)
    
    print(f"  Kept {n_obs_to_keep} observations (removed {len(df_combined) - n_obs_to_keep})")
    print(f"  New dataset shape: {df_reduced.shape}")
    print(f"  Saved as: df_combined_remove_{pct}pct_obs.pkl")

print(f"\n✅ All observation-reduced datasets created!")

In [None]:
print("=== Running pipelines for feature-reduced datasets ===")

removal_percentages = [10, 30, 50, 70, 90]

for pct in removal_percentages:
    run_name = f'reduced_features_{pct}pct'
    dataset_path = f'../data/processed/df_combined_remove_{pct}pct_features.pkl'
    
    print(f"\n--- Running: {run_name} ---")
    
    runner = PipelineRunner(
        data_dir='../data',
        models_dir='../models',
        run_name=run_name,
        df_combined_path=dataset_path,
        variables_type_path='../data/variables/table_variables_type.xlsx'
    )
    
    predictions = runner.run_full_pipeline()
    print(f"✅ Completed: {run_name}")

print("\n" + "="*60)

In [None]:
print("=== Running pipelines for observation-reduced datasets ===")

for pct in removal_percentages:
    run_name = f'reduced_obs_{pct}pct'
    dataset_path = f'../data/processed/df_combined_remove_{pct}pct_obs.pkl'
    
    print(f"\n--- Running: {run_name} ---")
    
    runner = PipelineRunner(
        data_dir='../data',
        models_dir='../models',
        run_name=run_name,
        df_combined_path=dataset_path,
        variables_type_path='../data/variables/table_variables_type.xlsx'
    )
    
    predictions = runner.run_full_pipeline()
    print(f"✅ Completed: {run_name}")

print("\n🎉 All robustness check pipelines completed!")

# 4. Remove waves

1. Dataset without India as baseline
1. Dataset without wave 5
2. Dataset without wave 4
3. Dataset without wave 3

In [None]:
df_combined = pd.read_pickle('../data/processed/df_combined.pkl')
df_combined_no_india = df_combined[df_combined['country'] != 'india']
df_combined_no_india.to_pickle('../data/processed/df_combined_no_india.pkl')

In [None]:
runner = PipelineRunner(
    data_dir='../data',
    models_dir='../models',
    run_name='no_india',
    df_combined_path='../data/processed/df_combined_no_india.pkl',
    variables_type_path='../data/variables/table_variables_type.xlsx'
) 
predictions = runner.run_full_pipeline()

In [None]:
# Define wave patterns to remove
wave_patterns = {
    'no_wave5': ['r5', 'h5'],
    'no_wave45': ['r5', 'h5', 'r4', 'h4'], 
    'no_wave345': ['r5', 'h5', 'r4', 'h4', 'r3', 'h3']
}

# Create datasets
for name, patterns in wave_patterns.items():
    # Find columns to remove
    cols_to_remove = [col for col in df_combined.columns 
                     if any(col.startswith(p) for p in patterns) 
                     and col not in keep_cols]
    
    # Create and save dataset
    df_reduced = df_combined_no_india.drop(columns=cols_to_remove)
    filename = f'../data/processed/df_combined_{name}.pkl'
    df_reduced.to_pickle(filename)
    
    print(f"{name}: {df_reduced.shape} (removed {len(cols_to_remove)} columns)")

print("✅ All wave-reduced datasets created!")

In [None]:
# Run pipelines for wave-reduced datasets
wave_datasets = ['no_wave5', 'no_wave45', 'no_wave345']

for dataset_name in wave_datasets:
    run_name = dataset_name
    dataset_path = f'../data/processed/df_combined_{dataset_name}.pkl'
    
    print(f"\n=== Running: {run_name} ===")
    
    runner = PipelineRunner(
        data_dir='../data',
        models_dir='../models',
        run_name=run_name,
        df_combined_path=dataset_path,
        variables_type_path='../data/variables/table_variables_type.xlsx'
    )
    
    predictions = runner.run_full_pipeline()
    print(f"✅ Completed: {run_name}")

print("\n🎉 All wave-reduced pipelines completed!")

# 5. Bias correction

In [None]:
for col in ['country', 'ragender_2_0', 'raeducl']:

    runner = PipelineRunner(data_dir='../data', models_dir='../models', run_name=f'fair_{col}',
                            df_combined_path='../data/processed/df_combined.pkl',
                            variables_type_path='../data/variables/table_variables_type.xlsx',
                            fair_col=col)

    _ = runner.run_full_pipeline()