<a href="https://colab.research.google.com/github/avionerman/machine_learning_2025/blob/main/competition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [158]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import zipfile
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import Ridge
from scipy.optimize import minimize

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

print("All libraries imported successfully!")
print(f"TensorFlow version: {tf.__version__}")
print(f"GPU available: {tf.config.list_physical_devices('GPU')}")

All libraries imported successfully!
TensorFlow version: 2.19.0
GPU available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


# Class Config

In [159]:

class Config:
    """Central configuration for all parameters."""

    # File paths (UPDATE THESE TO YOUR PATHS)
    TRAIN_FEATURES_PATH = '/content/train_hh_features.csv'
    TRAIN_LABELS_PATH = '/content/train_hh_gt.csv'
    TEST_FEATURES_PATH = '/content/test_hh_features.csv'

    # Target column
    TARGET_COL = 'cons_ppp17'

    # ID columns
    ID_COLS = ['survey_id', 'hhid']

    # Random state for reproducibility
    RANDOM_STATE = 42

    # Validation split
    VAL_SIZE = 0.2

    # Poverty thresholds for competition metric
    POVERTY_THRESHOLDS = [
        3.17, 3.94, 4.60, 5.26, 5.88, 6.47, 7.06, 7.70,
        8.40, 9.13, 9.87, 10.70, 11.62, 12.69, 14.03,
        15.64, 17.76, 20.99, 27.37
    ]

    # Percentile ranks for weighting
    PERCENTILE_RANKS = [
        0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40,
        0.45, 0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80,
        0.85, 0.90, 0.95
    ]

    # Test survey IDs
    TEST_SURVEY_IDS = [400000, 500000, 600000]

    # Binary mappings (COMPLETE)
    BINARY_MAPPINGS = {
        'Yes': 1, 'No': 0,
        'Male': 1, 'Female': 0,
        'Access': 1, 'No access': 0,
        'Urban': 1, 'Rural': 0,
        'Owner': 1, 'Renter': 0, 'Not owner': 0,
        'Employed': 1, 'Not employed': 0,
        1: 1, 0: 0,
        1.0: 1, 0.0: 0
    }



# Class for Data process

In [160]:

class DataProcessor:
    """Handles all data loading, cleaning, and preprocessing."""

    def __init__(self, config):
        self.config = config
        self.label_encoders = {}
        self.scaler = StandardScaler()
        self.feature_cols = None

    def load_data(self):
        """Load training and test data."""
        print("Loading data...")

        train_features = pd.read_csv(self.config.TRAIN_FEATURES_PATH)
        train_labels = pd.read_csv(self.config.TRAIN_LABELS_PATH)
        test_features = pd.read_csv(self.config.TEST_FEATURES_PATH)

        train = train_features.merge(train_labels, on=self.config.ID_COLS)

        print(f"Training data shape: {train.shape}")
        print(f"Test data shape: {test_features.shape}")

        return train, test_features

    def handle_missing_values(self, train, test):
        """Fill missing values in both datasets."""
        print("Handling missing values...")

        train['sector1d'] = train['sector1d'].fillna('Not employed')
        test['sector1d'] = test['sector1d'].fillna('Not employed')

        mode_dweltyp = train['dweltyp'].mode()[0]
        train['dweltyp'] = train['dweltyp'].fillna(mode_dweltyp)
        test['dweltyp'] = test['dweltyp'].fillna(mode_dweltyp)

        median_utl = train['utl_exp_ppp17'].median()
        train['utl_exp_ppp17'] = train['utl_exp_ppp17'].fillna(median_utl)
        test['utl_exp_ppp17'] = test['utl_exp_ppp17'].fillna(median_utl)

        cols_to_fill = ['employed', 'share_secondary', 'educ_max']
        for col in cols_to_fill:
            if col in train.columns and train[col].isnull().sum() > 0:
                mode_val = train[col].mode()[0]
                train[col] = train[col].fillna(mode_val)
                test[col] = test[col].fillna(mode_val)

        consumed_cols = [col for col in train.columns if col.startswith('consumed')]
        for col in consumed_cols:
            if train[col].isnull().sum() > 0:
                mode_val = train[col].mode()[0]
                train[col] = train[col].fillna(mode_val)
                test[col] = test[col].fillna(mode_val)

        print(f"Missing values after filling: {train.isnull().sum().sum()}")
        return train, test

    def encode_binary_columns(self, train, test):
        """Encode binary columns with robust handling."""
        print("Encoding binary columns...")

        consumed_cols = [col for col in train.columns if col.startswith('consumed')]
        binary_cols = ['male', 'owner', 'water', 'toilet', 'sewer', 'elect',
                       'employed', 'any_nonagric', 'urban'] + consumed_cols

        for col in binary_cols:
            if col in train.columns:
                unique_train = train[col].unique()
                unique_test = test[col].unique()

                if train[col].dtype in ['int64', 'float64', 'int32', 'float32']:
                    train[col] = train[col].astype(int)
                    test[col] = test[col].astype(int)
                else:
                    train[col] = train[col].map(self.config.BINARY_MAPPINGS)
                    test[col] = test[col].map(self.config.BINARY_MAPPINGS)

                    if train[col].isnull().any():
                        print(f"  Warning: Unmapped values in {col} (train). Unique before: {unique_train}")
                        train[col] = train[col].fillna(0)
                    if test[col].isnull().any():
                        print(f"  Warning: Unmapped values in {col} (test). Unique before: {unique_test}")
                        test[col] = test[col].fillna(0)

                    train[col] = train[col].astype(int)
                    test[col] = test[col].astype(int)

        print(f"Binary columns encoded: {len(binary_cols)}")
        return train, test, binary_cols

    def encode_multiclass_columns(self, train, test):
        """Encode multiclass columns using LabelEncoder."""
        print("Encoding multiclass columns...")

        multiclass_cols = ['water_source', 'sanitation_source', 'dweltyp', 'educ_max', 'sector1d']

        for col in multiclass_cols:
            if col in train.columns:
                le = LabelEncoder()
                train[col] = train[col].astype(str)
                test[col] = test[col].astype(str)

                combined = pd.concat([train[col], test[col]], axis=0)
                le.fit(combined)
                train[col] = le.transform(train[col])
                test[col] = le.transform(test[col])
                self.label_encoders[col] = le

        print(f"Multiclass columns encoded: {len(multiclass_cols)}")
        return train, test, multiclass_cols

    def define_feature_columns(self, train, binary_cols, multiclass_cols):
        """Define all feature columns."""
        print("Defining feature columns...")

        numerical_cols = [
            'weight', 'utl_exp_ppp17', 'hsize', 'num_children5', 'num_children10',
            'num_children18', 'age', 'num_adult_female', 'num_adult_male',
            'num_elderly', 'sworkershh', 'share_secondary', 'sfworkershh'
        ]

        region_cols = ['region1', 'region2', 'region3', 'region4', 'region5', 'region6', 'region7']

        feature_cols = binary_cols + multiclass_cols + numerical_cols + region_cols
        feature_cols = [col for col in feature_cols if col in train.columns and col != self.config.TARGET_COL]
        feature_cols = list(dict.fromkeys(feature_cols))

        self.feature_cols = feature_cols
        print(f"Total feature columns: {len(feature_cols)}")

        return feature_cols

    def prepare_data(self, train, test, feature_cols):
        """Prepare feature matrices and target."""
        print("Preparing feature matrices...")

        X = train[feature_cols].copy()
        y = train[self.config.TARGET_COL].copy()
        X_test = test[feature_cols].copy()

        sample_weights = train['weight'].values
        survey_ids = train['survey_id'].values

        y_log = np.log1p(y)

        print(f"X shape: {X.shape}")
        print(f"y shape: {y.shape}")
        print(f"X_test shape: {X_test.shape}")

        return X, y_log, X_test, sample_weights, survey_ids

    def create_splits(self, X, y_log, sample_weights, survey_ids):
        """Create train/validation splits with survey IDs."""
        print("Creating train/validation splits...")

        x_train, x_val, y_train, y_val, w_train, w_val, sid_train, sid_val = train_test_split(
            X, y_log, sample_weights, survey_ids,
            test_size=self.config.VAL_SIZE,
            random_state=self.config.RANDOM_STATE
        )

        print(f"Training set: {x_train.shape[0]} samples")
        print(f"Validation set: {x_val.shape[0]} samples")
        print(f"Surveys in validation: {np.unique(sid_val)}")

        return x_train, x_val, y_train, y_val, w_train, w_val, sid_train, sid_val

    def scale_features(self, x_train, x_val, X, X_test):
        """Scale features for neural network."""
        print("Scaling features...")

        x_train_scaled = self.scaler.fit_transform(x_train)
        x_val_scaled = self.scaler.transform(x_val)
        X_scaled = self.scaler.fit_transform(X)
        X_test_scaled = self.scaler.transform(X_test)

        return x_train_scaled, x_val_scaled, X_scaled, X_test_scaled

    def process_all(self):
        """Run the complete data processing pipeline."""
        print("=" * 60)
        print("STARTING DATA PROCESSING PIPELINE")
        print("=" * 60)

        train, test = self.load_data()
        train, test = self.handle_missing_values(train, test)
        train, test, binary_cols = self.encode_binary_columns(train, test)
        train, test, multiclass_cols = self.encode_multiclass_columns(train, test)
        feature_cols = self.define_feature_columns(train, binary_cols, multiclass_cols)
        X, y_log, X_test, sample_weights, survey_ids = self.prepare_data(train, test, feature_cols)
        x_train, x_val, y_train, y_val, w_train, w_val, sid_train, sid_val = self.create_splits(
            X, y_log, sample_weights, survey_ids
        )
        x_train_scaled, x_val_scaled, X_scaled, X_test_scaled = self.scale_features(x_train, x_val, X, X_test)

        print("\n" + "=" * 60)
        print("DATA PROCESSING COMPLETE")
        print("=" * 60)

        return {
            'train': train,
            'test': test,
            'X': X,
            'y_log': y_log,
            'X_test': X_test,
            'sample_weights': sample_weights,
            'survey_ids': survey_ids,
            'x_train': x_train,
            'x_val': x_val,
            'y_train': y_train,
            'y_val': y_val,
            'w_train': w_train,
            'w_val': w_val,
            'sid_train': sid_train,
            'sid_val': sid_val,
            'x_train_scaled': x_train_scaled,
            'x_val_scaled': x_val_scaled,
            'X_scaled': X_scaled,
            'X_test_scaled': X_test_scaled,
            'feature_cols': feature_cols
        }



# Class for Competition's Metric calc.

In [161]:

class CompetitionMetric:
    """
    Calculates the competition metric exactly as defined:
    metric = (1/S) × Σs [ (90/Σw) × Σt(wt × |rt - r̂t|/rt) + (10/H) × Σh(|ch - ĉh|/ch) ]

    Key: Calculate per survey, then average across surveys.
    """

    def __init__(self, config):
        self.config = config
        self.threshold_weights = [1 - abs(0.4 - p) for p in config.PERCENTILE_RANKS]
        self.sum_weights = sum(self.threshold_weights)

    def calculate(self, y_pred_log, y_true_log, survey_ids=None, return_details=False):
        """
        Calculate competition metric for predictions in log scale.
        """
        y_pred = np.expm1(y_pred_log)
        y_true = np.expm1(y_true_log)

        return self.calculate_from_original_scale(y_pred, y_true, survey_ids, return_details)

    def calculate_from_original_scale(self, y_pred, y_true, survey_ids=None, return_details=False):
        """
        Calculate competition metric when predictions are already in original scale.
        """
        if survey_ids is None:
            survey_ids = np.zeros(len(y_true), dtype=int)

        unique_surveys = np.unique(survey_ids)

        survey_scores = []
        survey_poverty_mapes = []
        survey_consumption_mapes = []

        for survey_id in unique_surveys:
            mask = survey_ids == survey_id
            y_true_survey = y_true[mask]
            y_pred_survey = y_pred[mask]

            # Calculate weighted poverty rate MAPE for this survey
            poverty_errors = []
            for threshold, tw in zip(self.config.POVERTY_THRESHOLDS, self.threshold_weights):
                actual_rate = (y_true_survey < threshold).mean()
                pred_rate = (y_pred_survey < threshold).mean()
                if actual_rate > 0:
                    error = tw * abs(actual_rate - pred_rate) / actual_rate
                    poverty_errors.append(error)

            poverty_mape = sum(poverty_errors) / self.sum_weights if poverty_errors else 0

            # Calculate consumption MAPE for this survey
            consumption_mape = np.mean(np.abs(y_true_survey - y_pred_survey) / y_true_survey)

            # Survey score: 90% poverty + 10% consumption
            survey_score = 0.9 * poverty_mape + 0.1 * consumption_mape

            survey_scores.append(survey_score)
            survey_poverty_mapes.append(poverty_mape)
            survey_consumption_mapes.append(consumption_mape)

        final_score = np.mean(survey_scores)
        avg_poverty_mape = np.mean(survey_poverty_mapes)
        avg_consumption_mape = np.mean(survey_consumption_mapes)

        if return_details:
            return final_score, avg_poverty_mape, avg_consumption_mape
        return final_score



# Class for Models training

In [162]:

class ModelTrainer:
    """Trains XGBoost model - our best performer."""

    def __init__(self, config, metric):
        self.config = config
        self.metric = metric
        self.model = None
        self.val_predictions = None
        self.test_predictions = None

    def train_xgboost(self, X, y_log, x_val, y_val, sid_val):
        """Train XGBoost model."""
        print("\n" + "=" * 60)
        print("Training XGBoost...")
        print("=" * 60)

        model = xgb.XGBRegressor(
            n_estimators=300,
            max_depth=8,
            learning_rate=0.05,
            subsample=0.7,
            colsample_bytree=0.8,
            min_child_weight=3,
            random_state=self.config.RANDOM_STATE,
            tree_method='hist'
        )

        start_time = time.time()
        model.fit(X, y_log)
        train_time = time.time() - start_time

        print(f"Training time: {train_time:.2f} seconds")

        # Get validation predictions
        y_pred_val = model.predict(x_val)

        # Calculate score BEFORE calibration
        score, pov_mape, cons_mape = self.metric.calculate(
            y_pred_val, y_val.values, sid_val, return_details=True
        )
        print(f"Validation Score (before calibration): {score*100:.2f}%")
        print(f"  Poverty MAPE:    {pov_mape*100:.2f}%")
        print(f"  Consumption MAPE: {cons_mape*100:.2f}%")

        self.model = model
        self.val_predictions = y_pred_val

        return model, y_pred_val

    def generate_test_predictions(self, X_test):
        """Generate predictions on test set."""
        print("\nGenerating test predictions...")
        self.test_predictions = self.model.predict(X_test)
        print(f"Test predictions generated: {len(self.test_predictions)}")
        return self.test_predictions



# Class Predictions

In [163]:

class PredictionCalibrator:
    """
    Post-processing calibration to directly optimize the competition metric.

    The key insight: Our model minimizes RMSE, but the competition scores on
    poverty rate MAPE. By calibrating predictions with y_cal = scale * y + shift,
    we can directly optimize what the competition measures.

    This is a standard technique used by competition winners.
    """

    def __init__(self, config, metric):
        self.config = config
        self.metric = metric
        self.best_params = None
        self.best_score = None

    def calibrate(self, y_pred_log, y_true_log, survey_ids):
        """
        Find optimal calibration parameters (scale, shift) that minimize
        the competition metric.

        Args:
            y_pred_log: Raw predictions in log scale
            y_true_log: True values in log scale
            survey_ids: Survey IDs for each sample

        Returns:
            Calibrated predictions in log scale
        """
        print("\n" + "=" * 60)
        print("CALIBRATING PREDICTIONS")
        print("=" * 60)

        # Convert to original scale for calibration
        y_pred_orig = np.expm1(y_pred_log)
        y_true_orig = np.expm1(y_true_log)

        # Score before calibration
        score_before = self.metric.calculate_from_original_scale(
            y_pred_orig, y_true_orig, survey_ids
        )
        print(f"\nScore BEFORE calibration: {score_before*100:.2f}%")

        # Define objective function to minimize
        def objective(params):
            scale, shift = params
            y_calibrated = scale * y_pred_orig + shift
            # Ensure no negative values
            y_calibrated = np.maximum(y_calibrated, 0.01)
            return self.metric.calculate_from_original_scale(
                y_calibrated, y_true_orig, survey_ids
            )

        # Try multiple starting points to avoid local minima
        best_result = None
        best_score = float('inf')

        starting_points = [
            [1.0, 0.0],      # No change
            [1.0, 0.5],      # Shift up
            [1.0, -0.5],     # Shift down
            [1.05, 0.0],     # Scale up
            [0.95, 0.0],     # Scale down
            [1.02, 0.2],     # Scale up + shift up
            [0.98, -0.2],    # Scale down + shift down
            [1.1, 0.0],      # Larger scale up
            [0.9, 0.0],      # Larger scale down
        ]

        print("\nSearching for optimal calibration parameters...")

        for start in starting_points:
            result = minimize(
                objective,
                start,
                method='Nelder-Mead',
                options={'maxiter': 1000, 'xatol': 1e-6, 'fatol': 1e-6}
            )

            if result.fun < best_score:
                best_score = result.fun
                best_result = result

        # Also try bounded optimization
        result_bounded = minimize(
            objective,
            [1.0, 0.0],
            method='L-BFGS-B',
            bounds=[(0.8, 1.2), (-2.0, 2.0)],
            options={'maxiter': 1000}
        )

        if result_bounded.fun < best_score:
            best_score = result_bounded.fun
            best_result = result_bounded

        self.best_params = best_result.x
        self.best_score = best_score

        scale, shift = self.best_params

        print(f"\nOptimal calibration parameters:")
        print(f"  Scale: {scale:.4f}")
        print(f"  Shift: {shift:.4f}")
        print(f"\nScore AFTER calibration: {best_score*100:.2f}%")
        print(f"Improvement: {(score_before - best_score)*100:.2f} percentage points")

        # Apply calibration
        y_calibrated_orig = scale * y_pred_orig + shift
        y_calibrated_orig = np.maximum(y_calibrated_orig, 0.01)

        # Convert back to log scale
        y_calibrated_log = np.log1p(y_calibrated_orig)

        # Verify improvement per survey
        print("\nPer-survey improvement:")
        for sid in np.unique(survey_ids):
            mask = survey_ids == sid
            score_before_s = self.metric.calculate_from_original_scale(
                y_pred_orig[mask], y_true_orig[mask], None
            )
            score_after_s = self.metric.calculate_from_original_scale(
                y_calibrated_orig[mask], y_true_orig[mask], None
            )
            print(f"  Survey {sid}: {score_before_s*100:.2f}% → {score_after_s*100:.2f}%")

        return y_calibrated_log

    def apply_calibration(self, y_pred_log):
        """
        Apply the learned calibration parameters to new predictions.

        Args:
            y_pred_log: Raw predictions in log scale

        Returns:
            Calibrated predictions in log scale
        """
        if self.best_params is None:
            raise ValueError("Must call calibrate() first to learn parameters")

        scale, shift = self.best_params

        # Convert to original scale
        y_pred_orig = np.expm1(y_pred_log)

        # Apply calibration
        y_calibrated_orig = scale * y_pred_orig + shift
        y_calibrated_orig = np.maximum(y_calibrated_orig, 0.01)

        # Convert back to log scale
        y_calibrated_log = np.log1p(y_calibrated_orig)

        return y_calibrated_log



# Class Submission

In [164]:

class SubmissionGenerator:
    """Generates submission files for DrivenData competition."""

    def __init__(self, config):
        self.config = config

    def generate(self, test_features, y_test_pred):
        """Generate submission files."""
        print("\n" + "=" * 60)
        print("GENERATING SUBMISSION FILES")
        print("=" * 60)

        # 1. Household consumption predictions
        submission_hh = pd.DataFrame({
            'survey_id': test_features['survey_id'],
            'hhid': test_features['hhid'],
            'cons_ppp17': y_test_pred
        })

        # 2. Poverty rates predictions
        poverty_rates = []
        for survey_id in self.config.TEST_SURVEY_IDS:
            survey_preds = submission_hh[submission_hh['survey_id'] == survey_id]['cons_ppp17']
            row = {'survey_id': survey_id}
            for threshold in self.config.POVERTY_THRESHOLDS:
                col_name = f'pct_hh_below_{threshold:.2f}'
                pct_below = (survey_preds < threshold).mean()
                row[col_name] = pct_below
            poverty_rates.append(row)

        submission_rates = pd.DataFrame(poverty_rates)

        # Save files
        submission_hh.to_csv('predicted_household_consumption.csv', index=False)
        submission_rates.to_csv('predicted_poverty_distribution.csv', index=False)

        # Create zip file
        with zipfile.ZipFile('submission.zip', 'w') as zipf:
            zipf.write('predicted_household_consumption.csv')
            zipf.write('predicted_poverty_distribution.csv')

        print("\nFiles created:")
        print(f"  1. predicted_household_consumption.csv - Shape: {submission_hh.shape}")
        print(f"  2. predicted_poverty_distribution.csv - Shape: {submission_rates.shape}")
        print(f"  3. submission.zip")

        # Show prediction statistics
        print(f"\nPrediction Statistics:")
        print(f"  Mean:   ${y_test_pred.mean():.2f}/day")
        print(f"  Median: ${np.median(y_test_pred):.2f}/day")
        print(f"  Min:    ${y_test_pred.min():.2f}/day")
        print(f"  Max:    ${y_test_pred.max():.2f}/day")

        # Show poverty rates
        print(f"\nPredicted Poverty Rates (at $5.26/day threshold):")
        for _, row in submission_rates.iterrows():
            print(f"  Survey {int(row['survey_id'])}: {row['pct_hh_below_5.26']*100:.1f}%")

        print("\n✓ Ready to submit to DrivenData!")

        return submission_hh, submission_rates

    def download(self):
        """Download the submission zip file."""
        from google.colab import files
        files.download('submission.zip')
        print("Download started!")



# Main

In [165]:

def main():
    """
    Main pipeline with post-processing calibration:
    1. Process data
    2. Train XGBoost model
    3. Calibrate predictions to optimize competition metric
    4. Apply calibration to test predictions
    5. Generate submission
    """
    print("=" * 60)
    print("POVERTY PREDICTION PIPELINE")
    print("With Post-Processing Calibration")
    print("=" * 60)

    # Step 1: Initialize configuration
    print("\n[Step 1/6] Initializing configuration...")
    config = Config()

    # Step 2: Process data
    print("\n[Step 2/6] Processing data...")
    processor = DataProcessor(config)
    data = processor.process_all()

    # Step 3: Initialize metric calculator
    print("\n[Step 3/6] Initializing competition metric...")
    metric = CompetitionMetric(config)

    # Step 4: Train XGBoost model
    print("\n[Step 4/6] Training XGBoost model...")
    trainer = ModelTrainer(config, metric)
    model, val_predictions = trainer.train_xgboost(
        data['X'], data['y_log'],
        data['x_val'], data['y_val'], data['sid_val']
    )

    # Step 5: Calibrate predictions
    print("\n[Step 5/6] Calibrating predictions...")
    calibrator = PredictionCalibrator(config, metric)

    # Learn calibration parameters on validation set
    val_predictions_calibrated = calibrator.calibrate(
        val_predictions,
        data['y_val'].values,
        data['sid_val']
    )

    # Generate test predictions and apply calibration
    test_predictions_raw = trainer.generate_test_predictions(data['X_test'])
    test_predictions_calibrated = calibrator.apply_calibration(test_predictions_raw)

    # Convert to original scale
    y_test_pred = np.expm1(test_predictions_calibrated)

    print(f"\nCalibrated Test Predictions:")
    print(f"  Mean:   ${y_test_pred.mean():.2f}/day")
    print(f"  Median: ${np.median(y_test_pred):.2f}/day")
    print(f"  Min:    ${y_test_pred.min():.2f}/day")
    print(f"  Max:    ${y_test_pred.max():.2f}/day")

    # Step 6: Generate submission
    print("\n[Step 6/6] Generating submission...")
    submission = SubmissionGenerator(config)
    submission_hh, submission_rates = submission.generate(data['test'], y_test_pred)

    print("\n" + "=" * 60)
    print("PIPELINE COMPLETE!")
    print("=" * 60)

    return {
        'config': config,
        'data': data,
        'metric': metric,
        'trainer': trainer,
        'calibrator': calibrator,
        'submission': submission,
        'y_test_pred': y_test_pred,
        'calibration_params': calibrator.best_params
    }



# Runner

In [166]:

# Run the complete pipeline with calibration
results = main()

# Show calibration parameters used
print("\n" + "=" * 60)
print("CALIBRATION SUMMARY")
print("=" * 60)
print(f"Scale: {results['calibration_params'][0]:.4f}")
print(f"Shift: {results['calibration_params'][1]:.4f}")
print(f"Formula: y_calibrated = {results['calibration_params'][0]:.4f} × y_pred + {results['calibration_params'][1]:.4f}")

# Download submission file
results['submission'].download()

POVERTY PREDICTION PIPELINE
With Post-Processing Calibration

[Step 1/6] Initializing configuration...

[Step 2/6] Processing data...
STARTING DATA PROCESSING PIPELINE
Loading data...
Training data shape: (104234, 89)
Test data shape: (103023, 88)
Handling missing values...
Missing values after filling: 0
Encoding binary columns...
Binary columns encoded: 59
Encoding multiclass columns...
Multiclass columns encoded: 5
Defining feature columns...
Total feature columns: 84
Preparing feature matrices...
X shape: (104234, 84)
y shape: (104234,)
X_test shape: (103023, 84)
Creating train/validation splits...
Training set: 83387 samples
Validation set: 20847 samples
Surveys in validation: [100000 200000 300000]
Scaling features...

DATA PROCESSING COMPLETE

[Step 3/6] Initializing competition metric...

[Step 4/6] Training XGBoost model...

Training XGBoost...
Training time: 2.74 seconds
Validation Score (before calibration): 8.70%
  Poverty MAPE:    7.08%
  Consumption MAPE: 23.28%

[Step 5/

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Download started!
