In [1]:
# Core libraries
import pandas as pd
import numpy as np

# CV utilities
from sklearn.model_selection import StratifiedKFold

# Pipelines
from sklearn.pipeline import Pipeline

# Models
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# Custom preprocessing
import sys
sys.path.append('../')
from feature import PreProcessor

In [2]:
# Load training data
train_df = pd.read_csv('../data/train.csv')
train_df.drop(columns=['PassengerId'], inplace=True)

TARGET = 'Survived'
X = train_df.drop(columns=[TARGET])
y = train_df[TARGET].values

In [3]:
# Stratified K-Fold for OOF
skf = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

In [4]:
# XGBoost (anchor model)
xgb_pipe = Pipeline([
    ('preprocessor', PreProcessor(scaling=False)),
    ('model', XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=4,
        min_child_weight=5,
        gamma=0.3,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        objective='binary:logistic',
        eval_metric='logloss',
        random_state=42,
        n_jobs=-1
    ))
])

In [5]:
# Random Forest
rf_pipe = Pipeline([
    ('preprocessor', PreProcessor(scaling=False)),
    ('model', RandomForestClassifier(
        n_estimators=400,
        min_samples_leaf=2,
        min_samples_split=5,
        max_features='sqrt',
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ))
])

In [6]:
# KNN (needs scaling)
knn_pipe = Pipeline([
    ('preprocessor', PreProcessor(scaling=True)),
    ('model', KNeighborsClassifier(n_neighbors=19))
])

In [7]:
# LightGBM
lgbm_pipe = Pipeline([
    ('preprocessor', PreProcessor(scaling=False)),
    ('model', LGBMClassifier(
        n_estimators=1000,
        learning_rate=0.03,
        max_depth=5,
        num_leaves=7,
        min_child_samples=50,
        subsample=0.7,
        colsample_bytree=0.9,
        reg_alpha=0.5,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1
    ))
])

In [8]:
# Logistic Regression
lr_pipe = Pipeline([
    ('preprocessor', PreProcessor(scaling=True)),
    ('model', LogisticRegression(
        C=0.5,
        solver='liblinear',
        class_weight='balanced',
        random_state=42
    ))
])

In [9]:
# Decision Tree
dt_pipe = Pipeline([
    ('preprocessor', PreProcessor(scaling=False)),
    ('model', DecisionTreeClassifier(
        max_depth=4,
        min_samples_leaf=20,
        min_samples_split=40,
        class_weight='balanced',
        random_state=42
    ))
])

In [10]:
# Container for OOF probabilities
oof_probs = {
    'xgb': np.zeros(len(X)),
    'rf': np.zeros(len(X)),
    'knn': np.zeros(len(X)),
    'lgbm': np.zeros(len(X)),
    'lr': np.zeros(len(X)),
    'dt': np.zeros(len(X))
}

In [11]:
# Loop over folds
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_tr, X_va = X.iloc[train_idx], X.iloc[val_idx]
    y_tr = y[train_idx]

    # Fit and predict for each model
    for name, model in [
        ('xgb', xgb_pipe),
        ('rf', rf_pipe),
        ('knn', knn_pipe),
        ('lgbm', lgbm_pipe),
        ('lr', lr_pipe),
        ('dt', dt_pipe)
    ]:
        model.fit(X_tr, y_tr)
        oof_probs[name][val_idx] = model.predict_proba(X_va)[:, 1]

[LightGBM] [Info] Number of positive: 273, number of negative: 439
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000212 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 167
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028




[LightGBM] [Info] Number of positive: 274, number of negative: 439
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000155 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 167
[LightGBM] [Info] Number of data points in the train set: 713, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.384292 -> initscore=-0.471371
[LightGBM] [Info] Start training from score -0.471371




[LightGBM] [Info] Number of positive: 274, number of negative: 439
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000223 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 166
[LightGBM] [Info] Number of data points in the train set: 713, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.384292 -> initscore=-0.471371
[LightGBM] [Info] Start training from score -0.471371




[LightGBM] [Info] Number of positive: 274, number of negative: 439
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000169 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 168
[LightGBM] [Info] Number of data points in the train set: 713, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.384292 -> initscore=-0.471371
[LightGBM] [Info] Start training from score -0.471371




[LightGBM] [Info] Number of positive: 273, number of negative: 440
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000176 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 170
[LightGBM] [Info] Number of data points in the train set: 713, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.382889 -> initscore=-0.477303
[LightGBM] [Info] Start training from score -0.477303




In [12]:
# Create probability DataFrame
prob_df = pd.DataFrame({
    'y_true': y,
    'xgb_prob': oof_probs['xgb'],
    'rf_prob': oof_probs['rf'],
    'knn_prob': oof_probs['knn'],
    'lgbm_prob': oof_probs['lgbm'],
    'lr_prob': oof_probs['lr'],
    'dt_prob': oof_probs['dt']
})

prob_df.head()

Unnamed: 0,y_true,xgb_prob,rf_prob,knn_prob,lgbm_prob,lr_prob,dt_prob
0,0,0.042072,0.136333,0.052632,0.045622,0.12716,0.148176
1,1,0.978834,0.990206,1.0,0.988931,0.968198,0.966933
2,1,0.501403,0.557324,0.421053,0.444122,0.707292,0.746487
3,1,0.992789,0.997611,0.947368,0.997931,0.958728,0.961882
4,0,0.138045,0.10496,0.210526,0.108427,0.119763,0.131006


In [13]:
# Default threshold
THRESHOLD = 0.5

for model in ['xgb', 'rf', 'knn', 'lgbm', 'lr', 'dt']:
    prob_df[f'{model}_correct'] = (
        (prob_df[f'{model}_prob'] >= THRESHOLD).astype(int)
        == prob_df['y_true']
    ).astype(int)

prob_df.head()

Unnamed: 0,y_true,xgb_prob,rf_prob,knn_prob,lgbm_prob,lr_prob,dt_prob,xgb_correct,rf_correct,knn_correct,lgbm_correct,lr_correct,dt_correct
0,0,0.042072,0.136333,0.052632,0.045622,0.12716,0.148176,1,1,1,1,1,1
1,1,0.978834,0.990206,1.0,0.988931,0.968198,0.966933,1,1,1,1,1,1
2,1,0.501403,0.557324,0.421053,0.444122,0.707292,0.746487,1,1,0,0,1,1
3,1,0.992789,0.997611,0.947368,0.997931,0.958728,0.961882,1,1,1,1,1,1
4,0,0.138045,0.10496,0.210526,0.108427,0.119763,0.131006,1,1,1,1,1,1


In [14]:
# Rows where XGB is wrong (OOF-based)
xgb_wrong_df = prob_df[prob_df['xgb_correct'] == 0]

# Size check
len(xgb_wrong_df), len(prob_df)

(143, 891)

In [15]:
# Models to evaluate as potential correctors
candidate_models = ['rf', 'knn', 'lgbm', 'lr', 'dt']

# Compute P(model correct | XGB wrong)
level1_correction = {
    model: xgb_wrong_df[f'{model}_correct'].mean()
    for model in candidate_models
}

level1_correction_df = (
    pd.Series(level1_correction, name='correction_rate')
      .sort_values(ascending=False)
)

level1_correction_df

knn     0.244755
lr      0.216783
dt      0.195804
rf      0.181818
lgbm    0.083916
Name: correction_rate, dtype: float64

In [16]:
# Absolute number of corrections (stability check)
level1_counts = {
    model: xgb_wrong_df[f'{model}_correct'].sum()
    for model in candidate_models
}

pd.DataFrame({
    'correction_rate': level1_correction_df,
    'num_corrections': pd.Series(level1_counts)
})

Unnamed: 0,correction_rate,num_corrections
dt,0.195804,28
knn,0.244755,35
lgbm,0.083916,12
lr,0.216783,31
rf,0.181818,26


In [17]:
# Rows where both XGB and KNN are wrong
xgb_knn_wrong_df = prob_df[
    (prob_df['xgb_correct'] == 0) &
    (prob_df['knn_correct'] == 0)
]

# Size check
len(xgb_knn_wrong_df), len(prob_df)

(108, 891)

In [18]:
# Remaining candidates (excluding XGB and KNN)
level2_candidates = ['rf', 'lgbm', 'lr', 'dt']

level2_correction = {
    model: xgb_knn_wrong_df[f'{model}_correct'].mean()
    for model in level2_candidates
}

level2_correction_df = (
    pd.Series(level2_correction, name='correction_rate')
      .sort_values(ascending=False)
)

level2_correction_df

dt      0.138889
rf      0.129630
lr      0.129630
lgbm    0.064815
Name: correction_rate, dtype: float64

In [19]:
level2_counts = {
    model: xgb_knn_wrong_df[f'{model}_correct'].sum()
    for model in level2_candidates
}

pd.DataFrame({
    'correction_rate': level2_correction_df,
    'num_corrections': pd.Series(level2_counts)
})

Unnamed: 0,correction_rate,num_corrections
dt,0.138889,15
lgbm,0.064815,7
lr,0.12963,14
rf,0.12963,14
