In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Imports

In [None]:
# Import all required libraries
import pandas as pd
import numpy as np
import joblib
import gc
import warnings
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Suppress warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
SEED = 42
np.random.seed(SEED)

# Configuration Parameters

In [None]:
# Model configuration parameters
TARGET = "y"

# XGBoost parameters
XGB_PARAMS = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'eta': 0.03,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'max_depth': 6,
    'n_estimators': 2000,
    'random_state': SEED,
    'tree_method': 'hist',
    'verbosity': 0
}

# LightGBM parameters
LGB_PARAMS = {
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.03,
    'num_leaves': 64,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'n_estimators': 2000,
    'seed': SEED,
    'verbose': -1
}

# CatBoost parameters
CAT_PARAMS = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'depth': 8,
    'learning_rate': 0.03,
    'l2_leaf_reg': 3,
    'random_seed': SEED,
    'iterations': 2000,
    'verbose': False
}

# Create directories for saving models and predictions
SAVE_DIR = Path("models")
SAVE_DIR.mkdir(exist_ok=True)
SAVE_PRED = Path("oof_preds")
SAVE_PRED.mkdir(exist_ok=True)
OUT_DIR = Path("submissions")
OUT_DIR.mkdir(exist_ok=True)


# Data Loading

In [None]:
def load_data(path: str) -> pd.DataFrame:
    """Load parquet data file"""
    return pd.read_parquet(path)

def get_folds(df: pd.DataFrame, n_splits: int = 5) -> pd.DataFrame:
    """Create stratified folds for cross-validation"""
    df = df.copy()
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    df["fold"] = -1

    for i, (_, val_idx) in enumerate(skf.split(df, df[TARGET])):
        df.loc[val_idx, "fold"] = i

    return df


In [None]:
#!pip install config

Collecting config
  Downloading config-0.5.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading config-0.5.1-py2.py3-none-any.whl (20 kB)
Installing collected packages: config
Successfully installed config-0.5.1


# Metric

In [None]:
import numpy as np
import joblib, gc
from pathlib import Path
import xgboost as xgb # Import xgboost
from xgboost import XGBClassifier # Keep import for reference, but won't use for training
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
# from config import XGB_PARAMS, LGB_PARAMS, CAT_PARAMS, TARGET, SEED # Removed the import from config

SAVE_DIR = Path("models")
SAVE_DIR.mkdir(exist_ok=True)

def train_xgb(X_train, y_train, X_val, y_val, name):
    """Train XGBoost model using native API with early stopping"""
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)

    eval_set = [(dtrain, 'train'), (dval, 'eval')]

    # Use xgb.train with early stopping parameters
    model = xgb.train(
        XGB_PARAMS,
        dtrain,
        num_boost_round=XGB_PARAMS['n_estimators'], # Use n_estimators from params
        evals=eval_set,
        early_stopping_rounds=200, # early_stopping_rounds in xgb.train
        verbose_eval=False # Set to True for detailed progress
    )

    # Save model
    model.save_model(SAVE_DIR / f"{name}.xgb") # Save in native xgb format

    # Print validation score
    val_pred = model.predict(dval) # Use model.predict for DMatrix
    val_score = roc_auc_score(y_val, val_pred)
    print(f"XGBoost {name} - Validation AUC: {val_score:.4f}")

    return model

def train_lgb(X_train, y_train, X_val, y_val, name):
    """Train LightGBM model with early stopping"""
    model = LGBMClassifier(**LGB_PARAMS)
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        # Using early_stopping_rounds directly in fit for LGBM
        callbacks=[model.callback_early_stopping(stopping_rounds=200, verbose=False)]
    )

    # Save model
    joblib.dump(model, SAVE_DIR / f"{name}.joblib")

    # Print validation score
    val_pred = model.predict_proba(X_val)[:, 1]
    val_score = roc_auc_score(y_val, val_pred)
    print(f"LightGBM {name} - Validation AUC: {val_score:.4f}")

    return model

def train_cat(X_train, y_train, X_val, y_val, name):
    """Train CatBoost model with early stopping"""
    model = CatBoostClassifier(**CAT_PARAMS)
    model.fit(
        X_train, y_train,
        eval_set=(X_val, y_val),
        early_stopping_rounds=200, # CatBoost still uses early_stopping_rounds
        verbose=False
    )

    # Save model
    joblib.dump(model, SAVE_DIR / f"{name}.joblib")

    # Print validation score
    val_pred = model.predict_proba(X_val)[:, 1]
    val_score = roc_auc_score(y_val, val_pred)
    print(f"CatBoost {name} - Validation AUC: {val_score:.4f}")

    return model

def train_lr_meta(meta_X, meta_y):
    """Train meta-learner (Logistic Regression)"""
    lr = make_pipeline(
        StandardScaler(with_mean=False),
        LogisticRegression(max_iter=1000, n_jobs=-1, random_state=SEED)
    )
    lr.fit(meta_X, meta_y)
    joblib.dump(lr, SAVE_DIR / "meta_lr.joblib")

    # Print meta-learner score
    meta_pred = lr.predict_proba(meta_X)[:, 1]
    meta_score = roc_auc_score(meta_y, meta_pred)
    print(f"Meta-learner - Training AUC: {meta_score:.4f}")

    return lr

# Model Training

In [None]:
def train_xgb(X_train, y_train, X_val, y_val, name):
    """Train XGBoost model with early stopping"""
    model = XGBClassifier(**XGB_PARAMS)
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        # early_stopping_rounds=200, # Removed due to TypeError
        verbose=False
    )

    # Save model
    joblib.dump(model, SAVE_DIR / f"{name}.joblib")

    # Print validation score
    val_pred = model.predict_proba(X_val)[:, 1]
    val_score = roc_auc_score(y_val, val_pred)
    print(f"XGBoost {name} - Validation AUC: {val_score:.4f}")

    return model

def train_lgb(X_train, y_train, X_val, y_val, name):
    """Train LightGBM model with early stopping"""
    model = LGBMClassifier(**LGB_PARAMS)
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        callbacks=[
            model.callback_early_stopping(stopping_rounds=200, verbose=False)
        ]
    )

    # Save model
    joblib.dump(model, SAVE_DIR / f"{name}.joblib")

    # Print validation score
    val_pred = model.predict_proba(X_val)[:, 1]
    val_score = roc_auc_score(y_val, val_pred)
    print(f"LightGBM {name} - Validation AUC: {val_score:.4f}")

    return model

def train_cat(X_train, y_train, X_val, y_val, name):
    """Train CatBoost model with early stopping"""
    model = CatBoostClassifier(**CAT_PARAMS)
    model.fit(
        X_train, y_train,
        eval_set=(X_val, y_val),
        early_stopping_rounds=200,
        verbose=False
    )

    # Save model
    joblib.dump(model, SAVE_DIR / f"{name}.joblib")

    # Print validation score
    val_pred = model.predict_proba(X_val)[:, 1]
    val_score = roc_auc_score(y_val, val_pred)
    print(f"CatBoost {name} - Validation AUC: {val_score:.4f}")

    return model

def train_lr_meta(meta_X, meta_y):
    """Train meta-learner (Logistic Regression)"""
    lr = make_pipeline(
        StandardScaler(with_mean=False),
        LogisticRegression(max_iter=1000, n_jobs=-1, random_state=SEED)
    )
    lr.fit(meta_X, meta_y)
    joblib.dump(lr, SAVE_DIR / "meta_lr.joblib")

    # Print meta-learner score
    meta_pred = lr.predict_proba(meta_X)[:, 1]
    meta_score = roc_auc_score(meta_y, meta_pred)
    print(f"Meta-learner - Training AUC: {meta_score:.4f}")

    return lr

# Load and Prepare Data

In [None]:
# Load your combined dataset
# Replace 'df_train.parquet' with your actual file path
df_train = load_data("/content/drive/MyDrive/Amex/combined/train_sample_25p.parquet")

# Create folds for cross-validation
df_train = get_folds(df_train)

# Identify feature columns (exclude id columns, target, and fold)
feature_cols = [col for col in df_train.columns
                if col not in ['fold', TARGET, 'id1', 'id2', 'id3', 'id4', 'id5']]

print(f"Dataset shape: {df_train.shape}")
print(f"Number of features: {len(feature_cols)}")
print(f"Target distribution: {df_train[TARGET].value_counts()}")


Dataset shape: (220329, 392)
Number of features: 385
Target distribution: y
0    183278
1     37051
Name: count, dtype: int64


# Cross-Validation Training Loop

In [None]:
# Initialize out-of-fold predictions
oof_predictions = {
    'xgb': np.zeros(len(df_train)),
    'lgb': np.zeros(len(df_train)),
    'cat': np.zeros(len(df_train))
}

# Cross-validation loop
for fold in range(5):
    print(f"\n{'='*50}")
    print(f"Training Fold {fold + 1}/5")
    print(f"{'='*50}")

    # Split data
    train_idx = df_train[df_train.fold != fold].index
    val_idx = df_train[df_train.fold == fold].index

    X_train = df_train.loc[train_idx, feature_cols]
    y_train = df_train.loc[train_idx, TARGET]
    X_val = df_train.loc[val_idx, feature_cols]
    y_val = df_train.loc[val_idx, TARGET]

    # Convert target to integer type
    y_train = y_train.astype(int)
    y_val = y_val.astype(int)

    # Identify and drop object type columns
    object_cols = X_train.select_dtypes(include='object').columns
    if len(object_cols) > 0:
        print(f"Dropping object columns: {list(object_cols)}")
        X_train = X_train.drop(columns=object_cols)
        X_val = X_val.drop(columns=object_cols)
        # Update feature_cols for subsequent folds if needed, although dropping here is sufficient for this fold

    print(f"Train size: {len(X_train)}, Validation size: {len(X_val)}")

    # Train models
    xgb_model = train_xgb(X_train, y_train, X_val, y_val, f"xgb_fold_{fold}")
    lgb_model = train_lgb(X_train, y_train, X_val, y_val, f"lgb_fold_{fold}")
    cat_model = train_cat(X_train, y_train, X_val, y_val, f"cat_fold_{fold}")

    # Store OOF predictions
    oof_predictions['xgb'][val_idx] = xgb_model.predict_proba(X_val)[:, 1]
    oof_predictions['lgb'][val_idx] = lgb_model.predict_proba(X_val)[:, 1]
    oof_predictions['cat'][val_idx] = cat_model.predict_proba(X_val)[:, 1]


    # Clean up memory
    del xgb_model, lgb_model, cat_model
    gc.collect()


Training Fold 1/5
Dropping object columns: ['f42', 'f48', 'f50', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f226', 'f227', 'f228', 'f229', 'f230', 'f231', 'f232', 'f233', 'f234', 'f235', 'f236', 'f237', 'f238', 'f239', 'f240', 'f241', 'f242', 'f243', 'f244', 'f245', 'f246', 'f247', 'f248', 'f249', 'f250', 'f251', 'f252', 'f253', 'f254', 'f255', 'f256', 'f257', 'f258', 'f259', 'f260', 'f261', 'f262', 'f263', 'f264', 'f265', 'f266', 'f267', 'f268', 'f269', 'f270', 'f271', 'f272', 'f273', 'f274', 'f275', 'f276', 'f277', 'f278', 'f279', 'f280', 'f281', 'f282', 'f283', 'f284', 'f285', 'f286', 'f287', 'f288', 'f289', 'f290', 'f291', 'f292', 'f293', 'f294', 'f295', 'f296', 'f297', 'f298', 'f299', 'f300', 'f301', 'f302', 'f303', 'f304', 'f305', 'f306', 'f307', 'f308', 'f309', 'f349', 'f354', 'id6', 'id7', 'f368', 'f369', 'f370', 'f371', 'f372', 'id9', 'f377', 'id10', 'id11', 'f378', 'f374', 'id8', 'id12', 'id13']
Train size: 176263, Validation size: 44066


KeyboardInterrupt: 

# Meta learning

In [None]:
# Save OOF predictions
for model_name, predictions in oof_predictions.items():
    np.save(SAVE_PRED / f"oof_{model_name}.npy", predictions)

# Calculate individual model scores
print("\n" + "="*50)
print("Individual Model Performance (AUC)")
print("="*50)

for model_name, predictions in oof_predictions.items():
    score = roc_auc_score(df_train[TARGET], predictions)
    print(f"{model_name.upper()}: {score:.4f}")

# Train meta-learner
print("\n" + "="*30)
print("Training Meta-Learner")
print("="*30)

meta_features = np.column_stack([
    oof_predictions['xgb'],
    oof_predictions['lgb'],
    oof_predictions['cat']
])

meta_model = train_lr_meta(meta_features, df_train[TARGET].values)


# Prediction Function

In [None]:
def predict_base_model(model_name: str, test_features: pd.DataFrame, n_folds: int = 5):
    """Generate predictions from base model ensemble"""
    predictions = np.zeros(len(test_features))

    for fold in range(n_folds):
        model = joblib.load(SAVE_DIR / f"{model_name}_fold_{fold}.joblib")
        predictions += model.predict_proba(test_features)[:, 1] / n_folds

    return predictions

def rank_blend(prediction_dict: dict, weights: dict):
    """Rank-based blending of predictions"""
    # Convert predictions to ranks
    ranks = {}
    for model_name, preds in prediction_dict.items():
        ranks[model_name] = preds.argsort().argsort()

    # Weighted combination of ranks
    blended_ranks = np.zeros_like(next(iter(ranks.values())), dtype=float)
    for model_name, model_ranks in ranks.items():
        blended_ranks += weights[model_name] * model_ranks

    # Normalize to [0, 1] and invert (higher rank = higher probability)
    blended_ranks = blended_ranks / blended_ranks.max()
    return 1 - blended_ranks

def weighted_average(prediction_dict: dict, weights: dict):
    """Weighted average of predictions"""
    result = np.zeros_like(next(iter(prediction_dict.values())))
    for model_name, preds in prediction_dict.items():
        result += weights[model_name] * preds
    return result



 # Load Test Data and Generate Predictions

In [None]:
# Load test data
# Replace 'df_test.parquet' with your actual test file path
df_test = load_data("df_test.parquet")

# Generate base model predictions
print("Generating base model predictions...")
test_predictions = {
    'xgb': predict_base_model('xgb', df_test[feature_cols]),
    'lgb': predict_base_model('lgb', df_test[feature_cols]),
    'cat': predict_base_model('cat', df_test[feature_cols])
}

print("✅ Base model predictions generated")

# Generate ensemble predictions
print("\nGenerating ensemble predictions...")

# 1. Weighted soft voting
soft_voting_weights = {'xgb': 0.4, 'lgb': 0.4, 'cat': 0.2}
soft_voting_pred = weighted_average(test_predictions, soft_voting_weights)

# 2. Rank-based blending
rank_weights = {'xgb': 0.35, 'lgb': 0.35, 'cat': 0.30}
rank_blend_pred = rank_blend(test_predictions, rank_weights)

# 3. Stacking with meta-learner
meta_model = joblib.load(SAVE_DIR / "meta_lr.joblib")
meta_input = np.column_stack([
    test_predictions['xgb'],
    test_predictions['lgb'],
    test_predictions['cat']
])
stacking_pred = meta_model.predict_proba(meta_input)[:, 1]

# 4. Final ensemble (average of all methods)
final_prediction = (soft_voting_pred + rank_blend_pred + stacking_pred) / 3



# Create Submission File

In [None]:
# Create submission dataframe
submission = pd.DataFrame({
    'id1': df_test['id1'],
    'id2': df_test['id2'],
    'id3': df_test['id3'],
    'id5': df_test['id5'],
    'pred': final_prediction.clip(0, 1)  # Ensure predictions are in [0, 1]
})

# Save submission file
submission_path = OUT_DIR / "ensemble_submission.csv"
submission.to_csv(submission_path, sep='\t', index=False)

print(f"✅ Submission file saved: {submission_path}")
print(f"Submission shape: {submission.shape}")
print(f"Prediction statistics:")
print(f"  Min: {final_prediction.min():.4f}")
print(f"  Max: {final_prediction.max():.4f}")
print(f"  Mean: {final_prediction.mean():.4f}")
print(f"  Std: {final_prediction.std():.4f}")

# Display first few rows
print(submission.head())


In [None]:
# Analyze OOF predictions
print("Out-of-Fold Predictions Analysis")
print("="*40)

# Calculate correlation between models
oof_df = pd.DataFrame({
    'xgb': oof_predictions['xgb'],
    'lgb': oof_predictions['lgb'],
    'cat': oof_predictions['cat'],
    'target': df_train[TARGET]
})

print("Correlation matrix:")
print(oof_df.corr().round(3))

# Create simple ensemble from OOF predictions
oof_ensemble = (oof_predictions['xgb'] + oof_predictions['lgb'] + oof_predictions['cat']) / 3
ensemble_score = roc_auc_score(df_train[TARGET], oof_ensemble)
print(f"\nSimple ensemble OOF AUC: {ensemble_score:.4f}")

# Feature importance analysis (using last fold models as example)
print("\nFeature Importance Analysis")
print("="*30)

# Load last fold models for feature importance
xgb_last = joblib.load(SAVE_DIR / "xgb_fold_4.joblib")
lgb_last = joblib.load(SAVE_DIR / "lgb_fold_4.joblib")

# Get feature importance
xgb_importance = xgb_last.feature_importances_
lgb_importance = lgb_last.feature_importances_

# Create feature importance dataframe
importance_df = pd.DataFrame({
    'feature': feature_cols,
    'xgb_importance': xgb_importance,
    'lgb_importance': lgb_importance
})

# Average importance and sort
importance_df['avg_importance'] = (importance_df['xgb_importance'] + importance_df['lgb_importance']) / 2
importance_df = importance_df.sort_values('avg_importance', ascending=False)

print("Top 10 most important features:")
print(importance_df.head(10)[['feature', 'avg_importance']])


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8
