# üì§ Submission Generation

> Competition: {{ COMPETITION_NAME }}

---

## üì¶ Setup

In [None]:
import sys
sys.path.append('../../..')

import pandas as pd
import numpy as np
import joblib

from shared.utils import set_seed
from shared.evaluation import create_submission

set_seed(42)

%load_ext autoreload
%autoreload 2

## ‚öôÔ∏è Configuration

In [None]:
PROCESSED_PATH = 'data/processed'
MODELS_PATH = 'models'
SUBMISSIONS_PATH = 'submissions'

TARGET_COL = 'target'  # Update this
ID_COL = 'id'  # Update this
TASK = 'classification'  # or 'regression'
N_FOLDS = 5

## üìÇ Load Test Data

In [None]:
test = pd.read_csv(f'{PROCESSED_PATH}/test_processed.csv')
print(f"Test shape: {test.shape}")

In [None]:
# Load sample submission for reference
sample_sub = pd.read_csv('data/raw/sample_submission.csv')
print(f"Submission columns: {sample_sub.columns.tolist()}")
sample_sub.head()

## ü§ñ Load Models & Generate Predictions

In [None]:
# Feature columns (same as training)
EXCLUDE_COLS = [ID_COL]
FEATURE_COLS = [c for c in test.columns if c not in EXCLUDE_COLS]
X_test = test[FEATURE_COLS]

In [None]:
# LightGBM predictions
lgb_preds = np.zeros(len(test))
for fold in range(N_FOLDS):
    model = joblib.load(f'{MODELS_PATH}/lgb_fold{fold}.pkl')
    lgb_preds += model.predict_proba(X_test) / N_FOLDS

print(f"LightGBM predictions: mean={lgb_preds.mean():.4f}, std={lgb_preds.std():.4f}")

In [None]:
# XGBoost predictions
xgb_preds = np.zeros(len(test))
for fold in range(N_FOLDS):
    model = joblib.load(f'{MODELS_PATH}/xgb_fold{fold}.pkl')
    xgb_preds += model.predict_proba(X_test) / N_FOLDS

print(f"XGBoost predictions: mean={xgb_preds.mean():.4f}, std={xgb_preds.std():.4f}")

In [None]:
# Ensemble predictions
final_preds = (lgb_preds + xgb_preds) / 2
print(f"Ensemble predictions: mean={final_preds.mean():.4f}, std={final_preds.std():.4f}")

## üìä Sanity Checks

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].hist(lgb_preds, bins=50, alpha=0.7, label='LGB')
axes[0].set_title('LightGBM Predictions')

axes[1].hist(xgb_preds, bins=50, alpha=0.7, label='XGB', color='orange')
axes[1].set_title('XGBoost Predictions')

axes[2].hist(final_preds, bins=50, alpha=0.7, label='Ensemble', color='green')
axes[2].set_title('Ensemble Predictions')

plt.tight_layout()
plt.show()

## üíæ Generate Submission

In [None]:
import os
from datetime import datetime

os.makedirs(SUBMISSIONS_PATH, exist_ok=True)

# Create timestamp for versioning
timestamp = datetime.now().strftime('%Y%m%d_%H%M')

In [None]:
# Create submission
test_ids = test[ID_COL] if ID_COL in test.columns else range(len(test))

submission = create_submission(
    test_ids=test_ids,
    predictions=final_preds,
    id_col=ID_COL,
    target_col=TARGET_COL,
    filename=f'{SUBMISSIONS_PATH}/submission_{timestamp}.csv',
    threshold=0.5 if TASK == 'classification' else None
)

In [None]:
# Verify against sample submission
assert len(submission) == len(sample_sub), "Submission length mismatch!"
print("‚úÖ Submission validated!")

## üöÄ Submit to Kaggle (Optional)

In [None]:
# Uncomment to submit via Kaggle API
# !kaggle competitions submit -c COMPETITION_SLUG -f {SUBMISSIONS_PATH}/submission_{timestamp}.csv -m "Ensemble LGB+XGB"

---
**Done!** üéâ Good luck on the leaderboard!