In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

print("Starting the Bank Term Deposit Prediction script (Enhanced Version)...")

# --- 1. Load Data ---
# This assumes you have uploaded 'train.csv', 'test.csv', and 'sample_submission.csv'
# to your Google Colab environment.
try:
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')
    sample_submission_df = pd.read_csv('sample_submission.csv')
    print("Data loaded successfully: train.csv, test.csv, sample_submission.csv")
except FileNotFoundError:
    print("Error: One or more of the required CSV files (train.csv, test.csv, sample_submission.csv) were not found.")
    print("Please ensure these files are uploaded to your Colab environment or the correct path is provided.")
    # It's good practice to exit or raise an error if critical files are missing
    exit()





Starting the Bank Term Deposit Prediction script (Enhanced Version)...
Data loaded successfully: train.csv, test.csv, sample_submission.csv


In [None]:
# --- 2. Preprocessing and Feature Engineering ---

# Store test IDs for the final submission file
test_ids = test_df['id']

# Drop the 'id' column from both training and test dataframes as it's not a predictive feature
train_df = train_df.drop('id', axis=1)
test_df = test_df.drop('id', axis=1)

# Separate features (X) and target (y) from the training data
X = train_df.drop('y', axis=1)
y = train_df['y']
X_test = test_df.copy() # Create a copy for the test features

print("Starting enhanced feature engineering...")

# --- Feature Engineering ---

# 2.1 Handle 'pdays' feature
# Create a binary feature indicating if the client was previously contacted
X['pdays_contacted'] = (X['pdays'] != -1).astype(int)
X_test['pdays_contacted'] = (X_test['pdays'] != -1).astype(int)

# Replace -1 in pdays with 0 (or another appropriate value) before transformation
# This signifies no previous contact for the duration aspect
X['pdays'] = X['pdays'].replace(-1, 0)
X_test['pdays'] = X_test['pdays'].replace(-1, 0)

# Apply log1p transformation to pdays to reduce skewness
X['pdays'] = np.log1p(X['pdays'])
X_test['pdays'] = np.log1p(X_test['pdays'])

# 2.2 Numerical Feature Transformations and Outlier Handling
# Apply log1p transformation to 'duration'
# Note: In a real-world scenario, 'duration' would not be known at prediction time.
# However, for Kaggle competitions, it's a powerful feature.
X['duration'] = np.log1p(X['duration'])
X_test['duration'] = np.log1p(X_test['duration'])

# Cap 'campaign' at its 99th percentile to handle outliers
campaign_cap = X['campaign'].quantile(0.99)
X['campaign'] = X['campaign'].clip(upper=campaign_cap)
X_test['campaign'] = X_test['campaign'].clip(upper=campaign_cap)

# Apply log1p transformation to 'balance' after shifting to handle negative values
min_balance_train = X['balance'].min()
min_balance_test = X_test['balance'].min()
shift_constant = abs(min(min_balance_train, min_balance_test)) + 1 # Add 1 to avoid log(0)

X['balance'] = np.log1p(X['balance'] + shift_constant)
X_test['balance'] = np.log1p(X_test['balance'] + shift_constant)

Starting enhanced feature engineering...


In [None]:
# 2.3 Create Interaction Features
# Ratio of balance to age (handle age being 0, though unlikely)
X['balance_age_ratio'] = X['balance'] / (X['age'] + 1e-6)
X_test['balance_age_ratio'] = X_test['balance'] / (X_test['age'] + 1e-6)

# Ratio of duration to campaign (handle campaign being 0)
X['duration_per_campaign'] = X['duration'] / (X['campaign'] + 1e-6)
X_test['duration_per_campaign'] = X_test['duration'] / (X_test['campaign'] + 1e-6)

# Interaction between balance and duration
X['balance_duration_interaction'] = X['balance'] * X['duration']
X_test['balance_duration_interaction'] = X_test['balance'] * X_test['duration']

# Interaction between age and duration
X['age_duration_interaction'] = X['age'] * X['duration']
X_test['age_duration_interaction'] = X_test['age'] * X_test['duration']

# 2.4 Handle 'month' feature - convert to numerical
month_map = {
    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
}
X['month'] = X['month'].map(month_map)
X_test['month'] = X_test['month'].map(month_map)


# Identify categorical features for one-hot encoding (after numerical month conversion)
categorical_features = [
    'job', 'marital', 'education', 'default', 'housing',
    'loan', 'contact', 'poutcome' # 'month' is now numerical
]

# Apply One-Hot Encoding to categorical features for both training and test sets
X = pd.get_dummies(X, columns=categorical_features, drop_first=True)
X_test = pd.get_dummies(X_test, columns=categorical_features, drop_first=True)

# Align columns between training and test sets after all feature engineering
train_cols = X.columns
test_cols = X_test.columns

# Add missing columns to the test set and fill with zeros
missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test:
    X_test[c] = 0

# Add missing columns to the training set and fill with zeros (less common, but good practice)
missing_in_train = set(test_cols) - set(train_cols)
for c in missing_in_train:
    X[c] = 0

# Ensure the columns are in the same order for both X and X_test
X_test = X_test[train_cols]

print(f"Enhanced feature engineering complete. Training features shape: {X.shape}, Test features shape: {X_test.shape}")

Enhanced feature engineering complete. Training features shape: (750000, 37), Test features shape: (250000, 37)


In [None]:
# --- 3. Model Training with Stratified K-Fold Cross-Validation ---

# Define the number of folds for cross-validation
NFOLDS = 5
# Initialize StratifiedKFold to preserve the percentage of samples for each class
folds = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=42)

# Arrays to store out-of-fold predictions and submission predictions
oof_preds = np.zeros(X.shape[0])
sub_preds = np.zeros(X_test.shape[0])

print(f"Starting model training with {NFOLDS}-Fold Stratified Cross-Validation...")

# LightGBM model parameters (tuned for potentially better performance)
lgb_params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'n_estimators': 3000,         # Increased estimators
    'learning_rate': 0.005,       # Decreased learning rate
    'num_leaves': 31,             # Slightly increased leaves
    'max_depth': 7,               # Slightly increased max depth
    'seed': 42,
    'n_jobs': -1,
    'verbose': -1,
    'colsample_bytree': 0.7,
    'subsample': 0.7,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'min_child_samples': 20,      # Minimum number of data needed in a child (leaf)
}

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, y)):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx]

    model = lgb.LGBMClassifier(**lgb_params)

    model.fit(X_train, y_train,
              eval_set=[(X_valid, y_valid)],
              eval_metric='auc',
              callbacks=[lgb.early_stopping(200, verbose=False)]) # Increased early stopping rounds

    oof_preds[valid_idx] = model.predict_proba(X_valid)[:, 1]
    sub_preds += model.predict_proba(X_test)[:, 1] / folds.n_splits

    print(f"Fold {n_fold+1}/{NFOLDS} completed. Validation AUC: {roc_auc_score(y_valid, oof_preds[valid_idx]):.4f}")

# Calculate the overall Out-Of-Fold AUC score
overall_oof_auc = roc_auc_score(y, oof_preds)
print(f"\nOverall Out-Of-Fold AUC Score: {overall_oof_auc:.4f}")

Starting model training with 5-Fold Stratified Cross-Validation...
Fold 1/5 completed. Validation AUC: 0.9656
Fold 2/5 completed. Validation AUC: 0.9643
Fold 3/5 completed. Validation AUC: 0.9647
Fold 4/5 completed. Validation AUC: 0.9657
Fold 5/5 completed. Validation AUC: 0.9650

Overall Out-Of-Fold AUC Score: 0.9651


In [None]:
# --- 4. Create Submission File ---

submission_df = pd.DataFrame({
    'id': test_ids,
    'y': sub_preds
})

# Save the submission file to your Colab environment
submission_df.to_csv('submission.csv', index=False)

print("\n--- Script Execution Complete ---")
print(f"Final submission file 'submission.csv' created successfully with {len(submission_df)} predictions.")
print("You can now download 'submission.csv' from the left-hand file panel in Colab and submit it to Kaggle.")
print("This enhanced version should provide a better score. Good luck!")



--- Script Execution Complete ---
Final submission file 'submission.csv' created successfully with 250000 predictions.
You can now download 'submission.csv' from the left-hand file panel in Colab and submit it to Kaggle.
This enhanced version should provide a better score. Good luck!
