In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score
from xgboost import XGBClassifier

# Load the datasets
train_path = '/kaggle/input/predict-the-success-of-bank-telemarketing/train.csv'
test_path = '/kaggle/input/predict-the-success-of-bank-telemarketing/test.csv'
sample_submission_path = '/kaggle/input/predict-the-success-of-bank-telemarketing/sample_submission.csv'

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)
sample_submission = pd.read_csv(sample_submission_path)

# Define a scaler to ensure consistency between training and test sets
scaler = StandardScaler()

def preprocess_data(df, is_train=True, scaler=None):
    # Convert 'last contact date' to datetime and extract year, month, and day
    df['last contact date'] = pd.to_datetime(df['last contact date'])
    df['contact_year'] = df['last contact date'].dt.year
    df['contact_month'] = df['last contact date'].dt.month
    df['contact_day'] = df['last contact date'].dt.day
    df = df.drop('last contact date', axis=1)

    # Fill missing values for 'poutcome' with 'unknown'
    df['poutcome'] = df['poutcome'].fillna('unknown')

    # Label encode binary columns (yes/no)
    binary_columns = ['default', 'housing', 'loan']
    for col in binary_columns:
        df[col] = df[col].map({'yes': 1, 'no': 0})

    # One-hot encode categorical features
    categorical_columns = ['job', 'marital', 'education', 'contact', 'poutcome']
    df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

    # Scale numeric features
    numeric_columns = ['age', 'balance', 'duration', 'campaign', 'pdays', 'previous']
    if is_train:
        df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
    else:
        df[numeric_columns] = scaler.transform(df[numeric_columns])

    # Handle target column for training data
    if is_train:
        df['target'] = df['target'].map({'yes': 1, 'no': 0})

    return df

# Preprocess train and test datasets
train_data_preprocessed = preprocess_data(train_data, is_train=True, scaler=scaler)
test_data_preprocessed = preprocess_data(test_data, is_train=False, scaler=scaler)

# Separate features and target from the training data
X = train_data_preprocessed.drop('target', axis=1)
y = train_data_preprocessed['target']

# Split training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Check for class imbalance and handle if needed
class_counts = y_train.value_counts()
if class_counts.min() / class_counts.max() < 0.5:
    from imblearn.over_sampling import SMOTE
    smote = SMOTE(random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)

# Define the XGBoost model with optimized hyperparameters and evaluation metric
xgb_model = XGBClassifier(
    n_estimators=1000,
    max_depth=6,
    learning_rate=0.02,
    subsample=0.7,
    colsample_bytree=0.7,
    reg_alpha=3,
    reg_lambda=5,
    scale_pos_weight=1.5,
    objective='binary:logistic',
    eval_metric='logloss',  # Specify eval_metric here
    use_label_encoder=False,
    n_jobs=-1,
    random_state=42
)

# Train the model with early stopping
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=True
)

# Evaluate on the validation set using F1 macro
y_val_pred = xgb_model.predict(X_val)
f1_macro_val = f1_score(y_val, y_val_pred, average='macro')
print("Validation F1 Macro Score:", f1_macro_val)

# Predict on the test set
test_predictions = xgb_model.predict(test_data_preprocessed)

# Map predictions back to "yes" and "no"
submission = sample_submission.copy()
submission['target'] = test_predictions
submission['target'] = submission['target'].map({1: 'yes', 0: 'no'})

# Save submission to CSV
submission_file_path = 'submission.csv'
submission.to_csv(submission_file_path, index=False)

# Display the saved submission
submission_file = pd.read_csv(submission_file_path)
submission_file

[0]	validation_0-logloss:0.84459
[1]	validation_0-logloss:0.83029
[2]	validation_0-logloss:0.81650
[3]	validation_0-logloss:0.80378
[4]	validation_0-logloss:0.79111
[5]	validation_0-logloss:0.78443
[6]	validation_0-logloss:0.77277
[7]	validation_0-logloss:0.76205
[8]	validation_0-logloss:0.75177
[9]	validation_0-logloss:0.74157
[10]	validation_0-logloss:0.73179
[11]	validation_0-logloss:0.72238
[12]	validation_0-logloss:0.71374
[13]	validation_0-logloss:0.70479
[14]	validation_0-logloss:0.69708
[15]	validation_0-logloss:0.68883
[16]	validation_0-logloss:0.68391
[17]	validation_0-logloss:0.67826
[18]	validation_0-logloss:0.67065
[19]	validation_0-logloss:0.66357
[20]	validation_0-logloss:0.65652
[21]	validation_0-logloss:0.64992
[22]	validation_0-logloss:0.64351
[23]	validation_0-logloss:0.63699
[24]	validation_0-logloss:0.63297
[25]	validation_0-logloss:0.62864
[26]	validation_0-logloss:0.62544
[27]	validation_0-logloss:0.62240
[28]	validation_0-logloss:0.61664
[29]	validation_0-loglos

Unnamed: 0,id,target
0,0,no
1,1,no
2,2,no
3,3,no
4,4,yes
...,...,...
9995,9995,no
9996,9996,yes
9997,9997,yes
9998,9998,yes
