In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold as SKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler,RobustScaler

from hyperopt import hp, tpe, fmin, Trials, STATUS_OK
from hyperopt.pyll import scope
import matplotlib.pyplot as plt

In [25]:
#global variables

SEED = 6

FOLDS = 5

EVALS = 2000

TARGET = 'Exited'

FILEPATH = '../data/'

In [3]:

test = pd.read_csv(f'{FILEPATH}test.csv')
train = pd.read_csv(f'{FILEPATH}train.csv')
#original = pd.read_csv(f'{FILEPATH}original.csv').drop('RowNumber', axis=1)
#train = pd.concat([train, original]).reset_index(drop=True).copy()

In [None]:
def analyze_dataframe(df):
    """
    Analyze a pandas DataFrame and provide a summary of its characteristics.

    Parameters:
    df (pandas.DataFrame): The input DataFrame to analyze.

    Returns:
    None
    """
    print("DataFrame Information:")
    print("______________________")
    display(df.info(verbose=True, show_counts=True))
    print("\n")
    
    print("DataFrame Head:")
    print("______________________")
    display(df.head())
    print("\n")

    print("DataFrame Tail:")
    print("______________________")
    display(df.tail())
    print("\n")

    print("DataFrame Description:")
    print("______________________")
    display(df.describe().T)
    print("\n")

    print("Number of Null Values:")
    print("______________________")
    display(df.isnull().sum())
    print("\n")

    print("Number of Duplicated Rows:")
    print("______________________")
    display(df.duplicated().sum())
    print("\n")

    print("Number of Unique Values:")
    print("______________________")
    display(df.nunique())
    print("\n")

    print("DataFrame Shape:")
    print("______________________")
    print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
    print("\n")
    
    print("DataFrame Columns:")
    print("______________________")
    display(df.columns)
    

analyze_dataframe(train)

In [4]:
# Function to calculate vowel and consonant count
def vowel_consonant_count(word):
    vowels = "aeiouAEIOU"
    vowel_count = sum(1 for char in word if char in vowels)
    consonant_count = sum(1 for char in word if char not in vowels and char.isalpha())
    return vowel_count, consonant_count

def create_surname_features(df):
    df['Length'] = df['Surname'].apply(len)
    df['Initial'] = df['Surname'].str[0]
    df[['Vowels', 'Consonants']] = df['Surname'].apply(lambda x: vowel_consonant_count(x)).tolist()
    df['Uniqueness'] = df['Surname'].apply(lambda x: len(set(x.lower())) / len(x) if x else 0)
    df['IsSenior'] = df['Age'].apply(lambda x: 1 if x >= 65 else 0)
    df['IsActive_by_CreditCard'] = df['HasCrCard'] * df['IsActiveMember']
    df['Products_Per_Tenure'] =  df['Tenure'] / df['NumOfProducts']
    df['Products_Per_Age'] = df['Age'] / df['NumOfProducts']
    df['Age_bins'] = pd.cut(df['Age'], bins=[0, 20, 30, 40, 50, 60, 70, 80, 90, 100], labels=[1, 2, 3, 4, 5, 6, 7, 8, 9])
    
    # Create Credit Rating
    bins = [0,100, 300, 580, 670, 740, 800,850]  # Example bin ranges
    labels = ['Very Poor', 'Poor', 'Fair', 'Good', 'Very Good', 'Exceptional', 'Excellent']
    df['CreditRating'] = pd.cut(df['CreditScore'], bins=bins, labels=labels, include_lowest=True)
    return df

train = create_surname_features(train)
test = create_surname_features(test)

In [None]:
analyze_dataframe(train)

In [5]:
def preprocess_data(df, cat_features, num_features, scaler):
    
    # One-hot encode categorical features
    df = pd.get_dummies(df, columns=cat_features)

    # Normalize numerical features
    df[num_features] = scaler.fit_transform(df[num_features])

    # Drop unnecessary columns
    df = df.drop(['Surname', 'CustomerId'], axis=1, errors='ignore').dropna().drop_duplicates()
    
    return df


In [6]:
cat_features = [
    "Geography",
    "Gender",
    "HasCrCard",
    "IsActiveMember",
    "NumOfProducts",
    "Initial",
    "IsSenior",
    "IsActive_by_CreditCard",
    "Products_Per_Tenure",
    "Products_Per_Age",
    "CreditRating",
    "Age_bins"
]
num_features = [
    "CreditScore",
    "Age",
    "Tenure",
    "EstimatedSalary",
    "Uniqueness",
    "Vowels",
    "Consonants",
    "Length",
    "Balance_Per_Product",
    
]


In [7]:
#preprocessing
train_df = preprocess_data(train, cat_features, num_features, scaler=StandardScaler())
test_df = preprocess_data(test, cat_features, num_features, scaler=StandardScaler())

In [8]:

# Set display options to show all columns
pd.set_option('display.max_columns', None)

# Display all columns of the train_df dataframe
train_df.head()


Unnamed: 0,id,CreditScore,Age,Tenure,Balance,EstimatedSalary,Exited,Length,Vowels,Consonants,Uniqueness,Balance_Per_Product,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male,HasCrCard_0.0,HasCrCard_1.0,IsActiveMember_0.0,IsActiveMember_1.0,NumOfProducts_1,NumOfProducts_2,NumOfProducts_3,NumOfProducts_4,Initial_A,Initial_B,Initial_C,Initial_D,Initial_E,Initial_F,Initial_G,Initial_H,Initial_I,Initial_J,Initial_K,Initial_L,Initial_M,Initial_N,Initial_O,Initial_P,Initial_Q,Initial_R,Initial_S,Initial_T,Initial_U,Initial_V,Initial_W,Initial_Y,Initial_Z,IsSenior_0,IsSenior_1,IsActive_by_CreditCard_0.0,IsActive_by_CreditCard_1.0,Products_Per_Tenure_0.0,Products_Per_Tenure_0.25,Products_Per_Tenure_0.3333333333333333,Products_Per_Tenure_0.5,Products_Per_Tenure_0.6666666666666666,Products_Per_Tenure_0.75,Products_Per_Tenure_1.0,Products_Per_Tenure_1.25,Products_Per_Tenure_1.3333333333333333,Products_Per_Tenure_1.5,Products_Per_Tenure_1.6666666666666667,Products_Per_Tenure_1.75,Products_Per_Tenure_2.0,Products_Per_Tenure_2.25,Products_Per_Tenure_2.3333333333333335,Products_Per_Tenure_2.5,Products_Per_Tenure_2.6666666666666665,Products_Per_Tenure_3.0,Products_Per_Tenure_3.3333333333333335,Products_Per_Tenure_3.5,Products_Per_Tenure_4.0,Products_Per_Tenure_4.5,Products_Per_Tenure_5.0,Products_Per_Tenure_6.0,Products_Per_Tenure_7.0,Products_Per_Tenure_8.0,Products_Per_Tenure_9.0,Products_Per_Tenure_10.0,Products_Per_Age_5.5,Products_Per_Age_6.0,Products_Per_Age_6.333333333333333,Products_Per_Age_6.5,Products_Per_Age_6.666666666666667,Products_Per_Age_6.75,Products_Per_Age_7.0,Products_Per_Age_7.25,Products_Per_Age_7.333333333333333,Products_Per_Age_7.5,Products_Per_Age_7.666666666666667,Products_Per_Age_7.75,Products_Per_Age_8.0,Products_Per_Age_8.25,Products_Per_Age_8.333333333333334,Products_Per_Age_8.5,Products_Per_Age_8.666666666666666,Products_Per_Age_8.75,Products_Per_Age_9.0,Products_Per_Age_9.25,Products_Per_Age_9.333333333333334,Products_Per_Age_9.5,Products_Per_Age_9.666666666666666,Products_Per_Age_9.75,Products_Per_Age_10.0,Products_Per_Age_10.25,Products_Per_Age_10.333333333333334,Products_Per_Age_10.5,Products_Per_Age_10.666666666666666,Products_Per_Age_10.75,Products_Per_Age_11.0,Products_Per_Age_11.25,Products_Per_Age_11.333333333333334,Products_Per_Age_11.5,Products_Per_Age_11.666666666666666,Products_Per_Age_11.75,Products_Per_Age_12.0,Products_Per_Age_12.25,Products_Per_Age_12.333333333333334,Products_Per_Age_12.5,Products_Per_Age_12.666666666666666,Products_Per_Age_12.75,Products_Per_Age_13.0,Products_Per_Age_13.25,Products_Per_Age_13.333333333333334,Products_Per_Age_13.5,Products_Per_Age_13.666666666666666,Products_Per_Age_13.75,Products_Per_Age_14.0,Products_Per_Age_14.25,Products_Per_Age_14.333333333333334,Products_Per_Age_14.5,Products_Per_Age_14.666666666666666,Products_Per_Age_14.75,Products_Per_Age_15.0,Products_Per_Age_15.25,Products_Per_Age_15.333333333333334,Products_Per_Age_15.5,Products_Per_Age_15.666666666666666,Products_Per_Age_15.75,Products_Per_Age_16.0,Products_Per_Age_16.17,Products_Per_Age_16.25,Products_Per_Age_16.333333333333332,Products_Per_Age_16.5,Products_Per_Age_16.666666666666668,Products_Per_Age_16.75,Products_Per_Age_17.0,Products_Per_Age_17.333333333333332,Products_Per_Age_17.5,Products_Per_Age_17.666666666666668,Products_Per_Age_17.75,Products_Per_Age_18.0,Products_Per_Age_18.333333333333332,Products_Per_Age_18.5,Products_Per_Age_18.666666666666668,Products_Per_Age_19.0,Products_Per_Age_19.333333333333332,Products_Per_Age_19.5,Products_Per_Age_19.666666666666668,Products_Per_Age_20.0,Products_Per_Age_20.333333333333332,Products_Per_Age_20.5,Products_Per_Age_20.666666666666668,Products_Per_Age_21.0,Products_Per_Age_21.333333333333332,Products_Per_Age_21.5,Products_Per_Age_21.666666666666668,Products_Per_Age_22.0,Products_Per_Age_22.333333333333332,Products_Per_Age_22.5,Products_Per_Age_22.666666666666668,Products_Per_Age_23.0,Products_Per_Age_23.333333333333332,Products_Per_Age_23.5,Products_Per_Age_23.666666666666668,Products_Per_Age_24.0,Products_Per_Age_24.5,Products_Per_Age_25.0,Products_Per_Age_25.5,Products_Per_Age_25.666666666666668,Products_Per_Age_26.0,Products_Per_Age_26.5,Products_Per_Age_27.0,Products_Per_Age_27.5,Products_Per_Age_28.0,Products_Per_Age_28.5,Products_Per_Age_29.0,Products_Per_Age_29.5,Products_Per_Age_30.0,Products_Per_Age_30.5,Products_Per_Age_31.0,Products_Per_Age_31.5,Products_Per_Age_32.0,Products_Per_Age_32.5,Products_Per_Age_33.0,Products_Per_Age_33.5,Products_Per_Age_34.0,Products_Per_Age_34.5,Products_Per_Age_35.0,Products_Per_Age_35.5,Products_Per_Age_36.0,Products_Per_Age_36.44,Products_Per_Age_36.5,Products_Per_Age_37.0,Products_Per_Age_37.5,Products_Per_Age_38.0,Products_Per_Age_38.5,Products_Per_Age_39.0,Products_Per_Age_39.5,Products_Per_Age_40.0,Products_Per_Age_40.5,Products_Per_Age_41.0,Products_Per_Age_41.5,Products_Per_Age_42.0,Products_Per_Age_42.5,Products_Per_Age_43.0,Products_Per_Age_44.0,Products_Per_Age_45.0,Products_Per_Age_46.0,Products_Per_Age_47.0,Products_Per_Age_48.0,Products_Per_Age_49.0,Products_Per_Age_50.0,Products_Per_Age_51.0,Products_Per_Age_52.0,Products_Per_Age_53.0,Products_Per_Age_54.0,Products_Per_Age_55.0,Products_Per_Age_56.0,Products_Per_Age_57.0,Products_Per_Age_58.0,Products_Per_Age_59.0,Products_Per_Age_60.0,Products_Per_Age_61.0,Products_Per_Age_62.0,Products_Per_Age_63.0,Products_Per_Age_64.0,Products_Per_Age_65.0,Products_Per_Age_66.0,Products_Per_Age_67.0,Products_Per_Age_68.0,Products_Per_Age_69.0,Products_Per_Age_70.0,Products_Per_Age_71.0,Products_Per_Age_72.0,Products_Per_Age_73.0,Products_Per_Age_74.0,Products_Per_Age_75.0,Products_Per_Age_76.0,Products_Per_Age_77.0,Products_Per_Age_78.0,Products_Per_Age_79.0,Products_Per_Age_80.0,Products_Per_Age_81.0,Products_Per_Age_82.0,Products_Per_Age_83.0,Products_Per_Age_84.0,Products_Per_Age_85.0,Products_Per_Age_92.0,CreditRating_Very Poor,CreditRating_Poor,CreditRating_Fair,CreditRating_Good,CreditRating_Very Good,CreditRating_Exceptional,CreditRating_Excellent,Age_bins_1,Age_bins_2,Age_bins_3,Age_bins_4,Age_bins_5,Age_bins_6,Age_bins_7,Age_bins_8,Age_bins_9
0,0,0.144135,-0.578074,-0.719973,0.0,1.369486,0,2.852362,2.372797,2.632339,-1.986372,-0.826179,True,False,False,False,True,False,True,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False
1,1,-0.367706,-0.578074,-1.432694,0.0,-1.254085,0,2.469845,3.079516,1.392546,-1.587112,-0.826179,True,False,False,False,True,False,True,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False
2,2,0.268974,0.211354,1.774548,0.0,1.437422,0,-0.590292,-0.454078,-0.467144,-0.717611,-0.826179,True,False,False,False,True,False,True,True,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False
3,3,-0.941966,-0.465299,-1.076334,148882.54,-0.557018,0,-1.355327,-0.454078,-1.706938,0.897177,1.798801,True,False,False,False,True,False,True,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False
4,4,0.743362,-0.578074,-0.007253,0.0,-1.93877,0,0.939776,0.959359,0.772649,-0.897031,-0.826179,False,False,True,False,True,False,True,False,True,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False


In [9]:
# Split the training data
X_train = train_df.drop(["Exited", "id"], axis=1)
y_train = train_df["Exited"]

In [10]:


#hyperparameter tuning
space = {
    # added scope to  make sure the max depth is an integer
    'max_depth': scope.int(hp.quniform('max_depth', 2, 8, 1)),
    'min_child_weight': hp.loguniform('min_child_weight', -2, 4),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'reg_alpha': scope.int(hp.uniform('reg_alpha', 0, 10)),
    'reg_lambda': hp.uniform('reg_lambda', 1, 10),
    'gamma': hp.loguniform('gamma', -10, 10),
    'learning_rate': hp.loguniform('learning_rate', np.log10(0.1), np.log10(0.3)),
    'random_state': SEED,
}

In [18]:
def objective(space):
    #Compute the scale_pos_weight
    ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)
    
    model = xgb.XGBClassifier(
        max_depth=int(space['max_depth']),
        min_child_weight=space['min_child_weight'],
        subsample=space['subsample'],
        colsample_bytree=space['colsample_bytree'],
        reg_alpha=space['reg_alpha'],
        reg_lambda=space['reg_lambda'],
        gamma=space['gamma'],
        learning_rate=space['learning_rate'],
        scale_pos_weight=ratio,
        random_state=SEED,
        nthread=-1
        
    )
    
    # Implement cross-validation
    kf = SKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
    auc_scores = []

    for train_idx, val_idx in kf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model.fit(X_tr, y_tr)
        y_pred_prob = model.predict_proba(X_val)[:, 1]
        auc_score = roc_auc_score(y_val, y_pred_prob)
        auc_scores.append(auc_score)

    average_auc_score = np.mean(auc_scores)

    return {'loss': -average_auc_score, 'status': STATUS_OK}


In [19]:
#running the hyperparameter tuning

trials = Trials()
best_hyperparams = fmin(fn=objective,
                        space=space,
                        algo=tpe.suggest,
                        max_evals=EVALS,
                        trials=trials)

print("The best hyperparameters are: ", "\n")
print(best_hyperparams)

print("The best auc score is: ", "\n")
print(trials.best_trial['result']['loss'])


100%|██████████| 2000/2000 [12:25:47<00:00, 22.37s/trial, best loss: -0.8913700121438571]  
The best hyperparameters are:  

{'colsample_bytree': 0.9993687627936996, 'gamma': 0.012430687070513199, 'learning_rate': 0.3838117712528664, 'max_depth': 3.0, 'min_child_weight': 0.5758186020760541, 'reg_alpha': 8.594946362150308, 'reg_lambda': 8.209106961673578, 'subsample': 0.9942579263623403}
The best auc score is:  

-0.8913700121438571


In [23]:
best_hyperparams = {
    "colsample_bytree": 0.9993687627936996,
    "gamma": 0.012430687070513199,
    "learning_rate": 0.3838117712528664,
    "max_depth": 3.0,
    "min_child_weight": 0.5758186020760541,
    "reg_alpha": 8.594946362150308,
    "reg_lambda": 8.209106961673578,
    "subsample": 0.9942579263623403,
}


# Convert dataset to DMatrix format
dtrain = xgb.DMatrix(X_train, label=y_train)

# Define your parameters
params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "max_depth": int(best_hyperparams["max_depth"]),
    "min_child_weight": best_hyperparams["min_child_weight"],
    "subsample": best_hyperparams["subsample"],
    "colsample_bytree": best_hyperparams["colsample_bytree"],
    "learning_rate": best_hyperparams["learning_rate"],
    "reg_alpha": best_hyperparams["reg_alpha"],
    "reg_lambda": best_hyperparams["reg_lambda"],
    "gamma": best_hyperparams["gamma"],
    "seed": SEED,
}

# Perform cross-validation with early stopping
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=1000,
    nfold=FOLDS,
    early_stopping_rounds=50,
    verbose_eval=50,
    metrics=["auc"],
)

# Optimal number of boosting rounds
optimal_boost_rounds = cv_results.shape[0]

# Display best boosting rounds
display(cv_results.tail())
print(f"Optimal boosting rounds = {optimal_boost_rounds}")


[0]	train-auc:0.83901+0.00056	test-auc:0.83878+0.00224
[50]	train-auc:0.89262+0.00030	test-auc:0.89042+0.00165
[100]	train-auc:0.89527+0.00037	test-auc:0.89125+0.00171
[150]	train-auc:0.89719+0.00047	test-auc:0.89148+0.00163
[200]	train-auc:0.89882+0.00046	test-auc:0.89158+0.00146
[250]	train-auc:0.90017+0.00051	test-auc:0.89160+0.00147
[275]	train-auc:0.90077+0.00049	test-auc:0.89159+0.00142


Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
221,0.899392,0.000492,0.891598,0.001439
222,0.899423,0.000493,0.891603,0.001454
223,0.899472,0.000502,0.891619,0.00145
224,0.899501,0.000501,0.891628,0.001439
225,0.899531,0.000507,0.891636,0.001443


Optimal boosting rounds = 226


In [30]:
X_test = train_df.drop(["id"], axis=1)
if not list(X_train.columns) == list(X_test.columns):
    print("Columns in X_train and X_test do not match.")
    print("Extra columns in X_train:", set(X_train.columns) - set(X_test.columns))
    print("Extra columns in X_test:", set(X_test.columns) - set(X_train.columns))

Columns in X_train and X_test do not match.
Extra columns in X_train: set()
Extra columns in X_test: {'Exited'}


In [26]:
from sklearn.model_selection import train_test_split

# final model with the optimal number of estimators
final_model = xgb.XGBClassifier(
    n_estimators=optimal_boost_rounds,
    objective='binary:logistic',
    eval_metric='auc',
    max_depth=int(best_hyperparams['max_depth']),
    min_child_weight=best_hyperparams['min_child_weight'],
    subsample=best_hyperparams['subsample'],
    colsample_bytree=best_hyperparams['colsample_bytree'],
    learning_rate=best_hyperparams['learning_rate'],
    reg_alpha=best_hyperparams['reg_alpha'],
    reg_lambda=best_hyperparams['reg_lambda'],
    gamma=best_hyperparams['gamma'],
    random_state=SEED
)
# Fit the final model
final_model.fit(X_train, y_train)


X_test = 

# Predict class probabilities
y_pred_prob = final_model.predict_proba(X_test)[:, 1]


ValueError: feature_names mismatch: ['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary', 'Length', 'Vowels', 'Consonants', 'Uniqueness', 'Balance_Per_Product', 'Geography_France', 'Geography_Germany', 'Geography_Spain', 'Gender_Female', 'Gender_Male', 'HasCrCard_0.0', 'HasCrCard_1.0', 'IsActiveMember_0.0', 'IsActiveMember_1.0', 'NumOfProducts_1', 'NumOfProducts_2', 'NumOfProducts_3', 'NumOfProducts_4', 'Initial_A', 'Initial_B', 'Initial_C', 'Initial_D', 'Initial_E', 'Initial_F', 'Initial_G', 'Initial_H', 'Initial_I', 'Initial_J', 'Initial_K', 'Initial_L', 'Initial_M', 'Initial_N', 'Initial_O', 'Initial_P', 'Initial_Q', 'Initial_R', 'Initial_S', 'Initial_T', 'Initial_U', 'Initial_V', 'Initial_W', 'Initial_Y', 'Initial_Z', 'IsSenior_0', 'IsSenior_1', 'IsActive_by_CreditCard_0.0', 'IsActive_by_CreditCard_1.0', 'Products_Per_Tenure_0.0', 'Products_Per_Tenure_0.25', 'Products_Per_Tenure_0.3333333333333333', 'Products_Per_Tenure_0.5', 'Products_Per_Tenure_0.6666666666666666', 'Products_Per_Tenure_0.75', 'Products_Per_Tenure_1.0', 'Products_Per_Tenure_1.25', 'Products_Per_Tenure_1.3333333333333333', 'Products_Per_Tenure_1.5', 'Products_Per_Tenure_1.6666666666666667', 'Products_Per_Tenure_1.75', 'Products_Per_Tenure_2.0', 'Products_Per_Tenure_2.25', 'Products_Per_Tenure_2.3333333333333335', 'Products_Per_Tenure_2.5', 'Products_Per_Tenure_2.6666666666666665', 'Products_Per_Tenure_3.0', 'Products_Per_Tenure_3.3333333333333335', 'Products_Per_Tenure_3.5', 'Products_Per_Tenure_4.0', 'Products_Per_Tenure_4.5', 'Products_Per_Tenure_5.0', 'Products_Per_Tenure_6.0', 'Products_Per_Tenure_7.0', 'Products_Per_Tenure_8.0', 'Products_Per_Tenure_9.0', 'Products_Per_Tenure_10.0', 'Products_Per_Age_5.5', 'Products_Per_Age_6.0', 'Products_Per_Age_6.333333333333333', 'Products_Per_Age_6.5', 'Products_Per_Age_6.666666666666667', 'Products_Per_Age_6.75', 'Products_Per_Age_7.0', 'Products_Per_Age_7.25', 'Products_Per_Age_7.333333333333333', 'Products_Per_Age_7.5', 'Products_Per_Age_7.666666666666667', 'Products_Per_Age_7.75', 'Products_Per_Age_8.0', 'Products_Per_Age_8.25', 'Products_Per_Age_8.333333333333334', 'Products_Per_Age_8.5', 'Products_Per_Age_8.666666666666666', 'Products_Per_Age_8.75', 'Products_Per_Age_9.0', 'Products_Per_Age_9.25', 'Products_Per_Age_9.333333333333334', 'Products_Per_Age_9.5', 'Products_Per_Age_9.666666666666666', 'Products_Per_Age_9.75', 'Products_Per_Age_10.0', 'Products_Per_Age_10.25', 'Products_Per_Age_10.333333333333334', 'Products_Per_Age_10.5', 'Products_Per_Age_10.666666666666666', 'Products_Per_Age_10.75', 'Products_Per_Age_11.0', 'Products_Per_Age_11.25', 'Products_Per_Age_11.333333333333334', 'Products_Per_Age_11.5', 'Products_Per_Age_11.666666666666666', 'Products_Per_Age_11.75', 'Products_Per_Age_12.0', 'Products_Per_Age_12.25', 'Products_Per_Age_12.333333333333334', 'Products_Per_Age_12.5', 'Products_Per_Age_12.666666666666666', 'Products_Per_Age_12.75', 'Products_Per_Age_13.0', 'Products_Per_Age_13.25', 'Products_Per_Age_13.333333333333334', 'Products_Per_Age_13.5', 'Products_Per_Age_13.666666666666666', 'Products_Per_Age_13.75', 'Products_Per_Age_14.0', 'Products_Per_Age_14.25', 'Products_Per_Age_14.333333333333334', 'Products_Per_Age_14.5', 'Products_Per_Age_14.666666666666666', 'Products_Per_Age_14.75', 'Products_Per_Age_15.0', 'Products_Per_Age_15.25', 'Products_Per_Age_15.333333333333334', 'Products_Per_Age_15.5', 'Products_Per_Age_15.666666666666666', 'Products_Per_Age_15.75', 'Products_Per_Age_16.0', 'Products_Per_Age_16.17', 'Products_Per_Age_16.25', 'Products_Per_Age_16.333333333333332', 'Products_Per_Age_16.5', 'Products_Per_Age_16.666666666666668', 'Products_Per_Age_16.75', 'Products_Per_Age_17.0', 'Products_Per_Age_17.333333333333332', 'Products_Per_Age_17.5', 'Products_Per_Age_17.666666666666668', 'Products_Per_Age_17.75', 'Products_Per_Age_18.0', 'Products_Per_Age_18.333333333333332', 'Products_Per_Age_18.5', 'Products_Per_Age_18.666666666666668', 'Products_Per_Age_19.0', 'Products_Per_Age_19.333333333333332', 'Products_Per_Age_19.5', 'Products_Per_Age_19.666666666666668', 'Products_Per_Age_20.0', 'Products_Per_Age_20.333333333333332', 'Products_Per_Age_20.5', 'Products_Per_Age_20.666666666666668', 'Products_Per_Age_21.0', 'Products_Per_Age_21.333333333333332', 'Products_Per_Age_21.5', 'Products_Per_Age_21.666666666666668', 'Products_Per_Age_22.0', 'Products_Per_Age_22.333333333333332', 'Products_Per_Age_22.5', 'Products_Per_Age_22.666666666666668', 'Products_Per_Age_23.0', 'Products_Per_Age_23.333333333333332', 'Products_Per_Age_23.5', 'Products_Per_Age_23.666666666666668', 'Products_Per_Age_24.0', 'Products_Per_Age_24.5', 'Products_Per_Age_25.0', 'Products_Per_Age_25.5', 'Products_Per_Age_25.666666666666668', 'Products_Per_Age_26.0', 'Products_Per_Age_26.5', 'Products_Per_Age_27.0', 'Products_Per_Age_27.5', 'Products_Per_Age_28.0', 'Products_Per_Age_28.5', 'Products_Per_Age_29.0', 'Products_Per_Age_29.5', 'Products_Per_Age_30.0', 'Products_Per_Age_30.5', 'Products_Per_Age_31.0', 'Products_Per_Age_31.5', 'Products_Per_Age_32.0', 'Products_Per_Age_32.5', 'Products_Per_Age_33.0', 'Products_Per_Age_33.5', 'Products_Per_Age_34.0', 'Products_Per_Age_34.5', 'Products_Per_Age_35.0', 'Products_Per_Age_35.5', 'Products_Per_Age_36.0', 'Products_Per_Age_36.44', 'Products_Per_Age_36.5', 'Products_Per_Age_37.0', 'Products_Per_Age_37.5', 'Products_Per_Age_38.0', 'Products_Per_Age_38.5', 'Products_Per_Age_39.0', 'Products_Per_Age_39.5', 'Products_Per_Age_40.0', 'Products_Per_Age_40.5', 'Products_Per_Age_41.0', 'Products_Per_Age_41.5', 'Products_Per_Age_42.0', 'Products_Per_Age_42.5', 'Products_Per_Age_43.0', 'Products_Per_Age_44.0', 'Products_Per_Age_45.0', 'Products_Per_Age_46.0', 'Products_Per_Age_47.0', 'Products_Per_Age_48.0', 'Products_Per_Age_49.0', 'Products_Per_Age_50.0', 'Products_Per_Age_51.0', 'Products_Per_Age_52.0', 'Products_Per_Age_53.0', 'Products_Per_Age_54.0', 'Products_Per_Age_55.0', 'Products_Per_Age_56.0', 'Products_Per_Age_57.0', 'Products_Per_Age_58.0', 'Products_Per_Age_59.0', 'Products_Per_Age_60.0', 'Products_Per_Age_61.0', 'Products_Per_Age_62.0', 'Products_Per_Age_63.0', 'Products_Per_Age_64.0', 'Products_Per_Age_65.0', 'Products_Per_Age_66.0', 'Products_Per_Age_67.0', 'Products_Per_Age_68.0', 'Products_Per_Age_69.0', 'Products_Per_Age_70.0', 'Products_Per_Age_71.0', 'Products_Per_Age_72.0', 'Products_Per_Age_73.0', 'Products_Per_Age_74.0', 'Products_Per_Age_75.0', 'Products_Per_Age_76.0', 'Products_Per_Age_77.0', 'Products_Per_Age_78.0', 'Products_Per_Age_79.0', 'Products_Per_Age_80.0', 'Products_Per_Age_81.0', 'Products_Per_Age_82.0', 'Products_Per_Age_83.0', 'Products_Per_Age_84.0', 'Products_Per_Age_85.0', 'Products_Per_Age_92.0', 'CreditRating_Very Poor', 'CreditRating_Poor', 'CreditRating_Fair', 'CreditRating_Good', 'CreditRating_Very Good', 'CreditRating_Exceptional', 'CreditRating_Excellent', 'Age_bins_1', 'Age_bins_2', 'Age_bins_3', 'Age_bins_4', 'Age_bins_5', 'Age_bins_6', 'Age_bins_7', 'Age_bins_8', 'Age_bins_9'] ['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary', 'Length', 'Vowels', 'Consonants', 'Uniqueness', 'Balance_Per_Product', 'Geography_France', 'Geography_Germany', 'Geography_Spain', 'Gender_Female', 'Gender_Male', 'HasCrCard_0.0', 'HasCrCard_1.0', 'IsActiveMember_0.0', 'IsActiveMember_1.0', 'NumOfProducts_1', 'NumOfProducts_2', 'NumOfProducts_3', 'NumOfProducts_4', 'Initial_A', 'Initial_B', 'Initial_C', 'Initial_D', 'Initial_E', 'Initial_F', 'Initial_G', 'Initial_H', 'Initial_I', 'Initial_J', 'Initial_K', 'Initial_L', 'Initial_M', 'Initial_N', 'Initial_O', 'Initial_P', 'Initial_Q', 'Initial_R', 'Initial_S', 'Initial_T', 'Initial_U', 'Initial_V', 'Initial_W', 'Initial_Y', 'Initial_Z', 'IsSenior_0', 'IsSenior_1', 'IsActive_by_CreditCard_0.0', 'IsActive_by_CreditCard_1.0', 'Products_Per_Tenure_0.0', 'Products_Per_Tenure_0.25', 'Products_Per_Tenure_0.3333333333333333', 'Products_Per_Tenure_0.5', 'Products_Per_Tenure_0.6666666666666666', 'Products_Per_Tenure_0.75', 'Products_Per_Tenure_1.0', 'Products_Per_Tenure_1.25', 'Products_Per_Tenure_1.3333333333333333', 'Products_Per_Tenure_1.5', 'Products_Per_Tenure_1.6666666666666667', 'Products_Per_Tenure_1.75', 'Products_Per_Tenure_2.0', 'Products_Per_Tenure_2.25', 'Products_Per_Tenure_2.3333333333333335', 'Products_Per_Tenure_2.5', 'Products_Per_Tenure_2.6666666666666665', 'Products_Per_Tenure_3.0', 'Products_Per_Tenure_3.3333333333333335', 'Products_Per_Tenure_3.5', 'Products_Per_Tenure_4.0', 'Products_Per_Tenure_4.5', 'Products_Per_Tenure_5.0', 'Products_Per_Tenure_6.0', 'Products_Per_Tenure_7.0', 'Products_Per_Tenure_8.0', 'Products_Per_Tenure_9.0', 'Products_Per_Tenure_10.0', 'Products_Per_Age_5.5', 'Products_Per_Age_5.75', 'Products_Per_Age_6.25', 'Products_Per_Age_6.333333333333333', 'Products_Per_Age_6.5', 'Products_Per_Age_6.75', 'Products_Per_Age_7.0', 'Products_Per_Age_7.25', 'Products_Per_Age_7.333333333333333', 'Products_Per_Age_7.5', 'Products_Per_Age_7.666666666666667', 'Products_Per_Age_7.75', 'Products_Per_Age_8.0', 'Products_Per_Age_8.25', 'Products_Per_Age_8.333333333333334', 'Products_Per_Age_8.5', 'Products_Per_Age_8.666666666666666', 'Products_Per_Age_8.75', 'Products_Per_Age_9.0', 'Products_Per_Age_9.25', 'Products_Per_Age_9.333333333333334', 'Products_Per_Age_9.5', 'Products_Per_Age_9.666666666666666', 'Products_Per_Age_9.75', 'Products_Per_Age_10.0', 'Products_Per_Age_10.25', 'Products_Per_Age_10.333333333333334', 'Products_Per_Age_10.5', 'Products_Per_Age_10.666666666666666', 'Products_Per_Age_10.75', 'Products_Per_Age_11.0', 'Products_Per_Age_11.25', 'Products_Per_Age_11.333333333333334', 'Products_Per_Age_11.5', 'Products_Per_Age_11.666666666666666', 'Products_Per_Age_11.75', 'Products_Per_Age_12.0', 'Products_Per_Age_12.25', 'Products_Per_Age_12.333333333333334', 'Products_Per_Age_12.5', 'Products_Per_Age_12.666666666666666', 'Products_Per_Age_12.75', 'Products_Per_Age_13.0', 'Products_Per_Age_13.25', 'Products_Per_Age_13.333333333333334', 'Products_Per_Age_13.5', 'Products_Per_Age_13.666666666666666', 'Products_Per_Age_13.75', 'Products_Per_Age_14.0', 'Products_Per_Age_14.25', 'Products_Per_Age_14.333333333333334', 'Products_Per_Age_14.5', 'Products_Per_Age_14.666666666666666', 'Products_Per_Age_14.75', 'Products_Per_Age_15.0', 'Products_Per_Age_15.25', 'Products_Per_Age_15.333333333333334', 'Products_Per_Age_15.5', 'Products_Per_Age_15.666666666666666', 'Products_Per_Age_15.75', 'Products_Per_Age_16.0', 'Products_Per_Age_16.17', 'Products_Per_Age_16.333333333333332', 'Products_Per_Age_16.5', 'Products_Per_Age_16.666666666666668', 'Products_Per_Age_16.75', 'Products_Per_Age_17.0', 'Products_Per_Age_17.333333333333332', 'Products_Per_Age_17.5', 'Products_Per_Age_17.666666666666668', 'Products_Per_Age_17.75', 'Products_Per_Age_18.0', 'Products_Per_Age_18.22', 'Products_Per_Age_18.333333333333332', 'Products_Per_Age_18.5', 'Products_Per_Age_18.666666666666668', 'Products_Per_Age_19.0', 'Products_Per_Age_19.333333333333332', 'Products_Per_Age_19.5', 'Products_Per_Age_19.666666666666668', 'Products_Per_Age_20.0', 'Products_Per_Age_20.333333333333332', 'Products_Per_Age_20.5', 'Products_Per_Age_20.666666666666668', 'Products_Per_Age_21.0', 'Products_Per_Age_21.333333333333332', 'Products_Per_Age_21.5', 'Products_Per_Age_21.666666666666668', 'Products_Per_Age_22.0', 'Products_Per_Age_22.333333333333332', 'Products_Per_Age_22.5', 'Products_Per_Age_22.666666666666668', 'Products_Per_Age_23.0', 'Products_Per_Age_23.333333333333332', 'Products_Per_Age_23.5', 'Products_Per_Age_23.666666666666668', 'Products_Per_Age_24.0', 'Products_Per_Age_24.32', 'Products_Per_Age_24.5', 'Products_Per_Age_25.0', 'Products_Per_Age_25.5', 'Products_Per_Age_25.666666666666668', 'Products_Per_Age_26.0', 'Products_Per_Age_26.5', 'Products_Per_Age_27.0', 'Products_Per_Age_27.5', 'Products_Per_Age_28.0', 'Products_Per_Age_28.5', 'Products_Per_Age_29.0', 'Products_Per_Age_29.5', 'Products_Per_Age_30.0', 'Products_Per_Age_30.5', 'Products_Per_Age_30.666666666666668', 'Products_Per_Age_31.0', 'Products_Per_Age_31.5', 'Products_Per_Age_32.0', 'Products_Per_Age_32.34', 'Products_Per_Age_32.5', 'Products_Per_Age_33.0', 'Products_Per_Age_33.5', 'Products_Per_Age_34.0', 'Products_Per_Age_34.5', 'Products_Per_Age_35.0', 'Products_Per_Age_35.5', 'Products_Per_Age_36.0', 'Products_Per_Age_36.44', 'Products_Per_Age_36.5', 'Products_Per_Age_37.0', 'Products_Per_Age_37.5', 'Products_Per_Age_38.0', 'Products_Per_Age_38.5', 'Products_Per_Age_39.0', 'Products_Per_Age_39.5', 'Products_Per_Age_40.0', 'Products_Per_Age_40.5', 'Products_Per_Age_41.0', 'Products_Per_Age_41.5', 'Products_Per_Age_42.0', 'Products_Per_Age_42.5', 'Products_Per_Age_43.0', 'Products_Per_Age_44.0', 'Products_Per_Age_45.0', 'Products_Per_Age_45.25', 'Products_Per_Age_46.0', 'Products_Per_Age_47.0', 'Products_Per_Age_48.0', 'Products_Per_Age_49.0', 'Products_Per_Age_50.0', 'Products_Per_Age_51.0', 'Products_Per_Age_52.0', 'Products_Per_Age_53.0', 'Products_Per_Age_54.0', 'Products_Per_Age_55.0', 'Products_Per_Age_56.0', 'Products_Per_Age_57.0', 'Products_Per_Age_58.0', 'Products_Per_Age_59.0', 'Products_Per_Age_60.0', 'Products_Per_Age_61.0', 'Products_Per_Age_62.0', 'Products_Per_Age_63.0', 'Products_Per_Age_64.0', 'Products_Per_Age_65.0', 'Products_Per_Age_66.0', 'Products_Per_Age_67.0', 'Products_Per_Age_68.0', 'Products_Per_Age_69.0', 'Products_Per_Age_70.0', 'Products_Per_Age_71.0', 'Products_Per_Age_72.0', 'Products_Per_Age_73.0', 'Products_Per_Age_74.0', 'Products_Per_Age_75.0', 'Products_Per_Age_76.0', 'Products_Per_Age_77.0', 'Products_Per_Age_78.0', 'Products_Per_Age_79.0', 'Products_Per_Age_80.0', 'Products_Per_Age_81.0', 'Products_Per_Age_82.0', 'Products_Per_Age_83.0', 'Products_Per_Age_84.0', 'Products_Per_Age_85.0', 'Products_Per_Age_88.0', 'Products_Per_Age_92.0', 'CreditRating_Very Poor', 'CreditRating_Poor', 'CreditRating_Fair', 'CreditRating_Good', 'CreditRating_Very Good', 'CreditRating_Exceptional', 'CreditRating_Excellent', 'Age_bins_1', 'Age_bins_2', 'Age_bins_3', 'Age_bins_4', 'Age_bins_5', 'Age_bins_6', 'Age_bins_7', 'Age_bins_8', 'Age_bins_9']
expected Products_Per_Age_6.0, Products_Per_Age_6.666666666666667, Products_Per_Age_16.25 in input data
training data did not have the following fields: Products_Per_Age_32.34, Products_Per_Age_88.0, Products_Per_Age_6.25, Products_Per_Age_18.22, Products_Per_Age_45.25, Products_Per_Age_30.666666666666668, Products_Per_Age_24.32, Products_Per_Age_5.75

In [None]:
feature_importance = final_model.feature_importances_

# Sort feature importance in descending order
sorted_indices = np.argsort(feature_importance)[::-1]
sorted_importance = feature_importance[sorted_indices]
sorted_columns = X_train.columns[sorted_indices][::-1]

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(sorted_columns[::-1], sorted_importance[::-1]) 
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.title('XGBoost Feature Importance')
plt.xticks(fontsize=5)
plt.yticks(fontsize=5)
plt.show()


In [None]:
# Predict probabilities for the test dataset
test_pred_prob = final_model.predict_proba(X_test)[:, 1]

submission_df = pd.DataFrame({
    'id': test_df['id'],
    'Exited': test_pred_prob
})
submission_df.to_csv('submission.csv', index=False)
submission_df.head()