In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold as SKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

from hyperopt import hp, tpe, fmin, Trials, STATUS_OK
from hyperopt.pyll import scope
import matplotlib.pyplot as plt

In [2]:
#global variables

SEED = 42

FOLDS = 10

EVALS = 1000

FILEPATH = '../data/'

In [3]:

test = pd.read_csv(f'{FILEPATH}test.csv')
train = pd.read_csv(f'{FILEPATH}train.csv')
#original = pd.read_csv(f'{FILEPATH}original.csv').drop('RowNumber', axis=1)
#train = pd.concat([train, original]).reset_index(drop=True).copy()

In [5]:
train.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0.0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1.0,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2.0,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3.0,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4.0,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [6]:
def analyze_dataframe(df):
    """
    Analyze a pandas DataFrame and provide a summary of its characteristics.

    Parameters:
    df (pandas.DataFrame): The input DataFrame to analyze.

    Returns:
    None
    """
    print("DataFrame Information:")
    print("______________________")
    display(df.info(verbose=True, show_counts=True))
    print("\n")
    
    print("DataFrame Head:")
    print("______________________")
    display(df.head())
    print("\n")

    print("DataFrame Tail:")
    print("______________________")
    display(df.tail())
    print("\n")

    print("DataFrame Description:")
    print("______________________")
    display(df.describe().T)
    print("\n")

    print("Number of Null Values:")
    print("______________________")
    display(df.isnull().sum())
    print("\n")

    print("Number of Duplicated Rows:")
    print("______________________")
    display(df.duplicated().sum())
    print("\n")

    print("Number of Unique Values:")
    print("______________________")
    display(df.nunique())
    print("\n")

    print("DataFrame Shape:")
    print("______________________")
    print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
    print("\n")
    
    print("DataFrame Columns:")
    print("______________________")
    display(df.columns)
    

analyze_dataframe(train)

DataFrame Information:
______________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175036 entries, 0 to 175035
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               165034 non-null  float64
 1   CustomerId       175036 non-null  int64  
 2   Surname          175036 non-null  object 
 3   CreditScore      175036 non-null  int64  
 4   Geography        175035 non-null  object 
 5   Gender           175036 non-null  object 
 6   Age              175035 non-null  float64
 7   Tenure           175036 non-null  int64  
 8   Balance          175036 non-null  float64
 9   NumOfProducts    175036 non-null  int64  
 10  HasCrCard        175035 non-null  float64
 11  IsActiveMember   175035 non-null  float64
 12  EstimatedSalary  175036 non-null  float64
 13  Exited           175036 non-null  int64  
dtypes: float64(6), int64(5), object(3)
memory usage: 18.7+ MB


None



DataFrame Head:
______________________


Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0.0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1.0,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2.0,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3.0,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4.0,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0




DataFrame Tail:
______________________


Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
175031,,15584532,Liu,709,France,Female,36.0,7,0.0,1,0.0,1.0,42085.58,1
175032,,15682355,Sabbatini,772,Germany,Male,42.0,3,75075.31,2,1.0,0.0,92888.52,1
175033,,15682355,Sabbatini,772,Germany,Male,42.0,3,75075.31,2,1.0,0.0,92888.52,1
175034,,15628319,Walker,792,France,Female,28.0,4,130142.79,1,1.0,0.0,38190.78,0
175035,,15628319,Walker,792,France,Female,28.0,4,130142.79,1,1.0,0.0,38190.78,0




DataFrame Description:
______________________


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,165034.0,82516.5,47641.3565,0.0,41258.25,82516.5,123774.75,165033.0
CustomerId,175036.0,15691940.0,71428.662023,15565701.0,15632882.0,15690169.0,15756655.0,15815690.0
CreditScore,175036.0,656.1173,81.15183,350.0,597.0,659.0,710.0,850.0
Age,175035.0,38.17139,8.969523,18.0,32.0,37.0,42.0,92.0
Tenure,175036.0,5.019904,2.811125,0.0,3.0,5.0,7.0,10.0
Balance,175036.0,56678.82,62982.46607,0.0,0.0,0.0,120729.77,250898.09
NumOfProducts,175036.0,1.553069,0.54921,1.0,1.0,2.0,2.0,4.0
HasCrCard,175035.0,0.7511869,0.432327,0.0,1.0,1.0,1.0,1.0
IsActiveMember,175035.0,0.4987517,0.5,0.0,0.0,0.0,1.0,1.0
EstimatedSalary,175036.0,111861.0,50815.418008,11.58,73181.39,116969.73,154767.34,199992.48




Number of Null Values:
______________________


id                 10002
CustomerId             0
Surname                0
CreditScore            0
Geography              1
Gender                 0
Age                    1
Tenure                 0
Balance                0
NumOfProducts          0
HasCrCard              1
IsActiveMember         1
EstimatedSalary        0
Exited                 0
dtype: int64



Number of Duplicated Rows:
______________________


2



Number of Unique Values:
______________________


id                 165034
CustomerId          23421
Surname              2932
CreditScore           460
Geography               3
Gender                  2
Age                    73
Tenure                 11
Balance             30239
NumOfProducts           4
HasCrCard               2
IsActiveMember          2
EstimatedSalary     55581
Exited                  2
dtype: int64



DataFrame Shape:
______________________
Rows: 175036, Columns: 14


DataFrame Columns:
______________________


Index(['id', 'CustomerId', 'Surname', 'CreditScore', 'Geography', 'Gender',
       'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [93]:
# Function to calculate vowel and consonant count
def vowel_consonant_count(word):
    vowels = "aeiouAEIOU"
    vowel_count = sum(1 for char in word if char in vowels)
    consonant_count = sum(1 for char in word if char not in vowels and char.isalpha())
    return vowel_count, consonant_count

# Feature engineering with surname
def create_surname_features(df):
    df['Length'] = df['Surname'].apply(len)
    df['Initial'] = df['Surname'].str[0]
    df[['Vowels', 'Consonants']] = df['Surname'].apply(lambda x: vowel_consonant_count(x)).tolist()
    df['Uniqueness'] = df['Surname'].apply(lambda x: len(set(x.lower())) / len(x) if x else 0)
    df
    return df

train = create_surname_features(train)
test = create_surname_features(test)

In [94]:
def preprocess_data(df, cat_features, num_features, scaler):
    
    
    # One-hot encode categorical features
    df = pd.get_dummies(df, columns=cat_features)

    # Normalize numerical features
    df[num_features] = scaler.fit_transform(df[num_features])

    # Drop unnecessary columns
    df = df.drop(['Surname', 'CustomerId'], axis=1, errors='ignore').dropna().drop_duplicates()
    
    return df


In [95]:
cat_features = [
    "Geography",
    "Gender",
    "HasCrCard",
    "IsActiveMember",
    "NumOfProducts",
    "Initial",
]
num_features = [
    "CreditScore",
    "Age",
    "Tenure",
    "EstimatedSalary",
    "Uniqueness",
    "Vowels",
    "Consonants",
    "Length",
]


In [96]:
#preprocessing
train_df = preprocess_data(train, cat_features, num_features, scaler=StandardScaler())
test_df = preprocess_data(test, cat_features, num_features, scaler=StandardScaler())

In [81]:
# Split the training data
X_train = train_df.drop(["Exited", "id"], axis=1)
y_train = train_df["Exited"]

#hyperparameter tuning
space = {
    'n_estimators': hp.quniform('n_estimators', 100, 1000, 2),
    # added scope to  make sure the max depth is an integer
    'max_depth': scope.int(hp.quniform('max_depth', 2, 8, 1)),
    'min_child_weight': hp.loguniform('min_child_weight', -2, 3),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'reg_alpha': scope.int(hp.uniform('reg_alpha', 0, 10)),
    'reg_lambda': hp.uniform('reg_lambda', 1, 10),
    'gamma': hp.loguniform('gamma', -10, 10),
    'learning_rate': hp.loguniform('learning_rate', np.log10(0.1), np.log10(0.25)),
    'random_state': SEED,
    'nthread': -1,
}

In [101]:
def objective(space):
    #Compute the scale_pos_weight
    ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)
    
    model = xgb.XGBClassifier(
        n_estimators=int(space['n_estimators']),
        max_depth=int(space['max_depth']),
        min_child_weight=space['min_child_weight'],
        subsample=space['subsample'],
        colsample_bytree=space['colsample_bytree'],
        reg_alpha=space['reg_alpha'],
        reg_lambda=space['reg_lambda'],
        gamma=space['gamma'],
        learning_rate=space['learning_rate'],
        scale_pos_weight=ratio,
        random_state=SEED,
        
    )
    
    # Implement cross-validation
    kf = SKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
    auc_scores = []

    for train_idx, val_idx in kf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model.fit(X_tr, y_tr)
        y_pred_prob = model.predict_proba(X_val)[:, 1]
        auc_score = roc_auc_score(y_val, y_pred_prob)
        auc_scores.append(auc_score)

    average_auc_score = np.mean(auc_scores)

    return {'loss': -average_auc_score, 'status': STATUS_OK}


In [102]:
#running the hyperparameter tuning

trials = Trials()
best_hyperparams = fmin(fn=objective,
                        space=space,
                        algo=tpe.suggest,
                        max_evals=EVALS,
                        trials=trials)

print("The best hyperparameters are: ", "\n")
print(best_hyperparams)

print("The best auc score is: ", "\n")
print(trials.best_trial['result']['loss'])


100%|██████████| 1000/1000 [12:04:18<00:00, 43.46s/trial, best loss: -0.8923606973057912] 
The best hyperparameters are:  

{'colsample_bytree': 0.7848336902999598, 'gamma': 0.11104155666381403, 'learning_rate': 0.37051256533289206, 'max_depth': 2.0, 'min_child_weight': 14.552961522899192, 'n_estimators': 860.0, 'reg_alpha': 5.4576130010073385, 'reg_lambda': 6.580967199150385, 'subsample': 0.9486302751512025}
The best auc score is:  

-0.8923606973057912


In [103]:
# best_hyperparms = {
#     "colsample_bytree": 0.6489473296295848,
#     "gamma": 0.038479137406336225,
#     "learning_rate": 0.1723786481573518,
#     "max_depth": 6.0,
#     "min_child_weight": 2.1495750805555214,
#     "reg_alpha": 3.9132673527741226,
#     "reg_lambda": 4.033439716572811,
#     "subsample": 0.8643794476506356,
# }

# Convert dataset to DMatrix format
dtrain = xgb.DMatrix(X_train, label=y_train)

# Define your parameters
params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "max_depth": int(best_hyperparams["max_depth"]),
    "min_child_weight": best_hyperparams["min_child_weight"],
    "subsample": best_hyperparams["subsample"],
    "colsample_bytree": best_hyperparams["colsample_bytree"],
    "learning_rate": best_hyperparams["learning_rate"],
    "reg_alpha": best_hyperparams["reg_alpha"],
    "reg_lambda": best_hyperparams["reg_lambda"],
    "gamma": best_hyperparams["gamma"],
    "seed": SEED,
    # Add any other relevant parameters
}

# Perform cross-validation with early stopping
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=1000,  # maximum number of boosting rounds
    nfold=10,  # number of folds for cross-validation
    early_stopping_rounds=30,  # stop if performance hasn't improved for 50 rounds
    verbose_eval=100,  # print out progress every 100 rounds
    metrics=["auc"],  # evaluation metrics
)

# Optimal number of boosting rounds
optimal_boost_rounds = cv_results.shape[0]

# Display best boosting rounds
display(cv_results.tail())
print(f"Optimal boosting rounds = {optimal_boost_rounds}")

[0]	train-auc:0.78518+0.00039	test-auc:0.78516+0.00354
[100]	train-auc:0.89059+0.00037	test-auc:0.88935+0.00297
[200]	train-auc:0.89289+0.00032	test-auc:0.89070+0.00278
[300]	train-auc:0.89430+0.00026	test-auc:0.89120+0.00268
[400]	train-auc:0.89536+0.00026	test-auc:0.89158+0.00259
[500]	train-auc:0.89617+0.00023	test-auc:0.89181+0.00253
[587]	train-auc:0.89678+0.00025	test-auc:0.89193+0.00248


Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
553,0.896546,0.000243,0.891908,0.002474
554,0.896552,0.000244,0.891921,0.00247
555,0.89656,0.000243,0.891925,0.00248
556,0.896566,0.000243,0.891933,0.00248
557,0.896571,0.000244,0.891936,0.002481


Optimal boosting rounds = 558


In [104]:
# final model with the optimal number of estimators
final_model = xgb.XGBClassifier(
    n_estimators=optimal_boost_rounds,
    objective='binary:logistic',
    eval_metric='auc',
    max_depth=int(best_hyperparams['max_depth']),
    min_child_weight=best_hyperparams['min_child_weight'],
    subsample=best_hyperparams['subsample'],
    colsample_bytree=best_hyperparams['colsample_bytree'],
    learning_rate=best_hyperparams['learning_rate'],
    random_state=SEED
)

# Fit the final model
final_model.fit(X_train, y_train)


X_test = test_df.drop(["id"], axis=1)

# Predict class probabilities
y_pred_prob = final_model.predict_proba(X_test)[:, 1]


In [105]:
# Predict probabilities for the test dataset
test_pred_prob = final_model.predict_proba(X_test)[:, 1]

submission_df = pd.DataFrame({
    'id': test_df['id'],
    'Exited': test_pred_prob
})
submission_df.to_csv('submission.csv', index=False)
submission_df.head()

Unnamed: 0,id,Exited
0,165034,0.042723
1,165035,0.879431
2,165036,0.034663
3,165037,0.264442
4,165038,0.370168
