In [28]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold as SKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler



from hyperopt import hp, tpe, fmin, Trials, STATUS_OK
from hyperopt.pyll import scope
import matplotlib.pyplot as plt
import ydata_profiling as ydp  

In [39]:
#global variables

VER = 1

SEED = 42

FOLDS = 5

FILEPATH = 'data/'

In [82]:

test = pd.read_csv(f'{FILEPATH}test.csv')
train = pd.read_csv(f'{FILEPATH}train.csv')
#original = pd.read_csv(f'{FILEPATH}original.csv').drop('RowNumber', axis=1)
# train['isTrain'] = 1
# test['isTrain'] = 0
# tt = pd.concat([train, test]).reset_index(drop=True).copy()

# concating train and original data
#train = pd.concat([train, original]).reset_index(drop=True).copy()

In [None]:
def analyze_dataframe(df):
    """
    Analyze a pandas DataFrame and provide a summary of its characteristics.

    Parameters:
    df (pandas.DataFrame): The input DataFrame to analyze.

    Returns:
    None
    """
    print("DataFrame Information:")
    print("______________________")
    display(df.info(verbose=True, show_counts=True))
    print("\n")
    
    print("DataFrame Head:")
    print("______________________")
    display(df.head())
    print("\n")

    print("DataFrame Tail:")
    print("______________________")
    display(df.tail())
    print("\n")

    print("DataFrame Description:")
    print("______________________")
    display(df.describe().T)
    print("\n")

    print("Number of Null Values:")
    print("______________________")
    display(df.isnull().sum())
    print("\n")

    print("Number of Duplicated Rows:")
    print("______________________")
    display(df.duplicated().sum())
    print("\n")

    print("Number of Unique Values:")
    print("______________________")
    display(df.nunique())
    print("\n")

    print("DataFrame Shape:")
    print("______________________")
    print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
    print("\n")
    
    print("DataFrame Columns:")
    print("______________________")
    display(df.columns)
    

analyze_dataframe(train)

In [83]:
def combined_preprocessing(df, cat_features, num_features, scaler=StandardScaler()):
    
    
    # Feature Engineering
    # df['BalanceSalaryRatio'] = df.Balance / df.EstimatedSalary
    # df['TenureByAge'] = df.Tenure / (df.Age - 18)
    # df['Age_NumOfProducts'] = df['Age'] * df['NumOfProducts']
    # df['Balance_NumOfProducts'] = df['Balance'] * df['NumOfProducts']
    # df['EngagementScore'] = (df['IsActiveMember'] + df['NumOfProducts']) / df['Tenure']
    # df['LoyaltyScore'] = df['Tenure'] / df['Age']
    # df['FinancialHealth'] = df['Balance'] / (df['EstimatedSalary'] + 1)  # +1 to avoid division by zero

    def categorize_credit_score(score):
        if 800 <= score <= 850:
            return 'Excellent'
        elif 740 <= score <= 799:
            return 'Very Good'
        elif 670 <= score <= 739:
            return 'Good'
        elif 580 <= score <= 669:
            return 'Fair'
        else:
            return 'Poor'
    # df['CreditScoreCategory'] = df.CreditScore.apply(categorize_credit_score)
    # df['CreditScoreGivenAge'] = df.CreditScore / (df.Age - 18)

    # Impute missing values in categorical features with mode
    for feature in cat_features:
        if df[feature].isnull().any():
            mode_value = df[feature].mode()[0]
            df[feature].fillna(mode_value, inplace=True)

    # One-hot encode categorical features
    df = pd.get_dummies(df, columns=cat_features)

    # Check and impute NaN values in numerical features
    for feature in num_features:
        if df[feature].isnull().any():
            median_value = df[feature].median()
            df[feature].fillna(median_value, inplace=True)

    # Check for infinite values and handle them
    df[num_features] = df[num_features].replace([np.inf, -np.inf], np.nan).fillna(0)
    df[num_features] = scaler.fit_transform(df[num_features])

    # Drop unnecessary columns
    df = df.drop(['Surname', 'CustomerId'], axis=1, errors='ignore')

    return df



In [84]:
cat_features = [
    "Geography",
    "Gender",
    "HasCrCard",
    "IsActiveMember",
    "NumOfProducts",
    # "CreditScoreCategory",
]
num_features = [
    "Balance",
    "EstimatedSalary",
    # "CreditScoreGivenAge",
    #"BalanceSalaryRatio",
    # "Balance_NumOfProducts",
    # "FinancialHealth",
    #"TenureByAge",
    "Tenure",
    "Age",
    "CreditScore",
    # "Age_NumOfProducts",
    # "LoyaltyScore",
    # "EngagementScore",
]


In [85]:
#preprocessing
train_df = combined_preprocessing(train, cat_features, num_features, scaler=StandardScaler())
test_df = combined_preprocessing(test, cat_features, num_features, scaler=StandardScaler())

In [None]:
ydp.ProfileReport(train_df)

In [74]:
# Split the training data
X_train = train_df.drop(["Exited", "id"], axis=1)
y_train = train_df["Exited"]

#hyperparameter tuning
space = {
    # added scope to  make sure the max depth is an integer
    'max_depth': scope.int(hp.quniform('max_depth', 1, 6, 1)),
    'min_child_weight': hp.loguniform('min_child_weight', -2, 3),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'reg_alpha': scope.int(hp.uniform('reg_alpha', 0, 10)),
    'reg_lambda': hp.uniform('reg_lambda', 1, 10),
    'gamma': hp.loguniform('gamma', -10, 10),
    'learning_rate': hp.loguniform('learning_rate', -6, 0),
    'random_state': SEED,
    'nthread': -1,
}

In [86]:
def objective(space):
    # Compute the scale_pos_weight
    ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)
    
    model = xgb.XGBClassifier(
        max_depth=int(space['max_depth']),
        min_child_weight=space['min_child_weight'],
        subsample=space['subsample'],
        colsample_bytree=space['colsample_bytree'],
        reg_alpha=space['reg_alpha'],
        reg_lambda=space['reg_lambda'],
        gamma=space['gamma'],
        learning_rate=space['learning_rate'],
        scale_pos_weight=ratio,
        random_state=SEED,
        
    )
    
    # Implement cross-validation
    kf = SKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
    auc_scores = []

    for train_idx, val_idx in kf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model.fit(X_tr, y_tr)
        y_pred_prob = model.predict_proba(X_val)[:, 1]
        auc_score = roc_auc_score(y_val, y_pred_prob)
        auc_scores.append(auc_score)

    average_auc_score = np.mean(auc_scores)

    return {'loss': -average_auc_score, 'status': STATUS_OK}


In [87]:
#running the hyperparameter tuning

trials = Trials()
best_hyperparams = fmin(fn=objective,
                        space=space,
                        algo=tpe.suggest,
                        max_evals=2000,
                        trials=trials)

print("The best hyperparameters are: ", "\n")
print(best_hyperparams)

print("The best auc score is: ", "\n")
print(trials.best_trial['result']['loss'])


100%|██████████| 2000/2000 [1:53:02<00:00,  3.39s/trial, best loss: -0.8897348132527071]
The best hyperparameters are:  

{'colsample_bytree': 0.7373895259881883, 'gamma': 0.0015517236665329628, 'learning_rate': 0.1897866928257987, 'max_depth': 4.0, 'min_child_weight': 3.143721378610037, 'reg_alpha': 3.0162632256474904, 'reg_lambda': 4.71901888093094, 'subsample': 0.9243921753319707}
The best auc score is:  

-0.8897348132527071


In [77]:
# best_hyperparms = {
#     "colsample_bytree": 0.7051112705179624,
#     "gamma": 0.32535746402392773,
#     "learning_rate": 0.20866113650237444,
#     "max_depth": 4.0,
#     "min_child_weight": 0.5404641062873045,
#     "reg_alpha": 3.307081064828227,
#     "reg_lambda": 2.7978425370426887,
#     "subsample": 0.9985717249511419,
# auc -0.8896499028593908
# }

# Convert dataset to DMatrix format
dtrain = xgb.DMatrix(X_train, label=y_train)

# Define your parameters
params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "max_depth": int(best_hyperparams["max_depth"]),
    "min_child_weight": best_hyperparams["min_child_weight"],
    "subsample": best_hyperparams["subsample"],
    "colsample_bytree": best_hyperparams["colsample_bytree"],
    "learning_rate": best_hyperparams["learning_rate"],
    "reg_alpha": best_hyperparams["reg_alpha"],
    "reg_lambda": best_hyperparams["reg_lambda"],
    "gamma": best_hyperparams["gamma"],
    "seed": SEED,
    # Add any other relevant parameters
}

# Perform cross-validation with early stopping
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=1000,  # maximum number of boosting rounds
    nfold=FOLDS,  # number of folds for cross-validation
    early_stopping_rounds=50,  # stop if performance hasn't improved for 50 rounds
    verbose_eval=100,  # print out progress every 100 rounds
    metrics=["auc"],  # evaluation metrics
)

# Optimal number of boosting rounds
optimal_boost_rounds = cv_results.shape[0]

# Display best boosting rounds
display(cv_results.tail())


[0]	train-auc:0.84629+0.00075	test-auc:0.84586+0.00159
[100]	train-auc:0.89381+0.00042	test-auc:0.88965+0.00159
[200]	train-auc:0.89710+0.00034	test-auc:0.88962+0.00149
[202]	train-auc:0.89715+0.00034	test-auc:0.88961+0.00149


Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
149,0.895546,0.000387,0.88975,0.001606
150,0.895578,0.000385,0.88974,0.001611
151,0.895616,0.000389,0.889742,0.001618
152,0.895652,0.000388,0.889748,0.001612
153,0.895683,0.00039,0.889759,0.001609


In [78]:
# final model with the optimal number of estimators
final_model = xgb.XGBClassifier(
    n_estimators=optimal_boost_rounds,
    objective='binary:logistic',
    eval_metric='auc',
    max_depth=int(best_hyperparams['max_depth']),
    min_child_weight=best_hyperparams['min_child_weight'],
    subsample=best_hyperparams['subsample'],
    colsample_bytree=best_hyperparams['colsample_bytree'],
    learning_rate=best_hyperparams['learning_rate'],
    random_state=SEED
)

# Fit the final model
final_model.fit(X_train, y_train)


X_test = test_df.drop(["id"], axis=1)

# Predict class probabilities
y_pred_prob = final_model.predict_proba(X_test)[:, 1]



In [51]:

# plotting feature importance
xgb.plot_importance(final_model)
plt.rcParams["figure.figsize"] = [10, 10]
plt.show()

  plt.show()


In [79]:
# Predict probabilities for the test dataset
test_pred_prob = final_model.predict_proba(X_test)[:, 1]

submission_df = pd.DataFrame({
    'id': test_df['id'],
    'Exited': test_pred_prob
})
submission_df.to_csv('submission.csv', index=False)
submission_df.head()

Unnamed: 0,id,Exited
0,165034,0.052688
1,165035,0.827881
2,165036,0.022282
3,165037,0.244513
4,165038,0.330699
