In [77]:
import os
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as imbl_pipe
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix, classification_report
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
import joblib

## Load Analytical Base Table
df = pd.read_csv(os.path.join("../Resources", "analytical_base_table.csv"))
print(f"Dataframe dimensions: {df.shape}")
df.head()

### Separate dataframe into separate object
y = df.Exited
X = df.drop(['Exited'], axis=1)
print(X.shape, y.shape)

# List numerical features
num_columns = X.select_dtypes(include='number').columns.tolist()

# List categorical features
cat_columns = X.select_dtypes(include='object').columns.tolist()

def class_count(a):
    counter=Counter(a)
    kv=[list(counter.keys()),list(counter.values())]
    dff = pd.DataFrame(np.array(kv).T, columns=['Exited','Count'])
    dff['Count'] = dff['Count'].astype('int64')
    dff['%'] = round(dff['Count'] / a.shape[0] * 100, 2)
    return dff.sort_values('Count',ascending=False)

class_count(y)

## Create a Train Test Split
random_state = 10
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                    random_state=random_state,
                                                   stratify=df.Exited)

print(len(X_train), len(X_test), len(y_train), len(y_test))
X_train.info()

## Pre-processing Pipeline
num_features = num_columns
cat_features = cat_columns

# Define column transformer
preprocess = make_column_transformer(
    (MinMaxScaler(), num_features),
    (OneHotEncoder(sparse_output=False), cat_features)
)

## Build Model Pipeline with SMOTE
model = imbl_pipe(preprocess,
                  SMOTE(sampling_strategy='auto', random_state=random_state),
                  xgb.XGBClassifier(use_label_encoder=False, random_state=random_state))

# Create the GridSearchCV model
param_grid = {'xgbclassifier__gamma': [0.5, 0.8, 1],
              'xgbclassifier__max_depth': [3, 4, 5, 6],
              'xgbclassifier__n_estimators': [50, 100, 200]
        }
grid = GridSearchCV(model, param_grid, verbose=3, cv=5, n_jobs=4, scoring='f1_macro')

# Fit the GridSearch model
grid.fit(X_train, y_train)  # إضافة هذه السطر لتدريب النموذج

# Print the best parameters and score
print("Best Parameters:", grid.best_params_)  # جعل الرسالة أوضح
print("Best Score:", grid.best_score_)  # جعل الرسالة أوضح
print(f"Training Data Score: {grid.score(X_train, y_train)}")
print(f"Testing Data Score: {grid.score(X_test, y_test)}")

# Make predictions with the hypertuned model
pred = grid.predict(X_test)

# Classification metrics
cm = confusion_matrix(y_test, pred)
print("Confusion Matrix:")
print(cm)  # فصل المصفوفة
cm = np.around(cm / cm.sum(axis=1)[:, np.newaxis], 2)
print("Normalized Confusion Matrix:")
print(cm)  # فصل المصفوفة
print("Classification Report:")
print(classification_report(y_test, pred))  # فصل التقرير
print(f"Predicted classes: \t{list(pred[:10])}")
print(f"Actual Labels: \t\t{list(y_test[:10])}")

## Save the Model
filename = '../models/XGBoost_model.sav'
joblib.dump(grid, filename)

## Loading the Model
xgb_model = joblib.load(filename)
print(f"Loaded Model Score: {xgb_model.score(X_test, y_test)}")  # تأكيد تحميل النموذج

### Predict class for new data
pred_new = xgb_model.predict(X_test[:1])
print(f"Predicted classes: \t{pred_new}")
print(f"Actual Labels: \t\t{list(y_test[:1])}")


Dataframe dimensions: (10000, 11)
(10000, 10) (10000,)
7000 3000 7000 3000
<class 'pandas.core.frame.DataFrame'>
Index: 7000 entries, 8061 to 4741
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      7000 non-null   int64  
 1   Geography        7000 non-null   object 
 2   Gender           7000 non-null   object 
 3   Age              7000 non-null   int64  
 4   Tenure           7000 non-null   int64  
 5   Balance          7000 non-null   float64
 6   NumOfProducts    7000 non-null   int64  
 7   HasCrCard        7000 non-null   int64  
 8   IsActiveMember   7000 non-null   int64  
 9   EstimatedSalary  7000 non-null   float64
dtypes: float64(2), int64(6), object(2)
memory usage: 601.6+ KB
Fitting 5 folds for each of 36 candidates, totalling 180 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV 1/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=50;, score=0.754 total time=   0.1s
[CV 2/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=50;, score=0.755 total time=   0.1s
[CV 5/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=50;, score=0.777 total time=   0.1s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV 1/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=100;, score=0.760 total time=   0.1s
[CV 2/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=100;, score=0.759 total time=   0.1s
[CV 3/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=100;, score=0.750 total time=   0.1s
[CV 4/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=100;, score=0.763 total time=   0.1s
[CV 3/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=50;, score=0.734 total time=   0.1s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV 5/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=100;, score=0.766 total time=   0.1s
[CV 4/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=50;, score=0.760 total time=   0.2s
[CV 1/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=200;, score=0.753 total time=   0.2s
[CV 3/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=200;, score=0.753 total time=   0.2s
[CV 2/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=200;, score=0.757 total time=   0.2s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV 5/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=200;, score=0.766 total time=   0.2s
[CV 1/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=50;, score=0.762 total time=   0.1s
[CV 4/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=200;, score=0.759 total time=   0.2s
[CV 2/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=50;, score=0.756 total time=   0.2s
[CV 3/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=50;, score=0.746 total time=   0.1s
[CV 5/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=50;, score=0.769 total time=   0.1s
[CV 4/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=50;, score=0.758 total time=   0.2s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV 1/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=100;, score=0.765 total time=   0.2s
[CV 3/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=100;, score=0.745 total time=   0.1s
[CV 2/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=100;, score=0.749 total time=   0.2s
[CV 4/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=100;, score=0.745 total time=   0.2s
[CV 5/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=100;, score=0.768 total time=   0.1s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV 1/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=200;, score=0.765 total time=   0.2s
[CV 3/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=200;, score=0.745 total time=   0.2s
[CV 4/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=200;, score=0.745 total time=   0.2s
[CV 2/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=200;, score=0.749 total time=   0.2s
[CV 1/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=50;, score=0.752 total time=   0.2s
[CV 2/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=50;, score=0.748 total time=   0.1s
[CV 5/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=200;, score=0.768 total time=   0.2s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV 3/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=50;, score=0.741 total time=   0.2s
[CV 5/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=50;, score=0.766 total time=   0.1s
[CV 1/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=100;, score=0.750 total time=   0.1s
[CV 2/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=100;, score=0.739 total time=   0.2s
[CV 3/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=100;, score=0.743 total time=   0.1s
[CV 4/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=50;, score=0.751 total time=   0.3s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV 4/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=100;, score=0.745 total time=   0.2s
[CV 5/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=100;, score=0.765 total time=   0.1s
[CV 1/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=200;, score=0.750 total time=   0.1s
[CV 3/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=200;, score=0.743 total time=   0.2s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV 5/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=200;, score=0.765 total time=   0.1s
[CV 4/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=200;, score=0.745 total time=   0.2s
[CV 2/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=200;, score=0.739 total time=   0.3s
[CV 2/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=50;, score=0.742 total time=   0.1s
[CV 3/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=50;, score=0.749 total time=   0.1s
[CV 1/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=50;, score=0.766 total time=   0.2s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV 4/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=50;, score=0.747 total time=   0.2s
[CV 5/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=50;, score=0.757 total time=   0.1s
[CV 1/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=100;, score=0.756 total time=   0.2s
[CV 2/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=100;, score=0.739 total time=   0.2s
[CV 4/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=100;, score=0.747 total time=   0.1s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV 3/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=100;, score=0.746 total time=   0.2s
[CV 5/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=100;, score=0.761 total time=   0.2s
[CV 1/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=200;, score=0.756 total time=   0.2s
[CV 2/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=200;, score=0.739 total time=   0.2s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV 4/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=200;, score=0.747 total time=   0.2s
[CV 3/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=200;, score=0.746 total time=   0.3s
[CV 2/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=50;, score=0.752 total time=   0.1s
[CV 1/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=50;, score=0.752 total time=   0.2s
[CV 5/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=200;, score=0.761 total time=   0.2s
[CV 3/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=50;, score=0.735 total time=   0.1s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV 4/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=50;, score=0.759 total time=   0.1s
[CV 5/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=50;, score=0.767 total time=   0.1s
[CV 1/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=100;, score=0.768 total time=   0.1s
[CV 3/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=100;, score=0.742 total time=   0.2s
[CV 5/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=100;, score=0.765 total time=   0.1s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV 4/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=100;, score=0.754 total time=   0.2s
[CV 2/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=100;, score=0.762 total time=   0.4s
[CV 2/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=200;, score=0.762 total time=   0.1s
[CV 1/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=200;, score=0.768 total time=   0.3s
[CV 3/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=200;, score=0.742 total time=   0.2s
[CV 5/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=200;, score=0.765 total time=   0.1s
[CV 4/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=200;, score=0.754 total time=   0.2s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV 1/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=50;, score=0.763 total time=   0.1s
[CV 2/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=50;, score=0.760 total time=   0.1s
[CV 3/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=50;, score=0.748 total time=   0.1s
[CV 4/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=50;, score=0.748 total time=   0.2s
[CV 5/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=50;, score=0.777 total time=   0.1s
[CV 1/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=100;, score=0.766 total time=   0.1s
[CV 2/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=100;, score=0.764 total time=   0.1s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV 3/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=100;, score=0.753 total time=   0.1s
[CV 5/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=100;, score=0.774 total time=   0.2s
[CV 4/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=100;, score=0.749 total time=   0.2s
[CV 2/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=200;, score=0.764 total time=   0.1s
[CV 1/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=200;, score=0.766 total time=   0.2s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV 3/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=200;, score=0.753 total time=   0.2s
[CV 1/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=50;, score=0.758 total time=   0.1s
[CV 4/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=200;, score=0.749 total time=   0.2s
[CV 5/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=200;, score=0.774 total time=   0.2s
[CV 2/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=50;, score=0.761 total time=   0.1s
[CV 3/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=50;, score=0.746 total time=   0.1s
[CV 4/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=50;, score=0.743 total time=   0.1s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV 5/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=50;, score=0.753 total time=   0.2s
[CV 1/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=100;, score=0.763 total time=   0.1s
[CV 3/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=100;, score=0.746 total time=   0.1s
[CV 2/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=100;, score=0.761 total time=   0.2s
[CV 5/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=100;, score=0.749 total time=   0.1s
[CV 4/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=100;, score=0.743 total time=   0.2s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV 1/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=200;, score=0.763 total time=   0.2s
[CV 4/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=200;, score=0.743 total time=   0.2s
[CV 2/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=200;, score=0.761 total time=   0.3s
[CV 3/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=200;, score=0.746 total time=   0.2s
[CV 5/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=200;, score=0.749 total time=   0.2s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV 1/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=50;, score=0.771 total time=   0.1s
[CV 3/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=50;, score=0.740 total time=   0.1s
[CV 2/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=50;, score=0.759 total time=   0.2s
[CV 4/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=50;, score=0.753 total time=   0.2s
[CV 1/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=100;, score=0.771 total time=   0.1s
[CV 5/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=50;, score=0.770 total time=   0.2s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV 2/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=100;, score=0.759 total time=   0.2s
[CV 3/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=100;, score=0.740 total time=   0.2s
[CV 4/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=100;, score=0.753 total time=   0.1s
[CV 5/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=100;, score=0.770 total time=   0.2s
[CV 3/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=200;, score=0.740 total time=   0.2s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV 1/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=200;, score=0.771 total time=   0.2s
[CV 4/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=200;, score=0.753 total time=   0.2s
[CV 2/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=200;, score=0.759 total time=   0.3s
[CV 1/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=50;, score=0.761 total time=   0.1s
[CV 5/5] END xgbclassifier__gamma=0.8, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=200;, score=0.770 total time=   0.2s
[CV 2/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=50;, score=0.760 total time=   0.1s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV 3/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=50;, score=0.735 total time=   0.1s
[CV 4/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=50;, score=0.759 total time=   0.2s
[CV 5/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=50;, score=0.766 total time=   0.1s
[CV 2/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=100;, score=0.756 total time=   0.1s
[CV 1/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=100;, score=0.764 total time=   0.2s
[CV 3/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=100;, score=0.748 total time=   0.1s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV 5/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=100;, score=0.775 total time=   0.1s
[CV 4/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=100;, score=0.758 total time=   0.2s
[CV 3/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=200;, score=0.748 total time=   0.1s
[CV 2/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=200;, score=0.756 total time=   0.2s
[CV 4/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=200;, score=0.758 total time=   0.2s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV 1/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=200;, score=0.764 total time=   0.3s
[CV 5/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=200;, score=0.775 total time=   0.1s
[CV 1/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=50;, score=0.762 total time=   0.1s
[CV 4/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=50;, score=0.746 total time=   0.1s
[CV 2/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=50;, score=0.759 total time=   0.2s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV 5/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=50;, score=0.768 total time=   0.1s
[CV 3/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=50;, score=0.748 total time=   0.2s
[CV 1/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=100;, score=0.761 total time=   0.1s
[CV 3/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=100;, score=0.751 total time=   0.1s
[CV 5/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=100;, score=0.765 total time=   0.1s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV 2/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=200;, score=0.762 total time=   0.2s
[CV 2/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=100;, score=0.762 total time=   0.4s
[CV 1/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=200;, score=0.761 total time=   0.2s
[CV 4/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=100;, score=0.748 total time=   0.4s
[CV 3/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=200;, score=0.751 total time=   0.1s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV 5/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=200;, score=0.765 total time=   0.2s
[CV 1/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=50;, score=0.738 total time=   0.1s
[CV 2/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=50;, score=0.766 total time=   0.1s
[CV 4/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=4, xgbclassifier__n_estimators=200;, score=0.748 total time=   0.2s
[CV 4/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=50;, score=0.749 total time=   0.1s
[CV 3/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=50;, score=0.742 total time=   0.1s
[CV 5/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=50;, score=0.769 total time=   0.1s
[CV 1/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=10

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV 3/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=100;, score=0.742 total time=   0.1s
[CV 4/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=100;, score=0.749 total time=   0.1s
[CV 2/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=100;, score=0.766 total time=   0.2s
[CV 5/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=100;, score=0.769 total time=   0.2s
[CV 1/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=200;, score=0.738 total time=   0.2s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV 2/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=200;, score=0.766 total time=   0.2s
[CV 3/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=200;, score=0.742 total time=   0.2s
[CV 4/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=200;, score=0.749 total time=   0.2s
[CV 1/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=50;, score=0.754 total time=   0.1s
[CV 5/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=200;, score=0.769 total time=   0.2s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV 2/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=50;, score=0.751 total time=   0.1s
[CV 5/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=50;, score=0.781 total time=   0.1s
[CV 4/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=50;, score=0.749 total time=   0.1s
[CV 3/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=50;, score=0.747 total time=   0.2s
[CV 1/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=100;, score=0.754 total time=   0.1s
[CV 2/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=100;, score=0.751 total time=   0.1s
[CV 3/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=100;, score=0.747 total time=   0.1s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV 4/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=100;, score=0.749 total time=   0.2s
[CV 5/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=100;, score=0.781 total time=   0.2s
[CV 1/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=200;, score=0.754 total time=   0.2s
[CV 2/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=200;, score=0.751 total time=   0.2s
[CV 3/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=200;, score=0.747 total time=   0.2s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV 5/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=200;, score=0.781 total time=   0.1s
[CV 4/5] END xgbclassifier__gamma=1, xgbclassifier__max_depth=6, xgbclassifier__n_estimators=200;, score=0.749 total time=   0.1s
Best Parameters: {'xgbclassifier__gamma': 0.8, 'xgbclassifier__max_depth': 4, 'xgbclassifier__n_estimators': 100}
Best Score: 0.7612982384579798
Training Data Score: 0.8106322732916824
Testing Data Score: 0.766781181961393
Confusion Matrix:
[[2187  202]
 [ 241  370]]
Normalized Confusion Matrix:
[[0.92 0.08]
 [0.39 0.61]]
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.92      0.91      2389
           1       0.65      0.61      0.63       611

    accuracy                           0.85      3000
   macro avg       0.77      0.76      0.77      3000
weighted avg       0.85      0.85      0.85      3000

Predicted classes: 	[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
Actual Labels: 

## Load Analytical Base Table

In [78]:
df = pd.read_csv(os.path.join("../Resources", "analytical_base_table.csv"))
print(f"Dataframe dimensions: {df.shape}")
df.head()

Dataframe dimensions: (10000, 11)


Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


### Separate dataframe into separate object

In [79]:
# Object for target variable
y = df.Exited

# object for input features
X = df.drop(['Exited'], axis=1)

# display shapes of X and y
print(X.shape, y.shape)

(10000, 10) (10000,)


In [80]:
# List numerical features
num_columns = X.select_dtypes(include='number').columns.tolist()
num_columns

['CreditScore',
 'Age',
 'Tenure',
 'Balance',
 'NumOfProducts',
 'HasCrCard',
 'IsActiveMember',
 'EstimatedSalary']

In [81]:
# List categorical features
cat_columns = X.select_dtypes(include='object').columns.tolist()
cat_columns

['Geography', 'Gender']

In [82]:
def class_count(a):
    counter=Counter(a)
    kv=[list(counter.keys()),list(counter.values())]
    dff = pd.DataFrame(np.array(kv).T, columns=['Exited','Count'])
    dff['Count'] = dff['Count'].astype('int64')
    dff['%'] = round(dff['Count'] / a.shape[0] * 100, 2)
    return dff.sort_values('Count',ascending=False)

In [83]:
class_count(y)

Unnamed: 0,Exited,Count,%
1,0,7963,79.63
0,1,2037,20.37


## Create a Train Test Split

In [84]:
random_state = 10

# Split X and y into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                    random_state=random_state,
                                                   stratify=df.Exited)

# Print number of observations in X_train, X_test, y_train, and y_test
print(len(X_train), len(X_test), len(y_train), len(y_test))

7000 3000 7000 3000


In [85]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7000 entries, 8061 to 4741
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      7000 non-null   int64  
 1   Geography        7000 non-null   object 
 2   Gender           7000 non-null   object 
 3   Age              7000 non-null   int64  
 4   Tenure           7000 non-null   int64  
 5   Balance          7000 non-null   float64
 6   NumOfProducts    7000 non-null   int64  
 7   HasCrCard        7000 non-null   int64  
 8   IsActiveMember   7000 non-null   int64  
 9   EstimatedSalary  7000 non-null   float64
dtypes: float64(2), int64(6), object(2)
memory usage: 601.6+ KB


## Pre-processing Pipeline

### Scale numerical data and encode categorical data
Construct a pre-processing pipeline from the given transformers: MinMaxScaler and OneHotEncoder

Create lists of indexes from the list of column names

Need to be numeric not string to specify columns name in column transformer

In [86]:
num_features = [] 

for i in num_columns:
    location = X.columns.get_loc(i)
    num_features.append(location)
print(num_features)  

[0, 3, 4, 5, 6, 7, 8, 9]


In [87]:
cat_features = []

for i in cat_columns:
    location = X.columns.get_loc(i)
    cat_features.append(location)
print(cat_features)  

[1, 2]


In [88]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

# Define your numeric and categorical feature lists
num_features = ['feature1', 'feature2']  # Replace with your actual numeric feature names
cat_features = ['feature3', 'feature4']  # Replace with your actual categorical feature names

# Define column transformer
preprocess = make_column_transformer(
    (MinMaxScaler(), num_features),
    (OneHotEncoder(sparse_output=False), cat_features)
)

preprocess


## Build Model Pipeline with SMOTE

* We are going to use the Pipeline from the imblearn package in place of scikit-learn Pipeline.

* It takes care automatically to re-sample when called fit() on the pipeline, and does not re-sample test data (when called transform() or predict()).

In [89]:
# Import XGBoost
import xgboost as xgb

# Define model with pipeline
model = imbl_pipe(preprocess,
                  SMOTE(sampling_strategy='auto', random_state=random_state),
                  xgb.XGBClassifier(use_label_encoder=False, random_state=random_state))

model

In [90]:
xgb.XGBClassifier().get_params()

{'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [91]:
# Create the GridSearchCV model
# Create the GridSearch estimator along with a parameter object containing the values to adjust
from sklearn.model_selection import GridSearchCV

param_grid = {'xgbclassifier__gamma': [0.5, 0.8, 1],
              'xgbclassifier__max_depth': [3, 4, 5, 6],
              'xgbclassifier__n_estimators': [50, 100, 200]
        }
grid = GridSearchCV(model, param_grid, verbose=3, cv= 5, n_jobs=4, scoring='f1_macro')


In [92]:
X_train = X_train.values
X_test = X_test.values

In [93]:
import pandas as pd

# Assuming you have your original DataFrame 'X'
X_train = pd.DataFrame(X_train, columns=original_column_names)
grid.fit(X_train, y_train)  
print("Best Parameters:", grid.best_params_)  # جعل الرسالة أوضح
print("Best Score:", grid.best_score_)  # جعل الرسالة أوضح



Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=50;, score=nan total time=   0.0s
[CV 2/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=50;, score=nan total time=   0.0s
[CV 3/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=50;, score=nan total time=   0.0s
[CV 4/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=50;, score=nan total time=   0.0s
[CV 5/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=50;, score=nan total time=   0.0s
[CV 1/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=100;, score=nan total time=   0.0s
[CV 2/5] END xgbclassifier__gamma=0.5, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=100;, score=nan total time=   0.0s
[CV 3/5] END xgbclassifier__gamma

ValueError: 
All the 180 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "/home/ahmedkhalid/anaconda3/lib/python3.12/site-packages/pandas/core/indexes/base.py", line 3805, in get_loc
    return self._engine.get_loc(casted_key)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "index.pyx", line 167, in pandas._libs.index.IndexEngine.get_loc
  File "index.pyx", line 196, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 7081, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 7089, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'feature1'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/ahmedkhalid/anaconda3/lib/python3.12/site-packages/sklearn/utils/__init__.py", line 505, in _get_column_indices
    col_idx = all_columns.get_loc(col)
              ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ahmedkhalid/anaconda3/lib/python3.12/site-packages/pandas/core/indexes/base.py", line 3812, in get_loc
    raise KeyError(key) from err
KeyError: 'feature1'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/ahmedkhalid/anaconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/ahmedkhalid/anaconda3/lib/python3.12/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ahmedkhalid/anaconda3/lib/python3.12/site-packages/imblearn/pipeline.py", line 329, in fit
    Xt, yt = self._fit(X, y, routed_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ahmedkhalid/anaconda3/lib/python3.12/site-packages/imblearn/pipeline.py", line 255, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ahmedkhalid/anaconda3/lib/python3.12/site-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ahmedkhalid/anaconda3/lib/python3.12/site-packages/imblearn/pipeline.py", line 1104, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ahmedkhalid/anaconda3/lib/python3.12/site-packages/sklearn/utils/_set_output.py", line 295, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ahmedkhalid/anaconda3/lib/python3.12/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ahmedkhalid/anaconda3/lib/python3.12/site-packages/sklearn/compose/_column_transformer.py", line 906, in fit_transform
    self._validate_column_callables(X)
  File "/home/ahmedkhalid/anaconda3/lib/python3.12/site-packages/sklearn/compose/_column_transformer.py", line 496, in _validate_column_callables
    transformer_to_input_indices[name] = _get_column_indices(X, columns)
                                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ahmedkhalid/anaconda3/lib/python3.12/site-packages/sklearn/utils/__init__.py", line 513, in _get_column_indices
    raise ValueError("A given column is not a column of the dataframe") from e
ValueError: A given column is not a column of the dataframe


In [73]:
print(grid.best_params_)

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [74]:
 # List the best score
print(grid.best_score_)

AttributeError: 'GridSearchCV' object has no attribute 'best_score_'

In [52]:
print(f"Training Data Score: {grid.score(X_train, y_train)}")
print(f"Testing Data Score: {grid.score(X_test, y_test)}")

NotFittedError: This GridSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [53]:
# Make predictions with the hypertuned model
pred = grid.predict(X_test)
pred

NotFittedError: This GridSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [54]:
# Classification metrics
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(y_test, pred)
print(cm)

NameError: name 'pred' is not defined

In [55]:
cm = np.around(cm / cm.sum(axis=1)[:, np.newaxis], 2)
print(cm)

NameError: name 'cm' is not defined

In [24]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.90      0.92      0.91      2389
           1       0.66      0.58      0.62       611

    accuracy                           0.85      3000
   macro avg       0.78      0.75      0.76      3000
weighted avg       0.85      0.85      0.85      3000



In [25]:
print(f"Predicted classes: \t{list(pred[:10])}")
print(f"Actual Labels: \t\t{list(y_test[:10])}")

Predicted classes: 	[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
Actual Labels: 		[1, 0, 0, 0, 0, 1, 0, 0, 0, 0]


## Save the Model

In [26]:
import joblib

# We are saving our grid model
filename = '../models/XGBoost_model.sav'
joblib.dump(grid, filename)

['../models/XGBoost_model.sav']

## Loading the Model

In [27]:
# load the model
xgb_model = joblib.load(filename)
print(xgb_model.score(X_test, y_test))

0.7637159144059555


### Predict class for new data

In [28]:
# Let's use the first X_test record as new data
X_test[:1]

array([[638, 'France', 'Male', 36, 6, 188455.19, 1, 0, 0, 47031.4]],
      dtype=object)

In [29]:
pred_new = xgb_model.predict(X_test[:1])

In [30]:
print(f"Predicted classes: \t{pred_new}")
print(f"Actual Labels: \t\t{list(y_test[:1])}")

Predicted classes: 	[0]
Actual Labels: 		[1]
