In [5]:
%load_ext autoreload
%autoreload 2

In [6]:
import os
import kagglehub
import pathlib
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
#numpy-1.26.0 is requested to run section 4 of this Notebook ("4. Deep learning model") because of an incompatibility of Tensorflow with numpy2-1-2
#However note that the rest of our package_folder code runs on the more recent version Numpy2-1-2
#Suggestion: pip install numpy-1.26.0 on your branch only if you run section 4

# Load data

In [7]:
ROOT_PATH = pathlib.Path().resolve().parent
# Get the parent directory of the current working directory
raw_data_path = os.path.join(ROOT_PATH, 'raw_data', 'Loan_Default.csv')
# Load the data into a DataFrame
if os.path.exists(raw_data_path):
    data = pd.read_csv(raw_data_path)
    print("✅ Data loaded successfully")
else:
    raise FileNotFoundError(f"The file {raw_data_path} does not exist. Please check the path.")

✅ Data loaded successfully


In [8]:
data=pd.read_csv(f'{raw_data_path}')

In [9]:
print(f"data shape: {data.shape}")

data shape: (148670, 34)


# Data exploration

In [10]:
data.head(1)

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
0,24890,2019,cf,Sex Not Available,nopre,type1,p1,l1,nopc,nob/c,...,EXP,758,CIB,25-34,to_inst,98.728814,south,direct,1,45.0


In [13]:
data.isnull().sum()/len(data)*100

ID                            0.000000
year                          0.000000
loan_limit                    2.249277
Gender                        0.000000
approv_in_adv                 0.610749
loan_type                     0.000000
loan_purpose                  0.090133
Credit_Worthiness             0.000000
open_credit                   0.000000
business_or_commercial        0.000000
loan_amount                   0.000000
rate_of_interest             24.509989
Interest_rate_spread         24.644515
Upfront_charges              26.664425
term                          0.027578
Neg_ammortization             0.081388
interest_only                 0.000000
lump_sum_payment              0.000000
property_value               10.155378
construction_type             0.000000
occupancy_type                0.000000
Secured_by                    0.000000
total_units                   0.000000
income                        6.154571
credit_type                   0.000000
Credit_Score             

**Objective: investigate the missing values in the columns where the % of missing values is non-negligible (> 2.5%)**

In [9]:
#Assumption based on banking knowledge & first visual observation of the dataset:
#Some columns contain null values only for class 1 i.e. when Status=1 (loan has already defaulted).
#Let's check out this assumption!
a=data[['loan_limit', 'rate_of_interest','Interest_rate_spread','Upfront_charges', 'property_value','LTV', 'dtir1','Status']]

In [10]:
#Select and count only the rows where 'rate_of_interest' AND 'Interest_rate_spread' AND 'Upfront_charges' are null values
b=a[(a['Status'] == 1) & (a['rate_of_interest'].isnull()) & (a['Interest_rate_spread'].isnull()) & (a['Upfront_charges'].isnull())]
print(f"Assumption confirmed: {b.shape[0]} rows contain null values for all 3 features. It is a perfect match!") 

Assumption confirmed: 36439 rows contain null values for all 3 features. It is a perfect match!


**Suggestion 1 for preprocessing: remove the columns ['rate_of_interest','Interest_rate_spread','Upfront_charges'] from the features. Why? If we kept those columns as features, they would create an imbalance. Those features would only "influence" the Class 0, never the Class 1. I therefore suggest to treat those 3 columns as targets instead. Based on the outcome of our prediction, we (the bank) will define contractually the interest rate, spread and upfront charges applicable to a customer if and only if the loan has been approved. How? By running a linear regression (or another model) only in the case where our first model (the classification) returns an approval to the applicant loan request.**

In [11]:
#Let's move on to the columns 'property_value' and 'LTV'!
c=a[(a['Status'] == 0) & (a['property_value'].isnull())] 
d=a[(a['Status'] == 1) & (a['property_value'].isnull())] 
print(f"Class 1 Defaulted loans: {d.shape[0]} missing property values Class 0 Performing loans: {c.shape[0]} missing property values only")

Class 1 Defaulted loans: 15096 missing property values Class 0 Performing loans: 2 missing property values only


In [12]:
#Let's check if we have class 1 defaulted loans with property values properly filled in
e = a[(a['Status'] == 1) & (a['property_value'].isnull() == False)]
print(f"Class 1 Defaulted loans: {e.shape[0]} with property values properly filled in")

Class 1 Defaulted loans: 21543 with property values properly filled in


Partial conclusion: out of 36639 Class 1 defaulted loans, 15096 have a missing property value, 21543 do not.

In [13]:
#Assumption: the Class 1 defaulted loans with missing property values are not real estate loans. Let's check this out by looking at the columns
#related to the type or purpose of the loan: "loan_type','loan_purpose' and 'business_or_commercial'

In [14]:
loan_type_columns=['loan_type','loan_purpose','business_or_commercial']
for element in loan_type_columns:
    f=data[(data['Status'] == 1)][element]
    print(f.value_counts())
    print("___________________")

loan_type
type1    25775
type2     7172
type3     3692
Name: count, dtype: int64
___________________
loan_purpose
p3    13996
p4    12590
p1     8935
p2     1083
Name: count, dtype: int64
___________________
business_or_commercial
nob/c    29467
b/c       7172
Name: count, dtype: int64
___________________


Partial conclusion: there is no obvious link. We need to find another angle.

In [15]:
data[(data['Status']==1) & (data['property_value'].isnull())]

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
1,24891,2019,cf,Male,nopre,type2,p1,l1,nopc,b/c,...,EQUI,552,EXP,55-64,to_inst,,North,direct,1,
26,24916,2019,cf,Joint,nopre,type3,p1,l1,nopc,nob/c,...,EQUI,518,EXP,45-54,to_inst,,south,direct,1,
42,24932,2019,cf,Male,nopre,type3,p4,l1,nopc,nob/c,...,EQUI,765,EXP,45-54,to_inst,,North,direct,1,
52,24942,2019,cf,Joint,nopre,type3,p3,l1,nopc,nob/c,...,EQUI,555,EXP,65-74,to_inst,,North,direct,1,
53,24943,2019,ncf,Female,nopre,type1,p3,l1,nopc,nob/c,...,EQUI,603,EXP,65-74,to_inst,,North,direct,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148644,173534,2019,cf,Joint,nopre,type1,p3,l1,nopc,nob/c,...,EQUI,846,EXP,>74,to_inst,,south,direct,1,
148649,173539,2019,cf,Sex Not Available,nopre,type1,p3,l1,nopc,nob/c,...,EQUI,703,EXP,55-64,to_inst,,south,direct,1,
148652,173542,2019,cf,Male,nopre,type1,p1,l1,nopc,nob/c,...,EQUI,608,EXP,55-64,to_inst,,North,direct,1,
148658,173548,2019,cf,Sex Not Available,nopre,type1,p4,l1,nopc,nob/c,...,EQUI,669,EXP,25-34,to_inst,,south,direct,1,


We observe visually that the column 'credit_type' only has EQUI values. That is a hint!

In [19]:
data[['credit_type','Status']][(data['Status']==1) & (data['credit_type']=='EQUI')]

Unnamed: 0,credit_type,Status
1,EQUI,1
26,EQUI,1
42,EQUI,1
52,EQUI,1
53,EQUI,1
...,...,...
148644,EQUI,1
148649,EQUI,1
148652,EQUI,1
148658,EQUI,1


In [21]:
data[['credit_type','Status']][(data['Status']==1) & (data['credit_type']!='EQUI')]

Unnamed: 0,credit_type,Status
0,EXP,1
10,EXP,1
12,CRIF,1
15,EXP,1
16,CRIF,1
...,...,...
148634,EXP,1
148646,CIB,1
148650,EXP,1
148651,EXP,1


In [17]:
#Let's check out the value distribution in that column for the complete dataset (i.e. not focusing on Class 1 non-performing loans)
data['credit_type'].value_counts()

credit_type
CIB     48152
CRIF    43901
EXP     41319
EQUI    15298
Name: count, dtype: int64

It seems the value 'EQUI' in the column 'credit_type' correlates almost exactly with missing values in the 'property_value' column. Let's zoom on the 'EQUI' values to be sure!

In [18]:
g=data[(data['Status']==1) & (data['property_value'].isnull()) & (data['credit_type']=='EQUI')]
print(f"Class 1 Defaulted loans with missing property values, how many are 'EQUI'? Answer: {g.shape[0]}")

Class 1 Defaulted loans with missing property values, how many are 'EQUI'? Answer: 15096


A Google search indicates the following: "Equifax Inc. is an American multinational consumer credit reporting agency headquartered in Atlanta, Georgia and is one of the three largest consumer credit reporting agencies, along with Experian and TransUnion (together known as the "Big Three")"

Assumption: EQUI does not provide the property value for defaulted loans whereas the 3 other credit bureaus (CIB, CRIF and EXP) do.

In [19]:
property_value_filled_in=data[(data['Status']==1) & (data['credit_type']=='CIB')][['credit_type']].shape[0]+data[(data['Status']==1) & (data['credit_type']=='CRIF')][['credit_type']].shape[0] +data[(data['Status']==1) & (data['credit_type']=='EXP')][['credit_type']].shape[0]
print(f"Class 1 Defaulted loans with missing property values, how many are not'EQUI'? Answer: {property_value_filled_in}")

Class 1 Defaulted loans with missing property values, how many are not'EQUI'? Answer: 21342


From the 21543 values actually filled in the column 'property_value', 21342 come from the bureaus CIB, CRIF and EXP. A plausible explanation is simply that unlike the 3 other bureaus, EQUI simply does not provide this information.

**Suggestion 2 for preprocessing: keep the column 'property_value' and remove the missing values in it.
We can apply the same method to the column 'LTV' (Loan-to-Value) which is equal to 'loan_amount' / 'property_value' as there are no missing values in 'loan_amount'**

In [20]:
#Now let's move on to the last column with missing values > 2.5%, which is 'dtir1' (Debt-to-Income_Ratio).
h=data[(data['dtir1'].isnull()) & (data['Status']==1)  & (data['credit_type']=='EQUI')]
print(f"Class 1 Defaulted loans with missing 'dtir1', how many are 'EQUI'? Answer: {h.shape[0]}")

Class 1 Defaulted loans with missing 'dtir1', how many are 'EQUI'? Answer: 15296


In [21]:
#Now let's take a look at the column loan_purpose as it appears visually that all missing 'dtir1' have 'p4' in that column.
i=data[(data['dtir1'].isnull()) & (data['loan_purpose'] == 'p4') & ~(data['credit_type'] =='EQUI')]
print(f"Class 1 Defaulted loans with missing 'dtir1', how many are 'p4'? Answer: {i.shape[0]}")

Class 1 Defaulted loans with missing 'dtir1', how many are 'p4'? Answer: 8817


We have a total of 8817+15296=24113 which matches almost entirely all missing values in 'dtir1.'

**Suggestion 3 for preprocessing: keep the column 'dtir1' and remove the missing values in it. De facto we are withdrawing "only" another (24113-15296)=8817 rows, as we already suggested to remove 15296 EQUI values.** 

# Base model

In [22]:
#import package_folder.preprocessor
from package_folder.preprocessor import clean_data
from package_folder.preprocessor import encode_categorical

In [23]:
'''#Load model
import pickle

# Adjust the path to your .pkl file
with open('path_to_your_file.pkl', 'rb') as file:
    model = pickle.load(file)

# Display some information about the loaded object
print(model)'''

"#Load model\nimport pickle\n\n# Adjust the path to your .pkl file\nwith open('path_to_your_file.pkl', 'rb') as file:\n    model = pickle.load(file)\n\n# Display some information about the loaded object\nprint(model)"

In [24]:
#Let's try a new approach: remove columns with a lot of missing values & highly correlated with 1 status ("loan not approved")
data_light=data.drop(columns=['loan_limit', 'rate_of_interest','Interest_rate_spread','Upfront_charges', 'property_value','LTV', 'dtir1'])
data_light_brutally_cleaned=data_light.dropna()
data_encoded_with_Gilian=encode_categorical(data_light_brutally_cleaned)
data_encoded_with_Gilian.shape

✅Categorical variables encoded successfully.


(138356, 64)

In [25]:
#Create X and y
X=data_encoded_with_Gilian.drop(columns='Status')
y=data_encoded_with_Gilian['Status']

In [26]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
scaler.fit(X) 
X_scaled= scaler.transform(X)

In [27]:
X_scaled.shape

(138356, 63)

In [28]:
df=pd.DataFrame(X_scaled,columns=X.columns)
df.head(2)

Unnamed: 0,ID,year,loan_amount,term,income,Credit_Score,Gender_Female,Gender_Joint,Gender_Male,Gender_Sex Not Available,...,age_<25,age_>74,submission_of_application_not_inst,submission_of_application_to_inst,Region_North,Region_North-East,Region_central,Region_south,Security_Type_Indriect,Security_Type_direct
0,-0.997366,0.0,-0.75,0.0,-0.8375,0.295,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,1.0,0.0,0.0
1,-0.997352,0.0,-0.375,0.0,-0.1625,-0.735,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
model=LogisticRegression()
base_model_score=cross_validate(model,X_scaled,y,cv=5)['test_score'].mean()
base_model_score

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.8614227085447249

In [30]:
print(f"Accuracy of logistic regression model: {round(float(base_model_score),2)}")

Accuracy of logistic regression model: 0.86


# Deep learning model

## First iteration: basic split, Dense Layers only

In [31]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

2024-11-01 12:27:39.760777: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-01 12:27:39.765808: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-01 12:27:39.776174: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1730460459.792769   54541 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1730460459.797464   54541 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-01 12:27:39.815331: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

In [47]:
#Split data
X_train=X_scaled[0:100000]
X_test=X_scaled[100000:]
y_train=y[0:100000]
y_test=y[100000:]
print(f"X_train:{X_train.shape} X_test: {X_test.shape} y_train: {y_train.shape} y_test:{y_test.shape}")

X_train:(100000, 63) X_test: (38356, 63) y_train: (100000,) y_test:(38356,)


In [36]:
# Model definition
model = Sequential()
model.add(layers.Dense(20, activation='relu', input_dim=63))		
model.add(layers.Dense(10, activation='relu'))			
model.add(layers.Dense(1, activation='sigmoid'))  
model.summary()	

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [38]:
#Compile model
model.compile(
    optimizer='adam', 
    loss='binary_crossentropy',
    metrics = ['accuracy'])

In [39]:
#Train model
history = model.fit(X_train, y_train, batch_size=16, epochs=10)

Epoch 1/10
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 3ms/step - accuracy: 0.8347 - loss: 0.4825
Epoch 2/10
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 3ms/step - accuracy: 0.8642 - loss: 0.3507
Epoch 3/10
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 3ms/step - accuracy: 0.8698 - loss: 0.3422
Epoch 4/10
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 3ms/step - accuracy: 0.8712 - loss: 0.3377
Epoch 5/10
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 3ms/step - accuracy: 0.8720 - loss: 0.3357
Epoch 6/10
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 3ms/step - accuracy: 0.8721 - loss: 0.3356
Epoch 7/10
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 3ms/step - accuracy: 0.8737 - loss: 0.3301
Epoch 8/10
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 3ms/step - accuracy: 0.8727 - loss: 0.3331
Epoch 9/10
[1m6

In [52]:
model.evaluate(scaler.transform(X_test), y_test) 



[1m1199/1199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.2572 - loss: 1007.4326


[1009.374267578125, 0.2557878792285919]

## Regularized model with drop out layers, early stop & shuffled dataset

In [64]:
from sklearn.utils import shuffle
from tensorflow.keras.callbacks import EarlyStopping

In [59]:
# Shuffle the dataset
df = shuffle(data_encoded_with_Gilian)  

# Define the split ratio
split_ratio = 0.8

# Calculate the split index
split_index = int(split_ratio * len(df))

# Split the data
train_df = df[:split_index]
test_df = df[split_index:]

# Separate features and target
X_train = train_df.drop('Status', axis=1)
y_train = train_df['Status']
X_test = test_df.drop('Status', axis=1)
y_test = test_df['Status']

In [65]:
#Early stopping
es = EarlyStopping(patience=5)

In [61]:
# Model definition
model = Sequential()
model.add(layers.Dense(20, activation='relu', input_dim=63))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(10, activation='relu'))	
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1, activation='sigmoid'))  
model.summary()	

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [62]:
#Compile model
model.compile(
    optimizer='adam', 
    loss='binary_crossentropy',
    metrics = ['accuracy'])

In [66]:
history=model.fit(X_train, y_train, batch_size=16, epochs=20, validation_data=(X_test, y_test), callbacks=[es])

Epoch 1/20
[1m6918/6918[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 4ms/step - accuracy: 0.6835 - loss: 3169.7563 - val_accuracy: 0.7466 - val_loss: 0.5729
Epoch 2/20
[1m6918/6918[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 4ms/step - accuracy: 0.7424 - loss: 0.7265 - val_accuracy: 0.7467 - val_loss: 0.5699
Epoch 3/20
[1m6918/6918[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 4ms/step - accuracy: 0.7453 - loss: 0.5741 - val_accuracy: 0.7467 - val_loss: 0.5695
Epoch 4/20
[1m6918/6918[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 4ms/step - accuracy: 0.7479 - loss: 0.5647 - val_accuracy: 0.7466 - val_loss: 0.5695
Epoch 5/20
[1m6918/6918[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 4ms/step - accuracy: 0.7463 - loss: 0.5664 - val_accuracy: 0.7466 - val_loss: 0.5695
Epoch 6/20
[1m6918/6918[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 4ms/step - accuracy: 0.7439 - loss: 0.5731 - val_accuracy: 0.7466 - val_loss: 0.5695
Epoch 7

In [67]:
model.evaluate(scaler.transform(X_test), y_test) 

[1m865/865[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.7446 - loss: 0.5736


[0.5722287893295288, 0.7466753125190735]

## Base model v.2 with shuffled dataset

In [108]:
lr_model=LogisticRegression(max_iter=1000)
lr_model.fit(X_train,y_train)
lr_model.score(X_test,y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8539679098005204

In [109]:
y_pred=lr_model.predict(X_test)

In [111]:
# Generate a classification report
report = classification_report(y_test, y_pred, target_names=['0','1'])
print("Classification Report:\n", report)

# Generate a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.98      0.91     20661
           1       0.88      0.49      0.63      7011

    accuracy                           0.85     27672
   macro avg       0.86      0.73      0.77     27672
weighted avg       0.86      0.85      0.84     27672

Confusion Matrix:
 [[20170   491]
 [ 3550  3461]]


# XGBRegressor

In [90]:
#Remove characters <, > and , in column names
column_names = list(X_test.columns)
cleaned_column_names = [s.replace('<', 'inf').replace('>', 'sup').replace(',', '') for s in column_names]

In [97]:
X_train_array = np.array(X_train)
X_train_clean_columns=pd.DataFrame(X_train_array,columns=cleaned_column_names)
X_test_array = np.array(X_test)
X_test_clean_columns=pd.DataFrame(X_test_array,columns=cleaned_column_names)

In [103]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

# Create an XGBoost classifier
xgb_clf = xgb.XGBClassifier(objective="binary:logistic", random_state=42) 

# Train the model
xgb_clf.fit(X_train_array, y_train)

# Make predictions
y_pred = xgb_clf.predict(X_test_array)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8720728534258456


In [107]:
#Evaluate other metrics than accuracy
from sklearn.metrics import classification_report, confusion_matrix

# Generate a classification report
report = classification_report(y_test, y_pred, target_names=['0','1'])
print("Classification Report:\n", report)

# Generate a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.98      0.92     20661
           1       0.92      0.54      0.68      7011

    accuracy                           0.87     27672
   macro avg       0.89      0.76      0.80     27672
weighted avg       0.88      0.87      0.86     27672

Confusion Matrix:
 [[20316   345]
 [ 3195  3816]]
