In [1]:
# imports we need + utils
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
import hvplot.pandas
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from datetime import datetime
from xgboost import XGBClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC
from sklearn.model_selection import train_test_split
from collections import Counter
from imblearn.over_sampling import SMOTE, ADASYN

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report,
    roc_auc_score, roc_curve, auc,
    # plot_confusion_matrix, plot_roc_curve
)
from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay
# import imblearn as iblearn
# import sklearn as skl
# print(iblearn.__version__)
# print(skl.__version__)

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor


pd.set_option('display.float', '{:.2f}'.format)
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)

bold_start = "\033[1m"
bold_end = "\033[0m"

# import sklearn as sk
# print(sk.__version__)
# import imblearn as ib
# print(ib.__version__)

In [2]:
# Exploratory Data Analysis code cell
data = pd.read_csv("resources/lending_club_loan_two.csv")


def plot_3_row_histogram(df):
    numerics_df = df.select_dtypes(include=['number'])
    print("Number of numeric cols: ", len(numerics_df.columns), " of ", len(df.columns))
    n_rows = len(numerics_df.columns) // 3
    n_rows += len(numerics_df.columns) % 3

    # Create a figure and axes with matplotlib's subplots()
    fig, axs = plt.subplots(n_rows, 3, figsize=(15, n_rows * 4))

    for idx, col in enumerate(numerics_df.columns):
        i = idx // 3
        j = idx % 3

        sns.histplot(numerics_df[col], ax=axs[i][j], kde=True)
        axs[i][j].set_title(f"Distribution of {col}")

    # Remove empty subplots
    if len(numerics_df.columns) % 3:
        for j in range(len(numerics_df.columns) % 3, 3):
            fig.delaxes(axs[i][j])

    plt.tight_layout()
    plt.show()


def plot_label_distribution(df, label):
    values = df[label].value_counts()
    values.plot(kind='bar', figsize=(5, 3))
    plt.title('Label values distribution')
    plt.show()


def plot_correlation_matrix(df):
    numeric_df = df.select_dtypes(include=['number'])
    plt.figure(figsize=(10, 8))
    sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm')
    plt.title('Correlation Matrix')
    plt.show()


def percentage_of_missing_values(df):
    missing_columns = df.columns[data.isna().any()].to_list()
    missing_values = df[missing_columns].isna().mean() * 100
    missing_values = missing_values.apply(lambda x: f'{x: .2f}%')

    # missing_df = pd.DataFrame({'Column': missing_columns, 'Missing %': missing_values})
    missing_df = pd.DataFrame({'Missing %': missing_values})
    print("Percentage of missing values")
    display(missing_df)


def detect_outliers_zscore(df, column, threshold=3):
    z_scores = np.abs((df[column] - df[column].mean()) / df[column].std())
    outliers = df[z_scores > threshold].index
    return outliers


def show_outliers(df):
    outlier_data = []
    numerics_df = df.select_dtypes(include=['number'])
    for col in numerics_df.columns:
        ol = detect_outliers_zscore(numerics_df, col)
        outlier_data.append({'Column': col, 'Outlier Count': len(ol)})
    outliers_df = pd.DataFrame(outlier_data)
    print("Outliers found in each feature")
    display(outliers_df.sort_values(by='Outlier Count', ascending=False))


def show_unique_values_categorical_features(df):
    categorical_cols = data.select_dtypes(include=['object'])
    unique_categories = []
    for col in categorical_cols.columns:
        unique_categories.append({'Column': col, 'Unique Count': data[col].nunique()})
    unique_categories_df = pd.DataFrame(unique_categories)
    print("Count of unique values in categorical features")
    display(unique_categories_df.sort_values(by='Unique Count', ascending=False))


def feature_importance_random_forest(x, y):
    # Train a Random Forest model
    rf = RandomForestClassifier()
    rf.fit(x, y)

    # Get feature importance
    importance = rf.feature_importances_

    # Create a DataFrame to visualize importance
    feature_importance = pd.DataFrame({'Feature': x.columns, 'Importance': importance})
    feature_importance = feature_importance.sort_values('Importance', ascending=False)

    print(feature_importance)


def pub_rec(number):
    if number == 0.0:
        return 0
    else:
        return 1


def mort_acc(number):
    if number == 0.0:
        return 0
    elif number >= 1.0:
        return 1
    else:
        return number


def pub_rec_bankruptcies(number):
    if number == 0.0:
        return 0
    elif number >= 1.0:
        return 1
    else:
        return number


def eda():
    #label distribution
    plot_label_distribution(data, 'loan_status')

    # feature distribution
    plot_3_row_histogram(data)

    #correlation matrix
    plot_correlation_matrix(data)

    # missing values
    percentage_of_missing_values(data)

    # outliers
    show_outliers(data)

    # show unique values for each categorical feature
    show_unique_values_categorical_features(data)


# Exploratory Data Exploration

In [3]:
# call exploratory data analysis methods
# eda()

## EDA Observations

## Distributions:
#### Right Skewed: 
* *loan_amnt*, *int_rate*, *installment*, *annual_inc*, *dti*, *open_acc*, *pub_rec*, *revol_bal*, *total_acc* 
* These variables have a long tail on the right, indicating a concentration of values at the lower end with a few very high values.

#### Approximately Normal: 
* *revol_util*
* This distribution resembles a bell curve, suggesting a symmetric distribution of values around the mean.

#### Highly Imbalanced: 
* `mort_acc, pub_rec_bankruptcies` 
* These variables have a very uneven distribution, with a large majority of values concentrated in a single category.

### Preprocessing Strategies:
1. Addressing the Imbalance in label instances:
    - Oversampling using SMOTE and ADASYN: From the bar graph above relating to Fully Paid vs Charged-Off, we can see a significant imbalance in label instance that need to be addressed.
    - We're using both Synthetic Minority Sampling and Adaptive Synthetic Sampling to experiment and cross-validate which is better since SMOTE might not perform well with complex distributions
2. Handling Skewness:
    - We chose to cap the extreme values at a certain percentile to reduce the impact of these outliers without removing them entirely. 
3. Normalization/Standardization:
    - We shall standardize `revol_util` since it's close to normal distribution. This will help improve model performance.
    - All other variables to are going to be normalized using min-max scaling to rescale them to a range between 0 and 1. This will prevent features with larger variations from biasing the models we train later on.
4. Because `pub_rec, mort_acc, pub_rec_bankruptcies` are heavily right-skewed and have most of their values as 0, we're choosing to simplify them into binary e.g. 0 - no mort_acc, 1 - has mort_acc  
5. Address feature needs to be augmented/engineered into zip_code because there are too many variations for it to be encoded effeciently.

# Data Preprocessing

The goal here is to do the following:

* Impute missing values
* Remove repeating and irrelevant features
* Encode categorical features into numerical format
* Oversample the minority class to increase its number of instances
* Change some features into variations of themselves (feature engineer)

In [4]:
# Convert features to simple binary 
data['pub_rec'] = data.pub_rec.apply(pub_rec)
data['mort_acc'] = data.mort_acc.apply(mort_acc)
data['pub_rec_bankruptcies'] = data.pub_rec_bankruptcies.apply(pub_rec_bankruptcies)
data['loan_status'] = data.loan_status.map({'Fully Paid': 1, 'Charged Off': 0})

# one-hot encode categories we found as categorical
one_hot_encode_cols = ['verification_status', 'purpose', 'initial_list_status',
                       'application_type', 'home_ownership']
data = pd.get_dummies(data, columns=one_hot_encode_cols, drop_first=True)

# the shape of the data
print(f"The shape of the dataset (instances, features): {data.shape}")

The shape of the dataset (instances, features): (396030, 45)


### 2.2 Converting categorical string features into numerical formats

In [5]:
data.term.unique()

array([' 36 months', ' 60 months'], dtype=object)

In [6]:
term_values = {' 36 months': 36, ' 60 months': 60}
data['term'] = data.term.map(term_values)

In [7]:
data.term.unique()

array([36, 60], dtype=int64)

In [8]:
data.drop('grade', axis=1, inplace=True)

address
We are going to feature engineer a zip code column from the address in the data set. Create a column called 'zip_code' that extracts the zip code from the address column.

In [9]:
data.address.head()

0       0174 Michelle Gateway\r\nMendozaberg, OK 22690
1    1076 Carney Fort Apt. 347\r\nLoganmouth, SD 05113
2    87025 Mark Dale Apt. 269\r\nNew Sabrina, WV 05113
3              823 Reid Ford\r\nDelacruzside, MA 00813
4               679 Luna Roads\r\nGreggshire, VA 11650
Name: address, dtype: object

In [10]:
data['zip_code'] = data.address.apply(lambda x: x[-5:])

In [11]:
data.zip_code.value_counts()

zip_code
70466    56985
30723    56546
22690    56527
48052    55917
00813    45824
29597    45471
05113    45402
11650    11226
93700    11151
86630    10981
Name: count, dtype: int64

In [12]:
# data = pd.get_dummies(data, columns=['zip_code'], drop_first=True)

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the zip_code column
data['zip_code_encoded'] = label_encoder.fit_transform(data['zip_code'])

data.drop('zip_code', axis=1, inplace=True)
data.drop('address', axis=1, inplace=True)

sub_grade_mapping = {
    'A1': 35, 'A2': 34, 'A3': 33, 'A4': 32, 'A5': 31,
    'B1': 30, 'B2': 29, 'B3': 28, 'B4': 27, 'B5': 26,
    'C1': 25, 'C2': 24, 'C3': 23, 'C4': 22, 'C5': 21,
    'D1': 20, 'D2': 19, 'D3': 18, 'D4': 17, 'D5': 16,
    'E1': 15, 'E2': 14, 'E3': 13, 'E4': 12, 'E5': 11,
    'F1': 10, 'F2': 9, 'F3': 8, 'F4': 7, 'F5': 6,
    'G1': 5, 'G2': 4, 'G3': 3, 'G4': 2, 'G5': 1
}
data['sub_grade_encoded'] = data['sub_grade'].map(sub_grade_mapping)
data.drop('sub_grade', axis=1, inplace=True)


Removing `issue_d` feature
This would be data leakage because we would not know the loan issue date beforehand when using our model/in our usecase. Therefore choosing to drop this feature.

In [13]:
data.drop('issue_d', axis=1, inplace=True)

In [14]:
data['earliest_cr_line'] = data['earliest_cr_line'].apply(lambda x: datetime.strptime(x, "%b-%Y").year)

In [15]:
data.earliest_cr_line.nunique()

65

In [16]:
data.earliest_cr_line.value_counts()

earliest_cr_line
2000    29366
2001    29083
1999    26491
2002    25901
2003    23657
        ...  
1951        3
1950        3
1953        2
1944        1
1948        1
Name: count, Length: 65, dtype: int64

### Removing or Imputing missing values

In [17]:
for column in data.columns:
    if data[column].isna().sum() != 0:
        missing = data[column].isna().sum()
        portion = (missing / data[column].shape[0]) * 100
        print(f"'{column}' number of missing values: '{missing}' ==> '{portion:.3f}%'")
        # print(
        #     f"{bold_start}'{column}'{bold_end}: number of missing values: {bold_start}'{missing}'{bold_end} ==> {bold_start}'{portion:.3f}%'{bold_end}")

'emp_title' number of missing values: '22927' ==> '5.789%'
'emp_length' number of missing values: '18301' ==> '4.621%'
'title' number of missing values: '1756' ==> '0.443%'
'revol_util' number of missing values: '276' ==> '0.070%'
'mort_acc' number of missing values: '37795' ==> '9.543%'
'pub_rec_bankruptcies' number of missing values: '535' ==> '0.135%'


In [18]:
data['emp_title'].nunique()

173105

The **emp_title** has too many unique values to encode and dataset will not be sufficent if used with it. So, the best thing to do is drop it.

In [19]:
data.drop('emp_title', axis=1, inplace=True)

Now, let's look at **emp_length**

In [20]:
data["emp_length"].unique()

array(['10+ years', '4 years', '< 1 year', '6 years', '9 years',
       '2 years', '3 years', '8 years', '7 years', '5 years', '1 year',
       nan], dtype=object)

In [21]:
for year in data["emp_length"].unique():
    print(f"{year} years in the positions:")
    # print(f"{bold_start}{year}{bold_end} years in the positions:")
    print(f"{data[data.emp_length == year].loan_status.value_counts(normalize=True)}")
    print("\n=============================================\n")

10+ years years in the positions:
loan_status
1   0.82
0   0.18
Name: proportion, dtype: float64


4 years years in the positions:
loan_status
1   0.81
0   0.19
Name: proportion, dtype: float64


< 1 year years in the positions:
loan_status
1   0.79
0   0.21
Name: proportion, dtype: float64


6 years years in the positions:
loan_status
1   0.81
0   0.19
Name: proportion, dtype: float64


9 years years in the positions:
loan_status
1   0.80
0   0.20
Name: proportion, dtype: float64


2 years years in the positions:
loan_status
1   0.81
0   0.19
Name: proportion, dtype: float64


3 years years in the positions:
loan_status
1   0.80
0   0.20
Name: proportion, dtype: float64


8 years years in the positions:
loan_status
1   0.80
0   0.20
Name: proportion, dtype: float64


7 years years in the positions:
loan_status
1   0.81
0   0.19
Name: proportion, dtype: float64


5 years years in the positions:
loan_status
1   0.81
0   0.19
Name: proportion, dtype: float64


1 year years in the positio

**Charge offs** seem to be extremely similar across all **emp_length**. So, we are going to drop it.

In [22]:
data.drop('emp_length', axis=1, inplace=True)

Now, let's have a look at **title**.

In [23]:
data.title.value_counts().head()

title
Debt consolidation         152472
Credit card refinancing     51487
Home improvement            15264
Other                       12930
Debt Consolidation          11608
Name: count, dtype: int64

In [24]:
data.title.unique().shape

(48817,)

In [25]:
data.drop("title", axis=1, inplace=True)

Let's have a look at **mort_acc**.

In [26]:
data.mort_acc.value_counts()

mort_acc
1.00    218458
0.00    139777
Name: count, dtype: int64

In [27]:
data.mort_acc.isna().sum()

37795

Imputing **mort_acc** should be imputed using either mean or prediction model, for this purpose we will do using **Random Forest**

In [28]:
# Split the data into two parts: one with missing values and one without
missing_data = data[data['mort_acc'].isnull()]
complete_data = data.dropna(subset=['mort_acc'])

# Select features and target variable
X = complete_data.drop(columns=['mort_acc'])
y = complete_data['mort_acc']

# Split the complete data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict missing values
imputed_values = model.predict(missing_data.drop(columns=['mort_acc']))

# Impute missing values in the mort_acc column
data.loc[data['mort_acc'].isnull(), 'mort_acc'] = imputed_values

In [29]:
data.mort_acc.value_counts()

mort_acc
1.00    237764
0.00    158266
Name: count, dtype: int64

In [30]:
for column in data.columns:
    if data[column].isna().sum() != 0:
        missing = data[column].isna().sum()
        portion = (missing / data.shape[0]) * 100
        print(f"'{column}': number of missing values '{missing}' ==> '{portion:.3f}%'")

# drop missing value instances that are less than 0.2% of the dataset
data.dropna(inplace=True)
display(data.shape)
data.loan_status.value_counts()

'revol_util': number of missing values '276' ==> '0.070%'
'pub_rec_bankruptcies': number of missing values '535' ==> '0.135%'


(395219, 40)

loan_status
1    317696
0     77523
Name: count, dtype: int64

Removing `revol_util & pub_rec_bankruptcies` missing values instances
These two features have missing data points, but they account for less than 0.5% of the total data. So we are going to remove the rows that are missing those values in those columns with dropna().

In [31]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 395219 entries, 0 to 396029
Data columns (total 40 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   loan_amnt                            395219 non-null  float64
 1   term                                 395219 non-null  int64  
 2   int_rate                             395219 non-null  float64
 3   installment                          395219 non-null  float64
 4   annual_inc                           395219 non-null  float64
 5   loan_status                          395219 non-null  int64  
 6   dti                                  395219 non-null  float64
 7   earliest_cr_line                     395219 non-null  int64  
 8   open_acc                             395219 non-null  float64
 9   pub_rec                              395219 non-null  int64  
 10  revol_bal                            395219 non-null  float64
 11  revol_util        

## Preparing the Test data

### Checking distribution of data

In [32]:
# Define a threshold for Z-score
threshold = 3

# Create a copy of the data to avoid modifying the original DataFrame
data_clean = data.copy()

# Dictionary to store counts of outliers for each column
outliers_count_dict = {}

# Loop through each numerical column in the DataFrame
for column in data_clean.select_dtypes(include=[np.number]).columns:
    # Calculate Z-scores
    z_scores = (data_clean[column] - data_clean[column].mean()) / data_clean[column].std()
    # Identify outliers
    outliers = np.abs(z_scores) > threshold
    # Store the count of previous outliers
    outliers_count_dict[column] = outliers.sum()
    # Calculate the mean value of the column
    mean_value = data_clean[column].mean()
    # If the column is of type int64, cast the mean value to int
    if pd.api.types.is_integer_dtype(data_clean[column]):
        mean_value = int(mean_value)
    # Replace outliers with the mean value of the column
    data_clean.loc[outliers, column] = mean_value

# Print the shape of the original and cleaned data
print(f"Original data shape: {data.shape}")
print(f"Cleaned data shape: {data_clean.shape}")

# Print count of previous outliers
print("\nCount of previous outliers in each column:")
for column, count in outliers_count_dict.items():
    print(f"Column: {column} - Outliers: {count}")

# Optional: Check if there are any remaining outliers
remaining_outliers = ((data_clean - data_clean.mean()) / data_clean.std()).abs() > threshold
print("\nRemaining outliers in each column after cleaning:")
print(remaining_outliers.sum())

Original data shape: (395219, 40)
Cleaned data shape: (395219, 40)

Count of previous outliers in each column:
Column: loan_amnt - Outliers: 184
Column: term - Outliers: 0
Column: int_rate - Outliers: 754
Column: installment - Outliers: 5042
Column: annual_inc - Outliers: 3190
Column: loan_status - Outliers: 0
Column: dti - Outliers: 12
Column: earliest_cr_line - Outliers: 4993
Column: open_acc - Outliers: 4873
Column: pub_rec - Outliers: 0
Column: revol_bal - Outliers: 4771
Column: revol_util - Outliers: 16
Column: total_acc - Outliers: 3396
Column: mort_acc - Outliers: 0
Column: pub_rec_bankruptcies - Outliers: 0
Column: zip_code_encoded - Outliers: 0
Column: sub_grade_encoded - Outliers: 1988

Remaining outliers in each column after cleaning:
loan_amnt                                  0
term                                       0
int_rate                                  51
installment                             2700
annual_inc                              6888
loan_status        

## Oversampling the dataset using SMOTE and ADASYN 

In [33]:
def smote_oversampling(x, y):
    """
    Synthetic Minority Sampling
    :param x: predictors dataframe
    :param y: label dataframe
    :return: x, y dataframes with synthetic instances
    """
    smote = SMOTE(random_state=42)
    # fit and resample the data
    x_resampled, y_resampled = smote.fit_resample(x, y)

    return x_resampled, y_resampled


def adasyn_oversampling(x, y):
    """
    Adaptive Synthetic Sampling
    :param x: predictors dataframe
    :param y: label dataframe
    :return: x, y dataframes with synthetic instances
    """
    adasyn = ADASYN(random_state=42)
    # fit and resample the data
    x_resampled, y_resampled = adasyn.fit_resample(x, y)

    return x_resampled, y_resampled


# Assuming data_clean is your DataFrame and 'target' is the target variable
X = data_clean.drop(columns=['loan_status'])
y = data_clean['loan_status']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

smote_x, smote_y = smote_oversampling(X_train, y_train)
adasyn_x, adasyn_y = adasyn_oversampling(X_train, y_train)


print(f"Original dataset shape: {Counter(y_train)}")
print(f"Resampled dataset shape: {Counter(y_train_res)}")
print(f"SMOTE sampled dataset shape: {Counter(smote_y)}")
print(f"ADASYN sampled dataset shape: {Counter(adasyn_y)}")


Original dataset shape: Counter({1: 254157, 0: 62018})
Resampled dataset shape: Counter({0: 254157, 1: 254157})
SMOTE sampled dataset shape: Counter({0: 254157, 1: 254157})
ADASYN sampled dataset shape: Counter({1: 254157, 0: 247625})


## Feature Importance Analysis

In [34]:
print("Feature importance on SMOTE dataset")
feature_importance_random_forest(smote_x, smote_y)

print("Feature importance on ADASYN dataset")
feature_importance_random_forest(adasyn_x, adasyn_y)

Feature importance on SMOTE dataset
                                Feature  Importance
37                     zip_code_encoded        0.22
1                                  term        0.07
38                    sub_grade_encoded        0.06
12                             mort_acc        0.06
2                              int_rate        0.06
32              home_ownership_MORTGAGE        0.05
5                                   dti        0.03
36                  home_ownership_RENT        0.03
4                            annual_inc        0.03
10                           revol_util        0.03
9                             revol_bal        0.03
15         verification_status_Verified        0.03
3                           installment        0.03
7                              open_acc        0.03
14  verification_status_Source Verified        0.03
11                            total_acc        0.03
0                             loan_amnt        0.03
6                      earli

# Logistic Regression

In [38]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

X_train, X_val, y_train, y_val = train_test_split(smote_x, smote_y, test_size=0.25, random_state=42)

model = LogisticRegression(solver='liblinear', max_iter=1000)
# model = LogisticRegression()
model.fit(X_train, y_train)

y_val_pred = model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
val_class_report = classification_report(y_val, y_val_pred)

print(f'Validation Accuracy: {val_accuracy}')
print('Validation Confusion Matrix:')
print(val_conf_matrix)
print('Validation Classification Report:')
print(val_class_report)

Validation Accuracy: 0.8180265818900054
Validation Confusion Matrix:
[[51367 12267]
 [10858 52587]]
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.81      0.82     63634
           1       0.81      0.83      0.82     63445

    accuracy                           0.82    127079
   macro avg       0.82      0.82      0.82    127079
weighted avg       0.82      0.82      0.82    127079



In [36]:
y_test_pred = model.predict(X_test)

test_accuracy = accuracy_score(y_test, y_test_pred)
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
test_class_report = classification_report(y_test, y_test_pred)

print(f'Test Accuracy: {test_accuracy}')
print('Test Confusion Matrix:')
print(test_conf_matrix)
print('Test Classification Report:')
print(test_class_report)

Test Accuracy: 0.7069733313091443
Test Confusion Matrix:
[[10900  4605]
 [18557 44982]]
Test Classification Report:
              precision    recall  f1-score   support

           0       0.37      0.70      0.48     15505
           1       0.91      0.71      0.80     63539

    accuracy                           0.71     79044
   macro avg       0.64      0.71      0.64     79044
weighted avg       0.80      0.71      0.73     79044



# ANN

In [39]:
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout

# # Define the model
# input_dim = smote_x.shape[1]
# model = Sequential()
# model.add(Dense(64, input_dim=input_dim))
# model.add(Dropout(0.5))  # Dropout layer with 50% dropout rate
# model.add(Dense(32))
# model.add(Dropout(0.5))  # Dropout layer with 50% dropout rate
# model.add(Dense(16))
# model.add(Dense(1, activation='sigmoid'))

# # Compile the model
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# # Train the model with class weights
# model.fit(smote_x, smote_y, epochs=10, batch_size=32, validation_split=0.2)

# # Evaluate the model
# loss, accuracy = model.evaluate(X_test, y_test)
# print(f'Test Accuracy: {accuracy:.4f}')

# y_pred = model.predict(X_test)
# y_pred_classes = (y_pred > 0.5).astype(int)

# print(classification_report(y_test, y_pred_classes))

### XGBoost Classifier

In [44]:
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score

# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(smote_x, label=smote_y)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define the parameter dictionary
params = {
    'objective': 'binary:logistic',  # binary classification
    'max_depth': 10,  # maximum depth of a tree
    'eta': 0.03,  # step size shrinkage
    'eval_metric': 'logloss',  # evaluation metric
    'random_state': 42  # seed for reproducibility
}

# Specify the training and validation sets
evals = [(dtrain, 'train'), (dval, 'eval')]

# Train the model
bst = xgb.train(params, dtrain, num_boost_round=1000, evals=evals, early_stopping_rounds=10)

# Predict on the test set
y_pred_prob = bst.predict(dtest)
y_pred = (y_pred_prob > 0.5).astype(int)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Test Accuracy: {accuracy:.4f}')

print(classification_report(y_test, y_pred))

[0]	train-logloss:0.67441	eval-logloss:0.67439
[1]	train-logloss:0.65688	eval-logloss:0.65686
[2]	train-logloss:0.64018	eval-logloss:0.64014
[3]	train-logloss:0.62441	eval-logloss:0.62435
[4]	train-logloss:0.60942	eval-logloss:0.60935
[5]	train-logloss:0.59554	eval-logloss:0.59546
[6]	train-logloss:0.58196	eval-logloss:0.58184
[7]	train-logloss:0.56921	eval-logloss:0.56909
[8]	train-logloss:0.55677	eval-logloss:0.55663
[9]	train-logloss:0.54519	eval-logloss:0.54505
[10]	train-logloss:0.53403	eval-logloss:0.53386
[11]	train-logloss:0.52350	eval-logloss:0.52332
[12]	train-logloss:0.51333	eval-logloss:0.51314
[13]	train-logloss:0.50356	eval-logloss:0.50334
[14]	train-logloss:0.49429	eval-logloss:0.49406
[15]	train-logloss:0.48555	eval-logloss:0.48532
[16]	train-logloss:0.47684	eval-logloss:0.47660
[17]	train-logloss:0.46865	eval-logloss:0.46840
[18]	train-logloss:0.46060	eval-logloss:0.46033
[19]	train-logloss:0.45304	eval-logloss:0.45276
[20]	train-logloss:0.44594	eval-logloss:0.44566
[2