<a id="toc"></a>
# **Table of Contents**

- [1. Set-up](#1)
    - [1.1 Install Pycaret](#1.1)
    - [1.2 Import Libraries](#1.2)
    - [1.3 Import Data](#1.3)
    - [1.4 Summay of Dataset (According to the Authors)](#1.4)
- [2. Exploring Data](#2)
    - [2.1 Total Number of Transactions](#2.1)
    - [2.2 Data Quick Overview](#2.2)
    - [2.3 Data Basic Statistics](#2.3)
    - [2.4 Check Missing Values](#2.4)
    - [2.5 Check Data Types](#2.5) 
- [3. Data Preprocessing](#3)
    - [3.1 Random Sampling](#3.1)
    - [3.2 One-Hot Encoding](#3.2)
    - [3.3 Evaluate Skewness](#3.3)
    - [3.4 Check Feature Scaling](#3.4)
    - [3.5 Logarithmic Scaling](#3.5)
    - [3.6 Drop Highly Correlated Features](#3.6)
    - [3.7 Identify Most Important Features](#3.7)
    - [3.8 Check Feature Importance](#3.8)
- [4. Imbalanced Classification](#4)
    - [4.1 Splitting Train and Test Dataset](#4.1)
    - [4.2 Oversampling Using SMOTE](#4.2)
    - [4.3 UMAP Dimensionality Reduction](#4.3)
    - [4.4 Analysing Results](#4.4)
- [5. Model Selection](#5)
    - [5.1 Creating Models](#5.1)
    - [5.2 Hyperparameter Tuning](#5.2)
    - [5.3 Interpretating Models](#5.3)
        - [5.3.1 Decistion Tree](#5.3.1)
        - [5.3.2 Gradient Boosting Machine](#5.3.2)
        - [5.3.3 Logistic Regression](#5.3.3)
- [6. Model Evaluation](#6)
    - [6.1 Decision Tree](#6.1)
    - [6.2 Gradient Boosting Machine](#6.2)
    - [6.3 Logistic Regression](#6.3)
- [7. Final Comparation](#7)
    - [7.1 Evaluation Metrics](#7.1)
    - [7.2 Precision-Recall Curve](#7.2)
    - [7.3 Receiver Operating Characterisitc (ROC)](#7.3)

-------------------------------------------------------------------------------
<a id="1"></a>
# <b>1 <span style='color: SteelBlue'>|</span> Set-up</b>

<a id="1.1"></a>
## <b>1.1 <span style='color: SteelBlue'>Install Pycaret</span></b> 

In [None]:
try:
    import pycaret
except:
    !pip install pycaret

try:
    import missingno
except:
    !pip install missingno

<a id="1.2"></a>
## <b>1.2 <span style='color: SteelBlue'>Import Libraries</span></b>

In [None]:
# Data manipulation libraries
import pandas as pd
import numpy as np 

# Data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from pycaret.classification import *
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve
from sklearn.metrics import ConfusionMatrixDisplay
from pycaret.classification import setup
from imblearn.over_sampling import SMOTE

# Data preprocessing and analysis libraries
import umap
import umap.plot
import scipy.stats as stats
from scipy.stats import norm, skew

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


<a id="1.3"></a>
## <b>1.3 <span style='color: SteelBlue'>Import Data</span></b>

In [None]:
card_df = pd.read_csv('/kaggle/input/bank-account-fraud-dataset-neurips-2022/Base.csv')
card_org = card_df.copy()

-------------------------------------------------------------------------------
<a id="2"></a>
# <b>2 <span style='color:SteelBlue'>|</span> Exploring Data</b>

<a id="2.1"></a>
## <b>2.1<span style='color: SteelBlue'> Total Number of Transactions</span></b>

In [None]:
colors = ['#004B87', 'LightBlue']
labels = ['Non-Fraud', 'Fraud']
values = card_df['fraud_bool'].value_counts() / card_df['fraud_bool'].shape[0]
total_normal = card_df[card_df['fraud_bool'] == False].shape[0]
total_fraudulent = card_df[card_df['fraud_bool'] == True].shape[0]

fig = go.Figure(data=[go.Pie(labels=labels,
                             values=values, hole=.3)])
fig.update_traces(hoverinfo='label+percent', textinfo='percent', textfont_size=20,
                  marker=dict(colors=colors, line=dict(color='black', width=0.1)))

fig.update_layout(
    title_text='<b>Credit Card Fraud</b>',
    title_font_color='black',
    title_font=dict(size=24),
    legend_title_font_color='black',
    paper_bgcolor='white',
    plot_bgcolor='white',
    font_color='black',
)

fig.show()

<a id="2.2"></a>
## <b>2.2 <span style='color:SteelBlue'>Data Quick Overview </span></b> 

In [None]:
card_df.head(5).T

<a id="2.3"></a>
## <b>2.3 <span style='color:SteelBlue'>Data Basic Statistics</span></b>

In [None]:
card_df.info()

In [None]:
card_df.describe()

<a id="2.4"></a>
## <b>2.4 <span style='color:SteelBlue'>Check Missing Values</span></b>

In [None]:
import missingno as msno
msno.matrix(card_df)

<a id="2.5"></a>
## <b>2.5 <span style='color:SteelBlue'>Check Data Types</span></b>

In [None]:
sns.set(style="ticks", context="talk",font_scale = 1)
plt.style.use("default")
plt.figure(figsize = (8,6))
ax = card_df.dtypes.value_counts().plot(kind='bar',grid = False,fontsize=20,color='SteelBlue')
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+ p.get_width() / 2., height + 0.2, height, ha = 'center', size = 25, color='black')
ax.set_title('Data Types', ha = 'center', weight='bold', fontsize=24)
sns.despine()
plt.show()


-------------------------------------------------------------------------------
<a id="3"></a>
# <b>3 <span style='color:SteelBlue'>|</span> Data Preprocessing</b>

<a id="3.1"></a>
## <b>3.1 <span style='color:SteelBlue'>Random Sampling</span></b>

In [None]:
card_df = card_df.sample(n = 400000,random_state=42) # Random Sample 400.000 Rows

In [None]:
card_df.shape

<a id="3.2"></a>
## <b>3.2 <span style='color:SteelBlue'>One-Hot Encoding</span></b>

In [None]:
card_df = pd.get_dummies(card_df) # one hot encoding

In [None]:
card_df.shape

<a id="3.3"></a>
## <b>3.3 <span style='color:SteelBlue'>Evaluate Skewness</span></b>

In [None]:
numerical_cols = [cname for cname in card_df.loc[:, :'month'].columns if
                  pd.to_numeric(card_df[cname], errors='coerce').notna().all() and
                  card_df[cname].min() >= 0 and
                  card_df[cname].dtype in ['int64', 'float64']]

In [None]:

plt.figure(figsize=(12, 8)) 
skew_features = card_df[numerical_cols].apply(lambda x: skew(x))
skew_features = skew_features[skew_features > 0.5].sort_values(ascending=False)
ax = sns.barplot(x=skew_features.index, y=skew_features.values, color='SteelBlue')  
ax.set_ylabel('', fontsize=20)  
ax.set_xlabel('', fontsize=20)  
ax.tick_params(axis='both', labelsize=15)  
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right', fontsize=15)  
ax.axhline(y=1, color='red', linestyle='--', linewidth=3)
ax.set_title('Skewness', ha = 'center', weight='bold', fontsize=24)
ax.text(0.01, 1.1, 'Threshold', color='red', transform=ax.transAxes, fontsize=15, weight='bold') 
sns.despine()
plt.gca().set_facecolor('white') 
plt.show()

<a id="3.4"></a>
## <b>3.4 <span style='color:SteelBlue'>Check Feature Scaling</span></b>

In [None]:
def check_normality(features, data_frame):
    for feature in features:
        plt.figure(figsize = (8,8))
        ax1 = plt.subplot(1,1,1)
        stats.probplot(data_frame[feature], dist=stats.norm, plot=ax1)
        ax1.set_title(f'{feature} Q-Q plot', fontsize=20)
        sns.despine()

        mean = data_frame[feature].mean()
        std = data_frame[feature].std()
        skew = data_frame[feature].skew()
        print(f'{feature} : mean: {mean:.4f}, std: {std:.4f}, skew: {skew:.4f}')

In [None]:
features = ['days_since_request', 'zip_count_4w', 'proposed_credit_limit']
check_normality(features, card_df)

In [None]:
def plot_histograms(features, data_frame, color=None):
    for feature in features:
        fig = px.histogram(data_frame, x=feature, color=color,
                            marginal="box",
                            barmode="overlay",
                            histnorm='density'
                            )
        fig.update_layout(
            title=f'Histogram of {feature}',
            xaxis_title=feature,
            yaxis_title='Density'
        )
        fig.show()

In [None]:
features = ['days_since_request', 'zip_count_4w', 'proposed_credit_limit']
plot_histograms(features, card_df, color='fraud_bool')

<a id="3.5"></a>
## <b>3.5 <span style='color:SteelBlue'>Logarithmic Scaling</span></b>

In [None]:
columns_to_transform = ['days_since_request', 'zip_count_4w', 'proposed_credit_limit']

# Apply natural logarithm transformation to specified columns
card_df[columns_to_transform] = np.log1p(card_df[columns_to_transform])

In [None]:
# Define a function to plot histogram and check normality
def plot_hist_and_check_normality(df, cols):
    for col in cols:
        # Plot histogram
        plt.figure(figsize=(8, 6))
        sns.histplot(df[col], bins=30, kde=True)
        plt.title(f'Histogram of {col}')
        plt.xlabel(col)
        plt.ylabel('Frequency')
        plt.show()

        # Check normality
        print(f'Normality test for {col}:')
        print('-------------------------------------')
        print('Skewness:', skew(df[col]))
        print('Kurtosis:', df[col].kurtosis())
        print('-------------------------------------\n')

# Call the function with the desired variables
cols_to_plot = ['days_since_request', 'zip_count_4w', 'proposed_credit_limit']
plot_hist_and_check_normality(card_df, cols_to_plot)

<a id="3.6"></a>
## <b>3.6 <span style='color:SteelBlue'> Drop Highly Correlated Features</span></b>

In [None]:
plt.figure(figsize=(18, 15)) 
cor = card_df.corr()
mask = np.triu(np.ones_like(cor))
heatmap = sns.heatmap(cor, mask=mask, annot=True, cmap=plt.cm.YlGnBu, annot_kws={"fontsize": 6}, fmt=".2f")  # Set fmt to ".2f" to display 2 decimals
plt.show()


In [None]:
# Function to drop correlated features
def correlation(dataset, threshold):
    col_corr = set ()
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if  (corr_matrix.iloc[i, j]) > threshold:
                colname = corr_matrix.columns[i]
                col_corr.add(colname)
    return col_corr

In [None]:
corr_features = correlation(card_df, 0.7)
print(corr_features)

In [None]:
card_df = card_df.drop('payment_type_AA', axis=1)

**<span style="color:Navy"> Observation:**
    
We have identified a strong positive correlation between the variables `payment_type_AA` and `intended_balcon_amount`. As a result, we have decided to drop the feature `payment_type_AA` from our dataset to prevent redundancy and potential overfitting in our machine learning model.

<a id="3.7"></a>
## <b>3.7 <span style='color:SteelBlue'> Identify Most Important Features</span></b>

In [None]:
plt.figure(figsize=(12, 7), dpi=100)
corr = card_df.corr()[['fraud_bool']].sort_values(by='fraud_bool', ascending=False)
corr = corr.round(2)
heatmap = sns.heatmap(corr, vmin=-1, vmax=1, annot=True, cmap='YlGnBu', annot_kws={"size": 8, "color":'white', "alpha":0.7, "ha": 'center', "va": 'center'})

In [None]:
corr = card_df.corr().round(1)

# Find features with strong correlation with target
target_corr = corr['fraud_bool'].abs().sort_values(ascending=False)
strong_corr_features = target_corr.index[1:11] 

print('Top 10 features with highest correlation with target:')
for feature in strong_corr_features:
    print(f"{feature}: {target_corr[feature]}")

<a id="3.8"></a>
## <b>3.8 <span style='color:SteelBlue'>Check Feature Importance</span></b>

In [None]:
features = [
    "device_os_windows",
    "customer_age",
    "proposed_credit_limit",
    "housing_status_BA",
    "credit_risk_score",
    "housing_status_BB",
    "payment_type_AE",
    "employment_status_CA",
    "employment_status_CB",
    "employment_status_CC",
]

plt.figure(figsize=(13,8))
ax = abs(card_df[features].corrwith(card_df.fraud_bool)).sort_values(ascending=False).plot(kind='bar',color='SteelBlue',fontsize=20)
for p in ax.patches:
    height = p.get_height().round(2)
    ax.text(p.get_x() + p.get_width() / 2., height+0.007, height, ha = 'center', size = 30)
sns.despine()


**<span style="color:Navy"> Observation:**

    housing_status_BA and device_os_windows seem to have a strong correlation with fraud_bool. We should go deeper on these variables.

-------------------------------------------------------------------------------
<a id="4"></a>
# <b>4 <span style='color:SteelBlue'>|</span> Imbalanced Classification</b>

<a id="4.1"></a>
## <b>4.1 <span style='color:SteelBlue'>Splitting Train and Test Dataset</span></b>

In [None]:
def get_preprocessed_df(df=None):
    df_copy = df.copy()
    # Set 'fraud_bool' as the target variable by moving it to the first column
    target_col = df_copy.pop('fraud_bool')
    df_copy.insert(0, 'fraud_bool', target_col)
    return df_copy

def get_train_test_dataset(df=None):
    df_copy = get_preprocessed_df(df)
    X_features = df_copy.iloc[:, 1:]  # Exclude only the first column
    y_target = df_copy.iloc[:, 0]  # Use the first column as y-axis
    X_train, X_test, y_train, y_test = train_test_split(X_features,
                                                        y_target, 
                                                        test_size=0.3, 
                                                        random_state=0, 
                                                        stratify=y_target)
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = get_train_test_dataset(card_df)

### **Histogram Plot**

In [None]:
# Create histogram plot
fig = px.histogram(card_df, x="customer_age", 
                   color="fraud_bool",
                   marginal="box",
                   barmode="overlay",
                   histnorm='density'
                  )  
fig.update_layout(
    title_text="Original Distribution",
    title_font_color="black",
    legend_title_font_color="black",
    paper_bgcolor="white",
    plot_bgcolor='white',
    font_color="black",
)
fig.show()


### **Scatterplot**

In [None]:
fig = px.scatter(card_df, x="customer_age", y="credit_risk_score", color="fraud_bool")
fig.update_layout(
    title_text="",
    title_font_color="black",
    legend_title_font_color="black",
    paper_bgcolor="white",
    plot_bgcolor='white',
    font_color="black",
)

<a id="4.2"></a>
## <b>4.2 <span style='color:SteelBlue'>Oversampling Using SMOTE</span></b>

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=0)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print('Feature/label dataset for training before applying SMOTE: ', X_train.shape, y_train.shape)
print('Feature/label dataset for training after applying SMOTE: ', X_train_smote.shape, y_train_smote.shape)
print('Distribution of label values after applying SMOTE:\n',pd.Series(y_train_smote).value_counts())

card_df_smote = pd.DataFrame(data=X_train_smote)  # Assuming your columns are named
card_df_smote['fraud_bool'] = y_train_smote

### **Scatterplot**

In [None]:
fig = px.scatter(card_df_smote, x="customer_age", y="credit_risk_score", color="fraud_bool")
fig.update_layout(
    title_text="",
    title_font_color="black",
    legend_title_font_color="black",
    paper_bgcolor="white",
    plot_bgcolor='white',
    font_color="black",
)

<a id="4.3"></a>
## <b>4.3 <span style='color:SteelBlue'>UMAP Dimensionality Reduction</span></b>

In [None]:
sns.set(style="ticks", context="talk",font_scale = 1)
plt.style.use("default")

mapper = umap.UMAP().fit(card_df_smote.loc[:,:'month']) 
umap.plot.points(mapper, labels=card_df_smote.loc[:,'fraud_bool'])

In [None]:
smote = SMOTE(random_state=0)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print('Feature/label dataset for training before applying SMOTE: ', X_train.shape, y_train.shape)
print('Feature/label dataset for training after applying SMOTE: ', X_train_smote.shape, y_train_smote.shape)
print('Distribution of label values after applying SMOTE:\n',pd.Series(y_train_smote).value_counts())



<a id="4.4"></a>
## <b>4.4 <span style='color:SteelBlue'>Analysing Results</span></b>

In [None]:
train_df =pd.concat([X_train_smote,y_train_smote],axis=1)
X_test_smote, y_test_smote = smote.fit_resample(X_test, y_test)

In [None]:
colors = ['SteelBlue', '#004B87']
labels = ['Non-Fraud','Fraud']
values = train_df['fraud_bool'].value_counts() / train_df['fraud_bool'].shape[0]
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=0.3)])
fig.update_traces(hoverinfo='label+percent', textinfo='percent', textfont_size=20,
                  marker=dict(colors=colors, line=dict(color='white', width=2)))
fig.update_layout(
    title_text='<b>Credit Card Fraud</b>',
    title_font_color='black',
    title_font=dict(size=24),
    legend_title_font_color='black',
    paper_bgcolor='white',
    plot_bgcolor='white',
    font_color='black',
)

fig.show()

In [None]:
classifier = setup(data=train_df, preprocess=False, target='fraud_bool', verbose=0)

**<span style="color:Navy"> Observation:**
    
By employing the Synthetic Minority Over-sampling Technique (SMOTE), we were able to effectively address the issue of class imbalance present in the dataset.

-------------------------------------------------------------------------------
<a id="5"></a>
# <b>5 <span style='color:SteelBlue'>|</span> Model Selection</b>

<a id="5.1"></a>
## <b>5.1 <span style='color:SteelBlue'>Creating Models</span></b>

In [None]:
decisiontree = create_model('dt') # decision tree
gbm = create_model('gbc') # gradient boosting machine
logisticregression = create_model('lr') # logistic regression

<a id="5.2"></a>
## <b>5.2 <span style='color:SteelBlue'>Hyperparameter tuning</span></b>

In [None]:
tuned_decisiontree = tune_model(decisiontree, optimize = 'AUC') 
tuned_gradientboosting = tune_model(gbm, optimize = 'AUC')
tuned_logisticregression = tune_model(logisticregression, optimize = 'AUC')

<a id="5.3"></a>
## <b>5.3 <span style='color:SteelBlue'>Interpretating Models</span></b>

<a id="5.3.1"></a>
### <b>5.3.1 <span style='color:SteelBlue'>Decision Tree</span></b>

In [None]:
plt.figure(figsize=(8, 8))
plot_model(tuned_decisiontree, plot='boundary')

In [None]:
plt.figure(figsize=(8, 8))
plot_model(tuned_decisiontree, plot='learning')

In [None]:
plt.figure(figsize=(8, 8))
plot_model(tuned_decisiontree, plot='class_report')

<a id="5.3.2"></a>
### <b>5.3.2 <span style='color:SteelBlue'>Gradient Boosting Machine</span></b>

In [None]:
plt.figure(figsize=(8, 8))
plot_model(tuned_gradientboosting, plot='boundary')


In [None]:
plt.figure(figsize=(8, 8))
plot_model(tuned_gradientboosting, plot='learning')

In [None]:
plt.figure(figsize=(8, 8))
plot_model(tuned_gradientboosting, plot='class_report')

<a id="5.3.3"></a>
### <b>5.3.3 <span style='color:SteelBlue'>Logistic Regression</span></b>

In [None]:
plt.figure(figsize=(8, 8))
plot_model(tuned_logisticregression, plot='boundary')

In [None]:
plt.figure(figsize=(8, 8))
plot_model(tuned_logisticregression, plot='learning')

In [None]:
plt.figure(figsize=(8, 8))
plot_model(tuned_logisticregression, plot='class_report')

-------------------------------------------------------------------------------
<a id="6"></a>
# <b>6 <span style='color:SteelBlue'>|</span> Model Evaluation</b>

<a id="6.1"></a>
### <b>6.1 <span style='color:SteelBlue'>Decision Tree</span></b>

In [None]:
pred = tuned_decisiontree.predict(X_test_smote)

In [None]:
final_model_result = confusion_matrix(y_test_smote, pred)
accuracy = accuracy_score(y_test_smote , pred)
precision = precision_score(y_test_smote , pred)
recall = recall_score(y_test_smote , pred)
f1 = f1_score(y_test_smote,pred) 
print('accuracy: {0:.4f}, precision: {1:.4f}, recall: {2:.4f},\
F1: {3:.4f}'.format(accuracy, precision, recall, f1))

In [None]:
plt.figure(figsize=(8, 6))
ax = sns.heatmap(final_model_result, cmap = 'YlGnBu',annot = True, fmt='d')
ax.set_title('Confusion Matrix (Decision Tree Classifier)')

In [None]:
# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test_smote, pred)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

print('AUC: {0:.4f}'.format(roc_auc))

<a id="6.2"></a>
### <b>6.2 <span style='color:SteelBlue'>Gradient Boosting Machine</span></b>

In [None]:
pred = tuned_gradientboosting.predict(X_test_smote)

In [None]:
final_model_result = confusion_matrix(y_test_smote, pred)
accuracy = accuracy_score(y_test_smote , pred)
precision = precision_score(y_test_smote , pred)
recall = recall_score(y_test_smote , pred)
f1 = f1_score(y_test_smote,pred) 
print('accuracy: {0:.4f}, precision: {1:.4f}, recall: {2:.4f},\
F1: {3:.4f}'.format(accuracy, precision, recall, f1))

In [None]:
plt.figure(figsize=(8, 6))
ax = sns.heatmap(final_model_result, cmap = 'YlGnBu',annot = True, fmt='d')
ax.set_title('Confusion Matrix (GBM Classifier)')

In [None]:
# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test_smote, pred)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

print('AUC: {0:.4f}'.format(roc_auc))

<a id="6.3"></a>
### <b>6.3 <span style='color:SteelBlue'>Logistic Regression</span></b>

In [None]:
pred = tuned_logisticregression.predict(X_test_smote) 

In [None]:
final_model_result = confusion_matrix(y_test_smote, pred)
accuracy = accuracy_score(y_test_smote , pred)
precision = precision_score(y_test_smote , pred)
recall = recall_score(y_test_smote , pred)
f1 = f1_score(y_test_smote,pred) 
print('accuracy: {0:.4f}, precision: {1:.4f}, recall: {2:.4f},\
F1: {3:.4f}'.format(accuracy, precision, recall, f1))

In [None]:
plt.figure(figsize=(8, 6))
ax = sns.heatmap(final_model_result, cmap = 'YlGnBu',annot = True, fmt='d')
ax.set_title('Confusion Matrix (Logistic Regression)')

In [None]:
# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test_smote, pred)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

print('AUC: {0:.4f}'.format(roc_auc))

-------------------------------------------------------------------------------
<a id="7"></a>
# <b>7 <span style='color:SteelBlue'>|</span> Final Comparation</b>

<a id="7.1"></a>
### <b>7.1 <span style='color:SteelBlue'>Evaluation Metrics</span></b>

In [None]:
# Calculate evaluation metrics for each model
models = [tuned_logisticregression, tuned_gradientboosting, tuned_decisiontree]
model_names = ['Logistic Regression', 'Gradient Boosting', 'Decision Tree']
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']

scores = []

for model in models:
    y_pred = model.predict(X_test_smote)
    accuracy = accuracy_score(y_test_smote, y_pred)
    precision = precision_score(y_test_smote, y_pred)
    recall = recall_score(y_test_smote, y_pred)
    f1 = f1_score(y_test_smote, y_pred)
    scores.append([accuracy, precision, recall, f1])

colors = plt.cm.Blues(np.linspace(0.2, 0.8, len(metrics)))

# Plot the bar plot
plt.figure(figsize=(10, 6))
x = np.arange(len(model_names))
width = 0.15

for i, metric in enumerate(metrics):
    plt.bar(x + (i * width), [score[i] for score in scores], width, label=metric, color=colors[i])

plt.xlabel('Models')
plt.ylabel('Score')
plt.title('Model Evaluation Metrics')
plt.xticks(x + (width * 1.5), model_names)
plt.legend()

plt.show()


<a id="7.2"></a>
### <b>7.2 <span style='color:SteelBlue'>Precision-Recall Curve</span></b>

In [None]:
# Calculate precision and recall for each model
models = [tuned_logisticregression, tuned_gradientboosting, tuned_decisiontree]
model_names = ['Logistic Regression', 'Gradient Boosting', 'Decision Tree']

plt.figure(figsize=(8, 6))

colors = plt.cm.Blues(np.linspace(0.2, 0.8, len(models)))

for model, name, color in zip(models, model_names, colors):
    y_pred_prob = model.predict_proba(X_test_smote)[:, 1]
    precision, recall, _ = precision_recall_curve(y_test_smote, y_pred_prob)
    plt.plot(recall, precision, label=name, color=color)

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc='lower right')

plt.show()


<a id="7.3"></a>
### <b>7.3 <span style='color:SteelBlue'>Receiver Operating Characteristic (ROC)</span></b>

In [None]:
# Calculate the ROC curve and AUC for each model
models = [tuned_logisticregression, tuned_gradientboosting, tuned_decisiontree]
model_names = ['Logistic Regression', 'Gradient Boosting', 'Decision Tree']

plt.figure(figsize=(8, 6))

colors = plt.cm.Blues(np.linspace(0.2, 0.8, len(models)))

for model, name, color in zip(models, model_names, colors):
    y_pred_prob = model.predict_proba(X_test_smote)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test_smote, y_pred_prob)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})', color=color)

plt.plot([0, 1], [0, 1], 'k--', label='Random Guessing')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')

plt.show()
