In [None]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.patches as mpatches
import time

# Classifier Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import collections


# Other Libraries
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from collections import Counter
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("creditcard.csv", low_memory=False)

In [None]:
def label_count_with_percentage(column: pd.Series):
    count_dict = column.value_counts().to_dict()
    for label in count_dict:
        count = count_dict[label]
        percentage = round((count/len(column))*100, 4)
        print(f"{label} = {count} ({percentage}%)")

In [None]:
label_count_with_percentage(df)

In [None]:
sns.countplot(x='Class', data=df)
plt.title('Class Distributions \n (0: No Fraud || 1: Fraud)', fontsize=14)

In [None]:
from typing import List
import random
import math


def plot_distributions(column_list: List[str], df: pd.DataFrame):
    col_count = len(column_list)
    row_count = col_count//2
    row_count = 1 if not row_count else row_count
    col_count = len(column_list)
    fig, ax = plt.subplots(row_count, 2, figsize=(15, 15))
    palettes = sns.color_palette()
    row_num = 0
    fig_no = 0
    for index, column in enumerate(column_list):
        column_val = df[column].values
        # Choose a random color palette from the list
        random_palette = random.choice(palettes)
        fig_no += 1
        sns.distplot(column_val, ax=ax[row_num,
                     index % 2], color=random_palette)
        ax[row_num, index % 2].set_title(
            f'Distribution of {column}', fontsize=14)
        ax[row_num, index % 2].set_xlim([min(column_val), max(column_val)])
        if fig_no % 2 == 0:
            row_num += 1
    plt.tight_layout()
    plt.show()


plot_distributions(["Amount", "V1", "Time", "V2"], df)

## Scale Time and Amount feature as other features are already scaled 

In [None]:
# checking for outlires as it will help us to choose the scaler

def detect_outliers_iqr(column: pd.Series):
    # Calculate the 25th and 75th percentiles
    q1 = np.percentile(column, 25)
    q3 = np.percentile(column, 75)

    # Calculate the IQR (Interquartile Range)
    iqr = q3 - q1

    # Define the lower and upper bounds for outliers
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    # Identify outliers using the bounds
    outliers = column[(column < lower_bound) | (column > upper_bound)]

    return outliers


print("Number of Outlires in Time = ", len(detect_outliers_iqr(df.Time)))
print("Number of Outlires in Amount = ", len(detect_outliers_iqr(df.Amount)))

In [None]:
# Checking outlires with box plot
sns.boxplot(x=df.Time, orient="v")
plt.show()

In [None]:
sns.boxplot(y=df.Amount)
plt.show()

In [None]:
sns.stripplot(data=df,
              x="Amount",      # x axis column from data
              y=None,      # y axis column from data
              color="crimson",  # Colours the dots
              linewidth=1,     # Dot outline width
              alpha=0.4)       # Makes them transparent

In [None]:
from sklearn.preprocessing import RobustScaler

# as there is a presence of outlires we are using robus scaler
scaler = RobustScaler()

df['Amount'] = scaler.fit_transform(df[['Amount']]).flatten()
df['Time'] = scaler.fit_transform(df[['Time']]).flatten()

In [None]:
# splitting the data set

from sklearn.model_selection import train_test_split


"""
You can choose any integer value for random_state, and as long as you use the same value,
you will always get the same split. If you don't specify a random_state value, the split will still be random,
but it will vary each time you run the function.

"""

X = df.drop('Class', axis=1)
y = df['Class']
original_Xtest, original_Xtrain, original_ytest, original_ytrain = train_test_split(
    X, y, test_size=0.2, random_state=42)

# we are splitting the data set before Under Scaling to have the test data from the original dataset

# now we will under smaple the test data and train our model

In [None]:
label_count_with_percentage(original_ytest)

In [None]:
label_count_with_percentage(original_ytrain)

In [None]:
train_df = original_Xtest
train_df["Class"] = original_ytest

In [None]:
fraud_df = train_df[train_df.Class == 1]
len(fraud_df)

In [None]:
non_fraud_df = train_df[train_df.Class == 0]
len(non_fraud_df)

In [None]:
under_smpled_df = pd.concat([fraud_df, non_fraud_df[:len(fraud_df)]])

In [None]:
label_count_with_percentage(under_smpled_df.Class)

### Exploring correlation of out under sampled dataframe

In [None]:
corr_data = round(under_smpled_df.corr(), 2)
fig, ax = plt.subplots(figsize=(20, 8))
sns.heatmap(corr_data, cmap='coolwarm_r', ax=ax, annot=True)
plt.show()

In [None]:
def find_strong_correlations(df, target_column, threshold=0.4):
    # Compute the correlation matrix
    corr_matrix = df.corr()
    positive_corr_cols = []
    negative_corr_cols = []
    for col in df.columns:
        if col != target_column:
            score = corr_matrix[target_column][col]
            # print(score)
            if score >= threshold:
                print("Strong positive Correlation--->", col, " = ", score)
                positive_corr_cols.append(col)
            elif score <= -threshold:
                print("Strong negative Correlation--->", col, " = ", score)
                negative_corr_cols.append(col)

    return positive_corr_cols, negative_corr_cols


positive, negative = find_strong_correlations(under_smpled_df, "Class")

In [None]:
def box_plot_corr(df: pd.DataFrame, col_list: List[str], label_name: str):
    for col in col_list:
        f, ax = plt.subplots(1, 1, figsize=(12, 4))
        sns.boxplot(x=label_name, y=col, data=df, ax=ax)
        plt.show()

# box_plot_corr(under_smpled_df,negative,"Class")

### Feature Selection

In [None]:
from sklearn.feature_selection import mutual_info_classif

X = under_smpled_df.drop('Class', axis=1)
y = under_smpled_df.Class
importances = mutual_info_classif(X, y)
feat_importance = pd.Series(importances, under_smpled_df.columns[:-1])
feat_importance.plot(kind='barh', color='teal')

In [None]:
df.columns[:-1]