<a href="https://colab.research.google.com/github/YagyanshB/nhs-data-science/blob/main/smote_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Required Libraries:

In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

import warnings
warnings.filterwarnings('ignore')

from random import randrange, uniform
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, recall_score

# Uploading the Credit Card Fraud Detection dataset from Kaggle:

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/creditcardfraud


In [3]:
print(path)

/kaggle/input/creditcardfraud


In [4]:
# listing all files in the downloaded dataset directory

print(os.listdir(path))

['creditcard.csv']


In [5]:
# uploading the kaggle dataset into our dataframe format

df = pd.read_csv(f"{path}/creditcard.csv")

# Investigating Dataset:

In [6]:
# investigating the first few rows of our credit card fraud detection dataset

df.head(10)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0
5,2.0,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,0.260314,-0.568671,...,-0.208254,-0.559825,-0.026398,-0.371427,-0.232794,0.105915,0.253844,0.08108,3.67,0
6,4.0,1.229658,0.141004,0.045371,1.202613,0.191881,0.272708,-0.005159,0.081213,0.46496,...,-0.167716,-0.27071,-0.154104,-0.780055,0.750137,-0.257237,0.034507,0.005168,4.99,0
7,7.0,-0.644269,1.417964,1.07438,-0.492199,0.948934,0.428118,1.120631,-3.807864,0.615375,...,1.943465,-1.015455,0.057504,-0.649709,-0.415267,-0.051634,-1.206921,-1.085339,40.8,0
8,7.0,-0.894286,0.286157,-0.113192,-0.271526,2.669599,3.721818,0.370145,0.851084,-0.392048,...,-0.073425,-0.268092,-0.204233,1.011592,0.373205,-0.384157,0.011747,0.142404,93.2,0
9,9.0,-0.338262,1.119593,1.044367,-0.222187,0.499361,-0.246761,0.651583,0.069539,-0.736727,...,-0.246914,-0.633753,-0.120794,-0.38505,-0.069733,0.094199,0.246219,0.083076,3.68,0


 Due to confidentiality issues, the dataset cannot provide the original features. Features V1, V2, … V28 are the principal components obtained with PCA, the only features which have not been transformed with PCA are ‘Time’ and ‘Amount’.

In [7]:
df.shape

print(f'The dataset has {df.shape[0]} rows and {df.shape[1]} columns.')

The dataset has 284807 rows and 31 columns.


In [8]:
df['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,284315
1,492


From the above output we can see, there are significantly more negative samples than positive samples.

For simplicity and faster processing, we reduce the dimensionality of our dataset by dropping 'Time'.[link text](https://)

In [9]:
df = df.drop('Time', axis=1)

In [10]:
df.shape

(284807, 30)

# Splitting the dataset into features and labels.
This dataset doesn't have SMOTE applied.




In [11]:
X = df.drop('Class', axis=1)
y = df['Class']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialising our Random Forest Classifier.

In [21]:
# rf = RandomForestClassifier(n_estimators=100,
                            # max_depth = 10,
                            # n_jobs=-1, verbose=1, random_state=42)

# rf.fit(X_train, y_train)

In [22]:
# y_pred = rf.predict(X_test)

In [16]:
# using a subset to speed up random forest modelling

df_sample = df.sample(frac=0.1, random_state=42)  # 10% of the dataset
X = df_sample.drop('Class', axis=1)
y = df_sample['Class']d

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestClassifier(n_estimators=100,
                            max_depth = 10,
                            n_jobs=-1, verbose=1, random_state=42)

rf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    7.6s finished


In [17]:
y_pred = rf.predict(X_test)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.0s finished


In [18]:
confusion_matrix(y_test, y_pred)

array([[5687,    4],
       [   2,    4]])

In [19]:
recall_score(y_test, y_pred)

0.6666666666666666

# Implementing SMOTE Library:

In [28]:
!pip install imbalanced-learn



In [29]:
from imblearn.over_sampling import SMOTE

# Uploading the Credit Card Dataset Again:

In [30]:
# uploading the kaggle dataset into our dataframe format

df_smote = pd.read_csv(f"{path}/creditcard.csv")

In [32]:
df_smote = df_smote.drop('Time', axis=1)

# Splitting the Training and Testing Dataset:

In [33]:
X = df_smote.drop('Class', axis=1)
y = df_smote['Class']

# Executing the SMOTE Algorithm:

In [34]:
sm = SMOTE(random_state=42)

In [35]:
X_resampled, y_resampled = sm.fit_resample(X, y)

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
rf = RandomForestClassifier(random_state=42)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [42]:
confusion_matrix(y_test, y_pred)

array([[56738,    12],
       [    0, 56976]])

In [43]:
recall_score(y_test, y_pred)

1.0