In [3]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv("../data/creditcard.csv")

In [5]:
scaler = StandardScaler()
df['Scaled_Amount'] = scaler.fit_transform(df[['Amount']])
df['Scaled_Time'] = scaler.fit_transform(df[['Time']])

df = df.drop(['Amount','Time'],axis=1)

# Handling Imbalance

In [6]:
print("Class Distribution")
print(df['Class'].value_counts())

Class Distribution
Class
0    284315
1       492
Name: count, dtype: int64


In [7]:
X = df.drop('Class',axis=1)
Y = df['Class']

print(f"Feature Shape {X.shape}")
print(f"Output Shape {Y.shape}")

Feature Shape (284807, 30)
Output Shape (284807,)


In [8]:
smote = SMOTE(random_state=42)
X_resampled, Y_resampled = smote.fit_resample(X,Y)
print('After Smote - ')
print(Y_resampled.value_counts())


After Smote - 
Class
0    284315
1    284315
Name: count, dtype: int64


In [9]:
X_train, X_test, Y_train , Y_test = train_test_split(X_resampled , Y_resampled , test_size = 0.3, random_state=42, stratify = Y_resampled)

print(f"Training set size : {X_train.shape}")
print(f"Testing Set size : {X_test.shape}")

Training set size : (398041, 30)
Testing Set size : (170589, 30)


In [13]:
import joblib
import os

os.chdir("..")
os.makedirs("data/processed_data", exist_ok=True)

joblib.dump(X_train, "data/processed_data/X_train.pkl")
joblib.dump(X_test, "data/processed_data/X_test.pkl")
joblib.dump(Y_train, "data/processed_data/y_train.pkl")
joblib.dump(Y_test, "data/processed_data/y_test.pkl")


['data/processed_data/y_test.pkl']