In [2]:
# Importing required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [3]:
# Load the dataset
df = pd.read_csv("../data/raw/creditcard.csv")


In [4]:
# Separate Features & Target
X = df.drop("Class", axis=1)
y = df["Class"]


The target variable represents fraudulent transactions, which are rare but high-impact events.

In [5]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [6]:
# Feature Scaling
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Feature scaling ensures fair contribution of all transaction features in distance-based and gradient-based models.

In [9]:
# Handle Class Imbalance
# Using Smote
# !pip install imblearn
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(
    X_train_scaled, y_train
)


In [10]:
# Saving processed data
np.save("../data/processed/X_train.npy", X_train_scaled)
np.save("../data/processed/X_test.npy", X_test_scaled)
np.save("../data/processed/y_train.npy", y_train)
np.save("../data/processed/y_test.npy", y_test)
