In [2]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [4]:
# 1. Load the cleaned dataset
df = pd.read_csv("../data/creditcard_cleaned.csv")

# 2. Split into features & target
X = df.drop('Class', axis=1)
y = df['Class']

# 3. Train-test split (before SMOTE!)
# We apply SMOTE **only** to the training set to avoid data leakage
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [5]:
# 4. Apply SMOTE on the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("Before SMOTE:")
print(y_train.value_counts())
print("\nAfter SMOTE:")
print(y_train_smote.value_counts())

Before SMOTE:
Class
0    226602
1       378
Name: count, dtype: int64

After SMOTE:
Class
0    226602
1    226602
Name: count, dtype: int64


In [6]:

# 5. Save the SMOTE-augmented training set

train_smote_df = pd.concat([pd.DataFrame(X_train_smote, columns=X.columns),
                            pd.DataFrame(y_train_smote, columns=['Class'])],
                           axis=1)

train_smote_df.to_csv("../data/creditcard_train_SMOTE.csv", index=False)