In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

PROJECT_ROOT = Path.cwd().parent
DATA_PATH = PROJECT_ROOT / "data"

DATA_PATH.mkdir(exist_ok=True)

TEST_SIZE = 0.2
RANDOM_STATE = 42

In [None]:
df = pd.read_csv(DATA_PATH / "creditcard.csv")

print(f"Shape: {df.shape}")
print("\nHead:")
display(df.head())

print("\nInfo:")
df.info()

In [None]:
class_counts = df['Class'].value_counts()
fraud_percentage = (class_counts[1] / class_counts.sum()) * 100

print(f"Legitimate transactions (Class=0): {class_counts[0]}")
print(f"Fraud transactions (Class=1): {class_counts[1]}")
print(f"Fraud percentage: {fraud_percentage:.4f}%")

plt.figure(figsize=(8, 6))
sns.countplot(x='Class', data=df)
plt.title('Classes distribution: 0 - Legitimate, 1 - Fraud')
plt.ylabel('Transactions')
plt.xlabel('Classes')
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(18, 6))

sns.histplot(df['Amount'], bins=100, ax=ax1)
ax1.set_title('Amount distribution')
ax1.set_yscale('log')

sns.histplot(df['Time'], bins=100, ax=ax2)
ax2.set_title('Time distribution')

plt.show()

In [None]:
df_processed = df.copy()

scaler = StandardScaler()
df_processed['scaled_Amount'] = scaler.fit_transform(df_processed['Amount'].values.reshape(-1, 1))

df_processed = df_processed.drop(['Time', 'Amount'], axis=1)

target_col = df_processed.pop('Class')
df_processed['Class'] = target_col

print("Processed DataFrame:")
display(df_processed.head())

In [None]:
X = df_processed.drop('Class', axis=1)
y = df_processed['Class']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=y
)

print(f"X_train Shape: {X_train.shape}")
print(f"X_test Shape: {X_test.shape}")

print(f"\ny_train Fraud percentage: {y_train.mean() * 100:.4f}%")
print(f"y_test Fraud percentage: {y_test.mean() * 100:.4f}%")

In [None]:
X_train.to_feather(DATA_PATH / 'X_train.feather')
X_test.to_feather(DATA_PATH / 'X_test.feather')
y_train.to_frame().to_feather(DATA_PATH / 'y_train.feather')
y_test.to_frame().to_feather(DATA_PATH / 'y_test.feather')