In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib

# ------------------------
# STEP 1: Load data
# ------------------------
df = pd.read_csv("/Users/akashbhat/credit-card-fraud-detection/data/raw/creditcard.csv")

# ------------------------
# STEP 2: Scale Amount
# ------------------------
scaler = StandardScaler()
df["Amount_scaled"] = scaler.fit_transform(df[["Amount"]])

# Drop unused columns
X = df.drop(columns=["Class", "Time", "Amount"])  # features
y = df["Class"]  # target

# ------------------------
# STEP 3: Train-test split
# ------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

# ------------------------
# STEP 4: Save preprocessing artifacts
# ------------------------
# Save scaler (fitted on entire dataset for Amount scaling)
joblib.dump(scaler, "/Users/akashbhat/credit-card-fraud-detection/data/scaler.pkl")

# Save feature column order (for inference alignment)
feature_columns = X_train.columns.tolist()
joblib.dump(feature_columns, "/Users/akashbhat/credit-card-fraud-detection/data/feature_columns.pkl")

# Optionally save train/test data
joblib.dump((X_train, X_test, y_train, y_test),
            "/Users/akashbhat/credit-card-fraud-detection/data/processed_data.pkl")

print("✅ Preprocessing complete! Scaler + feature_columns saved.")
