In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import skew
from sklearn.metrics import average_precision_score, precision_recall_curve

from project_utils.autosave_plots import enable_autosave

In [None]:
# save plots to results
enable_autosave("eda", quiet=True)

In [None]:
# enable retina plots
%matplotlib inline
%config InlineBackend.figure_format = "retina"

## Load the data

In [None]:
path = "../data/raw/creditcard.csv"
df = pd.read_csv(path)
df.head()

## Exploratory Data Analysis

In [None]:
# get basic info on shape
df.shape, df.dtypes, df.isna().sum().sum()

In [None]:
# Validate class imbalance
counts = df["Class"].value_counts().rename({0: "Non-fraud", 1: "Fraud"})
ratio = counts.iloc[1] / counts.sum()
print("Class counts:\n", counts)
print(f"\nFraud ratio: {ratio:.4%}")

### Exploring `Amount`

In [None]:
# plot non-PCA features
fig, ax = plt.subplots()
sns.histplot(df["Amount"], bins=60, ax=ax)
ax.set_title("Amount distribution")
plt.show()

In [None]:
# very high skew G_1
print(f"Raw Amount skew: {skew(df['Amount'])}")

In [None]:
# apply log transformation
fig, ax = plt.subplots()
sns.histplot(np.log1p(df["Amount"]), bins=60, ax=ax)
ax.set_title("Log(Amount) distribution")
plt.show()

In [None]:
print(f"Log-transformed Amount skew: {skew(np.log1p(df['Amount']))}")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

sns.histplot(df["Amount"], bins=60, ax=axes[0])
axes[0].set_title(f"Raw Amount (skew={skew(df['Amount']):.2f})")

sns.histplot(np.log1p(df["Amount"]), bins=60, ax=axes[1])
axes[1].set_title(f"Log(Amount) (skew={skew(np.log1p(df['Amount'])):.2f})")

plt.show()

In [None]:
# define amount log variable
df["AmountLog"] = np.log1p(df["Amount"])

### Exploring `Time`

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
sns.lineplot(
    data=df[:1000],
    x="Time",
    y="Amount",
    ax=axes[0],
)
axes[0].set_title("Amount vs Time")

sns.lineplot(
    data=df[:1000],
    x="Time",
    y="AmountLog",
    ax=axes[1],
)
axes[1].set_title("Log(Amount) vs Time")
plt.show()

In [None]:
# convert time to minute of the day
df["MinuteOfDay"] = ((df["Time"] % (24 * 3600)) / 60).astype(int)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
sns.lineplot(
    data=df,
    x="MinuteOfDay",
    y="Amount",
    ax=axes[0],
)
axes[0].set_title("Amount vs MinuteOfDay")

sns.lineplot(
    data=df,
    x="MinuteOfDay",
    y="AmountLog",
    ax=axes[1],
)
axes[1].set_title("Log(Amount) vs MinuteOfDay")
plt.show()

### Fraud vs non-fraud overlays

In [None]:
def kde_by_class(feature: str, df: pd.DataFrame) -> None:
    fig, ax = plt.subplots()
    sns.kdeplot(
        data=df[df["Class"] == 0], x=feature, label="Non-fraud", ax=ax, color="blue"
    )
    sns.kdeplot(data=df[df["Class"] == 1], x=feature, label="Fraud", ax=ax, color="red")
    ax.set_title(f"KDE: {feature} by class")
    ax.legend()
    plt.show()


# KDE for AmountLog and PCA features only
kde_dimensions = sorted([e for e in df.columns if e not in ["Time", "Class", "Amount"]])

for col in kde_dimensions:
    kde_by_class(col, df)

In [None]:
# calculate feature AUPRC scores
y = df["Class"]
results = []

for feature in kde_dimensions:
    scores = df[feature]
    aps = average_precision_score(y, scores)
    results.append({"feature": feature, "aps": aps})

precision_scores = pd.DataFrame(results).sort_values("aps").reset_index(drop=True)

# plot scores
fig, ax = plt.subplots(figsize=(10, 4))
sns.barplot(
    data=precision_scores,
    x="feature",
    y="aps",
)
plt.xticks(rotation=90)
plt.title("Average Precision Recall by Feature")
plt.tight_layout()
plt.show()

In [None]:
# calculate feature AUPRC scores
y = df["Class"]
results = []

for feature in kde_dimensions:
    # if non-fraud mean is higher than fraud, swap signs
    if df[df["Class"] == 0][feature].mean() > df[df["Class"] == 1][feature].mean():
        scores = -df[feature]
    else:
        scores = df[feature]
    aps = average_precision_score(y, scores)
    results.append({"feature": feature, "aps": aps})

precision_scores = pd.DataFrame(results).sort_values("aps").reset_index(drop=True)

# plot scores
fig, ax = plt.subplots(figsize=(10, 4))
sns.barplot(
    data=precision_scores,
    x="feature",
    y="aps",
)

baseline = df["Class"].mean()
ax.axhline(y=baseline, color="gray", linestyle="--", linewidth=2)

ax.text(
    x=-0.5,
    y=baseline * 1.3,  # slightly above line
    s=f"Baseline = {baseline:.4f}",
    color="black",
    fontsize=9,
)
plt.xticks(rotation=90)
plt.title("Average Precision Recall by Feature (sign corrected)")
plt.tight_layout()
plt.show()

In [None]:
# calculate feature AUPRC scores
y = df["Class"]
results = []

for feature in kde_dimensions:
    # if non-fraud mean is higher than fraud, swap signs
    if df[df["Class"] == 0][feature].mean() > df[df["Class"] == 1][feature].mean():
        scores = -df[feature]
    else:
        scores = df[feature]
    aps = average_precision_score(y, scores)
    results.append({"feature": feature, "aps": aps})

precision_scores = pd.DataFrame(results).sort_values("aps").reset_index(drop=True)

# plot scores
fig, ax = plt.subplots(figsize=(10, 4))
sns.barplot(
    data=precision_scores,
    x="feature",
    y="aps",
)

baseline = df["Class"].mean()
ax.axhline(y=baseline, color="gray", linestyle="--", linewidth=2)
ax.set_yscale("log")

ax.text(
    x=-0.5,
    y=baseline * 1.3,  # slightly above line
    s=f"Baseline = {baseline:.4f}",
    color="black",
    fontsize=9,
)
plt.xticks(rotation=90)
plt.title("Average Precision Recall by Feature (sign corrected, log-scaled)")
plt.tight_layout()
plt.show()

In [None]:
precision_scores.sort_values(by="aps", ascending=False).reset_index(drop=True)

In [None]:
aps = average_precision_score(df["Class"], -df["V14"])
precision_v14, recall_v14, _ = precision_recall_curve(df["Class"], -df["V14"])

plt.figure(figsize=(6, 5))
plt.plot(precision_v14, recall_v14, lw=2, label=f"V14 | APS = {aps:.3f}")
plt.fill_between(precision_v14, recall_v14, alpha=0.20)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.legend()
plt.title("Area Under Precision-Recall Curve | V14")
plt.show()

## Clean and save dataset

In [None]:
# we only need the features, class and transformed features (amountlog and minute of day)
cols = [col for col in df.columns if col[0] == "V"] + [
    "Class",
    "AmountLog",
    "MinuteOfDay",
]
df_clean = df[cols].copy()
df_clean.to_csv("../data/processed/creditcard_clean.csv", index=False)