In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Style
plt.style.use("ggplot")
sns.set_palette("Set2")

# Load dataset
df = pd.read_csv("../data/raw/mf_nf.csv")
print(f"Dataset shape: {df.shape}")
df.head()



ModuleNotFoundError: No module named 'seaborn'

In [None]:
df.info()
df.describe().T


In [None]:
missing = df.isnull().sum()
print("Missing values per column:\n", missing[missing > 0])
print("\nDuplicates:", df.duplicated().sum())

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(x="isFraud", data=df)
plt.title("Fraud vs Non-Fraud Transactions")
plt.xlabel("isFraud (0 = Non-Fraud, 1 = Fraud)")
plt.ylabel("Count")
plt.show()

fraud_ratio = df["isFraud"].value_counts(normalize=True)*100
print(f"Fraud ratio:\n{fraud_ratio}")


In [None]:
plt.figure(figsize=(7,4))
sns.countplot(x="type", hue="isFraud", data=df)
plt.title("Transaction Type vs Fraud")
plt.xlabel("Transaction Type")
plt.ylabel("Count")
plt.legend(["Non-Fraud", "Fraud"])
plt.show()


In [None]:
plt.figure(figsize=(7,4))
sns.histplot(df["amount"], bins=50, kde=True)
plt.title("Transaction Amount Distribution")
plt.xlabel("Amount")
plt.show()

plt.figure(figsize=(7,4))
sns.boxplot(x="isFraud", y="amount", data=df)
plt.title("Transaction Amount vs Fraud")
plt.xlabel("Fraud Label")
plt.ylabel("Amount")
plt.yscale("log")  # log scale because of outliers
plt.show()


In [None]:
df["orig_balance_change"] = df["oldbalanceOrg"] - df["newbalanceOrig"]
df["dest_balance_change"] = df["oldbalanceDest"] - df["newbalanceDest"]

plt.figure(figsize=(6,4))
sns.histplot(df["orig_balance_change"], bins=50, kde=True)
plt.title("Original Balance Change Distribution")
plt.show()

plt.figure(figsize=(6,4))
sns.histplot(df["dest_balance_change"], bins=50, kde=True)
plt.title("Destination Balance Change Distribution")
plt.show()


In [None]:
plt.figure(figsize=(10,6))
corr = df.select_dtypes(include=[np.number]).corr()
sns.heatmap(corr, cmap="coolwarm", annot=False)
plt.title("Correlation Heatmap of Numeric Features")
plt.show()


In [None]:
corr_fraud = corr["isFraud"].sort_values(ascending=False)
corr_fraud.head(10)


In [None]:
subset = df.sample(2000, random_state=42)[["amount", "oldbalanceOrg", "newbalanceOrig", "isFraud"]]
sns.pairplot(subset, hue="isFraud", diag_kind="kde")
plt.suptitle("Pairwise Relationships (sample)", y=1.02)
plt.show()
