# Load libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay, roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
import joblib

import warnings
warnings.filterwarnings('ignore')

# Load sample dataset

In [None]:
df = pd.read_csv("../data/raw/sample.csv")
display(df.head())

In [None]:
print(df.columns.tolist())

# Data cleaning and wrangling

In [None]:
# Step 3: Data Cleaning & Wrangling
df.info()

In [None]:
df.isnull().sum()

In [None]:
df = df.drop_duplicates()
df.head()

In [None]:
df.shape

# Exploratory Data Analysis 

In [None]:
sns.countplot(data=df, x='isFraud')
plt.title('Fraud vs Non-Fraud Transactions')
plt.show()

# Only use numeric columns for correlation matrix
numeric_df = df.select_dtypes(include=['number'])
plt.figure(figsize=(12,8))
sns.heatmap(numeric_df.corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title("Feature Correlation Matrix")
plt.show()

# Feature Engineering & Preprocessing

In [None]:
# Drop target
X = df.drop('isFraud', axis=1)
y = df['isFraud']
# Convert categorical columns to numeric
X = pd.get_dummies(X, drop_first=True)

In [None]:
# Scale numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, stratify=y, random_state=42
)

# Model Training & Evaluation

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay, roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
import joblib

import warnings
warnings.filterwarnings('ignore')

In [None]:
models = {
    "Logistic Regression": LogisticRegression(class_weight='balanced', max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(class_weight='balanced'),
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n{name}")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("ROC AUC Score:", roc_auc_score(y_test, model.predict_proba(X_test)[:,1]))

# Visual ROC Curve

In [None]:
plt.figure(figsize=(10, 6))

for name, model in models.items():
    y_probs = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_probs)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f"{name} (AUC = {roc_auc:.2f})")

plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve Comparison")
plt.legend()
plt.grid(True)
plt.show()

# Confusion Matrix Visualization

In [None]:
best_model = GradientBoostingClassifier()
best_model.fit(X_train, y_train)
ConfusionMatrixDisplay.from_estimator(best_model, X_test, y_test, cmap="Blues")
plt.title("Confusion Matrix - Gradient Boosting")
plt.show()

# Save Model

In [None]:
joblib.dump(best_model, "gradient_boosting_fraud_model.pkl")

# Class Probability Distribution

In [None]:
y_probs = best_model.predict_proba(X_test)[:, 1]
plt.figure(figsize=(8, 4))
sns.histplot(y_probs, bins=50, kde=True)
plt.title("Predicted Fraud Probabilities (Gradient Boosting)")
plt.xlabel("Probability of Fraud")
plt.show()

# Model Comparison Chart (manually entered example scores)

In [None]:
model_results = pd.DataFrame({
    "Model": [
        "Logistic Regression", "Decision Tree", "KNN", 
        "Random Forest", "Gradient Boosting", "AdaBoost"
    ],
    "Precision": [0.72, 0.75, 0.68, 0.84, 0.86, 0.81],
    "Recall": [0.65, 0.60, 0.62, 0.79, 0.83, 0.78],
    "F1-Score": [0.68, 0.66, 0.65, 0.81, 0.84, 0.79],
    "ROC AUC": [0.85, 0.77, 0.75, 0.92, 0.94, 0.91]
})

model_results.set_index("Model", inplace=True)
model_results.plot(kind="bar", figsize=(12, 6))
plt.title("Model Performance Comparison")
plt.ylabel("Score")
plt.ylim(0, 1)
plt.xticks(rotation=45)
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()
model_results

# SHAP Interpretability (optional, install SHAP if needed)

In [None]:
# !pip install shap
import shap

explainer = shap.Explainer(best_model)
shap_values = explainer(X_test[:100])  # sample for speed
shap.plots.beeswarm(shap_values)