# Load libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay, roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder
import joblib

import warnings
warnings.filterwarnings('ignore')

# Load sample dataset

In [None]:
df = pd.read_csv("../data/raw/sample.csv")
display(df.head())

In [None]:
print(df.columns.tolist())

# Data cleaning and wrangling

In [None]:
# Step 3: Data Cleaning & Wrangling
df.info()

In [None]:
df.isnull().sum()

In [None]:
df = df.drop_duplicates()
df.head()

In [None]:
df.shape

In [None]:
df['type'].nunique()

In [None]:
df['nameOrig'].nunique()

In [None]:
df['nameDest'].nunique()

In [None]:
df["newbalanceOrig"].nunique()

# Exploratory Data Analysis 

In [None]:
print("Structure of the data:\n")

In [None]:
print("\nSummary statistics:\n")

In [None]:
df.describe()

In [None]:
# Numerical Features vs isFraud
numeric_cols = ['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig',
                'oldbalanceDest', 'newbalanceDest', 'isFlaggedFraud']

# Boxplots for numerical features grouped by isFraud
for col in numeric_cols:
    plt.figure(figsize=(8, 4))
    sns.boxplot(data=df, x='isFraud', y=col)
    plt.title(f"Boxplot of {col} by isFraud")
    plt.xlabel("Fraud (0 = No, 1 = Yes)")
    plt.ylabel(col)
    plt.tight_layout()
    plt.show()


In [None]:
# Mean values grouped by isFraud
print("Mean values by fraud status:")
print(df.groupby('isFraud')[numeric_cols].mean().T)

In [None]:
# Categorical Features vs isFraud 
# Countplot for 'type' column
plt.figure(figsize=(8, 4))
sns.countplot(data=df, x='type', hue='isFraud')
plt.title("Transaction Type by Fraud Status")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Cross-tab and fraud ratio per type
type_ct = pd.crosstab(df['type'], df['isFraud'])
type_ct['fraud_rate_%'] = round((type_ct[1] / (type_ct[0] + type_ct[1])) * 100, 2)
print("\nFraud rate per transaction type:")
print(type_ct)

In [None]:
sns.countplot(data=df, x='isFraud')
plt.title('Fraud vs Non-Fraud Transactions')
plt.show()

# Only use numeric columns for correlation matrix
numeric_df = df.select_dtypes(include=['number'])
plt.figure(figsize=(12,8))
sns.heatmap(numeric_df.corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title("Feature Correlation Matrix")
plt.show()

# Chi-Square Test for 'type' vs 'isFraud'

In [None]:
from scipy.stats import chi2_contingency

# Create a contingency table
contingency_table = pd.crosstab(df['type'], df['isFraud'])

print("Contingency Table:")
print(contingency_table)

In [None]:
# Perform Chi-Square Test
chi2, p, dof, expected = chi2_contingency(contingency_table)

print(f"\nChi-Square Statistic: {chi2:.2f}")
print(f"Degrees of Freedom: {dof}")
print(f"P-Value: {p:.4f}")

In [None]:
# Interpretation
alpha = 0.05
if p < alpha:
    print(" Result: Statistically significant relationship between transaction type and fraud.")
else:
    print(" Result: No significant relationship between transaction type and fraud.")

# Chi-Square Test for isFlaggedFraud vs isFraud

In [None]:
# Create contingency table
flagged_table = pd.crosstab(df['isFlaggedFraud'], df['isFraud'])

print("\nContingency Table for isFlaggedFraud vs isFraud:")
print(flagged_table)

In [None]:
# Perform Chi-Square Test
chi2, p, dof, expected = chi2_contingency(flagged_table)

print(f"\nChi-Square Statistic: {chi2:.2f}")
print(f"Degrees of Freedom: {dof}")
print(f"P-Value: {p:.4f}")


In [None]:
# Interpretation
alpha = 0.05
if p < alpha:
    print(" Result: Statistically significant relationship between isFlaggedFraud and isFraud.")
else:
    print(" Result: No significant relationship between isFlaggedFraud and isFraud.")


In [None]:
y = df['isFraud']

In [None]:
print(y.value_counts(normalize=True))

In [None]:
print(y.value_counts())

In [None]:
lm = LogisticRegression(class_weight='balanced')
rf = RandomForestClassifier(class_weight='balanced')

In [None]:
from imblearn.over_sampling import SMOTE

# Feature Engineering & Preprocessing

In [None]:
print(df.columns.tolist())

In [None]:
# Keep only selected columns
df = df[['step', 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig',
                    'oldbalanceDest', 'newbalanceDest', 'isFlaggedFraud', 'isFraud']]

# Separate features (X) and target (y)
X = df.drop('isFraud', axis=1)
y = df['isFraud']

In [None]:
# Split the dataset into categorical columns and numerical columns

In [None]:
X.shape

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Convert categorical columns to numeric
X_train_cat = X_train.select_dtypes("object")
X_test_cat  = X_test.select_dtypes("object")

X_train_num = X_train.select_dtypes("number")
X_test_num  = X_test.select_dtypes("number")

In [None]:
unique_values = [df['type'].unique().tolist()]
ohe = OneHotEncoder(categories=unique_values, sparse_output=False, drop='first')
ohe.fit(X_train_cat)
X_train_cat_np = ohe.transform(X_train_cat)
X_test_cat_np  = ohe.transform(X_test_cat)

X_train_cat_df = pd.DataFrame(X_train_cat_np, columns=ohe.get_feature_names_out(), index=X_train_cat.index)
X_test_cat_df  = pd.DataFrame(X_test_cat_np,  columns=ohe.get_feature_names_out(), index=X_test_cat.index)

In [None]:
# Scale numeric features
scaler = StandardScaler()
scaler.fit(X_train_num)
X_train_num_scaled = scaler.transform(X_train_num)
X_test_num_scaled = scaler.transform(X_test_num) 

X_train_num_df = pd.DataFrame(X_train_num_scaled, columns=X_train_num.columns, index=X_train_num.index)
X_test_num_df = pd.DataFrame(X_test_num_scaled,  columns=X_test_num.columns, index=X_test_num.index)
# X_scaled = scaler.fit_transform(X)  scaler.fit(X).transform(X)

In [None]:
# Combine Numeric + Categorical
X_train_final = pd.concat([X_train_num_df, X_train_cat_df], axis=1)
X_test_final  = pd.concat([X_test_num_df,  X_test_cat_df], axis=1)

# Model Training & Evaluation

In [None]:
models = {
    "Logistic Regression": LogisticRegression(class_weight='balanced', max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(class_weight='balanced'),
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier()
}

for name, model in models.items():
    print(f"Training the model {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n{name}")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("ROC AUC Score:", roc_auc_score(y_test, model.predict_proba(X_test)[:,1]))

# Visual ROC Curve

In [None]:
plt.figure(figsize=(10, 6))

for name, model in models.items():
    y_probs = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_probs)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f"{name} (AUC = {roc_auc:.2f})")

plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve Comparison")
plt.legend()
plt.grid(True)
plt.show()

# Confusion Matrix Visualization

In [None]:
best_model = GradientBoostingClassifier()
best_model.fit(X_train, y_train)
ConfusionMatrixDisplay.from_estimator(best_model, X_test, y_test, cmap="Blues")
plt.title("Confusion Matrix - Gradient Boosting")
plt.show()

# Save Model

In [None]:
joblib.dump(best_model, "gradient_boosting_fraud_model.pkl")

# Class Probability Distribution

In [None]:
y_probs = best_model.predict_proba(X_test)[:, 1]
plt.figure(figsize=(8, 4))
sns.histplot(y_probs, bins=50, kde=True)
plt.title("Predicted Fraud Probabilities (Gradient Boosting)")
plt.xlabel("Probability of Fraud")
plt.show()

# Model Comparison Chart (manually entered example scores)

In [None]:
model_results = pd.DataFrame({
    "Model": [
        "Logistic Regression", "Decision Tree", "KNN", 
        "Random Forest", "Gradient Boosting", "AdaBoost"
    ],
    "Precision": [0.72, 0.75, 0.68, 0.84, 0.86, 0.81],
    "Recall": [0.65, 0.60, 0.62, 0.79, 0.83, 0.78],
    "F1-Score": [0.68, 0.66, 0.65, 0.81, 0.84, 0.79],
    "ROC AUC": [0.85, 0.77, 0.75, 0.92, 0.94, 0.91]
})

model_results.set_index("Model", inplace=True)
model_results.plot(kind="bar", figsize=(12, 6))
plt.title("Model Performance Comparison")
plt.ylabel("Score")
plt.ylim(0, 1)
plt.xticks(rotation=45)
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()
model_results

# SHAP Interpretability (optional, install SHAP if needed)

In [None]:
# !pip install shap
import shap

explainer = shap.Explainer(best_model)
shap_values = explainer(X_test[:100])  # sample for speed
shap.plots.beeswarm(shap_values)

# Distribution of Transaction Amount by Fraud Status

In [None]:
sns.kdeplot(data=df_sample, x='amount', hue='isFraud', fill=True, common_norm=False)
plt.title("Distribution of Transaction Amount by Fraud Status")
plt.xlabel("Transaction Amount")
plt.ylabel("Density")
plt.show()

# Boxplot of Transaction Amount by Fraud

In [None]:
sns.boxplot(data=df_sample, x='isFraud', y='amount')
plt.title("Boxplot of Transaction Amount by Fraud")
plt.xlabel("Fraud")
plt.ylabel("Amount")
plt.show()

# Correlation Matrix Zoomed for Top Features

In [None]:
corr = df_sample.corr()['isFraud'].sort_values(ascending=False)
top_features = corr[1:6].index  # Top 5 features excluding target
sns.heatmap(df_sample[top_features].corr(), annot=True, cmap='coolwarm')
plt.title("Top Correlated Features with isFraud")
plt.show()

# Fraud by Transaction Type (if available)

In [None]:
if 'type' in df_sample.columns:
    sns.countplot(data=df_sample, x='type', hue='isFraud')
    plt.title("Fraud Cases by Transaction Type")
    plt.xticks(rotation=45)
    plt.show()

# Pairplot of Key Transaction Features (only if small dataset)

In [None]:
selected_cols = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'isFraud']
sns.pairplot(df_sample[selected_cols], hue='isFraud', diag_kind='kde')
plt.suptitle("Pairplot of Key Transaction Features", y=1.02)
plt.show()