# Load libraries

In [1]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay, roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
import joblib

import warnings
warnings.filterwarnings('ignore')

# Load dataset

In [2]:
df = pd.read_csv("../data/raw/AIML_Dataset.csv")  
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [3]:
df.shape

(6362620, 11)

In [4]:
X = df.drop(columns="isFraud")
y = df["isFraud"]
X_sample, _, y_sample, _ =  train_test_split(X, y, train_size=100_000, random_state=42)

# Create a sample

In [5]:
#pd.DataFrame(y_sample)
df_sample = pd.concat([X_sample, pd.DataFrame(y_sample, index=y_sample.index)], axis=1).reset_index(drop=True)
df_sample.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFlaggedFraud,isFraud
0,276,CASH_OUT,82100.66,C1081875690,0.0,0.0,C1946167713,333250.12,415350.78,0,0
1,238,CASH_OUT,354035.48,C841401063,877.0,0.0,C1197061790,55518.64,409554.12,0,0
2,43,PAYMENT,9536.01,C2095821496,0.0,0.0,M1282202824,0.0,0.0,0,0
3,187,CASH_OUT,234984.91,C934116951,213295.0,0.0,C41074757,333675.55,568660.46,0,0
4,290,PAYMENT,3541.37,C2120446326,15285.0,11743.63,M1871624679,0.0,0.0,0,0


In [6]:
df_sample.shape

(100000, 11)

In [7]:
df_sample.to_csv("../data/raw/sample.csv", index=False)

In [9]:
del X, y, df, X_sample, y_sample

# Data cleaning and wrangling

In [None]:
# Step 3: Data Cleaning & Wrangling
df_sample.info()

In [None]:
df_sample.isnull().sum()

In [None]:
df_sample = df_sample.drop_duplicates()
df_sample.head()

In [None]:
df_sample.shape

# Exploratory Data Analysis (EDA)

In [None]:
sns.countplot(data=df_sample, x='isFraud')
plt.title('Fraud vs Non-Fraud Transactions')
plt.show()

# Only use numeric columns for correlation matrix
numeric_df_sample = df_sample.select_dtypes(include=['number'])

plt.figure(figsize=(12,8))
sns.heatmap(numeric_df_sample.corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title("Feature Correlation Matrix")
plt.show()

# Feature Engineering & Preprocessing

In [None]:
# Drop target
X = df_sample.drop('isFraud', axis=1)
y = df_sample
# Convert categorical columns to numeric
X = pd.get_dummies(X, drop_first=True)


In [None]:
# Scale numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)



In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, stratify=y, random_state=42
)


# Model Training & Evaluation

In [None]:
models = {
    "Logistic Regression": LogisticRegression(class_weight='balanced', max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(class_weight='balanced'),
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n{name}")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("ROC AUC Score:", roc_auc_score(y_test, model.predict_proba(X_test)[:,1]))