In [2]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, matthews_corrcoef
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import VarianceThreshold


In [3]:
df2 = pd.read_csv("Datasets.csv")
print(df2.shape)
print(df2.columns)

(10127, 17)
Index(['Unnamed: 0', 'step', 'type', 'branch', 'amount', 'nameOrig',
       'oldbalanceOrg', 'newbalanceOrig', 'nameDest', 'oldbalanceDest',
       'newbalanceDest', 'unusuallogin', 'isFlaggedFraud', 'Acct type',
       'Date of transaction', 'Time of day', 'isFraud'],
      dtype='object')


In [4]:
df2.rename(columns={df2.columns[-1]: "isFraud"}, inplace=True)

df2 = df2.replace([np.inf, -np.inf], np.nan)
df2 = df2.dropna()

In [5]:
X2 = df2.drop("isFraud", axis=1)
y2 = df2["isFraud"]

non_numeric_cols = X2.select_dtypes(include=['object']).columns
X2 = X2.drop(columns=non_numeric_cols)

X2 = X2.astype(float)
y2 = y2[X2.index]  

In [6]:
smote = SMOTE(random_state=42)
X2_resampled, y2_resampled = smote.fit_resample(X2, y2)
print("After SMOTE:", np.bincount(y2_resampled))

scaler = StandardScaler()
X2_scaled = scaler.fit_transform(X2_resampled)

After SMOTE: [10020 10020]




In [7]:

X_train, X_test, y_train, y_test = train_test_split(X2_scaled, y2_resampled, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor


models = {
    "Logistic Regression": LogisticRegression(max_iter=2000, solver='saga'),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Isolation Forest": IsolationForest(contamination=0.001, random_state=42),
    "Local Outlier Factor": LocalOutlierFactor(n_neighbors=20, contamination=0.001, novelty=True)
}

results = []

for name, model in models.items():
    print(f"\nTraining {name}...")

    if name in ["Isolation Forest", "Local Outlier Factor"]:
        model.fit(X_train)  # Fits only on X for unsupervised
        y_pred = model.predict(X_test)
        # Converts anomaly labels: -1 → 1 (fraud), 1 → 0 (normal)
        y_pred = [1 if x == -1 else 0 for x in y_pred]
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)

    results.append({
        'Model': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'MCC': mcc
    })

results_df = pd.DataFrame(results)
print("\nModel Performance:\n")
print(results_df)



Training Logistic Regression...

Training Decision Tree...

Training Random Forest...

Training Isolation Forest...

Training Local Outlier Factor...

Model Performance:

                  Model  Accuracy  Precision       MCC
0   Logistic Regression  0.817116   0.790871  0.636676
1         Decision Tree  0.992515   0.989960  0.985041
2         Random Forest  0.997255   0.994977  0.994521
3      Isolation Forest  0.997255   0.994977  0.994521
4  Local Outlier Factor  0.997255   0.994977  0.994521
