In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix,
    precision_recall_curve, average_precision_score,
    accuracy_score
)
from imblearn.over_sampling import SMOTE
import numpy as np

df = pd.read_csv("../data/creditcard_cleaned.csv")
X = df.drop('Class', axis=1)
y = df['Class']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)
print("Before SMOTE class counts (train):")
print(y_train.value_counts())
print("After SMOTE class counts (train_res):")
print(pd.Series(y_train_sm).value_counts())

Before SMOTE class counts (train):
Class
0    226602
1       378
Name: count, dtype: int64
After SMOTE class counts (train_res):
Class
0    226602
1    226602
Name: count, dtype: int64


In [7]:
# Random Forest - SMOTE, No Class Weights
rf_sm = RandomForestClassifier(n_estimators=100, random_state=42)
rf_sm.fit(X_train_sm, y_train_sm)
y_pred = rf_sm.predict(X_test)
y_proba = rf_sm.predict_proba(X_test)[:, 1]
print("Classification Report:\n", classification_report(y_test, y_pred, digits=4))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Average Precision Score:", average_precision_score(y_test, y_proba))


Classification Report:
               precision    recall  f1-score   support

           0     0.9996    0.9999    0.9997     56651
           1     0.9231    0.7579    0.8324        95

    accuracy                         0.9995     56746
   macro avg     0.9613    0.8789    0.9161     56746
weighted avg     0.9995    0.9995    0.9995     56746

Confusion Matrix:
 [[56645     6]
 [   23    72]]
Accuracy: 0.9994889507630493
Average Precision Score: 0.8113838895960894


In [8]:
precisions, recalls, thresholds = precision_recall_curve(y_test, y_proba)

In [9]:
feature_importances = pd.Series(rf_sm.feature_importances_, index=X.columns)
feature_importances = feature_importances.sort_values(ascending=False)
print("\nTop 10 global feature importances:")
print(feature_importances.head(10))


Top 10 global feature importances:
V14    0.227986
V10    0.113291
V17    0.104117
V4     0.088475
V12    0.078795
V11    0.071133
V3     0.060606
V16    0.046264
V2     0.025569
V7     0.024112
dtype: float64


In [10]:
# Prepare for audit notes
X_test_reset = X_test.reset_index(drop=True)
y_test_reset = y_test.reset_index(drop=True)
y_pred_series = pd.Series(y_pred, index=X_test_reset.index)
y_proba_series = pd.Series(y_proba, index=X_test_reset.index)

# Helper: create a short "top features for this row" using the global top features
global_top_feats = list(feature_importances.head(5).index)

def top_feature_values_for_row(row):
    pairs = []
    for feat in global_top_feats:
        val = row[feat]
        pairs.append(f"{feat}={val:.4f}")
    return ", ".join(pairs)

audit_rows = []
for i in X_test_reset.index:
    row = X_test_reset.loc[i]
    pred = int(y_pred_series.loc[i])
    prob = float(y_proba_series.loc[i])
    actual = int(y_test_reset.loc[i])
    top_vals = top_feature_values_for_row(row)
    status = (
        "True Positive" if (pred == 1 and actual == 1) else
        "False Positive" if (pred == 1 and actual == 0) else
        "True Negative" if (pred == 0 and actual == 0) else
        "False Negative"
    )
    note = (
        f"Prediction: {'FRAUD' if pred==1 else 'NOT FRAUD'} "
        f"(prob={prob:.3f}). "#Actual: {'FRAUD' if actual==1 else 'NOT FRAUD'} — {status}. "
        f"Top features (global): {top_vals}."
    )
    audit_rows.append({
        "idx": i,
        "Prediction": pred,
        "Probability": prob,
        "Actual": actual,
        "Status": status,
        "Audit_Note": note
    })

audit_df = pd.DataFrame(audit_rows).sort_values(by="Probability", ascending=False).reset_index(drop=True)

# Save or inspect the top flagged rows
print("\nTop flagged (predicted fraud) audit notes (top 10 by probability):")
print(audit_df[audit_df["Prediction"]==1].head(10)[["idx","Probability","Audit_Note"]].to_string(index=False))

# Optionally: save audit notes to CSV
audit_df.to_csv("../data/audit_notes_from_rf.csv", index=False)


Top flagged (predicted fraud) audit notes (top 10 by probability):
  idx  Probability                                                                                                               Audit_Note
24442          1.0    Prediction: FRAUD (prob=1.000). Top features (global): V14=-9.2661, V10=-5.6536, V17=-5.7091, V4=5.0370, V12=-5.8837.
30966          1.0 Prediction: FRAUD (prob=1.000). Top features (global): V14=-6.5526, V10=-15.1238, V17=-20.1645, V4=6.2074, V12=-14.1750.
30110          1.0   Prediction: FRAUD (prob=1.000). Top features (global): V14=-12.0438, V10=-5.1728, V17=-1.8799, V4=7.3165, V12=-5.6404.
52997          1.0 Prediction: FRAUD (prob=1.000). Top features (global): V14=-9.8872, V10=-12.6959, V17=-21.7102, V4=8.6986, V12=-11.9609.
20871          1.0   Prediction: FRAUD (prob=1.000). Top features (global): V14=-7.8409, V10=-4.1389, V17=-9.9318, V4=5.9282, V12=-11.1240.
36033          1.0    Prediction: FRAUD (prob=1.000). Top features (global): V14=-6.5275, V1