In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.utils import resample
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from joblib import dump

# Load the dataset
df = pd.read_csv("transaction_data.csv")

# Feature and label split
X = df.drop(columns=['transaction_id', 'is_fraud'])
y = df['is_fraud']

# Oversample the minority class using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42, stratify=y_resampled)

# Define a Random Forest Classifier with GridSearchCV
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier(random_state=42, class_weight='balanced'))
])

param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [10, 20, None],
    'model__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model evaluation
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

print("Classification Report:")
print(classification_report(y_test, y_pred))
print(f"AUC: {roc_auc_score(y_test, y_pred_proba)}")

# Save the best model
dump(best_model, "best_fraud_detection_model_improved.sav")
print("Improved model saved as 'best_fraud_detection_model_improved.sav'")


Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.86      0.90      5873
           1       0.87      0.94      0.90      5872

    accuracy                           0.90     11745
   macro avg       0.90      0.90      0.90     11745
weighted avg       0.90      0.90      0.90     11745

AUC: 0.9637886467002972
Improved model saved as 'best_fraud_detection_model_improved.sav'
