In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report
)

from imblearn.over_sampling import SMOTE

import pickle
import warnings
warnings.filterwarnings("ignore")


In [6]:
df = pd.read_csv("/content/creditcard.csv")
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71396 entries, 0 to 71395
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    71396 non-null  int64  
 1   V1      71396 non-null  float64
 2   V2      71396 non-null  float64
 3   V3      71396 non-null  float64
 4   V4      71396 non-null  float64
 5   V5      71396 non-null  float64
 6   V6      71396 non-null  float64
 7   V7      71396 non-null  float64
 8   V8      71396 non-null  float64
 9   V9      71396 non-null  float64
 10  V10     71396 non-null  float64
 11  V11     71396 non-null  float64
 12  V12     71396 non-null  float64
 13  V13     71396 non-null  float64
 14  V14     71396 non-null  float64
 15  V15     71396 non-null  float64
 16  V16     71396 non-null  float64
 17  V17     71396 non-null  float64
 18  V18     71396 non-null  float64
 19  V19     71396 non-null  float64
 20  V20     71396 non-null  float64
 21  V21     71396 non-null  float64
 22

In [9]:
df['Class'].value_counts(normalize=True) * 100

Unnamed: 0_level_0,proportion
Class,Unnamed: 1_level_1
0.0,99.752083
1.0,0.247917


In [10]:
df = df.dropna(subset=['Class'])

X = df.drop(columns=['Class'])
y = df['Class']

print("NaNs in target after cleaning:", y.isna().sum())


NaNs in target after cleaning: 0


In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [12]:
smote = SMOTE(
    sampling_strategy='auto',
    random_state=42,
    k_neighbors=5
)

X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("Before SMOTE:")
print(y_train.value_counts())

print("\nAfter SMOTE:")
print(pd.Series(y_train_smote).value_counts())


Before SMOTE:
Class
0.0    56974
1.0      142
Name: count, dtype: int64

After SMOTE:
Class
0.0    56974
1.0    56974
Name: count, dtype: int64


In [13]:
rf = RandomForestClassifier(
    random_state=42,
    n_jobs=-1
)

In [14]:
param_dist = {
    'n_estimators': [50, 100],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

In [16]:
rf_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=5,
    scoring='roc_auc',
    cv=3,
    random_state=42,
    verbose=2,
    n_jobs=-1
)

rf_search.fit(X_train_smote, y_train_smote)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


In [17]:
best_rf = rf_search.best_estimator_
rf_search.best_params_

{'n_estimators': 50,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 20,
 'bootstrap': False}

In [18]:
best_rf.fit(X_train_smote, y_train_smote)

In [19]:
y_pred = best_rf.predict(X_test)
y_prob = best_rf.predict_proba(X_test)[:, 1]

In [20]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))

Accuracy: 0.9996498354226486
Precision: 0.9411764705882353
Recall: 0.9142857142857143
F1 Score: 0.927536231884058
ROC-AUC: 0.9545001404099971


In [21]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Confusion Matrix:
 [[14242     2]
 [    3    32]]


In [22]:
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     14244
         1.0       0.94      0.91      0.93        35

    accuracy                           1.00     14279
   macro avg       0.97      0.96      0.96     14279
weighted avg       1.00      1.00      1.00     14279



In [23]:
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

feature_importance.head(10)

Unnamed: 0,Feature,Importance
14,V14,0.307607
3,V3,0.118279
10,V10,0.103907
17,V17,0.099536
12,V12,0.076487
4,V4,0.064551
16,V16,0.055059
2,V2,0.033049
9,V9,0.021208
7,V7,0.019603


In [24]:
with open("random_forest_fraud_model.pkl", "wb") as file:
    pickle.dump(best_rf, file)

print("Model saved as random_forest_fraud_model.pkl")

Model saved as random_forest_fraud_model.pkl


In [25]:
with open("random_forest_fraud_model.pkl", "rb") as file:
    loaded_model = pickle.load(file)

loaded_model.predict(X_test.iloc[:5])

array([0., 0., 0., 0., 0.])