In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, auc, precision_recall_curve, classification_report, confusion_matrix

In [2]:
# Load the dataset
df= pd.read_csv('/content/creditcard.csv')

In [3]:
# Data Analysis
# Quick look at the data structure
print(df.head())

   Time        V1        V2        V3        V4        V5        V6        V7  \
0     0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1     0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2     1 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3     1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4     2 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

In [5]:
# Preprocessing
# Separate features and target variable
X = df.drop(['Class'], axis=1)
y = df['Class']

In [7]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
# Handling imbalanced data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

In [10]:
# Model Training
# Train a Random Forest Classifier
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train_res, y_train_res)

In [11]:
# Hyperparameter Tuning
# Random Forest hyperparameter grid
rf_param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 6, 12, 24],
    'min_samples_split': [2, 5, 10]
}

In [12]:
# Grid search for Random Forest
rf_grid_search = GridSearchCV(rf_clf, rf_param_grid, cv=3, scoring='recall')
rf_grid_search.fit(X_train_res, y_train_res)

In [14]:
# Evaluate Random Forest on the test set
rf_best_clf = rf_grid_search.best_estimator_
rf_predictions = rf_best_clf.predict(X_test_scaled)
print("Random Forest Classifier report: \n", classification_report(y_test, rf_predictions))

Random Forest Classifier report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     15398
           1       0.96      0.79      0.86        28

    accuracy                           1.00     15426
   macro avg       0.98      0.89      0.93     15426
weighted avg       1.00      1.00      1.00     15426



In [20]:
# Generate the confusion matrix for the Random Forest Classifier
rf_predictions = rf_best_clf.predict(X_test_scaled)
conf_matrix = confusion_matrix(y_test, rf_predictions)
print("Confusion Matrix for Random Forest Classifier: \n", conf_matrix)

Confusion Matrix for Random Forest Classifier: 
 [[15397     1]
 [    6    22]]
