In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report

In [16]:
# Load your dataset
df = pd.read_csv('card_transdata.csv')
df = df.head(50000)
# Display class distribution
print("Class Distribution:")
print(df['fraud'].value_counts())

Class Distribution:
0.0    45683
1.0     4317
Name: fraud, dtype: int64


In [17]:
# Separate features and labels in the original dataset
X = df.drop('fraud', axis=1)
y = df['fraud']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest Classifier for the original data
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

In [18]:
# Evaluate the model on the original data
print('Classification Report (Original Data):')
print(classification_report(y_test, y_pred))

Classification Report (Original Data):
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      9154
         1.0       1.00      1.00      1.00       846

    accuracy                           1.00     10000
   macro avg       1.00      1.00      1.00     10000
weighted avg       1.00      1.00      1.00     10000



In [19]:
# Create a RandomUnderSampler for the minority class only
rus = RandomUnderSampler(sampling_strategy={1: 874}, random_state=42)

In [20]:
# Fit and transform the data
X_resampled, y_resampled = rus.fit_resample(X, y)

In [21]:
# Combine the features and labels into a new DataFrame
df_resampled = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.Series(y_resampled, name='fraud_label')], axis=1)

In [22]:
# Display the new class distribution
print("\nResampled Class Distribution:")
print(df_resampled['fraud_label'].value_counts())


Resampled Class Distribution:
0.0    45683
1.0      874
Name: fraud_label, dtype: int64


In [23]:
# Create an instance of the Random Forest model for the resampled data
rf_model_resampled = RandomForestClassifier(random_state=42)
rf_model_resampled.fit(X_resampled, y_resampled)

RandomForestClassifier(random_state=42)

In [24]:
# Calculate evaluation metrics on the original data
y_pred_original = rf_model_resampled.predict(X)
accuracy_original = accuracy_score(y, y_pred_original)
precision_original = precision_score(y, y_pred_original)
recall_original = recall_score(y, y_pred_original)
f1_original = f1_score(y, y_pred_original)

print("\nOriginal Data Metrics:")
print(f"Accuracy: {accuracy_original:.4f}")
print(f"Precision: {precision_original:.4f}")
print(f"Recall: {recall_original:.4f}")
print(f"F1 Score: {f1_original:.4f}")

# Calculate evaluation metrics on the resampled data
y_pred_resampled = rf_model_resampled.predict(X_resampled)
accuracy_resampled = accuracy_score(y_resampled, y_pred_resampled)
precision_resampled = precision_score(y_resampled, y_pred_resampled)
recall_resampled = recall_score(y_resampled, y_pred_resampled)
f1_resampled = f1_score(y_resampled, y_pred_resampled)

print("\nResampled Data Metrics:")
print(f"Accuracy: {accuracy_resampled:.4f}")
print(f"Precision: {precision_resampled:.4f}")
print(f"Recall: {recall_resampled:.4f}")
print(f"F1 Score: {f1_resampled:.4f}")


Original Data Metrics:
Accuracy: 0.9989
Precision: 1.0000
Recall: 0.9875
F1 Score: 0.9937

Resampled Data Metrics:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000


In [25]:
# Confusion Matrix
conf_matrix_original = confusion_matrix(y, y_pred_original)
print("\nConfusion Matrix (Original Data):")
print(conf_matrix_original)


Confusion Matrix (Original Data):
[[45683     0]
 [   54  4263]]


In [26]:
conf_matrix_resampled = confusion_matrix(y_resampled, y_pred_resampled)
print("\nConfusion Matrix (Resampled Data):")
print(conf_matrix_resampled)



Confusion Matrix (Resampled Data):
[[45683     0]
 [    0   874]]
