In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_csv("creditcard.csv")

In [3]:
# Define the feature columns and target column (assuming 'class' is the target variable)
X = df.drop('Class', axis=1)  # Features (drop the target column)
y = df['Class']  # Target variable (fraud labels)

In [4]:
# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [5]:
# Initialize Random Forest
rf = RandomForestClassifier(random_state=42)

# Train the model
rf.fit(X_train, y_train)

# Predictions
y_pred = rf.predict(X_test)

In [6]:
# Generate classification report as a dictionary
report = classification_report(y_test, y_pred, output_dict=True)

# Extract precision and recall for the '1' class (fraudulent transactions)
precision = report['1']['precision']
recall = report['1']['recall']

# Store results in a DataFrame
results_df = pd.DataFrame({"Precision": [precision], "Recall": [recall]}, index=["Random Forest"])
print(results_df)

               Precision    Recall
Random Forest   0.947826  0.801471


In [7]:
# Apply SMOTE to oversample the minority class
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [8]:
# Initialize Random Forest
rf = RandomForestClassifier(random_state=42)

In [10]:
# Train the model on balanced data
rf.fit(X_train_resampled, y_train_resampled)

In [11]:
# Predictions
y_pred = rf.predict(X_test)

In [12]:
# Compute Precision & Recall
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

In [13]:
# Store results
results_df.loc["Random Forest (Balanced)"] = [precision, recall]

# Print results
print(results_df)

                          Precision    Recall
Random Forest              0.947826  0.801471
Random Forest (Balanced)   0.832168  0.875000


In [14]:
from sklearn.metrics import confusion_matrix

In [22]:
# Get confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(f"False Positives (FP): {fp}")
print(f"False Negatives (FN): {fn}")

False Positives (FP): 24
False Negatives (FN): 17


In [19]:
# Define costs
FP_cost = 5
FN_cost = 50

In [20]:
# Compute total cost
total_cost = (fp * FP_cost) + (fn * FN_cost)

print(f"Total Cost of Misclassification: £{total_cost}")

Total Cost of Misclassification: £995


In [21]:
# Get probabilities
y_prob = rf.predict_proba(X_test)[:, 1]  # Probabilities for class 1 (fraud)

# Set custom threshold (e.g., 0.3 instead of default 0.5)
threshold = 0.3
y_pred_adjusted = (y_prob >= threshold).astype(int)

# Compute new confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_adjusted).ravel()

# Recalculate cost
total_cost_adjusted = (fp * FP_cost) + (fn * FN_cost)
print(f"Total Cost with Threshold {threshold}: £{total_cost_adjusted}")

Total Cost with Threshold 0.3: £995


In [23]:
import time

In [25]:
start_pred_time = time.time()
y_pred = rf.predict(X_test)
end_pred_time = time.time()

pred_time = end_pred_time - start_pred_time
print(f"Prediction Time: {pred_time:.4f} seconds")

Prediction Time: 1.3688 seconds
