In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv("creditcard.csv")

In [3]:
# Define the feature columns and target column (assuming 'class' is the target variable)
X = df.drop('Class', axis=1)  # Features (drop the target column)
y = df['Class']  # Target variable (fraud labels)

In [7]:
# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Initialize the model
dt_model = DecisionTreeClassifier(random_state=42)

In [9]:
# Fit the model on the training data
dt_model.fit(X_train, y_train)

In [10]:
# Predictions
y_pred = dt_model.predict(X_test)

# Generate classification report as a dictionary
report = classification_report(y_test, y_pred, output_dict=True)

# Extract precision and recall for the '1' class (fraudulent transactions)
precision = report['1']['precision']
recall = report['1']['recall']

# Store results in a DataFrame
results_df = pd.DataFrame({"Precision": [precision], "Recall": [recall]}, index=["Decision Tree"])
print(results_df)

               Precision    Recall
Decision Tree   0.696429  0.795918


In [11]:
# Initialize Decision Tree with class_weight='balanced'
dt_model = DecisionTreeClassifier(random_state=42, class_weight="balanced")

In [12]:
# Train the model
dt_model.fit(X_train, y_train)

In [13]:
# Predictions
y_pred = dt_model.predict(X_test)

In [15]:
# Compute Precision & Recall
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Store results
results_df.loc["Decision Tree (Balanced)"] = [precision, recall]

# Print results
print(results_df)

                          Precision    Recall
Decision Tree              0.696429  0.795918
Decision Tree (Balanced)   0.732673  0.755102


In [30]:
from sklearn.metrics import confusion_matrix

# Get confusion matrix values
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(f"False Positives (FP): {fp}")
print(f"False Negatives (FN): {fn}")

False Positives (FP): 27
False Negatives (FN): 24


In [26]:
# Define costs
FP_cost = 5
FN_cost = 50

In [27]:
# Compute total cost
total_cost = (fp * FP_cost) + (fn * FN_cost)

print(f"Total Cost of Misclassification: £{total_cost}")

Total Cost of Misclassification: £1335


In [28]:
# Get probabilities
y_prob = dt_model.predict_proba(X_test)[:, 1]  # Probabilities for class 1 (fraud)

# Set custom threshold (e.g., 0.3 instead of default 0.5)
threshold = 0.3
y_pred_adjusted = (y_prob >= threshold).astype(int)

# Compute new confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_adjusted).ravel()

# Recalculate cost
total_cost_adjusted = (fp * FP_cost) + (fn * FN_cost)
print(f"Total Cost with Threshold {threshold}: £{total_cost_adjusted}")

Total Cost with Threshold 0.3: £1335


In [31]:
import time

In [32]:
start_pred_time = time.time()
y_pred = dt_model.predict(X_test)
end_pred_time = time.time()

pred_time = end_pred_time - start_pred_time
print(f"Prediction Time: {pred_time:.4f} seconds")

Prediction Time: 0.0908 seconds
