In [None]:
# Import core data handling and plotting libraries

import pandas as pd
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE

import seaborn as sns

# Import tools for splitting data and balancing with SMOTE
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE



In [None]:
# Load the dataset from the CSV file into a pandas DataFrame

df = pd.read_csv("creditcard.csv")

# Show the first 5 rows of the dataset to inspect structure and values

df.head()


In [None]:
# Print how many fraud (1) vs. non-fraud (0) transactions exist

print(df['Class'].value_counts())

# Plot a bar chart to visualize class imbalance in the dataset
sns.countplot(x='Class', data=df)
plt.title("Class Distribution")
plt.show()


In [None]:
# Separate features (X) and target labels (y)
X = df.drop('Class', axis=1)  # All columns except 'Class'
y = df['Class']               # Only the 'Class' column

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [None]:
# Initialize SMOTE to balance the minority class (fraud = 1)
smote = SMOTE(random_state=42)

# Apply SMOTE only on training data to prevent data leakage
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Print class counts before and after applying SMOTE
print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", pd.Series(y_train_resampled).value_counts())


In [None]:
# Import the Random Forest model from scikit-learn
from sklearn.ensemble import RandomForestClassifier

# Create a Random Forest with 20 trees
model = RandomForestClassifier(n_estimators=20, random_state=42)

# Train the model on the balanced (SMOTE) training data
model.fit(X_train_resampled, y_train_resampled)



In [None]:
# Import evaluation tools
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

# Use the trained model to predict on the original test set
y_pred = model.predict(X_test)

# Print precision, recall, and F1-score for each class
print("=== Classification Report ===")
print(classification_report(y_test, y_pred))

# Generate and display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()
plt.title("Confusion Matrix")
plt.show()




In [None]:
# Plot feature importances to understand which columns affect predictions the most
import numpy as np

importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

# Plot top 10 most important features
plt.figure(figsize=(10, 6))
plt.title("Top 10 Feature Importances")
plt.bar(range(10), importances[indices[:10]], align="center")
plt.xticks(range(10), X.columns[indices[:10]], rotation=45)
plt.tight_layout()
plt.show()
