# 03. Model Training and Evaluation

In this notebook, we will train a Random Forest Classifier using the preprocessed data and evaluate its performance.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve
import sys
import os

# Add src to path
sys.path.append(os.path.abspath(os.path.join('..', 'src')))
from preprocess import preprocess_data, get_resampled_data

## 1. Load and Preprocess Data

In [None]:
data_path = '../data/raw/creditcard.csv'
if not os.path.exists(data_path):
    print("Data file not found. Please ensure 'creditcard.csv' is in the 'data/raw' directory.")
else:
    df = pd.read_csv(data_path)
    X_train, X_test, y_train, y_test = preprocess_data(df)
    
    # Apply SMOTE
    X_train_res, y_train_res = get_resampled_data(X_train, y_train, method='SMOTE')
    print("Data loaded and preprocessed.")
    print(f"Training shape: {X_train_res.shape}")
    print(f"Test shape: {X_test.shape}")

## 2. Train Model

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
clf.fit(X_train_res, y_train_res)
print("Model trained.")

## 3. Evaluation

In [None]:
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

## 4. Save Model

In [None]:
joblib.dump(clf, '../models/random_forest_model.pkl')
print("Model saved to models/random_forest_model.pkl")