In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, auc
from imblearn.over_sampling import SMOTE

# --- 1. Load Processed Data ---
# This file was created at the end of Notebook 1
df = pd.read_csv('processed_train_data.csv')

# Prepare X (Features) and y (Target)
# Drop non-numeric columns like 'Provider' and the target itself
X = df.drop(['PotentialFraud', 'Provider', 'Target'], axis=1)
y = df['Target']

# --- 2. Split Data ---
# Standard 80/20 split. 
# Random State ensures we skget the same split every time (reproducibility).
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training Shape: {X_train.shape}")
print(f"Testing Shape:  {X_test.shape}")


# --- 3. Address Imbalance (SMOTE) ---
# Requirement: "Address imbalance using approaches such as... oversampling"
print("\n--- Applying SMOTE (Synthetic Minority Over-sampling) ---")
print(f"Original Fraud Count: {sum(y_train == 1)}")

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Fit SMOTE *only* on Training data (Never on Test data!)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print(f"New Fraud Count:      {sum(y_train_resampled == 1)}")
print("Strategy: We balanced the training data so the model doesn't ignore the minority class.")


# --- 4. Train Model ---
# We use Random Forest as a baseline (it's robust and handles mixed data well).
print("\n--- Training Random Forest ---")
model = RandomForestClassifier(random_state=42)
model.fit(X_train_resampled, y_train_resampled)


# --- 5. Evaluate with Appropriate Metrics ---
# Requirement: "Select metrics... prioritizing Precision, Recall, F1-score, and PR-AUC"

# Get predictions
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1] # Probability scores for AUC

print("\n--- Evaluation Metrics ---")
print(classification_report(y_test, y_pred))

# Confusion Matrix (To see False Positives vs False Negatives)
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)


# --- 6. PR-AUC Curve (Visual Requirement) ---
precision, recall, _ = precision_recall_curve(y_test, y_prob)
pr_auc_score = auc(recall, precision)

plt.figure(figsize=(7, 5))
plt.plot(recall, precision, label=f'PR-AUC = {pr_auc_score:.2f}')
plt.xlabel('Recall (Sensitivity)')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve (Better for Imbalanced Data)')
plt.legend()
plt.show()


# --- 7. Justification & Trade-offs (Section 1.5.2) ---
print("\n--- Strategy Justification ---")
print("1. Strategy: We used SMOTE to create synthetic examples of Fraud. This prevents the model from simply guessing 'Legitimate' for everyone.")
print("2. Trade-off: Higher Recall (catching more fraud) often lowers Precision (more false alarms).")
print(f"3. Result: Our PR-AUC of {pr_auc_score:.2f} indicates how well we balance these trade-offs.")

ModuleNotFoundError: No module named 'sklearn'