In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import os

# --- Configuration ---
TARGET_COL = 'status'
MODELS_DOCS_DIR = 'docs/models'      # Folder where you save your results
RANDOM_STATE = 42
# ---

# 2. Create folder for results
os.makedirs(MODELS_DOCS_DIR, exist_ok=True)


# 3. LOAD AND SPLIT DATA FROM CATHY (CORRECTED FOR YOU)
print("3. Loading and splitting data from Cathy...")
# Files are in the same folder as the script (so no path needed before filenames)
try:
    # Load the split files saved by Cathy directly from the current folder
    train_df = pd.read_csv('train_clean.csv') 
    test_df = pd.read_csv('test_clean.csv')
    print("Training and test data loaded successfully.")
except FileNotFoundError:
    print("ERROR: 'train_clean.csv' or 'test_clean.csv' not found in the current folder.")
    print("Please place the files in the same directory as this script.")
    exit()

# Split features (X) and target (y)
X_train = train_df.drop(columns=[TARGET_COL])
y_train = train_df[TARGET_COL]

X_test = test_df.drop(columns=[TARGET_COL])
y_test = test_df[TARGET_COL]

print(f"Data ready for training. X_train shape: {X_train.shape}")


# 4. Train Random Forest
print("\n4. Starting training of Random Forest model...")
rf = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1)
rf.fit(X_train, y_train)
print("Training completed.")


# 5. Predictions
y_pred = rf.predict(X_test)


# 6. Evaluation: Classification report
print("\n6. Creating classification report...")
report = classification_report(y_test, y_pred)
print("=== Classification Report (Random Forest) ===")
print(report)

# Save report
report_path = f'{MODELS_DOCS_DIR}/rf_classification_report.txt'
with open(report_path, 'w') as f:
    f.write(report)
print(f"✅ Report saved: {report_path}")


# 7. Confusion matrix (plot)
print("\n7. Creating confusion matrix...")
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=rf.classes_, yticklabels=rf.classes_)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Random Forest Confusion Matrix')
plt.tight_layout()
cm_path = f'{MODELS_DOCS_DIR}/rf_confusion_matrix.png'
plt.savefig(cm_path)
plt.close() 
print(f"✅ Confusion matrix saved: {cm_path}")


# 8. Feature importances (plot)
print("\n8. Creating feature importance plot...")
feature_importances = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf.feature_importances_
})

# Select top 15 features
top15 = feature_importances.sort_values(by='importance', ascending=False).head(15)

plt.figure(figsize=(10, 6))
# Horizontal bar plot
sns.barplot(x='importance', y='feature', data=top15, color='teal')
plt.title('Top 15 Feature Importances (Random Forest)')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
feat_imp_path = f'{MODELS_DOCS_DIR}/rf_feature_importance.png'
plt.savefig(feat_imp_path)
plt.close() 
print(f"✅ Feature importance plot saved: {feat_imp_path}")

3. Loading and splitting data from Cathy...
Training and test data loaded successfully.
Data ready for training. X_train shape: (738, 30)

4. Starting training of Random Forest model...
Training completed.

6. Creating classification report...
=== Classification Report (Random Forest) ===
              precision    recall  f1-score   support

           0       0.67      0.58      0.62        65
           1       0.79      0.84      0.81       120

    accuracy                           0.75       185
   macro avg       0.73      0.71      0.72       185
weighted avg       0.75      0.75      0.75       185

✅ Report saved: docs/models/rf_classification_report.txt

7. Creating confusion matrix...
✅ Confusion matrix saved: docs/models/rf_confusion_matrix.png

8. Creating feature importance plot...
✅ Feature importance plot saved: docs/models/rf_feature_importance.png
