In [12]:
# ===============================================================
# 02_Model_Training_and_Evaluation.ipynb
# ===============================================================

# ===============================================================
# Import Required Libraries
# ===============================================================
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from imblearn.over_sampling import SMOTE
import joblib

# ===============================================================
# Set Working Directory and Paths
# ===============================================================
project_dir = r"F:\CustomerChurnProject"
os.chdir(project_dir)

data_path = os.path.join("data", "cleaned_churn_data.csv")
model_dir = os.path.join(project_dir, "models")
report_dir = os.path.join(project_dir, "reports")

os.makedirs(model_dir, exist_ok=True)
os.makedirs(report_dir, exist_ok=True)

# ===============================================================
# Load Data
# ===============================================================
data = pd.read_csv(data_path)
X = data.drop("Target_Churn", axis=1)
y = data["Target_Churn"]

# ===============================================================
# Split Data into Train and Test Sets
# ===============================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ===============================================================
# Scale Features
# ===============================================================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

joblib.dump(scaler, os.path.join(model_dir, "scaler.pkl"))

# ===============================================================
# Handle Class Imbalance with SMOTE
# ===============================================================
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

# ===============================================================
# Define Models and Hyperparameters
# ===============================================================
models = {
    "RandomForest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [None, 5, 10],
            "min_samples_split": [2, 5]
        }
    },
    "LogisticRegression": {
        "model": LogisticRegression(max_iter=1000, random_state=42),
        "params": {
            "C": [0.1, 1, 10],
            "solver": ["liblinear", "lbfgs"]
        }
    },
    "XGBoost": {
        "model": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [3, 5, 7],
            "learning_rate": [0.01, 0.1, 0.2]
        }
    }
}

best_model_name = None
best_model = None
best_accuracy = 0

# ===============================================================
# Train, Evaluate and Save Models
# ===============================================================
for name, m in models.items():
    clf = GridSearchCV(m["model"], m["params"], cv=3, scoring='accuracy', n_jobs=-1)
    clf.fit(X_train_res, y_train_res)
    
    y_pred = clf.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, clf.predict_proba(X_test_scaled)[:,1]) if hasattr(clf, "predict_proba") else None
    
    # Save metrics to text file
    with open(os.path.join(report_dir, f"{name}_report.txt"), "w") as f:
        f.write(f"Model: {name}\n")
        f.write(f"Best Parameters: {clf.best_params_}\n")
        f.write(f"Accuracy: {accuracy:.4f}\n")
        if roc_auc:
            f.write(f"ROC-AUC: {roc_auc:.4f}\n")
        f.write("Confusion Matrix:\n")
        f.write(str(confusion_matrix(y_test, y_pred)) + "\n")
        f.write("Classification Report:\n")
        f.write(classification_report(y_test, y_pred))
    
    # Plot and save confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"{name} Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.savefig(os.path.join(report_dir, f"{name}_confusion_matrix.png"))
    plt.close()
    
    # Feature importance for tree-based models
    if name in ["RandomForest", "XGBoost"]:
        feat_imp = pd.DataFrame({
            "Feature": X.columns,
            "Importance": clf.best_estimator_.feature_importances_
        }).sort_values(by="Importance", ascending=False)
        feat_imp.to_csv(os.path.join(report_dir, f"{name}_feature_importance.csv"), index=False)
        
        plt.figure(figsize=(8,6))
        sns.barplot(x="Importance", y="Feature", data=feat_imp.head(10))
        plt.title(f"Top 10 Feature Importances - {name}")
        plt.tight_layout()
        plt.savefig(os.path.join(report_dir, f"{name}_feature_importance.png"))
        plt.close()
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model_name = name
        best_model = clf.best_estimator_
        joblib.dump(best_model, os.path.join(model_dir, f"{name}_model.pkl"))

print(f"Best Model: {best_model_name} with Accuracy: {best_accuracy:.4f}")
# ===============================================================
# Generate PDF Summary Report
# ===============================================================
from fpdf import FPDF
from PIL import Image

pdf_path = os.path.join(report_dir, "Model_Training_Summary.pdf")

pdf = FPDF()
pdf.set_auto_page_break(auto=True, margin=15)

for name in models.keys():
    # Add a page for each model
    pdf.add_page()
    pdf.set_font("Arial", 'B', 16)
    pdf.cell(0, 10, f"{name} Model Report", ln=True, align="C")
    
    # Add metrics text
    report_file = os.path.join(report_dir, f"{name}_report.txt")
    with open(report_file, "r") as f:
        lines = f.readlines()
    
    pdf.set_font("Arial", '', 12)
    pdf.ln(5)
    for line in lines:
        pdf.multi_cell(0, 6, line.strip())
    
    # Add confusion matrix image
    cm_path = os.path.join(report_dir, f"{name}_confusion_matrix.png")
    if os.path.exists(cm_path):
        pdf.ln(5)
        pdf.image(cm_path, w=pdf.w - 40)
    
    # Add feature importance image if exists
    fi_path = os.path.join(report_dir, f"{name}_feature_importance.png")
    if os.path.exists(fi_path):
        pdf.ln(5)
        pdf.image(fi_path, w=pdf.w - 40)

# Save PDF
pdf.output(pdf_path)
print(f"✅ PDF Summary report saved at: {pdf_path}")


Best Model: LogisticRegression with Accuracy: 0.5150


ModuleNotFoundError: No module named 'fpdf'