# Health Insurance Cross-Sell Prediction: Data Exploration & Modeling

This notebook walks through a complete workflow for predicting customer interest in health insurance cross-sell using machine learning. The steps include:

- **Data Loading & Cleaning**: Import and inspect the data, handle missing values, and ensure correct data types.
- **Exploratory Data Analysis (EDA)**: Visualize distributions, check for outliers, and understand feature relationships.
- **Feature Engineering**: Encode categorical variables, scale features, and prepare data for modeling.
- **Model Training & Evaluation**: Train several classifiers (Logistic Regression, Random Forest, XGBoost, LightGBM, CatBoost), balance the data, and compare performance.
- **Summary & Next Steps**: Summarize findings, results, and suggest improvements.

Best practices such as reproducibility, clear documentation, and robust evaluation are followed throughout.

In [None]:
# --- Data Loading & Setup ---
# Import required libraries for data handling, visualization, and modeling
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# For model building and evaluation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# For handling imbalanced data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# For models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [None]:
# Load the training data
df = pd.read_csv('./data/train.csv')

# Show the first 5 rows to inspect the data
df.head()

In [None]:
# (Removed: duplicate df.head() call)

In [None]:
### Dataset Features Overview

# - **id**: Customer ID (unique identifier)
# - **Gender**: Male/Female
# - **Age**: Customer age (numeric)
# - **Driving_License**: 1 if customer has a driving license, else 0
# - **Previously_Insured**: 1 if already has health insurance, else 0
# - **Vehicle_Age**: Age of the vehicle ('< 1 Year', '1-2 Year', '> 2 Years')
# - **Vehicle_Damage**: Was vehicle previously damaged? (Yes/No)
# - **Annual_Premium**: Yearly insurance premium (numeric)
# - **Policy_Sales_Channel**: Channel through which policy was sold (numeric code)
# - **Vintage**: Days since customer joined
# - **Response**: Target variable (0 = not interested, 1 = interested)

In [None]:
# Show info, summary statistics, and missing values
display(df.info())
display(df.describe())
display(df.isnull().sum())

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
# Plot distributions for numeric and categorical features
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# Plot numeric features
df[numeric_cols].hist(figsize=(14, 10), bins=30, layout=(3, 4))
plt.suptitle('Numeric Feature Distributions')
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

# Plot categorical features
for col in categorical_cols:
    plt.figure(figsize=(6, 4))
    sns.countplot(x=col, data=df)
    plt.title(f'Distribution of {col}')
    plt.show()

In [None]:
# Plot the distribution of the target variable
sns.countplot(x='Response', data=df)
plt.title('Response Distribution (Target Variable)')
plt.xlabel('Response')
plt.ylabel('Count')
plt.show()

In [None]:
# Boxplots for numeric features to check for outliers
for col in numeric_cols:
    plt.figure(figsize=(8, 2))
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col}')
    plt.show()

In [None]:
# Remove outliers from Annual_Premium using IQR
q1 = df['Annual_Premium'].quantile(0.25)
q3 = df['Annual_Premium'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
df = df[(df['Annual_Premium'] >= lower_bound) & (df['Annual_Premium'] <= upper_bound)]

In [None]:
# Log transform Annual_Premium to reduce skewness
df['Annual_Premium_log'] = np.log1p(df['Annual_Premium'])

In [None]:
# Define categorical and numerical features
categorical = ['Vehicle_Damage', 'Previously_Insured', 'Vehicle_Age']
numerical = ['Policy_Sales_Channel']

In [None]:
# Value counts for categorical features
for col in categorical:
    print(f"Value counts for {col}:")
    print(df[col].value_counts())
    print()

In [None]:
# Value counts for numerical features
for col in numerical:
    print(f"Value counts for {col}:")
    print(df[col].value_counts())

In [None]:
# Correlation matrix for numerical features
correlation_matrix = df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Numerical Features')
plt.show()

In [None]:
df = pd.read_csv("./data/train.csv")

In [None]:
# choose the categorical and numerical features

categorical = ['Vehicle_Damage' , 'Previously_Insured', 'Vehicle_Age']

numerical = ['Policy_Sales_Channel']

In [None]:
# Encode categorical features
vehicle_age_map = {
    "1-2 Year": 0,
    "< 1 Year": 1,
    "> 2 Years": 2
}
df["Vehicle_Age"] = df["Vehicle_Age"].map(vehicle_age_map)

le = LabelEncoder()
for col in df.select_dtypes(include=['object']).columns:
    if col != 'Vehicle_Age':
        df[col] = le.fit_transform(df[col])

In [None]:
# Standardize numerical features (excluding target and categorical)
scaler = StandardScaler()
for col in df.select_dtypes(include=['int64', 'float64']).columns:
    if col != 'Response' and col not in categorical and col != 'Vehicle_Age':
        df[col] = scaler.fit_transform(df[[col]])

In [None]:
# Preview the cleaned dataframe
df.head()

In [None]:
# Prepare features and target
X = df.drop(['Response', 'id'], axis=1)
y = df['Response']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Handle class imbalance with SMOTE (oversampling) and random undersampling
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
rus = RandomUnderSampler(random_state=42)
X_train_balanced, y_train_balanced = rus.fit_resample(X_train_res, y_train_res)

In [None]:
# Define a function to train and evaluate models
def train_and_evaluate(model, X_train, y_train, X_test, y_test, model_name):
    """Train the model and print evaluation metrics."""
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n--- {model_name} ---")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("ROC AUC Score:", roc_auc_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# List of models to evaluate
models = [
    (LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42), 'Logistic Regression'),
    (RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42), 'Random Forest'),
    (XGBClassifier(eval_metric='logloss', scale_pos_weight=1, use_label_encoder=False, random_state=42), 'XGBoost'),
    (LGBMClassifier(class_weight='balanced', random_state=42), 'LightGBM'),
    (CatBoostClassifier(verbose=0, random_state=42), 'CatBoost')
]

# Train and evaluate each model
for model, name in models:
    train_and_evaluate(model, X_train_balanced, y_train_balanced, X_test, y_test, name)

In [None]:
# (Removed: XGBoost cell, now handled in unified model evaluation)

In [None]:
# use LightGBM or CatBoost 

from lightgbm import LGBMClassifier
lgbm = LGBMClassifier(class_weight='balanced')
lgbm.fit(X_train_balanced, y_train_balanced)
y_pred = lgbm.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
from sklearn.metrics import accuracy_score, roc_auc_score
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred))



#  --- IGNORE ---
#  --- IGNORE ---



In [None]:
from catboost import CatBoostClassifier
catboost = CatBoostClassifier(verbose=0)
catboost.fit(X_train_balanced, y_train_balanced)
y_pred = catboost.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred))

In [None]:
# Insurance Health Cross-Sell Prediction - Optimized with MLflow
# =================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import logging
from typing import Tuple, Dict, Any, List
from pathlib import Path
import joblib
import time

# ML Libraries
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix, 
                           roc_auc_score, precision_recall_curve, roc_curve, f1_score)
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# MLflow
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import mlflow.lightgbm
import mlflow.catboost

# Configuration
warnings.filterwarnings('ignore')
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Constants
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# MLflow Configuration
EXPERIMENT_NAME = "insurance-cross-sell-prediction"
TRACKING_URI = "sqlite:///mlflow.db"  # Local SQLite database
# For remote server: TRACKING_URI = "http://your-mlflow-server:5000"

# =================================================================
# 1. CONFIGURATION AND SETUP
# =================================================================

class Config:
    """Configuration class for the ML pipeline"""
    RANDOM_STATE = 42
    TEST_SIZE = 0.2
    CV_FOLDS = 5
    DATA_PATH = './data/train.csv'
    MODEL_SAVE_PATH = './models/'
    
    # Outlier removal parameters
    OUTLIER_METHOD = 'iqr'
    IQR_FACTOR = 1.5

def setup_mlflow():
    """Setup MLflow tracking"""
    mlflow.set_tracking_uri(TRACKING_URI)
    
    # Create experiment if it doesn't exist
    try:
        experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
    except mlflow.exceptions.MlflowException:
        experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
        experiment_id = experiment.experiment_id
    
    mlflow.set_experiment(EXPERIMENT_NAME)
    logger.info(f"MLflow experiment '{EXPERIMENT_NAME}' set up successfully")
    return experiment_id

# =================================================================
# 2. DATA LOADING AND PREPROCESSING
# =================================================================

class DataLoader:
    """Data loading and basic validation"""
    
    @staticmethod
    def load_data(file_path: str) -> pd.DataFrame:
        """Load data with basic validation"""
        start_time = time.time()
        
        if not Path(file_path).exists():
            raise FileNotFoundError(f"Data file not found: {file_path}")
        
        df = pd.read_csv(file_path)
        load_time = time.time() - start_time
        
        logger.info(f"Data loaded successfully in {load_time:.2f} seconds")
        logger.info(f"Dataset shape: {df.shape}")
        
        return df
    
    @staticmethod
    def basic_data_info(df: pd.DataFrame) -> Dict[str, Any]:
        """Get basic information about the dataset"""
        info = {
            'shape': df.shape,
            'memory_usage_mb': df.memory_usage(deep=True).sum() / 1024**2,
            'missing_values': df.isnull().sum().to_dict(),
            'dtypes': df.dtypes.to_dict(),
            'target_distribution': df['Response'].value_counts().to_dict() if 'Response' in df.columns else None
        }
        return info

class DataPreprocessor:
    """Comprehensive data preprocessing pipeline"""
    
    def __init__(self):
        self.scalers = {}
        self.encoders = {}
        self.feature_names = []
        
    def remove_outliers(self, df: pd.DataFrame, column: str, method: str = 'iqr', factor: float = 1.5) -> pd.DataFrame:
        """Remove outliers using IQR method"""
        if method == 'iqr':
            q1 = df[column].quantile(0.25)
            q3 = df[column].quantile(0.75)
            iqr = q3 - q1
            lower_bound = q1 - factor * iqr
            upper_bound = q3 + factor * iqr
            
            outliers_count = len(df[(df[column] < lower_bound) | (df[column] > upper_bound)])
            df_cleaned = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
            
            logger.info(f"Removed {outliers_count} outliers from {column}")
            return df_cleaned
        else:
            raise ValueError(f"Unsupported outlier removal method: {method}")
    
    def engineer_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Feature engineering pipeline"""
        df = df.copy()
        
        # Log transform for Annual_Premium
        df['Annual_Premium_log'] = np.log1p(df['Annual_Premium'])
        
        # Age groups
        df['Age_Group'] = pd.cut(df['Age'], bins=[0, 25, 35, 50, 100], 
                                labels=['Young', 'Adult', 'Middle_Age', 'Senior'])
        
        # Premium per age ratio
        df['Premium_Age_Ratio'] = df['Annual_Premium'] / df['Age']
        
        # Vehicle age mapping
        vehicle_age_map = {
            '< 1 Year': 0,
            '1-2 Year': 1,
            '> 2 Years': 2
        }
        df['Vehicle_Age_Numeric'] = df['Vehicle_Age'].map(vehicle_age_map)
        
        return df
    
    def encode_categorical(self, df: pd.DataFrame, fit: bool = True) -> pd.DataFrame:
        """Encode categorical variables"""
        df = df.copy()
        categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
        
        # Remove target if present
        if 'Response' in categorical_cols:
            categorical_cols.remove('Response')
        
        for col in categorical_cols:
            if fit:
                if col not in self.encoders:
                    self.encoders[col] = LabelEncoder()
                df[col] = self.encoders[col].fit_transform(df[col])
            else:
                if col in self.encoders:
                    df[col] = self.encoders[col].transform(df[col])
        
        return df
    
    def scale_features(self, df: pd.DataFrame, fit: bool = True) -> pd.DataFrame:
        """Scale numerical features"""
        df = df.copy()
        
        # Identify numerical columns to scale
        numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
        categorical_indicators = ['Vehicle_Damage', 'Previously_Insured', 'Vehicle_Age_Numeric']
        
        # Remove target, ID, and categorical indicators
        cols_to_exclude = ['Response', 'id'] + categorical_indicators
        numerical_cols = [col for col in numerical_cols if col not in cols_to_exclude]
        
        for col in numerical_cols:
            if fit:
                if col not in self.scalers:
                    self.scalers[col] = StandardScaler()
                df[col] = self.scalers[col].fit_transform(df[[col]]).flatten()
            else:
                if col in self.scalers:
                    df[col] = self.scalers[col].transform(df[[col]]).flatten()
        
        self.feature_names = df.columns.tolist()
        return df
    
    def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """Complete preprocessing pipeline"""
        logger.info("Starting data preprocessing...")
        start_time = time.time()
        
        # Remove outliers
        df = self.remove_outliers(df, 'Annual_Premium')
        
        # Feature engineering
        df = self.engineer_features(df)
        
        # Encode categorical variables
        df = self.encode_categorical(df, fit=True)
        
        # Scale numerical features
        df = self.scale_features(df, fit=True)
        
        preprocessing_time = time.time() - start_time
        logger.info(f"Preprocessing completed in {preprocessing_time:.2f} seconds")
        
        return df
    
    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """Transform new data using fitted preprocessors"""
        df = self.engineer_features(df)
        df = self.encode_categorical(df, fit=False)
        df = self.scale_features(df, fit=False)
        return df

# =================================================================
# 3. DATA EXPLORATION AND VISUALIZATION
# =================================================================

class DataExplorer:
    """Data exploration and visualization utilities"""
    
    @staticmethod
    def plot_distributions(df: pd.DataFrame, figsize: Tuple[int, int] = (15, 10)):
        """Plot distributions of numerical features"""
        numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
        if 'Response' in numeric_cols:
            numeric_cols.remove('Response')
        
        n_cols = len(numeric_cols)
        n_rows = (n_cols // 4) + (1 if n_cols % 4 else 0)
        
        fig, axes = plt.subplots(n_rows, 4, figsize=figsize)
        axes = axes.flatten() if n_rows > 1 else [axes]
        
        for i, col in enumerate(numeric_cols):
            if i < len(axes):
                df[col].hist(bins=30, ax=axes[i])
                axes[i].set_title(col)
                axes[i].set_xlabel(col)
                axes[i].set_ylabel('Frequency')
        
        # Hide unused subplots
        for i in range(len(numeric_cols), len(axes)):
            axes[i].set_visible(False)
        
        plt.tight_layout()
        plt.show()
    
    @staticmethod
    def plot_correlation_matrix(df: pd.DataFrame, figsize: Tuple[int, int] = (12, 10)):
        """Plot correlation matrix"""
        numeric_df = df.select_dtypes(include=['int64', 'float64'])
        correlation_matrix = numeric_df.corr()
        
        plt.figure(figsize=figsize)
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', 
                   square=True, linewidths=0.5)
        plt.title('Feature Correlation Matrix')
        plt.tight_layout()
        plt.show()
    
    @staticmethod
    def plot_target_distribution(df: pd.DataFrame):
        """Plot target variable distribution"""
        plt.figure(figsize=(8, 6))
        target_counts = df['Response'].value_counts()
        
        sns.countplot(data=df, x='Response')
        plt.title('Target Variable Distribution')
        
        # Add percentage labels
        total = len(df)
        for i, v in enumerate(target_counts.values):
            plt.text(i, v + total*0.01, f'{v}\n({v/total*100:.1f}%)', 
                    ha='center', va='bottom')
        
        plt.show()

# =================================================================
# 4. MODEL TRAINING AND EVALUATION
# =================================================================

class ModelTrainer:
    """Enhanced model training with MLflow integration"""
    
    def __init__(self, experiment_name: str):
        self.experiment_name = experiment_name
        self.models = {}
        self.results = {}
    
    def prepare_data(self, X: pd.DataFrame, y: pd.Series, 
                    test_size: float = 0.2, balance_data: bool = True) -> Tuple:
        """Prepare training and test data with optional balancing"""
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=RANDOM_STATE, stratify=y
        )
        
        if balance_data:
            logger.info("Applying SMOTE + Random Under Sampling for class balancing...")
            
            # Apply SMOTE
            smote = SMOTE(random_state=RANDOM_STATE)
            X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
            
            # Apply Random Under Sampling
            rus = RandomUnderSampler(random_state=RANDOM_STATE)
            X_train_balanced, y_train_balanced = rus.fit_resample(X_train_res, y_train_res)
            
            logger.info(f"Original training set: {X_train.shape[0]} samples")
            logger.info(f"Balanced training set: {X_train_balanced.shape[0]} samples")
            
            return X_train_balanced, X_test, y_train_balanced, y_test
        
        return X_train, X_test, y_train, y_test
    
    def get_model_configs(self) -> List[Tuple]:
        """Get model configurations"""
        return [
            (LogisticRegression(max_iter=1000, class_weight='balanced', random_state=RANDOM_STATE), 
             'Logistic_Regression'),
            (RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=RANDOM_STATE), 
             'Random_Forest'),
            (XGBClassifier(eval_metric='logloss', scale_pos_weight=1, use_label_encoder=False, 
                          random_state=RANDOM_STATE), 'XGBoost'),
            (LGBMClassifier(class_weight='balanced', random_state=RANDOM_STATE, verbose=-1), 
             'LightGBM'),
            (CatBoostClassifier(verbose=0, random_state=RANDOM_STATE), 
             'CatBoost')
        ]
    
    def evaluate_model(self, model, X_test: pd.DataFrame, y_test: pd.Series, 
                      model_name: str) -> Dict[str, float]:
        """Comprehensive model evaluation"""
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else y_pred
        
        metrics = {
            'accuracy': accuracy_score(y_test, y_pred),
            'roc_auc': roc_auc_score(y_test, y_pred_proba),
            'f1_score': f1_score(y_test, y_pred),
            'precision': classification_report(y_test, y_pred, output_dict=True)['1']['precision'],
            'recall': classification_report(y_test, y_pred, output_dict=True)['1']['recall']
        }
        
        return metrics
    
    def plot_confusion_matrix(self, y_true, y_pred, model_name: str):
        """Plot confusion matrix"""
        cm = confusion_matrix(y_true, y_pred)
        plt.figure(figsize=(6, 5))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                   xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
        plt.title(f'Confusion Matrix: {model_name}')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.show()
    
    def cross_validate_model(self, model, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5) -> Dict[str, float]:
        """Perform cross-validation"""
        cv_scores = cross_val_score(model, X, y, cv=StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=RANDOM_STATE), 
                                   scoring='roc_auc')
        
        return {
            'cv_mean': cv_scores.mean(),
            'cv_std': cv_scores.std(),
            'cv_scores': cv_scores.tolist()
        }
    
    def train_all_models(self, X_train: pd.DataFrame, y_train: pd.Series, 
                        X_test: pd.DataFrame, y_test: pd.Series) -> Dict[str, Any]:
        """Train all models with MLflow tracking"""
        
        models_config = self.get_model_configs()
        results = {}
        
        for model, model_name in models_config:
            with mlflow.start_run(run_name=f"{model_name}_run") as run:
                logger.info(f"Training {model_name}...")
                start_time = time.time()
                
                # Enable auto-logging
                if model_name == 'XGBoost':
                    mlflow.xgboost.autolog()
                elif model_name == 'LightGBM':
                    mlflow.lightgbm.autolog()
                elif model_name == 'CatBoost':
                    mlflow.catboost.autolog()
                else:
                    mlflow.sklearn.autolog()
                
                # Train model
                model.fit(X_train, y_train)
                training_time = time.time() - start_time
                
                # Cross-validation
                cv_results = self.cross_validate_model(model, X_train, y_train)
                
                # Evaluate on test set
                test_metrics = self.evaluate_model(model, X_test, y_test, model_name)
                
                # Log custom parameters and metrics
                mlflow.log_param("model_name", model_name)
                mlflow.log_param("training_time", training_time)
                mlflow.log_param("training_samples", len(X_train))
                mlflow.log_param("test_samples", len(X_test))
                
                # Log cross-validation results
                mlflow.log_metric("cv_roc_auc_mean", cv_results['cv_mean'])
                mlflow.log_metric("cv_roc_auc_std", cv_results['cv_std'])
                
                # Log test metrics
                for metric_name, metric_value in test_metrics.items():
                    mlflow.log_metric(f"test_{metric_name}", metric_value)
                
                # Save model
                model_path = f"models/{model_name.lower()}_model"
                if model_name == 'XGBoost':
                    mlflow.xgboost.log_model(model, model_path)
                elif model_name == 'LightGBM':
                    mlflow.lightgbm.log_model(model, model_path)
                elif model_name == 'CatBoost':
                    mlflow.catboost.log_model(model, model_path)
                else:
                    mlflow.sklearn.log_model(model, model_path)
                
                # Store results
                results[model_name] = {
                    'model': model,
                    'test_metrics': test_metrics,
                    'cv_results': cv_results,
                    'training_time': training_time,
                    'run_id': run.info.run_id
                }
                
                # Print results
                print(f"\n--- {model_name} Results ---")
                print(f"Training Time: {training_time:.2f} seconds")
                print(f"Cross-Validation ROC-AUC: {cv_results['cv_mean']:.4f} (±{cv_results['cv_std']:.4f})")
                print(f"Test Accuracy: {test_metrics['accuracy']:.4f}")
                print(f"Test ROC-AUC: {test_metrics['roc_auc']:.4f}")
                print(f"Test F1-Score: {test_metrics['f1_score']:.4f}")
                
                # Plot confusion matrix
                y_pred = model.predict(X_test)
                self.plot_confusion_matrix(y_test, y_pred, model_name)
                
                logger.info(f"{model_name} training completed")
        
        return results

# =================================================================
# 5. MAIN EXECUTION PIPELINE
# =================================================================

def main():
    """Main execution pipeline"""
    logger.info("Starting Insurance Cross-Sell Prediction Pipeline")
    
    # Setup MLflow
    experiment_id = setup_mlflow()
    
    # Load data
    data_loader = DataLoader()
    df = data_loader.load_data(Config.DATA_PATH)
    
    # Log dataset info to MLflow
    with mlflow.start_run(run_name="data_exploration") as run:
        data_info = data_loader.basic_data_info(df)
        mlflow.log_param("dataset_shape", str(data_info['shape']))
        mlflow.log_param("memory_usage_mb", data_info['memory_usage_mb'])
        mlflow.log_param("target_distribution", str(data_info['target_distribution']))
        
        # Data exploration
        explorer = DataExplorer()
        explorer.plot_target_distribution(df)
        explorer.plot_distributions(df)
        explorer.plot_correlation_matrix(df)
    
    # Preprocessing
    preprocessor = DataPreprocessor()
    df_processed = preprocessor.fit_transform(df)
    
    # Prepare features and target
    X = df_processed.drop(['Response', 'id'], axis=1, errors='ignore')
    y = df_processed['Response']
    
    logger.info(f"Feature matrix shape: {X.shape}")
    logger.info(f"Target vector shape: {y.shape}")
    
    # Model training
    trainer = ModelTrainer(EXPERIMENT_NAME)
    X_train, X_test, y_train, y_test = trainer.prepare_data(X, y, test_size=Config.TEST_SIZE)
    
    # Train all models
    results = trainer.train_all_models(X_train, y_train, X_test, y_test)
    
    # Compare models
    print("\n" + "="*50)
    print("MODEL COMPARISON SUMMARY")
    print("="*50)
    
    comparison_df = pd.DataFrame({
        model_name: {
            'ROC-AUC': result['test_metrics']['roc_auc'],
            'Accuracy': result['test_metrics']['accuracy'],
            'F1-Score': result['test_metrics']['f1_score'],
            'CV ROC-AUC': result['cv_results']['cv_mean'],
            'Training Time (s)': result['training_time']
        }
        for model_name, result in results.items()
    }).T
    
    print(comparison_df.round(4))
    
    # Best model
    best_model_name = comparison_df['ROC-AUC'].idxmax()
    best_model = results[best_model_name]['model']
    
    print(f"\nBest Model: {best_model_name}")
    print(f"Best ROC-AUC Score: {comparison_df.loc[best_model_name, 'ROC-AUC']:.4f}")
    
    # Log best model to MLflow
    with mlflow.start_run(run_name="best_model_summary") as run:
        mlflow.log_param("best_model", best_model_name)
        mlflow.log_metric("best_roc_auc", comparison_df.loc[best_model_name, 'ROC-AUC'])
        mlflow.log_metric("best_accuracy", comparison_df.loc[best_model_name, 'Accuracy'])
        mlflow.log_metric("best_f1_score", comparison_df.loc[best_model_name, 'F1-Score'])
    
    logger.info("Pipeline execution completed successfully!")
    
    return results, preprocessor, best_model

# =================================================================
# 6. UTILITY FUNCTIONS
# =================================================================

def save_preprocessor(preprocessor: DataPreprocessor, filepath: str):
    """Save the preprocessor for future use"""
    joblib.dump(preprocessor, filepath)
    logger.info(f"Preprocessor saved to {filepath}")

def load_preprocessor(filepath: str) -> DataPreprocessor:
    """Load a saved preprocessor"""
    preprocessor = joblib.load(filepath)
    logger.info(f"Preprocessor loaded from {filepath}")
    return preprocessor

def predict_new_data(model, preprocessor: DataPreprocessor, new_data: pd.DataFrame) -> np.ndarray:
    """Make predictions on new data"""
    processed_data = preprocessor.transform(new_data)
    predictions = model.predict(processed_data)
    probabilities = model.predict_proba(processed_data)[:, 1] if hasattr(model, 'predict_proba') else None
    
    return predictions, probabilities

# =================================================================
# EXECUTION
# =================================================================

if __name__ == "__main__":
    # Create necessary directories
    Path("models").mkdir(exist_ok=True)
    Path("data").mkdir(exist_ok=True)
    
    # Run the main pipeline
    results, preprocessor, best_model = main()
    
    # Save the preprocessor
    save_preprocessor(preprocessor, "models/preprocessor.joblib")
    
    print(f"\nMLflow Tracking URI: {TRACKING_URI}")
    print("To view the MLflow UI, run: mlflow ui --backend-store-uri sqlite:///mlflow.db")