# Insurance Health Cross-Sell Prediction
# =================================================================

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import logging
from typing import Tuple, Dict, Any, List
from pathlib import Path
import joblib
import time

# ML Libraries
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix, 
                           roc_auc_score, precision_recall_curve, roc_curve, f1_score)
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# # MLflow
# import mlflow
# import mlflow.sklearn
# import mlflow.xgboost
# import mlflow.lightgbm
# import mlflow.catboost



# =================================================================
# 1. CONFIGURATION AND SETUP
# =================================================================

In [12]:
# Configuration
warnings.filterwarnings('ignore')
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Constants
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# MLflow Configuration
EXPERIMENT_NAME = "insurance-cross-sell-prediction"
TRACKING_URI = "sqlite:///mlflow.db"  # Local SQLite database
# For remote server: TRACKING_URI = "http://your-mlflow-server:5000"

In [13]:
class Config:
    """Configuration class for the ML pipeline"""
    RANDOM_STATE = 42
    TEST_SIZE = 0.2
    CV_FOLDS = 5
    DATA_PATH = './data/train.csv'
    MODEL_SAVE_PATH = './models/'
    
    # Outlier removal parameters
    OUTLIER_METHOD = 'iqr'
    IQR_FACTOR = 1.5

In [14]:
def setup_mlflow():
    """Setup MLflow tracking URI and experiment"""
    mlflow.set_tracking_uri(TRACKING_URI)
    
    # create experiment if it doesn't exist
    
    try:
        experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
    except mlflow.exceptions.MlflowException as e:
        logger.warning(f"Experiment '{EXPERIMENT_NAME}' already exists.")
        experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

    mlflow.start_run(experiment_id=experiment_id)
    logger.info(f"MLflow experiment '{EXPERIMENT_NAME}' started with ID: {experiment_id}")
    return experiment_id


# =================================================================
# 2. DATA LOADING AND PREPROCESSING
# =================================================================

In [15]:
# Load the training data
df = pd.read_csv('./data/train.csv')
df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


In [16]:
# Check data info and missing values
display(df.info())
display(df.isnull().sum())

# Remove outliers from Annual_Premium
q1 = df['Annual_Premium'].quantile(0.25)
q3 = df['Annual_Premium'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
df = df[(df['Annual_Premium'] >= lower_bound) & (df['Annual_Premium'] <= upper_bound)]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381109 entries, 0 to 381108
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    381109 non-null  int64  
 1   Gender                381109 non-null  object 
 2   Age                   381109 non-null  int64  
 3   Driving_License       381109 non-null  int64  
 4   Region_Code           381109 non-null  float64
 5   Previously_Insured    381109 non-null  int64  
 6   Vehicle_Age           381109 non-null  object 
 7   Vehicle_Damage        381109 non-null  object 
 8   Annual_Premium        381109 non-null  float64
 9   Policy_Sales_Channel  381109 non-null  float64
 10  Vintage               381109 non-null  int64  
 11  Response              381109 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 34.9+ MB


None

id                      0
Gender                  0
Age                     0
Driving_License         0
Region_Code             0
Previously_Insured      0
Vehicle_Age             0
Vehicle_Damage          0
Annual_Premium          0
Policy_Sales_Channel    0
Vintage                 0
Response                0
dtype: int64

In [17]:
# Log transform Annual_Premium
df['Annual_Premium_log'] = np.log1p(df['Annual_Premium'])

# Encode categorical features
vehicle_age_map = {
    '1-2 Year': 0,
    '< 1 Year': 1,
    '> 2 Years': 2
}
df['Vehicle_Age'] = df['Vehicle_Age'].map(vehicle_age_map)
le = LabelEncoder()
for col in df.select_dtypes(include=['object']).columns:
    if col != 'Vehicle_Age':
        df[col] = le.fit_transform(df[col])

# Standardize numerical features (excluding target and categorical)
categorical = ['Vehicle_Damage', 'Previously_Insured', 'Vehicle_Age']
for col in df.select_dtypes(include=['int64', 'float64']).columns:
    if col != 'Response' and col not in categorical and col != 'Vehicle_Age':
        df[col] = StandardScaler().fit_transform(df[[col]])

In [None]:
# Boxplots for numeric features to check for outliers
for col in numeric_cols:
    plt.figure(figsize=(8, 2))
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col}')
    plt.show()

In [None]:
# Correlation matrix for numerical features
correlation_matrix = df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Numerical Features')
plt.show()

In [None]:
# Numeric and categorical columns
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# Plot numeric features
df[numeric_cols].hist(figsize=(14, 10), bins=30, layout=(3, 4))
plt.suptitle('Numeric Feature Distributions')
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

# Plot categorical features
for col in categorical_cols:
    plt.figure(figsize=(6, 4))
    sns.countplot(x=col, data=df)
    plt.title(f'Distribution of {col}')
    plt.show()

# Target variable distribution
sns.countplot(x='Response', data=df)
plt.title('Response Distribution (Target Variable)')
plt.xlabel('Response')
plt.ylabel('Count')
plt.show()

In [18]:
# split the data in curr and ref
ref, cur = train_test_split(df, test_size=0.5, random_state=42)

# Prepare features and target
X = cur.drop(['Response', 'id'], axis=1)
y = cur['Response']

# Split data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE, stratify=y)

# Handle class imbalance
smote = SMOTE(random_state=RANDOM_STATE)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
rus = RandomUnderSampler(random_state=RANDOM_STATE)
X_train_balanced, y_train_balanced = rus.fit_resample(X_train_res, y_train_res)

In [19]:
X_train_balanced

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Annual_Premium_log
0,0,1.316675,0.045409,-0.258241,0,2,1,-0.226320,-1.601474,-1.187343,0.182333
2,0,-1.014909,0.045409,0.192535,1,1,0,0.114519,0.730152,1.035666,0.359914
3,0,1.899571,0.045409,-1.159793,1,0,0,-0.499669,0.212013,-0.087790,0.012992
4,1,0.150883,0.045409,-1.385181,0,0,1,-1.806591,0.804172,0.880294,-2.111359
5,1,-0.108182,0.045409,-1.159793,0,0,1,-0.490919,0.804172,1.214941,0.018880
...,...,...,...,...,...,...,...,...,...,...,...
190764,1,1.536325,0.045409,-1.296958,0,0,1,-1.806591,-1.601474,0.018750,-2.111359
224212,1,0.798545,0.045409,-1.385181,0,0,1,0.691161,-1.601474,0.342470,0.602759
217638,0,0.821297,0.045409,0.117406,0,0,1,0.106955,0.212013,0.348767,0.356289
7185,1,0.992844,0.045409,0.117406,0,2,1,-0.146282,0.212013,-1.414425,0.226948


In [20]:
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import mlflow.lightgbm
import mlflow.catboost
import os
import tempfile
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt



In [21]:
mlflow.set_tracking_uri("http://localhost:5000")

# Set MLflow experiment 
mlflow.set_experiment("Model_Comparison_Experiment")


2025/07/29 12:45:10 INFO mlflow.tracking.fluent: Experiment with name 'Model_Comparison_Experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/530509601835095243', creation_time=1753829110775, experiment_id='530509601835095243', last_update_time=1753829110775, lifecycle_stage='active', name='Model_Comparison_Experiment', tags={}>

In [22]:
def train_and_evaluate(model, X_train, y_train, X_test, y_test, model_name):
    """Train the model, log metrics to MLflow, save for registry, and visualize evaluation."""

    with mlflow.start_run(run_name=model_name):

        # Set tags
        mlflow.set_tag("mlflow.runName", f"{model_name}_Run")
        mlflow.set_tag("experiment_type", "baseline")
        mlflow.set_tag("model_type", model_name)
        mlflow.set_tag("description", f"Baseline {model_name} model for insurance cross-sell prediction using a simple train-test split")

        # Log parameters
        if hasattr(model, 'get_params'):
            mlflow.log_params(model.get_params())

        # Train and predict
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Evaluate
        acc = accuracy_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True)
        cm = confusion_matrix(y_test, y_pred)

        # Log metrics
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("roc_auc", roc_auc)
        mlflow.log_metrics({
            "precision": report['weighted avg']['precision'],
            "recall": report['weighted avg']['recall'],
            "f1-score": report['weighted avg']['f1-score']
        })

        # Confusion matrix
        fig, ax = plt.subplots()
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
        plt.title(f'Confusion Matrix: {model_name}')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')

        tmp_dir = tempfile.mkdtemp()
        cm_path = os.path.join(tmp_dir, f"{model_name}_confusion_matrix.png")
        plt.savefig(cm_path)
        mlflow.log_artifact(cm_path, artifact_path="confusion_matrices")
        plt.close(fig)

        # Save and log model to specific path
        model_dir = os.path.join(tmp_dir, model_name.replace(" ", "_"))
        os.makedirs(model_dir, exist_ok=True)

        # Save model to disk (path-based log_model)
        try:
            if 'XGBClassifier' in str(type(model)):
                mlflow.xgboost.save_model(model, path=model_dir)
            elif 'LGBMClassifier' in str(type(model)):
                mlflow.lightgbm.save_model(model, path=model_dir)
            elif 'CatBoostClassifier' in str(type(model)):
                mlflow.catboost.save_model(model, path=model_dir)
            else:
                mlflow.sklearn.save_model(model, path=model_dir)

            # Log model as artifact
            mlflow.log_artifacts(model_dir, artifact_path="saved_models")
        except Exception as e:
            print(f"Model saving failed for {model_name}: {e}")

        # Console output
        print(f'\n--- {model_name} ---')
        print('Accuracy:', acc)
        print('ROC AUC Score:', roc_auc)
        print('Classification Report:\n', classification_report(y_test, y_pred))

NameError: name 'X' is not defined

In [23]:
# Define your models
models = [
    (LogisticRegression(max_iter=1000, class_weight='balanced', random_state=RANDOM_STATE), 'Logistic Regression'),
    (RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=RANDOM_STATE), 'Random Forest'),
    (XGBClassifier(eval_metric='logloss', scale_pos_weight=1, use_label_encoder=False, random_state=RANDOM_STATE), 'XGBoost'),
    (LGBMClassifier(class_weight='balanced', random_state=RANDOM_STATE), 'LightGBM'),
    (CatBoostClassifier(verbose=0, random_state=RANDOM_STATE), 'CatBoost')
]


# Loop through models
for model, name in models:
    train_and_evaluate(model, X_train_balanced, y_train_balanced, X_test, y_test, name)



--- Logistic Regression ---
Accuracy: 0.6389183552383179
ROC AUC Score: 0.7831436045093316
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.59      0.74     48834
           1       0.25      0.97      0.40      6785

    accuracy                           0.64     55619
   macro avg       0.62      0.78      0.57     55619
weighted avg       0.90      0.64      0.70     55619


--- Random Forest ---
Accuracy: 0.8277926607813877
ROC AUC Score: 0.6467243110063603
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.89      0.90     48834
           1       0.33      0.41      0.37      6785

    accuracy                           0.83     55619
   macro avg       0.62      0.65      0.63     55619
weighted avg       0.84      0.83      0.84     55619


--- XGBoost ---
Accuracy: 0.8076376777719844
ROC AUC Score: 0.7055527970488445
Classification Report:
               pr

In [30]:
from mlflow import MlflowClient
import mlflow

client = MlflowClient()

# First model registration (only call create_registered_model once)
run_id_1 = "2a4df295d713432e8202714c455b0f27"
model_name = "My_Insurance_Model"
model_uri_1 = f"runs:/{run_id_1}/saved_models"

# Only run this ONCE to create the model in the registry
try:
    client.create_registered_model(model_name)
except Exception as e:
    print(f"Model already exists or error: {e}")

mlflow.register_model(model_uri_1, model_name)  # Registers as version 1

# Second model registration (no need to recreate the model name)
run_id_2 = "38fdc04ede364f3ca0e298821bff9cef"
model_uri_2 = f"runs:/{run_id_2}/saved_models"

mlflow.register_model(model_uri_2, model_name)  # Automatically becomes version 2


Registered model 'My_Insurance_Model' already exists. Creating a new version of this model...
2025/07/29 13:10:35 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: My_Insurance_Model, version 2


Model already exists or error: RESOURCE_ALREADY_EXISTS: Registered Model (name=My_Insurance_Model) already exists.


Created version '2' of model 'My_Insurance_Model'.
Registered model 'My_Insurance_Model' already exists. Creating a new version of this model...
2025/07/29 13:10:35 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: My_Insurance_Model, version 3
Created version '3' of model 'My_Insurance_Model'.


<ModelVersion: aliases=[], creation_timestamp=1753830635310, current_stage='None', description='', last_updated_timestamp=1753830635310, name='My_Insurance_Model', run_id='38fdc04ede364f3ca0e298821bff9cef', run_link='', source='mlflow-artifacts:/530509601835095243/38fdc04ede364f3ca0e298821bff9cef/artifacts/saved_models', status='READY', status_message='', tags={}, user_id='', version='3'>

In [25]:
# Transition the model to production
client.transition_model_version_stage(
    name=model_name,
    version=1,  # Adjust based on your model version
    stage="Production"
)


<ModelVersion: aliases=[], creation_timestamp=1753830456990, current_stage='Production', description='', last_updated_timestamp=1753830488652, name='My_Insurance_Model', run_id='2a4df295d713432e8202714c455b0f27', run_link='', source='mlflow-artifacts:/530509601835095243/2a4df295d713432e8202714c455b0f27/artifacts/saved_models', status='READY', status_message='', tags={}, user_id='', version='1'>

In [31]:
# Transition the model to production
client.transition_model_version_stage(
    name=model_name,
    version=3,  # Adjust based on your model version
    stage="Staging"  # Change to "Production" if you want to move it to production stage
)

# Transition the model to production
client.transition_model_version_stage(
    name=model_name,
    version=2,  # Adjust based on your model version
    stage="Staging"  # Change to "Production" if you want to move it to production stage
)


<ModelVersion: aliases=[], creation_timestamp=1753830635175, current_stage='Staging', description='', last_updated_timestamp=1753830663716, name='My_Insurance_Model', run_id='2a4df295d713432e8202714c455b0f27', run_link='', source='mlflow-artifacts:/530509601835095243/2a4df295d713432e8202714c455b0f27/artifacts/saved_models', status='READY', status_message='', tags={}, user_id='', version='2'>

In [35]:
# Download the production model and save it in the app/models dir 
import mlflow

# Download the model from MLflow Model Registry
model_name = "My_Insurance_Model"
model_version = 1  # Adjust based on your model version
model_uri = f"models:/{model_name}/{model_version}"

# Download the model artifacts to a local directory
model_save_path = './app/models/My_Insurance_Model_v1'
mlflow.artifacts.download_artifacts(artifact_uri=model_uri, dst_path=model_save_path)


Downloading artifacts: 100%|██████████| 5/5 [00:02<00:00,  2.21it/s]


'c:\\Users\\mkrym\\OneDrive\\Desktop\\Machine learning\\MLops\\project-1\\app\\models\\My_Insurance_Model_v1\\'

In [63]:
# save the cur and ref files 
cur.to_csv('./data/cur_data.csv', index=False)
ref.to_csv('./data/ref_data.csv', index=False)