# Baseline Data Modeling

In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, confusion_matrix
from imblearn.over_sampling import SMOTE

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau


In [62]:
test = pd.read_parquet('../data/processed/processed_test.parquet.gzip')

In [2]:
train = pd.read_parquet('../data/processed/processed_train.parquet.gzip')
train

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Premium_to_Age_Ratio,age_group_senior,age_group_young,Vehicle_Age_LT_1_Year,Vehicle_Age_GT_2_Years,Vehicle_Damage_Yes
0,1,0.333777,1,28.0,0,0.070366,26.0,0.748795,1,0.022331,0,0,0,1,1
1,1,2.396751,1,3.0,0,0.057496,26.0,0.342443,0,-0.773246,1,0,0,0,0
2,1,0.527181,1,28.0,0,0.066347,26.0,-1.521998,1,-0.151783,0,0,0,1,1
3,1,-1.148985,1,11.0,1,0.048348,152.0,0.581474,0,0.760095,0,1,1,0,0
4,0,-0.633242,1,41.0,1,0.046259,152.0,-1.378580,0,0.070132,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
381104,1,2.267815,1,26.0,1,0.051234,26.0,-0.792954,0,-0.829086,1,0,0,0,0
381105,1,-0.568774,1,37.0,1,0.069551,152.0,-0.279037,0,0.711938,0,0,1,0,0
381106,1,-1.148985,1,30.0,1,0.060439,160.0,0.079509,0,1.275025,0,1,1,0,0
381107,0,1.881007,1,14.0,0,0.078110,124.0,-0.960275,0,-0.415730,1,0,0,1,1


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381109 entries, 0 to 381108
Data columns (total 15 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Gender                  381109 non-null  object 
 1   Age                     381109 non-null  float64
 2   Driving_License         381109 non-null  object 
 3   Region_Code             381109 non-null  object 
 4   Previously_Insured      381109 non-null  object 
 5   Annual_Premium          381109 non-null  float64
 6   Policy_Sales_Channel    381109 non-null  object 
 7   Vintage                 381109 non-null  float64
 8   Response                381109 non-null  int64  
 9   Premium_to_Age_Ratio    381109 non-null  float64
 10  age_group_senior        381109 non-null  object 
 11  age_group_young         381109 non-null  object 
 12  Vehicle_Age_LT_1_Year   381109 non-null  object 
 13  Vehicle_Age_GT_2_Years  381109 non-null  object 
 14  Vehicle_Damage_Yes  

In [4]:
X = train.drop('Response', axis=1)
y = train['Response']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=13, stratify=y)

In [5]:
print("Training set shape:", X_train.shape)
print("Validation set shape:", X_val.shape)
print("Class distribution in training set:")
print(y_train.value_counts(normalize=True))

Training set shape: (304887, 14)
Validation set shape: (76222, 14)
Class distribution in training set:
Response
0    0.877437
1    0.122563
Name: proportion, dtype: float64


In [6]:
# Function to evaluate model
def evaluate_model(model, X, y, model_name):
    # Predict both class labels and probabilities
    y_pred = model.predict(X)
    y_pred_proba = model.predict_proba(X)[:, 1]
    
    # Calculate various evaluation metrics
    # ROC AUC: Good for imbalanced datasets, measures ability to distinguish between classes
    roc_auc = roc_auc_score(y, y_pred_proba)
    # PR AUC: Also good for imbalanced datasets, focuses on positive class performance
    pr_auc = average_precision_score(y, y_pred_proba)
    # F1 Score: Harmonic mean of precision and recall
    f1 = f1_score(y, y_pred)
    # Confusion Matrix: Gives a breakdown of correct and incorrect classifications
    cm = confusion_matrix(y, y_pred)
    
    # Print results for easy comparison between models
    print(f"{model_name} Results:")
    print(f"ROC AUC: {roc_auc:.4f}")
    print(f"PR AUC: {pr_auc:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("Confusion Matrix:")
    print(cm)
    print("\n")

In [7]:
# Handle class imbalance using SMOTE
# SMOTE creates synthetic examples of the minority class to balance the dataset
smote = SMOTE(random_state=13)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [11]:
print("Training set shape:", X_train_resampled.shape)
print("Validation set shape:", X_val.shape)
print("Class distribution in training set:")
print(y_train_resampled.value_counts(normalize=True))

Training set shape: (535038, 14)
Validation set shape: (76222, 14)
Class distribution in training set:
Response
0    0.5
1    0.5
Name: proportion, dtype: float64


In [14]:
# Logistic Regression - Our baseline model
# Simple, interpretable, and often performs well on linearly separable data
lr_model = LogisticRegression(class_weight='balanced', random_state=13, max_iter=1000)
# class_weight='balanced' adjusts weights inversely proportional to class frequencies
lr_model.fit(X_train_resampled, y_train_resampled)
evaluate_model(lr_model, X_val, y_val, "Logistic Regression")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Results:
ROC AUC: 0.8403
PR AUC: 0.3251
F1 Score: 0.4019
Confusion Matrix:
[[40734 26146]
 [  417  8925]]




In [15]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=13)
rf_model.fit(X_train_resampled, y_train_resampled)
evaluate_model(rf_model, X_val, y_val, "Random Forest")

Random Forest Results:
ROC AUC: 0.8327
PR AUC: 0.3154
F1 Score: 0.3782
Confusion Matrix:
[[58136  8744]
 [ 5125  4217]]




### Logistic Regression vs Random Forest

The comparison between the Logistic Regression and **Random Forest models** reveals interesting insights into our insurance cross-sell prediction task. While both models show moderate predictive power, the Random Forest **slightly outperforms** the Logistic Regression in terms of ROC AUC (0.8327 vs 0.8403), indicating a better ability to distinguish between classes. 

However, the **Logistic Regression model** demonstrates a **higher precision-recall balance** with a superior F1 Score (0.4019 vs 0.3782). This suggests that for this particular dataset, the simpler linear model might be capturing some important linear relationships that the more complex Random Forest is potentially overfitting. 

The confusion matrices indicate that **both models struggle with false positives**, but the **Random Forest appears to have a lower false negative** rate, which could be valuable if **minimizing missed opportunities** is a **priority for the insurance company's marketing strategy**.

### Important Features

In [16]:
# Feature importance for Random Forest
# This helps us understand which features are most influential in making predictions
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': rf_model.feature_importances_})
feature_importance = feature_importance.sort_values('importance', ascending=False).head(10)
print("Top 10 Important Features:")
print(feature_importance)

Top 10 Important Features:
                  feature  importance
13     Vehicle_Damage_Yes    0.183252
1                     Age    0.161707
4      Previously_Insured    0.159697
7                 Vintage    0.117761
8    Premium_to_Age_Ratio    0.106742
5          Annual_Premium    0.082149
3             Region_Code    0.059031
6    Policy_Sales_Channel    0.049347
11  Vehicle_Age_LT_1_Year    0.030532
10        age_group_young    0.019255


# PyTorch

In [26]:
X_train_resampled = X_train_resampled.astype(float)
X_val = X_val.astype(float)

# Verify the conversion
# print(X_train_resampled.dtypes)
# print(X_val_scaled.dtypes)

In [27]:
# Convert data to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_resampled.values)
y_train_tensor = torch.FloatTensor(y_train_resampled.values)
X_val_tensor = torch.FloatTensor(X_val.values)
y_val_tensor = torch.FloatTensor(y_val.values)

In [28]:
# Create DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

In [51]:
# Define the neural network
class Net(nn.Module):
    def __init__(self, input_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)  # First hidden layer
        self.bn1 = nn.BatchNorm1d(64)         # Batch Normalization
        self.fc2 = nn.Linear(64, 32)          # Second hidden layer
        self.bn2 = nn.BatchNorm1d(32)         # Batch Normalization
        self.fc3 = nn.Linear(32, 1)           # Output layer
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))  # ReLU activation for non-linearity
        x = torch.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))  # Sigmoid for binary classification
        return x


In [43]:
# Initialize the model
model = Net(X_train_resampled.shape[1])

# Define loss function and optimizer
# Binary Cross Entropy loss is suitable for binary classification
criterion = nn.BCELoss()
# Adam optimizer is generally a good default choice
optimizer = optim.Adam(model.parameters())

In [52]:
# Training loop
best_val_loss = float('inf')
patience = 30
no_improve = 0

num_epochs = 100
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()  # Clear previous gradients
        outputs = model(batch_X).squeeze()
        loss = criterion(outputs, batch_y)
        loss.backward()  # Compute gradients
        optimizer.step()  # Update weights
    
    # Evaluate on validation set every 10 epochs
    if (epoch + 1) % 10 == 0:
        model.eval()  # Set model to evaluation mode
        with torch.no_grad():
            val_outputs = model(X_val_tensor).squeeze()
            val_loss = criterion(val_outputs, y_val_tensor)
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {val_loss.item():.4f}')

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_model.pth')
        no_improve = 0
    else:
        no_improve += 1
        
    if no_improve >= patience:
        print(f"Early stopping at epoch {epoch}")
        break

Epoch [10/100], Loss: 0.4602
Epoch [20/100], Loss: 0.4463
Epoch [30/100], Loss: 0.4444
Epoch [40/100], Loss: 0.4865
Epoch [50/100], Loss: 0.4520
Epoch [60/100], Loss: 0.4487
Early stopping at epoch 59


### Training Loop Results

The training loop results for the neural network model provide some interesting insights into its learning process. Over the course of 50 epochs, we can observe the validation loss fluctuating, but generally decreasing from an initial value of 0.4641 to 0.4255 by epoch 30. This indicates that the model is learning and improving its predictions on the validation set.

However, there's a notable increase in loss towards the end, rising to 0.4982 by epoch 50. This uptick suggests that the model might be starting to overfit the training data, as its performance on the validation set begins to degrade. It's possible that the optimal number of training epochs for this model is around 30-40, where the validation loss was at its lowest. 

This observation could guide us in implementing early stopping or adjusting the model's complexity to prevent overfitting and achieve better generalization on unseen data

In [55]:
class PyTorchModelWrapper:
    """
    A wrapper class to make PyTorch models compatible with scikit-learn style evaluation.
    This wrapper bridges the gap between PyTorch's functional API and scikit-learn's object-oriented API.
    """

    def __init__(self, model, X_val):
        """
        Initialize the wrapper with a PyTorch model and validation data.
        
        :param model: A trained PyTorch model
        :param X_val: Validation data (can be DataFrame or numpy array)
        """
        self.model = model
        # Convert DataFrame to numpy array if necessary
        # This ensures compatibility regardless of input type
        self.X_val = X_val.values if isinstance(X_val, pd.DataFrame) else X_val
        self.val_outputs = None
        self._compute_outputs()

    def _compute_outputs(self):
        """
        Precompute the model outputs for the validation set.
        This is done once to avoid redundant computations in predict and predict_proba.
        """
        self.model.eval()  # Set the model to evaluation mode
        with torch.no_grad():  # Disable gradient computation for efficiency
            X_val_tensor = torch.FloatTensor(self.X_val)  # Convert to PyTorch tensor
            # Compute and store the raw model outputs
            self.val_outputs = self.model(X_val_tensor).squeeze().numpy()

    def predict(self, X):
        """
        Return class predictions (0 or 1) based on a threshold of 0.5.
        
        :param X: Input data (ignored as we use precomputed outputs)
        :return: Class predictions as a numpy array
        """
        # Convert raw outputs to class predictions
        return (self.val_outputs > 0.5).astype(float)

    def predict_proba(self, X):
        """
        Return probability estimates for both classes.
        
        :param X: Input data (ignored as we use precomputed outputs)
        :return: Probability estimates as a 2D numpy array
        """
        # Create a 2D array with probabilities for both classes
        return np.column_stack((1 - self.val_outputs, self.val_outputs))

# Wrap our PyTorch model
wrapped_model = PyTorchModelWrapper(model, X_val)

In [56]:
# Original evaluation
model.eval()
with torch.no_grad():
    val_outputs = model(X_val_tensor).squeeze()
    val_predictions = (val_outputs > 0.5).float()
    accuracy = (val_predictions == y_val_tensor).float().mean()
    print(f'Validation Accuracy: {accuracy.item():.4f}')

# Additional evaluation using evaluate_model
evaluate_model(wrapped_model, X_val.values, y_val.values, "Neural Network")

Validation Accuracy: 0.7239
Neural Network Results:
ROC AUC: 0.8465
PR AUC: 0.3455
F1 Score: 0.4350
Confusion Matrix:
[[47073 19807]
 [ 1239  8103]]




# Conclusion: Cross-Selling Vehicle Insurance to Health Insurance Customers

### Model Performance Overview

The analysis employs three distinct models to predict customer interest in vehicle insurance: Logistic Regression, Random Forest, and a Neural Network. Each model offers unique insights into customer behavior and preferences.

### Neural Network: The Slight Edge
The Neural Network emerges as the top performer with an ROC AUC of 0.8465 and an impressive accuracy of 72.39%. This suggests that complex, non-linear relationships within the data are captured effectively by this model. However, its slight edge over simpler models raises questions about the trade-off between performance and interpretability.

### Logistic Regression: Simplicity Meets Effectiveness
Close behind, the Logistic Regression model achieves an ROC AUC of 0.8403. Its standout feature is the highest F1 Score of 0.4019, indicating a well-balanced precision and recall. This balance is crucial for optimizing marketing efforts, ensuring we capture a good proportion of interested customers without excessive false positives.

### Random Forest: Robust but Not Superior
The Random Forest model, while robust, doesn't outperform its counterparts with an ROC AUC of 0.8327. However, its ability to handle non-linear relationships and feature interactions shouldn't be overlooked in our overall strategy.

### Business Insights and Strategic Implications
The similar performance across models unveils several key insights for our insurance cross-selling strategy:

1. Targeted Marketing Potential: With prediction accuracies exceeding 70%, we have a solid foundation for more targeted marketing campaigns. This precision allows for significant improvements in marketing efficiency and potential cost savings.
2. Conservative Prediction Approach: All models demonstrate a tendency towards false positives rather than false negatives. In practical terms, this means we're more likely to market to uninterested customers than to miss potentially interested ones. While this ensures comprehensive coverage, it also suggests room for fine-tuning to reduce unnecessary marketing spend.
3. Customer Segmentation Opportunities: The models' predictive capabilities enable more sophisticated customer segmentation. We can now tailor our communication strategies and product offerings based on the likelihood of interest in vehicle insurance, potentially leading to higher conversion rates and customer satisfaction.
4. Balancing Complexity and Interpretability: While the Neural Network shows slightly superior performance, the Logistic Regression model offers a compelling balance between predictive power and interpretability. This balance is crucial for stakeholder buy-in and for deriving actionable insights from the model's predictions.

## Recommended Action Plan
Given these insights, we propose the following action plan:

1. **Implement Logistic Regression Model**: Deploy the Logistic Regression model as our primary predictive tool. Its balance of performance and interpretability makes it ideal for initial implementation and stakeholder communication.
2. **Develop Tiered Marketing Approach**: Create a tiered marketing strategy based on predicted interest levels. High-probability customers receive more personalized, high-touch approaches, while lower-probability customers receive more general, cost-effective communications.
3. **Continuous Model Refinement**: Regularly update and refine the model with new data. Consider A/B testing different models in real-world scenarios to validate their performance beyond mere statistical metrics.
4. **Investigate Feature Importance**: Conduct a deep dive into the features driving predictions in the Logistic Regression model. Use these insights to inform product development and marketing messaging.
5. **Cross-Functional Collaboration**: Foster collaboration between the data science team and marketing department to ensure model insights are effectively translated into actionable marketing strategies.

By implementing this data-driven approach, we position ourselves to significantly enhance our cross-selling effectiveness, optimize our marketing expenditure, and ultimately drive growth in our vehicle insurance portfolio among our existing health insurance customer base.