In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# import xgboost as xgb
# from scipy.stats import randint, uniform

# Set random state for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Configure matplotlib
plt.style.use('default')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

print("Advanced modeling libraries imported successfully!")


Advanced modeling libraries imported successfully!


In [4]:
# Load and prepare data (using the same preprocessing from notebook 04)
def load_and_prepare_data():
    """Load and prepare data for advanced modeling"""
    
    # Load the dataset
    df = pd.read_csv('../MachineLearningRating_v3.txt', delimiter='|', low_memory=False)
    
    # Feature engineering (same as notebook 04)
    df['HasClaim'] = (df['TotalClaims'] > 0).astype(int)
    df['ClaimRatio'] = df['TotalClaims'] / (df['TotalPremium'] + 1e-6)
    df['PremiumPerInsured'] = df['TotalPremium'] / (df['SumInsured'] + 1e-6)
    df['VehicleAge'] = (2024 - df['RegistrationYear']).clip(0, 50)
    df['Gender_MaritalStatus'] = df['Gender'].astype(str) + '_' + df['MaritalStatus'].astype(str)
    df['Province_VehicleType'] = df['Province'].astype(str) + '_' + df['VehicleType'].astype(str)
    
    # Risk scores
    province_risk = df.groupby('Province')['HasClaim'].mean()
    df['ProvinceRiskScore'] = df['Province'].map(province_risk)
    
    make_risk = df.groupby('make')['HasClaim'].mean()
    df['MakeRiskScore'] = df['make'].map(make_risk)
    
    df['IsNewDriver'] = ((df['Gender'] == 'Male') & (df['VehicleAge'] < 5)).astype(int)
    
    return df

# Load data
print("Loading and preparing data...")
df = load_and_prepare_data()
print(f"Dataset loaded: {df.shape}")

# Prepare feature sets
exclude_cols = ['PolicyID', 'TotalClaims', 'TotalPremium', 'HasClaim', 'ClaimRatio', 
               'UnderwrittenCoverID', 'TransactionMonth']

numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
numerical_features = [col for col in numerical_cols if col not in exclude_cols]

categorical_features = []
for col in df.select_dtypes(include=['object']).columns:
    if col not in exclude_cols and df[col].nunique() < 50:
        categorical_features.append(col)

all_features = numerical_features + categorical_features
print(f"Features for modeling: {len(all_features)} ({len(numerical_features)} numerical + {len(categorical_features)} categorical)")


Loading and preparing data...
Dataset loaded: (1000098, 61)
Features for modeling: 50 (16 numerical + 34 categorical)


In [6]:
# Hyperparameter tuning configurations
def get_param_grids():
    """Define parameter grids for hyperparameter tuning"""
    
    param_grids = {
        'RandomForestRegressor': {
            'regressor__n_estimators': [100, 200, 300],
            'regressor__max_depth': [10, 15, 20, None],
            'regressor__min_samples_split': [2, 5, 10],
            'regressor__min_samples_leaf': [1, 2, 4],
            'regressor__max_features': ['sqrt', 'log2', None]
        },
        'XGBRegressor': {
            'regressor__n_estimators': [100, 200, 300],
            'regressor__max_depth': [6, 8, 10],
            'regressor__learning_rate': [0.01, 0.1, 0.2],
            'regressor__subsample': [0.8, 0.9, 1.0],
            'regressor__colsample_bytree': [0.8, 0.9, 1.0]
        },
        'RandomForestClassifier': {
            'classifier__n_estimators': [100, 200, 300],
            'classifier__max_depth': [10, 15, 20, None],
            'classifier__min_samples_split': [2, 5, 10],
            'classifier__min_samples_leaf': [1, 2, 4],
            'classifier__max_features': ['sqrt', 'log2', None]
        },
        'XGBClassifier': {
            'classifier__n_estimators': [100, 200, 300],
            'classifier__max_depth': [6, 8, 10],
            'classifier__learning_rate': [0.01, 0.1, 0.2],
            'classifier__subsample': [0.8, 0.9, 1.0],
            'classifier__colsample_bytree': [0.8, 0.9, 1.0]
        }
    }
    
    return param_grids

# Create preprocessing pipeline
def create_preprocessor(numerical_features, categorical_features):
    """Create preprocessing pipeline"""
    
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    return preprocessor

preprocessor = create_preprocessor(numerical_features, categorical_features)
param_grids = get_param_grids()

print("Hyperparameter grids and preprocessing pipeline created.")


Hyperparameter grids and preprocessing pipeline created.
