# Statistical Modeling for AlphaCare Insurance Solutions (ACIS)

This notebook performs statistical modeling on the insurance claim data to predict TotalPremium and analyze feature importance.

In [11]:
# statistical_modeling_notebook.ipynb
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
import shap
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score  # Import these for evaluation
from sklearn.preprocessing import LabelEncoder

# Define the path to the src directory
src_dir = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
sys.path.insert(0, src_dir)

if 'data_loader' in sys.modules:
    del sys.modules['data_loader']
if 'statistical_modeling' in sys.modules:
    del sys.modules['statistical_modeling']

from data_loader import DataLoader
from statistical_modeling import StatisticalModeling

# Load data
data_loader = DataLoader('../resources/Data/machineLearning.txt')
data = data_loader.load_data()

if data is not None:
    print(data.head())
    print(data.info())

    # Data Preparation and Modeling
    modeling = StatisticalModeling(data)
    data = modeling.handle_missing_data()
    data = modeling.feature_engineering()

    # Encode categorical data
    data = modeling.encode_categorical_data()

    # Split data
    X_train, X_test, y_train, y_test = modeling.train_test_split(target_column='TotalClaims')

    # Scale numerical features
    numerical_features = X_train.select_dtypes(include=['float64', 'int64']).columns
    scaler = StandardScaler()  # or MinMaxScaler()
    X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
    X_test[numerical_features] = scaler.transform(X_test[numerical_features])

    # Train models and evaluate
    lr_preds, lr_mse, lr_r2 = modeling.linear_regression(X_train, X_test, y_train, y_test)
    print(f"Linear Regression MSE: {lr_mse}, R2: {lr_r2}")

    # Hyperparameter tuning for Random Forest
    param_grid_rf = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    grid_search_rf = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid_rf, cv=5, scoring='neg_mean_squared_error')
    grid_search_rf.fit(X_train, y_train)
    best_rf_model = grid_search_rf.best_estimator_
    
    rf_preds = best_rf_model.predict(X_test)
    rf_mse = mean_squared_error(y_test, rf_preds)
    rf_r2 = r2_score(y_test, rf_preds)
    print(f"Random Forest MSE: {rf_mse}, R2: {rf_r2}")

    # Hyperparameter tuning for XGBoost
    param_grid_xgb = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 6, 9],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0]
    }
    grid_search_xgb = GridSearchCV(estimator=XGBRegressor(), param_grid=param_grid_xgb, cv=5, scoring='neg_mean_squared_error')
    grid_search_xgb.fit(X_train, y_train)
    best_xgb_model = grid_search_xgb.best_estimator_

    xgb_preds = best_xgb_model.predict(X_test)
    xgb_mse = mean_squared_error(y_test, xgb_preds)
    xgb_r2 = r2_score(y_test, xgb_preds)
    print(f"XGBoost MSE: {xgb_mse}, R2: {xgb_r2}")

    # Feature importance with SHAP using the trained Random Forest model
    rf_shap_values = shap.Explainer(best_rf_model, X_train)(X_train)
    shap.summary_plot(rf_shap_values, X_train)

else:
    print("Data loading failed.")


  pd.DataFrame: Loaded and preprocessed data
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # For numeric columns, fill missing values with median
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  for col in categorical_columns:


   UnderwrittenCoverID  PolicyID     TransactionMonth  IsVATRegistered  \
0             145249.0   12827.0  2015-03-01 00:00:00             True   
1             145249.0   12827.0  2015-05-01 00:00:00             True   
2             145249.0   12827.0  2015-07-01 00:00:00             True   
3             145255.0   12827.0  2015-05-01 00:00:00             True   
4             145255.0   12827.0  2015-07-01 00:00:00             True   

  Citizenship          LegalType Title Language                 Bank  \
0              Close Corporation    Mr  English  First National Bank   
1              Close Corporation    Mr  English  First National Bank   
2              Close Corporation    Mr  English  First National Bank   
3              Close Corporation    Mr  English  First National Bank   
4              Close Corporation    Mr  English  First National Bank   

       AccountType  ...                    ExcessSelected CoverCategory  \
0  Current account  ...             Mobility - 

AttributeError: 'StatisticalModeling' object has no attribute 'feature_importance'