# Statistical Modeling for AlphaCare Insurance Solutions (ACIS)

This notebook performs statistical modeling on the insurance claim data to predict TotalPremium and analyze feature importance.

In [7]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

# Define the path to the src directory
src_dir = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
sys.path.insert(0, src_dir)



In [14]:
if 'data_loader' in sys.modules:
    del sys.modules['data_loader']
if 'statistical_modeling' in sys.modules:
    del sys.modules['statistical_modeling']
if 'statistical_modeling_improved' in sys.modules:
    del sys.modules['statistical_modeling_improved']

from data_loader import DataLoader
from statistical_modeling import StatisticalModeling


# Load data
data_loader = DataLoader('../resources/Data/machineLearning.txt')
data = data_loader.load_data()

  self.data = pd.read_csv(self.file_path, sep='|')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[numeric_columns] = numeric_imputer.fit_transform(df[numeric_columns])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[categorical_columns] = categorical_imputer.fit_transform(df[categorical_columns])


In [None]:
# Initialize StatisticalModeling
modeling = StatisticalModeling(data)

# Handle missing data and perform feature engineering
data = modeling.handle_missing_data()


In [5]:
# Function to plot feature importance
def plot_feature_importance(importance_df, title):
    plt.figure(figsize=(10, 6))
    sns.barplot(x='importance', y='feature', data=importance_df.head(20))
    plt.title(title)
    plt.tight_layout()
    plt.show()

# Function to plot SHAP values
def plot_shap_values(shap_values, X_train, title):
    plt.figure(figsize=(10, 6))
    shap.summary_plot(shap_values, X_train, plot_type="bar")
    plt.title(title)
    plt.tight_layout()
    plt.show()



In [None]:
# Analyze TotalClaims
print("Analyzing TotalClaims...")
results_claims = modeling.run_analysis('TotalClaims')



In [None]:
for model_name, result in results_claims.items():
    print(f"\n{model_name} Results:")
    print(f"MSE: {result['mse']:.4f}")
    print(f"R2 Score: {result['r2']:.4f}")
    
    plot_feature_importance(result['feature_importance'], f"{model_name} Feature Importance for TotalClaims")
    
    if 'shap_values' in result:
        plot_shap_values(result['shap_values'], result['model'].X_train, f"{model_name} SHAP Values for TotalClaims")

In [None]:
# Analyze TotalPremium
print("\nAnalyzing TotalPremium...")
results_premium = modeling.run_analysis('TotalPremium')

for model_name, result in results_premium.items():
    print(f"\n{model_name} Results:")
    print(f"MSE: {result['mse']:.4f}")
    print(f"R2 Score: {result['r2']:.4f}")
    
    plot_feature_importance(result['feature_importance'], f"{model_name} Feature Importance for TotalPremium")
    
    if 'shap_values' in result:
        plot_shap_values(result['shap_values'], result['model'].X_train, f"{model_name} SHAP Values for TotalPremium")



In [None]:
# Compare model performance
def compare_models(results_claims, results_premium):
    models = list(results_claims.keys())
    mse_claims = [results_claims[model]['mse'] for model in models]
    r2_claims = [results_claims[model]['r2'] for model in models]
    mse_premium = [results_premium[model]['mse'] for model in models]
    r2_premium = [results_premium[model]['r2'] for model in models]

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

    x = np.arange(len(models))
    width = 0.35

    ax1.bar(x - width/2, mse_claims, width, label='TotalClaims')
    ax1.bar(x + width/2, mse_premium, width, label='TotalPremium')
    ax1.set_ylabel('Mean Squared Error')
    ax1.set_title('MSE Comparison')
    ax1.set_xticks(x)
    ax1.set_xticklabels(models)
    ax1.legend()

    ax2.bar(x - width/2, r2_claims, width, label='TotalClaims')
    ax2.bar(x + width/2, r2_premium, width, label='TotalPremium')
    ax2.set_ylabel('R2 Score')
    ax2.set_title('R2 Score Comparison')
    ax2.set_xticks(x)
    ax2.set_xticklabels(models)
    ax2.legend()

    plt.tight_layout()
    plt.show()

compare_models(results_claims, results_premium)

# Print final conclusions
print("\nFinal Conclusions:")
best_model_claims = max(results_claims, key=lambda x: results_claims[x]['r2'])
best_model_premium = max(results_premium, key=lambda x: results_premium[x]['r2'])

print(f"Best model for TotalClaims: {best_model_claims}")
print(f"Best model for TotalPremium: {best_model_premium}")

print("\nTop 5 features for TotalClaims:")
print(results_claims[best_model_claims]['feature_importance'].head())

print("\nTop 5 features for TotalPremium:")
print(results_premium[best_model_premium]['feature_importance'].head())