### Statistical Modeling

In [5]:
# Import standard python libraries
import os,sys

# Get the absolute path of the parent directory
rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)


In [7]:
# Loading modules from the script directory
from scripts.statistical_modeling import *

In [8]:
# Read the dataset
df = pd.read_csv('../data/cleaned_data.csv',index_col=False,low_memory=False)

In [11]:
numeric_features = ['SumInsured', 'CalculatedPremiumPerTerm','RegistrationYear','PostalCode'] 
categorical_features = ['Province', 'CoverType', 'VehicleType', 'make', 'Gender', 'MaritalStatus','PostalCode','Model','CoverCategory','NewVehicle','RegistrationYear','Citizenship' ] 

In [12]:
# Set up features according to the given columns.
selected_features = prepare_features(df, numeric_features, categorical_features)

In [15]:
# Preprocess the data
X_train, X_test, y_premium_train, y_premium_test, y_claims_train, y_claims_test, preprocessor = preprocess_data(df, selected_features)

In [17]:
# Build models
models = build_models()

In [18]:
# Train and evaluate models for TotalPremium
premium_results = train_and_evaluate_models(X_train, X_test, y_premium_train, y_premium_test, preprocessor, models)

In [19]:
# Train and evaluate models for TotalClaims
claims_results = train_and_evaluate_models(X_train, X_test, y_claims_train, y_claims_test, preprocessor, models)

In [23]:
# TotalPremium 
print("Results for TotalPremium prediction:")
for name, result in premium_results.items():
    print(f"{name}: MSE = {result['MSE']:.4f}, R2 = {result['R2']:.4f}")

Results for TotalPremium prediction:
Linear Regression: MSE = 16838.2191, R2 = 0.3897
Decision Tree: MSE = 12243.2447, R2 = 0.5562
Random Forest: MSE = 8822.0527, R2 = 0.6802
XGBoost: MSE = 10755.5932, R2 = 0.6101


In [24]:
# TotalClaims
print("\nResults for TotalClaims prediction:")
for name, result in claims_results.items():
    print(f"{name}: MSE = {result['MSE']:.4f}, R2 = {result['R2']:.4f}")


Results for TotalClaims prediction:
Linear Regression: MSE = 4869297.8188, R2 = 0.0035
Decision Tree: MSE = 6634768.6749, R2 = -0.3578
Random Forest: MSE = 5325011.4553, R2 = -0.0898
XGBoost: MSE = 5215581.6576, R2 = -0.0674


In [46]:
# Evaluate feature significance for the top-performing model
best_premium_model = premium_results['Random Forest']['Model'].named_steps['regressor']
best_claims_model = claims_results['Random Forest']['Model'].named_steps['regressor']
print(best_claims_model)
print(best_claims_model)

RandomForestRegressor(random_state=42)
RandomForestRegressor(random_state=42)


In [48]:
premium_importance = analyze_feature_importance(best_premium_model, X_train)
claims_importance = analyze_feature_importance(best_claims_model, X_train)

In [35]:
# Report comparison between each model performance
print("\nModel Performance Comparison:")
print("\nTotalPremium:")
for name, result in premium_results.items():
    print(f"{name}: R2 = {result['R2']:.4f}")

print("\nTotalClaims:")
for name, result in claims_results.items():
    print(f"{name}: R2 = {result['R2']:.4f}")


Model Performance Comparison:

TotalPremium:
Linear Regression: R2 = 0.3897
Decision Tree: R2 = 0.5562
Random Forest: R2 = 0.6802
XGBoost: R2 = 0.6101

TotalClaims:
Linear Regression: R2 = 0.0035
Decision Tree: R2 = -0.3578
Random Forest: R2 = -0.0898
XGBoost: R2 = -0.0674


In [50]:
# Summarize model performance
summarize_model_performance()


Summary:
Based on the R2 scores, the best model for predicting both TotalPremium and TotalClaims.
Best model for TotalPremium: Random Forest
Best model for TotalClaims: Linear Regression

Key features for predicting TotalPremium include:
['Province', 'Model', 'NewVehicle', 'make', 'Gender']

Key features for predicting TotalClaims include:
['Province', 'Model', 'NewVehicle', 'make', 'Gender']
