## Statistical modeling

#### Data Preparation

In [1]:
import sys
sys.path.append('../')
import pandas as pd
import numpy as np
from tabulate import tabulate
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from src.data_quality_checks import check_missing_data
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
# from xgboost import XGBRegressor

In [2]:
# Loading historical data
file_path = "../data/MachineLearningRating_v3.txt"

df = pd.read_csv(file_path, sep='|', low_memory=False)

In [3]:
# selecting relevant columns
relevant_cols = ['TotalPremium', 'TotalClaims', 'Gender', 'PostalCode', 'Province', 'VehicleType', 'RegistrationYear', 'make', 'Model', 'NewVehicle']

df_relevant_cols = df[relevant_cols]

In [4]:
# Feature engineering: Example
df_relevant_cols['CarAge'] = 2024 - df_relevant_cols['RegistrationYear']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_relevant_cols['CarAge'] = 2024 - df_relevant_cols['RegistrationYear']


In [5]:
# checking for columns with missing values
missing_data = check_missing_data(df_relevant_cols)
print(missing_data)

   Column Name  Missing Values  Percentage Missing
2       Gender            9536            0.953507
5  VehicleType             552            0.055195
7         make             552            0.055195
8        Model             552            0.055195
9   NewVehicle          153295           15.327998


In [6]:
# data cleaning
df_relevant_cols.loc[:, 'VehicleType'] = df_relevant_cols['VehicleType'].fillna(df_relevant_cols['VehicleType'].mode()[0])
df_relevant_cols.loc[:, 'make'] = df_relevant_cols['make'].fillna(df_relevant_cols['make'].mode()[0])
df_relevant_cols.loc[:, 'Model'] = df_relevant_cols['Model'].fillna(df_relevant_cols['Model'].mode()[0])
df_relevant_cols.loc[:, 'NewVehicle'] = df_relevant_cols['NewVehicle'].fillna(df_relevant_cols['NewVehicle'].mode()[0])
df_relevant_cols.loc[:, 'Gender'] = df_relevant_cols['Gender'].fillna(df_relevant_cols['Gender'].mode()[0])

In [7]:
# checking for columns with missing values
missing_data = check_missing_data(df_relevant_cols)
print(missing_data)

Success: No missing values.


In [8]:
df_relevant_cols.head(10)

Unnamed: 0,TotalPremium,TotalClaims,Gender,PostalCode,Province,VehicleType,RegistrationYear,make,Model,NewVehicle,CarAge
0,21.929825,0.0,Not specified,1459,Gauteng,Passenger Vehicle,2004,MERCEDES-BENZ,E 240,More than 6 months,20
1,21.929825,0.0,Not specified,1459,Gauteng,Passenger Vehicle,2004,MERCEDES-BENZ,E 240,More than 6 months,20
2,0.0,0.0,Not specified,1459,Gauteng,Passenger Vehicle,2004,MERCEDES-BENZ,E 240,More than 6 months,20
3,512.84807,0.0,Not specified,1459,Gauteng,Passenger Vehicle,2004,MERCEDES-BENZ,E 240,More than 6 months,20
4,0.0,0.0,Not specified,1459,Gauteng,Passenger Vehicle,2004,MERCEDES-BENZ,E 240,More than 6 months,20
5,3.256435,0.0,Not specified,1459,Gauteng,Passenger Vehicle,2004,MERCEDES-BENZ,E 240,More than 6 months,20
6,50.474737,0.0,Not specified,1459,Gauteng,Passenger Vehicle,2004,MERCEDES-BENZ,E 240,More than 6 months,20
7,35.332316,0.0,Not specified,1459,Gauteng,Passenger Vehicle,2004,MERCEDES-BENZ,E 240,More than 6 months,20
8,0.0,0.0,Not specified,1459,Gauteng,Passenger Vehicle,2004,MERCEDES-BENZ,E 240,More than 6 months,20
9,1.009474,0.0,Not specified,1459,Gauteng,Passenger Vehicle,2004,MERCEDES-BENZ,E 240,More than 6 months,20


In [9]:
# Encoding categorical data
categorical_features = ['Province', 'PostalCode', 'Gender', 'VehicleType', 'CarAge', 'NewVehicle']
encoder = OneHotEncoder(drop='first')
encoded_features = encoder.fit_transform(df_relevant_cols[categorical_features])

# Create DataFrame
feature_names = encoder.get_feature_names_out(input_features=categorical_features)
encoded_features_df = pd.DataFrame(encoded_features.toarray(), columns=feature_names)
data = df_relevant_cols.drop(categorical_features, axis=1)
data = pd.concat([data, encoded_features_df], axis=1)

: 

#### Model Building

#### Model Evaluation

#### Feature Importance Analysis

#### Reporting Results