# Random Forest Regressor Model

In [1]:
from sklearn.ensemble import RandomForestRegressor

# Defining random forest regressor model object

In [2]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

In [3]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split

# Reading the dataset

In [4]:
rf_df = pd.read_csv('/data/insurance_data50k.csv')

In [5]:
print(rf_df)

       Age  Gender    Region  PolicyYears  PolicyType  PastClaimsCount  \
0       63    Male  Suburban            2        Home                0   
1       41    Male  Suburban            2      Health                1   
2       25    Male     Urban            5        Life                2   
3       44  Female     Urban            9        Auto                2   
4       48    Male  Suburban            4        Auto                2   
...    ...     ...       ...          ...         ...              ...   
49995   45  Female     Urban            5        Life                0   
49996   26  Female     Urban           12        Home                1   
49997   48    Male     Urban            3  Disability                1   
49998   49  Female     Urban           13        Life                1   
49999   19  Female  Suburban            7        Life                4   

       PastClaimsAmount  
0          99010.167897  
1          86606.465354  
2           3329.873640  
3      

In [6]:
#Correcting outliers
columns_to_filter = ['Age', 'PolicyYears', 'PastClaimsCount', 'PastClaimsAmount']

#  Applying ordinal encoding on categorical columns

In [7]:
#Applying ordinalw encoding on gender, region and policy type columns
gen = {'Gender' :['Male', 'Female']}
reg = {'Region' :['Urban', 'Suburban']}
pol_type = {'PolicyType' :['Health', 'Life', 'Auto', 'Home', 'Disability']}
df_ordinal_gender=pd.DataFrame(gen,columns=["Gender"])
df_ordinal_region=pd.DataFrame(reg,columns=["Region"])
df_ordinal_pol_type=pd.DataFrame(pol_type,columns=["PolicyType"])
gen_dict = {'Male': 1,'Female': 2}
reg_dict = {'Urban': 1,'Suburban': 2}
pol_type_dict = {'Health': 1,'Life': 2, 'Auto': 3, 'Home': 4, 'Disability': 5}
rf_df["gender_ordinal"]=rf_df.Gender.map(gen_dict)
rf_df["region_ordinal"]=rf_df.Region.map(reg_dict)
rf_df["pol_type_ordinal"]=rf_df.PolicyType.map(pol_type_dict)

In [8]:
#Printing dataframe after ordinal encoding on gender, region and policy type columns and dumping it in csv for further use
print(rf_df)
rf_df.to_csv("/data/insurance_data_encoded1L.csv")

       Age  Gender    Region  PolicyYears  PolicyType  PastClaimsCount  \
0       63    Male  Suburban            2        Home                0   
1       41    Male  Suburban            2      Health                1   
2       25    Male     Urban            5        Life                2   
3       44  Female     Urban            9        Auto                2   
4       48    Male  Suburban            4        Auto                2   
...    ...     ...       ...          ...         ...              ...   
49995   45  Female     Urban            5        Life                0   
49996   26  Female     Urban           12        Home                1   
49997   48    Male     Urban            3  Disability                1   
49998   49  Female     Urban           13        Life                1   
49999   19  Female  Suburban            7        Life                4   

       PastClaimsAmount  gender_ordinal  region_ordinal  pol_type_ordinal  
0          99010.167897            

# Defining train test data split function

In [9]:
X= rf_df[['Age','gender_ordinal','region_ordinal','PolicyYears','pol_type_ordinal','PastClaimsCount']] 
y= rf_df['PastClaimsAmount']

In [10]:
# using the train test split function 
X_train, X_test, y_train, y_test = train_test_split(X,y , 
                                   random_state=42,  
                                   test_size=0.2,  
                                   shuffle=True)

# Model training

In [None]:
#Model training
rf_model.fit(X_train, y_train)

# Calculating model metrics like MAE, MSE, R2

In [13]:
rf_predictions = rf_model.predict(X_test)

In [14]:
#Calculating model metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
y_pred = rf_model.predict(X_test)
rf_mae = mean_absolute_error(y_test, rf_predictions)
rf_mse = mean_squared_error(y_test, y_pred, squared=False)
rf_rmse = np.sqrt(rf_mse)
rf_r2 = r2_score(y_test, y_pred)

In [15]:
print(f"RandomForest MAE: {rf_mae}")
print(f"RandomForest MSE: {rf_mse}")
print(f"RandomForest RMSE: {rf_rmse}")
print(f"RandomForest R2: {rf_r2}")

RandomForest MAE: 53384.065330367324
RandomForest MSE: 63264.39460915747
RandomForest RMSE: 251.52414319336717
RandomForest R2: -0.1979742262766515


# Hyperparameter tuning to reduce MAE, MSE value using grid search technique

In [16]:
#Fine tuning hyper parameters with GridSearch
from sklearn.model_selection import GridSearchCV

In [17]:
# Define the parameter grid to search
param_grid = {
    'n_estimators': [1000, 2000, 3000, 4000, 5000],
    'max_depth': [10, 15, 20],
    'min_samples_leaf': [1000, 2000, 3000, 4000, 5000]
}

In [18]:
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

In [19]:
# Perform the grid search
grid_search.fit(X, y)

In [20]:
# Get the best model
best_rf = grid_search.best_estimator_

In [21]:
# Evaluate the best model
y_pred = best_rf.predict(X_test)
rf_mae = mean_absolute_error(y_test, rf_predictions)
rf_mse = mean_squared_error(y_test, y_pred, squared=False)
rf_rmse = np.sqrt(rf_mse)
rf_r2 = r2_score(y_test, y_pred)

In [22]:
#Printing new RMSE and R2 values
print("RandomForest Best Parameters:", grid_search.best_params_)
print(f"RandomForest MAE: {rf_mae}")
print(f"RandomForest MSE: {rf_mse}")
print(f"RandomForest RMSE: {rf_rmse}")
print(f"RandomForest R2: {rf_r2}")

RandomForest Best Parameters: {'max_depth': 10, 'min_samples_leaf': 5000, 'n_estimators': 1000}
RandomForest MAE: 53384.065330367324
RandomForest MSE: 57789.048789742956
RandomForest RMSE: 240.3935290097114
RandomForest R2: 0.000414716571723317


# Gradient Boosting Regressor Model

In [23]:
from sklearn.ensemble import GradientBoostingRegressor

# Defining gradient boosting regressor model

In [24]:
gb_model = GradientBoostingRegressor(random_state=42)

# Reading the dataset 

In [25]:
gb_df = pd.read_csv("/data/insurance_data_encoded1L.csv")

In [26]:
print(gb_df)

       Unnamed: 0  Age  Gender    Region  PolicyYears  PolicyType  \
0               0   74  Female  Suburban            3  Disability   
1               1   57    Male     Urban            5        Auto   
2               2   50  Female     Urban            7        Home   
3               3   41    Male  Suburban            5  Disability   
4               4   30  Female     Urban           12        Home   
...           ...  ...     ...       ...          ...         ...   
99995       99995   37    Male  Suburban            4      Health   
99996       99996   76  Female     Urban           15        Life   
99997       99997   72  Female  Suburban            3  Disability   
99998       99998   56  Female     Urban            1      Health   
99999       99999   42    Male  Suburban            3        Home   

       PastClaimsCount  PastClaimsAmount  gender_ordinal  region_ordinal  \
0                    6     190278.786408               2               2   
1                  

# Defining train test data split function

In [27]:
X= gb_df[['Age','gender_ordinal','region_ordinal','PolicyYears','pol_type_ordinal','PastClaimsCount']] 
y= gb_df['PastClaimsAmount']

In [28]:
# using the train test split function 
X_train, X_test, y_train, y_test = train_test_split(X,y , 
                                   random_state=42,  
                                   test_size=0.2,  
                                   shuffle=True)

# Model training

In [29]:
#Model training
gb_model.fit(X_train, y_train)

# Evaluating Gradient boosting regressor model

In [30]:
gb_predictions = gb_model.predict(X_test)

In [31]:
# Evaluation metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
y_pred = gb_model.predict(X_test)
gb_mae = mean_absolute_error(y_test, gb_predictions)
gb_mse = mean_squared_error(y_test, y_pred, squared=False)
gb_rmse = np.sqrt(gb_mse)
gb_r2 = r2_score(y_test, y_pred)

In [32]:
print(f"GradientBoosting MAE: {gb_mae}")
print(f"GradientBoosting MSE: {gb_mse}")
print(f"GradientBoosting RMSE: {gb_rmse}")
print(f"GradientBoosting R2: {gb_r2}")

GradientBoosting MAE: 50063.021964497515
GradientBoosting MSE: 57815.4312109695
GradientBoosting RMSE: 240.44839614971337
GradientBoosting R2: -0.0004981726429131861


# Hyperparameter tuning using grid search technique

In [33]:
#Fine tuning hyper parameters with GridSearch
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search
param_grid = {
    'n_estimators': [1000, 2000, 3000, 4000, 5000],
    'max_depth': [10, 15, 20],
    'min_samples_leaf': [1000, 2000, 3000, 4000, 5000]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

In [None]:
# Perform the grid search
grid_search.fit(X, y)

In [None]:
# Get the best model
best_gb = grid_search.best_estimator_

# Evaluate the best model
y_pred = best_gb.predict(X_test)
gb_mae = mean_absolute_error(y_test, gb_predictions)
gb_mse = mean_squared_error(y_test, y_pred, squared=False)
gb_rmse = np.sqrt(gb_mse)
gb_r2 = r2_score(y_test, y_pred)

#Printing new RMSE and R2 values
print("GradientBoosting Best Parameters:", grid_search.best_params_)
print(f"GradientBoosting MAE: {gb_mae}")
print(f"GradientBoosting MSE: {gb_mse}")
print(f"GradientBoosting RMSE: {gb_rmse}")
print(f"GradientBoosting R2: {gb_r2}")

In [None]:
# def rf_cv(n_estimators, max_depth):
#     # Convert hyperparameters to integers
#     n_estimators = int(n_estimators)
#     max_depth = int(max_depth)
    
#     # Define the Gradient Boosting Regressor model with hyperparameters
#     best_model = GradientBoostingRegressor(n_estimators=n_estimators, max_depth=max_depth)
    
#     # Use cross-validation to evaluate the model
#     cv_score = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error').mean()
    
#     return cv_score

In [None]:
# # Define the parameter space for Bayesian Optimization
# pbounds = {'n_estimators': (10, 1000), 'max_depth': (1, 100)}

# # Initialize BayesianOptimization
# optimizer = BayesianOptimization(f=rf_cv, pbounds=pbounds, random_state=1)

# # Perform the optimization
# optimizer.maximize(init_points=5, n_iter=10)

# # Print the best hyperparameters found
# print(optimizer.max)

In [None]:
# # Evaluate the byesian optimized model
# bayesian_opt_model = GradientBoostingRegressor(n_estimators=n_estimators, max_depth=max_depth)
# bayesian_opt_model.fit(X_train, y_train)

In [None]:
# bayesian_opt_predictions = bayesian_opt_model.predict(X_test)
# y_pred = bayesian_opt_model.predict(X_test)
# bo_gb_mae = mean_absolute_error(y_test, bayesian_opt_predictions)
# bo_gb_mse = mean_squared_error(y_test, y_pred, squared=False)
# bo_gb_rmse = np.sqrt(bo_gb_mse)
# bo_gb_r2 = r2_score(y_test, y_pred)

# #Printing new RMSE and R2 values
# print(f"GradientBoosting MAE: {bo_gb_mae}")
# print(f"GradientBoosting MSE: {bo_gb_mse}")
# print(f"GradientBoosting RMSE: {bo_gb_rmse}")
# print(f"GradientBoosting R2: {bo_gb_r2}")

# Gradient Boosting Regressor Model

In [None]:
from xgboost import XGBRegressor

# Defining XGBoost regressor model

In [None]:
xgb_model = XGBRegressor(random_state=42)

# Reading the dataset

In [None]:
xgb_df = pd.read_csv("/data/insurance_data_encoded.csv")

In [None]:
print(xgb_df)

# Defining train test data split function

In [None]:
X= xgb_df[['Age','gender_ordinal','region_ordinal','PolicyYears','pol_type_ordinal','PastClaimsCount']] 
y= xgb_df['PastClaimsAmount']

In [None]:
# using the train test split function 
X_train, X_test, y_train, y_test = train_test_split(X,y , 
                                   random_state=20,  
                                   test_size=0.3,  
                                   shuffle=True)

# Model training

In [None]:
#Model training
xgb_model.fit(X_train, y_train)

# Evaluating XGBoost regressor model

In [None]:
xgb_predictions = xgb_model.predict(X_test)

In [None]:
# Evaluation metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
y_pred = xgb_model.predict(X_test)
xgb_mae = mean_absolute_error(y_test, xgb_predictions)
xgb_mse = mean_squared_error(y_test, y_pred, squared=False)
xgb_rmse = np.sqrt(xgb_mse)
xgb_r2 = r2_score(y_test, y_pred)

In [None]:
print(f"XGBoost MAE: {xgb_mae}")
print(f"XGBoost MSE: {xgb_mse}")
print(f"XGBoost RMSE: {xgb_rmse}")
print(f"XGBoost R2: {xgb_r2}")

# Hyperparameter tuning using grid search technique

In [None]:
#Fine tuning hyper parameters with GridSearch
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search
param_grid = {
    'n_estimators': [500, 700, 1000],
    'max_depth': [10, 15, 20],
    'min_samples_leaf': [1, 2, 5]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

In [None]:
# Perform the grid search
grid_search.fit(X, y)

In [None]:
# Get the best model
best_xgb = grid_search.best_estimator_

# Evaluate the best model
y_pred = best_xgb.predict(X_test)
xgb_mae = mean_absolute_error(y_test, xgb_predictions)
xgb_mse = mean_squared_error(y_test, y_pred, squared=False)
xgb_rmse = np.sqrt(xgb_mse)
xgb_r2 = r2_score(y_test, y_pred)

#Printing new RMSE and R2 values
print("XGBoost Best Parameters:", grid_search.best_params_)
print(f"XGBoost MAE: {xgb_mae}")
print(f"XGBoost MSE: {xgb_mse}")
print(f"XGBoost RMSE: {xgb_rmse}")
print(f"XGBoost R2: {xgb_r2}")

In [None]:
# xgb_tuned = XGBRegressor(
#     max_depth= 15, 
#     min_samples_leaf= 2, 
#     n_estimators= 1000,
#     learning_rate=0.1,
#     subsample=0.8,
#     colsample_bytree=0.8,
#     random_state=42
# )

In [None]:
# xgb_tuned.fit(X_train, y_train)

In [None]:
# # Evaluate the tuned model
# y_pred = best_xgb.predict(X_test)
# xgb_mae = mean_absolute_error(y_test, xgb_predictions)
# xgb_mse = mean_squared_error(y_test, y_pred, squared=False)
# xgb_rmse = np.sqrt(xgb_mse)
# xgb_r2 = r2_score(y_test, y_pred)

# #Printing new RMSE and R2 values
# print(f"XGBoost MAE: {xgb_mae}")
# print(f"XGBoost MSE: {xgb_mse}")
# print(f"XGBoost RMSE: {xgb_rmse}")
# print(f"XGBoost R2: {xgb_r2}")

# Export best model and create final predictions csv

In [None]:
import joblib

In [None]:
# As per the parameter comparison, Gradient boosting model is best performing model
joblib.dump(gb_model, '/data/best_model.joblib')

In [None]:
final_predictions_df = X_test.copy()  # Start with the test features
final_predictions_df['Actual'] = y_test  # Add the actual target values
final_predictions_df['Predicted'] = gb_model.predict(X_test)  # Add the model's predictions

In [None]:
final_predictions_df.to_csv('/data/final_predictions5k.csv', index=False)