In [1]:
!pip install pandas



In [5]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score



data = pd.read_csv('insurance_claims.csv')

data.head()

Unnamed: 0,months_as_customer,age,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,incident_type,collision_type,incident_severity,...,number_of_vehicles_involved,property_damage,bodily_injuries,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year
0,328,48,OH,250/500,1000,1406.91,0,Single Vehicle Collision,Side Collision,Major Damage,...,1,YES,1,71610,6510,13020,52080,Saab,92x,2004
1,228,42,IN,250/500,2000,1197.22,5000000,Vehicle Theft,?,Minor Damage,...,1,?,0,5070,780,780,3510,Mercedes,E400,2007
2,134,29,OH,100/300,2000,1413.14,5000000,Multi-vehicle Collision,Rear Collision,Minor Damage,...,3,NO,2,34650,7700,3850,23100,Dodge,RAM,2007
3,256,41,IL,250/500,2000,1415.74,6000000,Single Vehicle Collision,Front Collision,Major Damage,...,1,?,1,63400,6340,6340,50720,Chevrolet,Tahoe,2014
4,228,44,IL,500/1000,1000,1583.91,6000000,Vehicle Theft,?,Minor Damage,...,1,NO,0,6500,1300,650,4550,Accura,RSX,2009


In [34]:
#Independent and Dependent Variables
X = data.drop(columns=['total_claim_amount', 'injury_claim', 'property_claim', 'vehicle_claim'])
y = data['total_claim_amount']

# Convert categorical variables to dummy/indicator variables
X = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [35]:
# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)
r2 = r2_score(y_test, y_pred_rf)

print(f'Random Forest MAE: {mae_rf}')
print(f'Random Forest RMSE: {rmse_rf}')
print(f'R-squared: {r2}')



Random Forest MAE: 10675.467666666666
Random Forest RMSE: 14953.599425807153
R-squared: 0.6862917754676663


In [48]:
# Define the parameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [100, 500, 1000],        # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],         # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],         # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],           # Minimum number of samples required to be at a leaf node
    'max_features': [1, 'sqrt', 'log2'], # Number of features to consider when looking for the best split
    'bootstrap': [True, False],              # Whether bootstrap samples are used when building trees
}

# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)

# Perform GridSearchCV to find the best hyperparameters
rf_grid_search = GridSearchCV(estimator=rf_model, param_grid=rf_param_grid, 
                              cv=5, n_jobs=-1, scoring='neg_mean_absolute_error', verbose=2)

# Fit the grid search to the data
rf_grid_search.fit(X_train, y_train)

# Get the best model and parameters
best_rf_model = rf_grid_search.best_estimator_
print("Best parameters for Random Forest:", rf_grid_search.best_params_)

# Evaluate the best model
rf_mae = mean_absolute_error(y_test, best_rf_model.predict(X_test))
rf_rmse = np.sqrt(mean_squared_error(y_test, best_rf_model.predict(X_test)))
rf_r2 = r2_score(y_test, best_rf_model.predict(X_test))

print(f'Random Forest MAE: {rf_mae:.2f}')
print(f'Random Forest RMSE: {rf_rmse:.2f}')
print(f'Random Forest R-squared: {rf_r2:.4f}')

Fitting 5 folds for each of 648 candidates, totalling 3240 fits
Best parameters for Random Forest: {'bootstrap': False, 'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Random Forest MAE: 10782.99
Random Forest RMSE: 14699.47
Random Forest R-squared: 0.6969


In [49]:
# Save the trained model to a file
joblib.dump(best_rf_model, 'best_gb_model.pkl')

['best_gb_model.pkl']

In [36]:
# Initialize and train the Gradient Boosting model
gb_model = GradientBoostingRegressor(n_estimators=1000, subsample=0.8, learning_rate=0.01, max_depth=7, min_samples_split=5, min_samples_leaf=4, max_features='sqrt', random_state=42)
gb_model.fit(X_train, y_train)

# Predict on the test set
y_pred_gb = gb_model.predict(X_test)

# Evaluate the model
mae_gb = mean_absolute_error(y_test, y_pred_gb)
rmse_gb = mean_squared_error(y_test, y_pred_gb, squared=False)
r2 = r2_score(y_test, y_pred_gb)


print(f'Gradient Boosting MAE: {mae_gb}')
print(f'Gradient Boosting RMSE: {rmse_gb}')
print(f'R-squared: {r2}')


Gradient Boosting MAE: 11018.196310201414
Gradient Boosting RMSE: 15118.755827588515
R-squared: 0.6793239498185234


In [47]:
# Define the parameter grid for Gradient Boosting
gb_param_grid = {
    'n_estimators': [100, 500, 1000],        # Number of boosting stages to be run
    'learning_rate': [0.01, 0.05, 0.1],      # Step size shrinkage
    'max_depth': [3, 5, 7],                  # Maximum depth of the individual regression estimators
    'min_samples_split': [2, 5, 10],         # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],           # Minimum number of samples required to be at a leaf node
    'max_features': [1, 'sqrt', 'log2'], # Number of features to consider when looking for the best split
    'subsample': [0.8, 1.0],                 # Fraction of samples used for fitting the individual base learners
}

# Initialize the Gradient Boosting Regressor
gb_model = GradientBoostingRegressor(random_state=42)

# Perform GridSearchCV to find the best hyperparameters
gb_grid_search = GridSearchCV(estimator=gb_model, param_grid=gb_param_grid, 
                              cv=5, n_jobs=-1, scoring='neg_mean_absolute_error', verbose=2)

# Fit the grid search to the data
gb_grid_search.fit(X_train, y_train)

# Get the best model and parameters
best_gb_model = gb_grid_search.best_estimator_
print("Best parameters for Gradient Boosting:", gb_grid_search.best_params_)

# Evaluate the best model
gb_mae = mean_absolute_error(y_test, best_gb_model.predict(X_test))
gb_rmse = np.sqrt(mean_squared_error(y_test, best_gb_model.predict(X_test)))
gb_r2 = r2_score(y_test, best_gb_model.predict(X_test))

print(f'Gradient Boosting MAE: {gb_mae:.2f}')
print(f'Gradient Boosting RMSE: {gb_rmse:.2f}')
print(f'Gradient Boosting R-squared: {gb_r2:.4f}')

Fitting 5 folds for each of 1458 candidates, totalling 7290 fits
Best parameters for Gradient Boosting: {'learning_rate': 0.01, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 1000, 'subsample': 1.0}
Gradient Boosting MAE: 10942.03
Gradient Boosting RMSE: 15047.36
Gradient Boosting R-squared: 0.6823


In [50]:
# Save the trained model to a file
joblib.dump(best_gb_model, 'best_gb_model.pkl')


['best_gb_model.pkl']