#### Zeru Zhou zeruzhou9@gmail.com

### This model aimed to predict the insurance premium price for the user based on health condition

In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
import statsmodels.api as sm
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pickle

### Import data

In [3]:
my_df = pd.read_csv('../data/Medicalpremium.csv')
my_df.head()

Unnamed: 0,Age,Diabetes,BloodPressureProblems,AnyTransplants,AnyChronicDiseases,Height,Weight,KnownAllergies,HistoryOfCancerInFamily,NumberOfMajorSurgeries,PremiumPrice
0,45,0,0,0,0,155,57,0,0,0,25000
1,60,1,0,0,0,180,73,0,0,0,29000
2,36,1,1,0,0,158,59,0,0,1,23000
3,52,1,1,0,1,183,93,0,0,2,28000
4,38,0,0,0,1,166,88,0,0,1,23000


In [5]:
my_df.isna().value_counts()

Age    Diabetes  BloodPressureProblems  AnyTransplants  AnyChronicDiseases  Height  Weight  KnownAllergies  HistoryOfCancerInFamily  NumberOfMajorSurgeries  PremiumPrice
False  False     False                  False           False               False   False   False           False                    False                   False           986
dtype: int64

In [7]:
my_df.duplicated().value_counts()

False    986
dtype: int64

### Split data

In [9]:
x = my_df.drop(columns=['PremiumPrice'])
y = my_df['PremiumPrice']

In [13]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.15, random_state=42)

In [14]:
train_x.shape, train_y.shape, test_x.shape, test_y.shape

((838, 10), (838,), (148, 10), (148,))

In [26]:
train_x = MinMaxScaler().fit_transform(train_x)
test_x = MinMaxScaler().fit_transform(test_x)

### Linear Regression

In [32]:
LR = sm.OLS(train_y, train_x).fit()
y_pred = LR.predict(test_x)
mape = mean_absolute_percentage_error(y_pred, test_y)
mae = mean_absolute_error(y_pred, test_y)
mape, mae

(0.24611767165221682, 4503.3562487906365)

### SVM

In [34]:
svr = SVR()
linearsvr = LinearSVR()

params = {'C': np.linspace(0.01, 3, 50)}
clf = GridSearchCV(svr, params).fit(train_x, train_y)
y_pred = clf.predict(test_x)
mape = mean_absolute_percentage_error(y_pred, test_y)
mae = mean_absolute_error(y_pred, test_y)
mape, mae

(0.22497192812045264, 5212.911422280605)

In [35]:
clf = GridSearchCV(linearsvr, params).fit(train_x, train_y)
y_pred = clf.predict(test_x)
mape = mean_absolute_percentage_error(y_pred, test_y)
mae = mean_absolute_error(y_pred, test_y)
mape, mae

(3.6243763603144177, 19623.383451753598)

### Tree Based Methods

In [36]:
rdf = RandomForestRegressor(criterion='absolute_error', max_features='sqrt')
params = {'ccp_alpha': np.logspace(-4, 4, 50)}

clf = GridSearchCV(rdf, params).fit(train_x, train_y)
y_pred = clf.predict(test_x)
mape = mean_absolute_percentage_error(y_pred, test_y)
mae = mean_absolute_error(y_pred, test_y)
mape, mae

(0.07345996455865725, 1820.2702702702702)

### Ridge and Lasso

In [37]:
ridge = Ridge()
lasso = Lasso()
params = {'alpha': np.logspace(-4, 4, 50)}

clf = GridSearchCV(ridge, params).fit(train_x, train_y)
y_pred = clf.predict(test_x)
mape = mean_absolute_percentage_error(y_pred, test_y)
mae = mean_absolute_error(y_pred, test_y)
mape, mae

(0.11740223809367199, 2815.0486289245514)

In [38]:
clf = GridSearchCV(lasso, params).fit(train_x, train_y)
y_pred = clf.predict(test_x)
mape = mean_absolute_percentage_error(y_pred, test_y)
mae = mean_absolute_error(y_pred, test_y)
mape, mae

(0.11713108686280707, 2807.5822474897896)

### Since this dataset doesn't have many data, and the features are mostly categorical, and we are doing a prediction task, it is common that some bias to appear. Here, based on the test results, I'll choose Random Forest Regressor.

In [39]:
x = MinMaxScaler().fit_transform(x)

In [40]:
rdf = RandomForestRegressor(criterion='absolute_error', max_features='sqrt')
params = {'ccp_alpha': np.logspace(-4, 4, 100)}
clf = GridSearchCV(rdf, params).fit(x, y)

In [42]:
pickle.dump(clf, open('model.pkl', 'wb'))