In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn import metrics
import pickle

data = pd.read_csv("insurance.csv")


In [None]:
'''
check data

data.head()
data.tail()
data.shape()
data.info()
data.isnull().sum()
data.describe(include='all')
'''


In [3]:
#convert non-numerical value to numerical
data['sex']=data['sex'].map({'female':0,'male':1})
data['smoker']=data['smoker'].map({'yes':1,'no':0})
data['region']=data['region'].map({'southwest':1,'southeast':2,
                                   'northwest':3,'northeast':4})

In [5]:
#Data that will be predicted
predict = 'charges'

#Separate dataset that is meant for prediction
X = data.drop([predict],axis=1)
y = data[predict]

In [6]:
#Split dataset for training and testing (80/20)
#Hyperparameter tuning rec : test size = [0.1,0.2,0.3], random_state = [0,42]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [None]:
#Train the model on different regression model
lr = LinearRegression()
lr.fit(X_train,y_train)
svm = SVR()
svm.fit(X_train,y_train)
rfr = RandomForestRegressor()
rfr.fit(X_train,y_train)
gbr = GradientBoostingRegressor()
gbr.fit(X_train,y_train)

In [10]:
#Test the trained model with the test dataset
y_pred1 = lr.predict(X_test)
y_pred2 = svm.predict(X_test)
y_pred3 = rfr.predict(X_test)
y_pred4 = gbr.predict(X_test)

df1 = pd.DataFrame({'Actual':y_test,'Lr':y_pred1,
                  'svm':y_pred2,'rfr':y_pred3,'gbr':y_pred4})
df1

In [None]:
#For better visual understanding we plot the result into a graph
plt.subplot(221)
plt.plot(df1['Actual'].iloc[0:11],label='Actual')
plt.plot(df1['Lr'].iloc[0:11],label="Lr")
plt.legend()

plt.subplot(222)
plt.plot(df1['Actual'].iloc[0:11],label='Actual')
plt.plot(df1['svm'].iloc[0:11],label="svr")
plt.legend()

plt.subplot(223)
plt.plot(df1['Actual'].iloc[0:11],label='Actual')
plt.plot(df1['rfr'].iloc[0:11],label="rfr")
plt.legend()

plt.subplot(224)
plt.plot(df1['Actual'].iloc[0:11],label='Actual')
plt.plot(df1['gbr'].iloc[0:11],label="gbr")

plt.tight_layout()

plt.legend()

In [None]:
#Measure the R-Squared of each model
score1 = metrics.r2_score(y_test,y_pred1)
score2 = metrics.r2_score(y_test,y_pred2)
score3 = metrics.r2_score(y_test,y_pred3)
score4 = metrics.r2_score(y_test,y_pred4)

print(score1,score2,score3,score4)

In [None]:
#Measure the mean square error of each model
s1 = metrics.mean_absolute_error(y_test,y_pred1)
s2 = metrics.mean_absolute_error(y_test,y_pred2)
s3 = metrics.mean_absolute_error(y_test,y_pred3)
s4 = metrics.mean_absolute_error(y_test,y_pred4)

print(s1,s2,s3,s4)

In [None]:
#Data to be tested on the trained model
data = {'age' : 25,
        'sex' : 1,
        'bmi' : 25.4,
        'children' : 3,
        'smoker' : 1,
        'region' : 2}

df = pd.DataFrame(data,index=[0])
df

In [None]:
new_pred = gbr.predict(df)
print("Medical Insurance cost for the customer : ",new_pred[0])

In [None]:
#Train the model on the entire dataset
#Hyperparameter tuning : learning rate = [0.1,0.01], n_estimators = [100,1000,10000]
gbr = GradientBoostingRegressor(learning_rate=0.01,n_estimators=10000)
gbr.fit(X,y)

In [None]:
#Compare previous trained model with the newly trained model
y_pred = gbr.predict(X_test)
df2 = pd.DataFrame({'Actual':y_test,'Old gbr':y_pred4,'New gbr':y_pred})
print(df2)

fscore = metrics.r2_score(y_test,y_pred)
print(score4, fscore)

In [None]:
new_pred = gbr.predict(df)
print("Medical Insurance cost for New Customer is : ",new_pred[0])

In [None]:
#Saving the model
with open("MedicCostRegression.pickle","wb") as f:
    pickle.dump(gbr,f)

'''
Load the model 
pickle_in = open("MedicCostRegression.pickle","rb")
gbr = pickle.load(pickle_in)
'''