# Step 1: Importing libraries

In [76]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

# Step 2: Loading the dataset

In [77]:
df=pd.read_csv("insurance.csv")
df.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.77,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.88,0,no,northwest,3866.86
5,31,female,25.74,0,no,southeast,3756.62
6,46,female,33.44,1,no,southeast,8240.59
7,37,female,27.74,3,no,northwest,7281.51
8,37,male,29.83,2,no,northeast,6406.41
9,60,female,25.84,0,no,northwest,28923.14


# Step 3: Data preprocessing

# Set the display precision for floating-point numbers

In [78]:
pd.set_option('display.float_format', '{:.2f}'.format)

# Handling missing values

In [79]:
#check if there is null values
df.isnull().sum() # => there isn't fortunately so we will proceed

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

# Removing any duplicates

In [80]:
#removing duplicates (if there is any)
df.drop_duplicates(inplace=True)

# Detecting and handling the outliers

In [81]:
def limits(data,feature):
    Q1=data[feature].quantile(0.25)
    Q3=data[feature].quantile(0.75)
    IQR=Q3-Q1
    lower_limit=Q1-1.5*IQR
    upper_limit=Q3+1.5*IQR
    
    return lower_limit,upper_limit
lower_limit,upper_limit=limits(df,"bmi")

df["bmi"]=np.where(df["bmi"]>upper_limit,upper_limit,
                  np.where(df["bmi"]<lower_limit,lower_limit,df["bmi"]))

# Encode categorical features

In [82]:
#the categorical features that we have are: sex,smoker and region
categorical_features=["sex","smoker","region"]

#initialize label encoder
label_encoder=LabelEncoder()

#apply label encoder to each categorical feature
df[categorical_features]=df[categorical_features].apply(lambda feature: label_encoder.fit_transform(feature))

#modified dataframe
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.90,0,1,3,16884.92
1,18,1,33.77,1,0,2,1725.55
2,28,1,33.00,3,0,2,4449.46
3,33,1,22.70,0,0,1,21984.47
4,32,1,28.88,0,0,1,3866.86
...,...,...,...,...,...,...,...
1333,50,1,30.97,3,0,1,10600.55
1334,18,0,31.92,0,0,0,2205.98
1335,18,0,36.85,0,0,2,1629.83
1336,21,0,25.80,0,0,3,2007.94


# Split dataset into training, test and validation

In [83]:
#putting each features and output to variables
x=df.iloc[:,:-1] #features (independent variables)
y=df.iloc[:,-1]  #output (dependent variable)

#splitting the data to training, validation and testing
x = x.values
y = y.values
x_temp, x_test, y_temp, y_test = train_test_split(x,y,test_size=(1 - (1000/len(x))) ,random_state=4)
x_train, x_val, y_train, y_val = train_test_split(x_temp,y_temp,test_size= (250/len(x_temp)) ,random_state=4)

# Feature scaling

In [90]:
#initialize Standard Scaler
scaler=StandardScaler()

#compute mean and standard deviation in x_train needed to standardize and scaling x_train
x_train=scaler.fit_transform(x_train)

#using the same scaler to standardize and scaling x_val and x_test
x_test=scaler.transform(x_test)
x_val=scaler.transform(x_val)

# Step 4: Modelling

# Linear regression

In [85]:
liner = LinearRegression()
liner.fit(x_train,y_train)
y_pred = liner.predict(x_test)
Liner_r2_score = r2_score(y_test,y_pred)*100

# Random forest

In [86]:
RF_r2_score_val = 0
for n_trees in range(10,100,10):
    rf = RandomForestRegressor(n_estimators=n_trees, random_state=4)
    rf.fit(x_train, y_train)
    y_val_pred = rf.predict(x_val)
    if r2_score(y_val, y_val_pred) * 100 >= RF_r2_score_val:
        RF_r2_score_val = r2_score(y_val, y_val_pred) * 100
        best_n_trees = n_trees


rf = RandomForestRegressor(n_estimators=best_n_trees, random_state=4)
rf.fit(x_train,y_train)
y_test_pred = rf.predict(x_test)
RF_r2_score = r2_score(y_test,y_test_pred)*100

# KNN

In [87]:
KNN_r2_score_val = 0
for k in range(1,15):
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(x_train, y_train)
    y_val_pred = knn.predict(x_val)
    if r2_score(y_val, y_val_pred) * 100 > KNN_r2_score_val:
        KNN_r2_score_val = r2_score(y_val, y_val_pred) * 100
        best_k = k

knn = KNeighborsRegressor(n_neighbors=best_k)
knn.fit(x_train,y_train)
y_test_pred = knn.predict(x_test)
KNN_r2_score = r2_score(y_test,y_test_pred)*100

# Step 5: Printing the accuracy of each model

In [88]:
print("Linear Regression Accuracy: %.2f" % Liner_r2_score ,"%")
print("Random Forest Regression Accuracy:%.2f" % RF_r2_score,"%")
print("KNN Regression Accuracy: %.2f" % KNN_r2_score,"%")

Linear Regression Accuracy: 75.85 %
Random Forest Regression Accuracy:86.31 %
KNN Regression Accuracy: 83.71 %
