In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [None]:
df = pd.read_excel("/kaggle/input/health-insuarance-price/Health_insurance_cost (1).xlsx")

In [None]:
df.head()

In [None]:
df.isna().sum()

In [None]:
null_vals = ['age', 'BMI', 'health_insurance_price']
si = SimpleImputer()
si.fit(df[null_vals])
df[null_vals] = si.fit_transform(df[null_vals])

In [None]:
df.isna().sum()

# Pairplot without clasification, spotted tendency to have outliers

In [None]:
nums = ['age', 'BMI', 'health_insurance_price']
sns.pairplot(df, vars=nums)

# Check for outliers using boxplots

In [None]:
fig, axes = plt.subplots(ncols=3, nrows=1, figsize=(15, 5))
for i in range(len(nums)):
    sns.boxplot(df, x=nums[i], ax=axes[i])

In [None]:
def plots(df, x, y):
    grouped = df.groupby(y)
    fig, axes = plt.subplots(ncols=3, nrows=1, figsize=(20, 10))
    sns.histplot(df, x=x, hue=y, kde=True, ax=axes[0])
    sns.barplot(x=grouped[x].mean().index,
               y=grouped[x].mean(),
               ax=axes[1])
    
    for container in axes[1].containers:
        axes[1].bar_label(container, size=15, color='black')
        
    axes[2].pie(df[y].value_counts(),
               labels=df[y].value_counts().index,
               autopct='%0.2f%%')
    
    plt.suptitle("Insurance prices histplots and mean barplots grouped by {}".format(y),
                size=20)
    plt.tight_layout()
    plt.show()

In [None]:
cats = ['gender', 'smoking_status', 'location']
for i in cats:
    plots(df, 'health_insurance_price', i)

In [None]:
def remove_outliers(df, i):
    perc = np.percentile(df[i], [0, 25, 50, 75, 100])
    iqr = perc[3] - perc[1]
    min_bound = perc[1] - iqr*1.5
    max_bound = perc[3] + iqr*1.5
    df.loc[df[i] < min_bound, i] = min_bound
    df.loc[df[i] > max_bound, i] = max_bound
    return df

In [None]:
for i in nums:
    df = remove_outliers(df, i)

# Data after the removal of Outliers

In [None]:
fig, axes = plt.subplots(ncols=3, nrows=1, figsize=(15, 5))
for i in range(len(nums)):
    sns.boxplot(df, x=nums[i], ax=axes[i])

In [None]:
for i in cats:
    plots(df, 'health_insurance_price', i)

# Encoding data for training a regression task

In [None]:
le = LabelEncoder()
encoders = []
for i in cats:
    df[i] = le.fit_transform(df[i])
    encoders += [le]

In [None]:
x = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=0.2)

# Training and evaluation pipeline

In [None]:
def training(model, name):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    mae = mean_absolute_error(y_pred, y_test)
    mse = mean_squared_error(y_pred, y_test)
    r2 = r2_score(y_pred, y_test)
    print(name)
    print("MAE: {}\nMSE: {}\nr2: {}".format(mae, mse, r2))
    print("-"*10, "\n")
    return model

# Defining models and tuning their hyperparameters

In [None]:
abr = AdaBoostRegressor(learning_rate=0.0001)
rfr = RandomForestRegressor(max_depth=50)
svr = SVR(C=0.5)
knr = KNeighborsRegressor(n_neighbors=5)
lnr = LinearRegression()
xgb = XGBRegressor()
models = [abr, rfr, svr, knr, lnr, xgb]
names = ['Ada Boost', 'Random Forest', 'SVM',
        'Nearest Neighbors', 'Linear Regression',
        'XGB']

# Results

In [None]:
trained = []
for i, j in zip(models, names):
    trained += [training(i, j)]