### Dataset preparation

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

from sklearn.preprocessing import OneHotEncoder

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv(os.path.join(os.getcwd(), 'insurance.csv'))

df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [3]:
df.head().nunique()

age         5
sex         2
bmi         5
children    3
smoker      2
region      3
charges     5
dtype: int64

In [4]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
features = list(df.columns)
target = 'charges'
features.remove(target)

df.sex.replace({'female' : 0, 'male' : 1}, inplace=True)
df.smoker.replace({'no' : 0, 'yes' : 1}, inplace=True)

ohe = OneHotEncoder(sparse=False)
ohe.fit(df.region.values.reshape(-1, 1))
preps = ohe.transform(df.region.values.reshape(-1, 1)).T

for i in range(preps.shape[0]):
    category = ohe.categories_[0][i]
    df[category] = preps[i]
    features.append(category)
    
features.remove('region')

In [6]:
df[features].dtypes

age            int64
sex            int64
bmi          float64
children       int64
smoker         int64
northeast    float64
northwest    float64
southeast    float64
southwest    float64
dtype: object

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, 
                                                    random_state=47, stratify=pd.qcut(df[target], q=[0, 0.25, 0.5, 0.75, 1]))

In [8]:
y_train.mean(), y_test.mean()

(13352.334457971969, 12943.384778839552)

In [9]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)

predict = linreg.predict(X_test)

In [10]:
predict = linreg.predict(X_train)
mae = mean_absolute_error(y_train, predict)
mse = mean_squared_error(y_train, predict, squared=False)
r2 = r2_score(y_train, predict)

print(f"MAE: {mae:.2f}, RMSE: {mse:.2f}, r**2: {r2:.4f}" )

MAE: 4153.73, RMSE: 6068.57, r**2: 0.7532


In [11]:
predict = linreg.predict(X_test)
mae = mean_absolute_error(y_test, predict)
mse = mean_squared_error(y_test, predict, squared=False)
r2 = r2_score(y_test, predict)

print(f"MAE: {mae:.2f}, RMSE: {mse:.2f}, r**2: {r2:.4f}" )

MAE: 4289.90, RMSE: 5961.42, r**2: 0.7381


In [12]:
pd.concat([X_train, y_train], axis=1).to_csv('train.csv', index=False)
pd.concat([X_test, y_test], axis=1).to_csv('test.csv', index=False)