In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("../datasets/insurance.csv")
df.head(3)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [5]:
df = pd.get_dummies(df, columns = ["sex", "smoker", "region"], drop_first=True)
df.head(3)

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,False,True,False,False,True
1,18,33.77,1,1725.5523,True,False,False,True,False
2,28,33.0,3,4449.462,True,False,False,True,False


In [6]:
y, x = df[["charges"]], df.drop(columns=["charges"])

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x,y, train_size = 0.75, random_state = 57)

In [11]:
model = LinearRegression()
model.fit(x_train, y_train)

In [15]:
print("Training set score: " + str(model.score(x_train,y_train)))
print("Test set score: " + str(model.score(x_test, y_test)))

Training set score: 0.7420548276411689
Test set score: 0.7743092620329011


In [19]:
df_hata = pd.DataFrame()
df_hata["y"] = df["charges"]

In [21]:
y_all, x_all  = df["charges"], df.drop("charges", axis=1)

In [23]:
df_hata["prediction"] = model.predict(x_all)
df_hata.head(3)

Unnamed: 0,y,prediction
0,16884.924,25352.393518
1,1725.5523,3442.865459
2,4449.462,6707.316337


In [24]:
df_hata["error"] = df["charges"] - df_hata["prediction"]
df_hata.head(3)

Unnamed: 0,y,prediction,error
0,16884.924,25352.393518,-8467.469518
1,1725.5523,3442.865459,-1717.313159
2,4449.462,6707.316337,-2257.854337


In [25]:
df_hata["squared_error"] =  df_hata["error"] **2
df_hata.head(3)

Unnamed: 0,y,prediction,error,squared_error
0,16884.924,25352.393518,-8467.469518,71698040.0
1,1725.5523,3442.865459,-1717.313159,2949164.0
2,4449.462,6707.316337,-2257.854337,5097906.0


In [26]:
df_hata["abstract_error"] = np.abs(df_hata["error"])
df_hata.head(3)

Unnamed: 0,y,prediction,error,squared_error,abstract_error
0,16884.924,25352.393518,-8467.469518,71698040.0,8467.469518
1,1725.5523,3442.865459,-1717.313159,2949164.0,1717.313159
2,4449.462,6707.316337,-2257.854337,5097906.0,2257.854337


In [27]:
 df_hata["percent_error"] = df_hata["abstract_error"] / df_hata["y"]
df_hata.head(3)

Unnamed: 0,y,prediction,error,squared_error,abstract_error,percent_error
0,16884.924,25352.393518,-8467.469518,71698040.0,8467.469518,0.501481
1,1725.5523,3442.865459,-1717.313159,2949164.0,1717.313159,0.995225
2,4449.462,6707.316337,-2257.854337,5097906.0,2257.854337,0.507444


In [29]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

In [30]:
mean_squared_error(df_hata["y"], df_hata["prediction"])

36549366.43836404

In [31]:
mean_absolute_error(df_hata["y"], df_hata["prediction"])

4199.204837855979

In [32]:
mean_absolute_percentage_error(df_hata["y"], df_hata["prediction"])

0.42608825147929624