In [37]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR


from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt


ds = pd.read_csv('insurance.csv')

print(ds.head(15))

    age     sex     bmi  children smoker     region      charges
0    19  female  27.900         0    yes  southwest  16884.92400
1    18    male  33.770         1     no  southeast   1725.55230
2    28    male  33.000         3     no  southeast   4449.46200
3    33    male  22.705         0     no  northwest  21984.47061
4    32    male  28.880         0     no  northwest   3866.85520
5    31  female  25.740         0     no  southeast   3756.62160
6    46  female  33.440         1     no  southeast   8240.58960
7    37  female  27.740         3     no  northwest   7281.50560
8    37    male  29.830         2     no  northeast   6406.41070
9    60  female  25.840         0     no  northwest  28923.13692
10   25    male  26.220         0     no  northeast   2721.32080
11   62  female  26.290         0    yes  southeast  27808.72510
12   23    male  34.400         0     no  southwest   1826.84300
13   56  female  39.820         0     no  southeast  11090.71780
14   27    male     NaN  

In [3]:
#checking for null numbers
count_nan = ds.isnull().sum()
print(count_nan[count_nan>0])

bmi    5
dtype: int64


In [4]:
#filling null numeb
ds['bmi'].fillna(ds['bmi'].mean(), inplace = True)

In [5]:
#checking if it has filled
count_nan = ds.isnull().sum()
print(count_nan[count_nan>0])

Series([], dtype: int64)


In [6]:
#create array for encoding
sex = ds.iloc[:,1:2].values
smoker = ds.iloc[:,4:5].values

In [7]:
#label encoder for sex
le = LabelEncoder()
sex[:,0] = le.fit_transform(sex[:,0])
sex = pd.DataFrame(sex)
sex.columns = ['sex']
le_sex_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(f"results> ", le_sex_mapping)
                      

results>  {'female': 0, 'male': 1}


In [8]:
#label encoder for smoker
le = LabelEncoder()
smoker[:,0] = le.fit_transform(smoker[:,0])
smoker = pd.DataFrame(smoker)
smoker.columns = ['smoker']
le_smoker_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(f"results> ", le_smoker_mapping)

results>  {'no': 0, 'yes': 1}


In [9]:
#label encoder for region
region = ds.iloc[:,5:6].values
ohe = OneHotEncoder()
region = ohe.fit_transform(region).toarray()
region = pd.DataFrame(region)
region.columns = ['northeast', 'northwest', 'southeast', 'southwest']
print(region[:10])

   northeast  northwest  southeast  southwest
0        0.0        0.0        0.0        1.0
1        0.0        0.0        1.0        0.0
2        0.0        0.0        1.0        0.0
3        0.0        1.0        0.0        0.0
4        0.0        1.0        0.0        0.0
5        0.0        0.0        1.0        0.0
6        0.0        0.0        1.0        0.0
7        0.0        1.0        0.0        0.0
8        1.0        0.0        0.0        0.0
9        0.0        1.0        0.0        0.0


In [18]:
#splitting data for test purposes
X_num = ds[['age', 'bmi', 'children']]
X_final = pd.concat([X_num, sex, smoker, region], axis = 1)
y_final = ds[['charges']].copy()
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size = 0.33, random_state = 0)


      age     bmi  children sex smoker  northeast  northwest  southeast  \
0      19  27.900         0   0      1        0.0        0.0        0.0   
1      18  33.770         1   1      0        0.0        0.0        1.0   
2      28  33.000         3   1      0        0.0        0.0        1.0   
3      33  22.705         0   1      0        0.0        1.0        0.0   
4      32  28.880         0   1      0        0.0        1.0        0.0   
...   ...     ...       ...  ..    ...        ...        ...        ...   
1333   50  30.970         3   1      0        0.0        1.0        0.0   
1334   18  31.920         0   0      0        1.0        0.0        0.0   
1335   18  36.850         0   0      0        0.0        0.0        1.0   
1336   21  25.800         0   0      0        0.0        0.0        0.0   
1337   61  29.070         0   0      1        0.0        1.0        0.0   

      southwest  
0           1.0  
1           0.0  
2           0.0  
3           0.0  
4        

In [16]:
#normalized scaler
n_scaler = MinMaxScaler()
X_train = n_scaler.fit_transform(X_train.astype(float))
X_test = n_scaler.transform(X_train.astype(float))


      age     bmi  children
0      19  27.900         0
1      18  33.770         1
2      28  33.000         3
3      33  22.705         0
4      32  28.880         0
...   ...     ...       ...
1333   50  30.970         3
1334   18  31.920         0
1335   18  36.850         0
1336   21  25.800         0
1337   61  29.070         0

[1338 rows x 3 columns]


In [26]:
#standard scaler
s_scaler = StandardScaler()
X_train = s_scaler.fit_transform(X_train.astype(float))
X_test = s_scaler.transform(X_train.astype(float))


In [28]:
#linear regression
lr = LinearRegression().fit(X_train, y_train)
y_train_pred = lr.predict(X_train)
y_test_predict = lr.predict(X_test)

print("lr.coef_: {}".format(lr.coef_))
print("lr.intercept_ {}".format(lr.intercept_))


lr.coef_: [[3624.36356197 1966.90473927  661.35603447  -44.54996175 9310.54961689
   242.57758422  -29.49212715 -104.19142495  -99.14488063]]
lr.intercept_ [13141.35083164]


In [35]:
#polynomial regression
poly_lr = LinearRegression().fit(X_train, y_train)

y_train_pred = poly_lr.predict(X_train)
y_test_pred = poly_lr.predict(X_test)


print(poly_lr.score(X_train, y_train))



0.7283337653139448


In [38]:
#Support vector analysis 
svr = SVR(kernel='linear', C=300)

#standard scaler
s_scaler = StandardScaler()
X_train = s_scaler.fit_transform(X_train.astype(float))
X_test = s_scaler.transform(X_train.astype(float))

svr = svr.fit(X_train, y_train.values.ravel())
y_train_pred = svr.predict(X_train)
y_test_pred = svr.predict(X_test)

print(y_train_pred)