In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor


from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt


ds = pd.read_csv('insurance.csv')

print(ds.head(15))

    age     sex     bmi  children smoker     region      charges
0    19  female  27.900         0    yes  southwest  16884.92400
1    18    male  33.770         1     no  southeast   1725.55230
2    28    male  33.000         3     no  southeast   4449.46200
3    33    male  22.705         0     no  northwest  21984.47061
4    32    male  28.880         0     no  northwest   3866.85520
5    31  female  25.740         0     no  southeast   3756.62160
6    46  female  33.440         1     no  southeast   8240.58960
7    37  female  27.740         3     no  northwest   7281.50560
8    37    male  29.830         2     no  northeast   6406.41070
9    60  female  25.840         0     no  northwest  28923.13692
10   25    male  26.220         0     no  northeast   2721.32080
11   62  female  26.290         0    yes  southeast  27808.72510
12   23    male  34.400         0     no  southwest   1826.84300
13   56  female  39.820         0     no  southeast  11090.71780
14   27    male     NaN  

In [8]:
#checking for null numbers
count_nan = ds.isnull().sum()
print(count_nan[count_nan>0])

bmi    5
dtype: int64


In [9]:
#filling null numeb
ds['bmi'].fillna(ds['bmi'].mean(), inplace = True)

In [10]:
#checking if it has filled
count_nan = ds.isnull().sum()
print(count_nan[count_nan>0])

Series([], dtype: int64)


In [11]:
#create array for encoding
sex = ds.iloc[:,1:2].values
smoker = ds.iloc[:,4:5].values

In [12]:
#label encoder for sex
le = LabelEncoder()
sex[:,0] = le.fit_transform(sex[:,0])
sex = pd.DataFrame(sex)
sex.columns = ['sex']
le_sex_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(f"results> ", le_sex_mapping)
                      

results>  {'female': 0, 'male': 1}


In [13]:
#label encoder for smoker
le = LabelEncoder()
smoker[:,0] = le.fit_transform(smoker[:,0])
smoker = pd.DataFrame(smoker)
smoker.columns = ['smoker']
le_smoker_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(f"results> ", le_smoker_mapping)

results>  {'no': 0, 'yes': 1}


In [14]:
#label encoder for region
region = ds.iloc[:,5:6].values
ohe = OneHotEncoder()
region = ohe.fit_transform(region).toarray()
region = pd.DataFrame(region)
region.columns = ['northeast', 'northwest', 'southeast', 'southwest']
print(region[:10])

   northeast  northwest  southeast  southwest
0        0.0        0.0        0.0        1.0
1        0.0        0.0        1.0        0.0
2        0.0        0.0        1.0        0.0
3        0.0        1.0        0.0        0.0
4        0.0        1.0        0.0        0.0
5        0.0        0.0        1.0        0.0
6        0.0        0.0        1.0        0.0
7        0.0        1.0        0.0        0.0
8        1.0        0.0        0.0        0.0
9        0.0        1.0        0.0        0.0


In [15]:
#splitting data for test purposes
X_num = ds[['age', 'bmi', 'children']]
X_final = pd.concat([X_num, sex, smoker, region], axis = 1)
y_final = ds[['charges']].copy()
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size = 0.33, random_state = 0)


In [16]:
#normalized scaler
n_scaler = MinMaxScaler()
X_train = n_scaler.fit_transform(X_train.astype(float))
X_test = n_scaler.transform(X_train.astype(float))




In [17]:
#standard scaler
s_scaler = StandardScaler()
X_train = s_scaler.fit_transform(X_train.astype(float))
X_test = s_scaler.transform(X_train.astype(float))


In [18]:
#linear regression
lr = LinearRegression().fit(X_train, y_train)
y_train_pred = lr.predict(X_train)
y_test_predict = lr.predict(X_test)

print("lr.coef_: {}".format(lr.coef_))
print("lr.intercept_ {}".format(lr.intercept_))


lr.coef_: [[3624.36356197 1966.90473927  661.35603447  -44.54996175 9310.54961689
   242.57758422  -29.49212715 -104.19142495  -99.14488063]]
lr.intercept_ [13141.35083164]


In [19]:
#polynomial regression
poly_lr = LinearRegression().fit(X_train, y_train)

y_train_pred = poly_lr.predict(X_train)
y_test_pred = poly_lr.predict(X_test)


print(poly_lr.score(X_train, y_train))



0.7283337653139448


In [20]:
#Support vector analysis 
svr = SVR(kernel='linear', C=300)

#standard scaler
s_scaler = StandardScaler()
X_train = s_scaler.fit_transform(X_train.astype(float))
X_test = s_scaler.transform(X_train.astype(float))

svr = svr.fit(X_train, y_train.values.ravel())
y_train_pred = svr.predict(X_train)
y_test_pred = svr.predict(X_test)



[ 3257.75779835 18674.0402738   4451.01660112 11576.3519452
   967.86714563 26524.38705965  9507.72169017  3858.9602114
  3165.654251    9234.53867716  9770.87108285  8522.4198571
 11688.15723281  4206.06379747  1979.66347853  2296.52804792
  4069.77015787  7889.31816667 17722.42815777 12107.6888466
  8001.94691364  8803.33600346 11499.50933335  1426.7259441
 10826.3976031  10888.63479943  3228.86158603  1015.83770316
 25645.74062007  8049.7659534   8650.00935147  3260.4803684
 11986.89778626  6310.92812436 11439.82248412 17942.97147299
 19300.35237488  9528.75240379  8175.51209082   730.34846033
  4569.43965082  8584.45384959  1613.42838008  9656.80564672
  1255.66717371  3390.85631073 23545.00619916  6963.80860948
  7941.20810375  5050.17883313 25827.13215396 10359.49148712
  8633.42826562  5078.4839957   6667.18812424  3769.68417337
  1562.03084398  9106.87802481 25215.62699128  5348.28916369
 25394.96096361 10199.91470364  7503.21211999  3087.81562714
  6812.13030405 17739.47848596

In [22]:
dt = DecisionTreeRegressor(random_state=0)

X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size = 0.33, random_state = 0)

s_scaler = StandardScaler()
X_train = s_scaler.fit_transform(X_train.astype(float))
X_test = s_scaler.transform(X_train.astype(float))

dt = dt.fit(X_train, y_train.values.ravel())
y_train_pred = dt.predict(X_train)
y_test_pred = dt.predict(X_test)

print('dt train score %.3f, dt test score: %.3f' % (dt.score(X_train, y_train), dt.score(X_test, y_test)))



ValueError: Found input variables with inconsistent numbers of samples: [442, 896]

In [None]:
#random forest regression
forest = RandomForestRegressor(n_estimators = 100, criterios = 'mse', random_state = 1, n_jobs = -1)

X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size = 0.33, random_state = 0)

s_scaler = StandardScaler()
X_train = s_scaler.fit_transform(X_train.astype(float))
X_test = s_scaler.transform(X_train.astype(float))

forest.fit(X_train, y_train.values.ravel())
y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)