In [12]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


# == Importing the dataset ==
dataset = pd.read_csv('insurance.csv')
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [13]:
# == Taking care of missing data ==
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer.fit(x[:,0:1])
x[:,0:1] = imputer.transform(x[:,0:1])
imputer.fit(x[:,2:4])
x[:,2:4] = imputer.transform(x[:,2:4])

In [14]:
# == Encoding categorical data ==
## Encoding categorical data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers = [('encoder',OneHotEncoder(),[1,4,5])],remainder = 'passthrough')
x = np.array(ct.fit_transform(x))
## Encoding the Dependent Variable
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [15]:
# == Splitting the dataset into the Training set and Test set ==
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size =0.2, random_state =1)

In [16]:
# == Feature Scaling (Standardisation method) ==
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train[:,8:] = sc.fit_transform(x_train[:,8:])
x_test[:,8:] = sc.transform(x_test[:,8:])

In [17]:
## Showing results
print("\n == x ==")
print(x)
print("\n == y ==")
print(y)
print("\n==============")

print("\n == x_train ==")
print(x_train)
print("\n == x_test ==")
print(x_test)
print("\n == y_train ==")
print(y_train)
print("\n == y_test ==")
print(y_test)
print("\n\n\n")


 == x ==
[[1.0 0.0 0.0 ... 19.0 27.9 0.0]
 [0.0 1.0 1.0 ... 18.0 33.77 1.0]
 [0.0 1.0 1.0 ... 28.0 33.0 3.0]
 ...
 [1.0 0.0 1.0 ... 18.0 36.85 0.0]
 [1.0 0.0 1.0 ... 21.0 25.8 0.0]
 [1.0 0.0 0.0 ... 61.0 29.07 0.0]]

 == y ==
[1005   57  306 ...   32   91 1171]


 == x_train ==
[[1.0 0.0 1.0 ... 1.0022862943641633 -0.6647447195470174
  -0.9070577122378711]
 [0.0 1.0 1.0 ... 1.0022862943641633 -1.51402368505697
  -0.07894188280568505]
 [0.0 1.0 1.0 ... -1.5042660745131606 1.0811768460878646
  -0.9070577122378711]
 ...
 [1.0 0.0 0.0 ... 0.8590547304283163 0.7006345403882132 0.749173946626501]
 [1.0 0.0 0.0 ... 0.0712811287811573 -1.3800989251111697
  0.749173946626501]
 [0.0 1.0 1.0 ... 1.2887494222358575 -0.44589206305022205
  -0.07894188280568505]]

 == x_test ==
[[0.0 1.0 1.0 ... -1.4326502925452371 0.7937285808383426
  -0.9070577122378711]
 [0.0 1.0 1.0 ... 1.2887494222358575 0.14207029768743673
  -0.9070577122378711]
 [0.0 1.0 1.0 ... 0.8590547304283163 1.0338132114728866
  -0.9070

In [18]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)

y_pred = regressor.predict(x_test)
print("== Predicting the Test set results (Multiple Linear Regression) ==")
print(y_pred)

print("\n == Predicting single result (row-50) ==")
print(regressor.predict([[0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.5752125501075516,0.1469699840269174,-0.9070577122378711
]]))
print("row-50: ", y_pred[50])

== Predicting the Test set results (Multiple Linear Regression) ==
[ 217.375  758.25   669.375  760.375  192.625 1118.25   701.25   797.375
  325.875 1089.25   778.625  849.5    531.125  564.25   172.     654.625
  366.     515.25   847.25   847.25   725.625 1132.375  612.625  608.25
  245.5    538.875  656.625  641.375  479.125  367.375  800.375  434.5
 1456.125  841.25  1338.125  667.25   992.875  937.     814.    1280.
  405.625  785.75   735.875  817.375  408.25   822.5    235.125  946.5
  642.875  932.375  791.5    788.375  237.875  653.375  869.     454.75
 1294.5    826.25   352.375  210.25   371.875  859.25   922.625  276.125
  859.75   659.125  717.5    761.875  231.5    930.875 1427.5   1142.125
  195.75   805.75   825.    1270.5    269.625  304.75   723.75   724.375
  260.125  830.5    732.25   238.125 1216.625 1024.     547.75  1330.
  775.375  622.5   1092.125 1402.875  873.125  666.375  159.75   664.5
  682.25   837.375  828.     525.5    769.25   894.75  1033.75   828.87