In [14]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


# == Importing the dataset ==
dataset = pd.read_csv('insurance.csv')
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [15]:
# == Taking care of missing data ==
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer.fit(x[:,0:1])
x[:,0:1] = imputer.transform(x[:,0:1])
imputer.fit(x[:,2:4])
x[:,2:4] = imputer.transform(x[:,2:4])

In [16]:
# == Encoding categorical data ==
## Encoding categorical data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers = [('encoder',OneHotEncoder(),[1,4,5])],remainder = 'passthrough')
x = np.array(ct.fit_transform(x))
## Encoding the Dependent Variable
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [17]:
# == Splitting the dataset into the Training set and Test set ==
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size =0.2, random_state =1)

In [18]:
# == Feature Scaling (Standardisation method) ==
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train[:,8:] = sc.fit_transform(x_train[:,8:])
x_test[:,8:] = sc.transform(x_test[:,8:])

In [19]:
## Showing results
print("\n == x ==")
print(x)
print("\n == y ==")
print(y)
print("\n==============")

print("\n == x_train ==")
print(x_train)
print("\n == x_test ==")
print(x_test)
print("\n == y_train ==")
print(y_train)
print("\n == y_test ==")
print(y_test)


 == x ==
[[1.0 0.0 0.0 ... 19.0 27.9 0.0]
 [0.0 1.0 1.0 ... 18.0 33.77 1.0]
 [0.0 1.0 1.0 ... 28.0 33.0 3.0]
 ...
 [1.0 0.0 1.0 ... 18.0 36.85 0.0]
 [1.0 0.0 1.0 ... 21.0 25.8 0.0]
 [1.0 0.0 0.0 ... 61.0 29.07 0.0]]

 == y ==
[1005   57  306 ...   32   91 1171]


 == x_train ==
[[1.0 0.0 1.0 ... 1.0022862943641633 -0.6647447195470174
  -0.9070577122378711]
 [0.0 1.0 1.0 ... 1.0022862943641633 -1.51402368505697
  -0.07894188280568505]
 [0.0 1.0 1.0 ... -1.5042660745131606 1.0811768460878646
  -0.9070577122378711]
 ...
 [1.0 0.0 0.0 ... 0.8590547304283163 0.7006345403882132 0.749173946626501]
 [1.0 0.0 0.0 ... 0.0712811287811573 -1.3800989251111697
  0.749173946626501]
 [0.0 1.0 1.0 ... 1.2887494222358575 -0.44589206305022205
  -0.07894188280568505]]

 == x_test ==
[[0.0 1.0 1.0 ... -1.4326502925452371 0.7937285808383426
  -0.9070577122378711]
 [0.0 1.0 1.0 ... 1.2887494222358575 0.14207029768743673
  -0.9070577122378711]
 [0.0 1.0 1.0 ... 0.8590547304283163 1.0338132114728866
  -0.9070

In [20]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)

y_pred = regressor.predict(x_test)
print("== Predicting the Test set results (Multiple Linear Regression) ==")
print(y_pred)

print("\n == Predicting single result (row-50) ==")
print(regressor.predict([[0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.5752125501075516,0.1469699840269174,-0.9070577122378711
]]))
print("row-50: ", y_pred[50])

== Predicting the Test set results (Multiple Linear Regression) ==
[ 215.74418089  757.50021296  668.54471863  749.66205203  195.06300683
 1128.62813924  689.32537242  796.1052557   324.51607307 1093.75791617
  771.46711771  847.26011823  520.49137847  566.51283834  161.41485476
  654.92776269  367.38715205  517.32140993  836.99958647  848.65234572
  722.50924919 1133.07604217  614.20464544  609.92611397  243.9185895
  538.65870559  654.13398494  632.22714857  480.28065149  366.20943929
  801.03078707  435.61235763 1468.02694591  852.99666938 1341.93599587
  668.21008624 1005.15995172  950.7173194   803.38375336 1293.23562824
  396.03007337  784.23570523  727.20042386  816.16787092  411.6071233
  820.45267733  224.13102026  949.30051411  646.78519149  930.00654835
  781.44352242  790.17047396  239.09762752  655.87700444  879.92822066
  443.01251978 1296.77112202  827.27562001  351.74566045  197.64758381
  362.21483294  856.01409078  926.38150216  277.70601857  855.75636204
  658.008029