In [82]:
import numpy as np # Mathematical Library
import pandas as pd # Dataset manage library

In [83]:
# import dataset
#.CSV files store tabular data in plain text, like Excel
dataset = pd.read_csv('datasets/Data.csv')
# data.iloc[<row selection],<column selection>]
X = dataset.iloc[ : , :-1].values #except last column of data frame
Y = dataset.iloc[ : , 3].values # the 3rd column of data frame

In [84]:
dataset


Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [85]:
print(X)
print(Y)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


In [86]:
#handling the missing data
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan , strategy = 'mean')
# Replace the NaN data by the mean of column
imputer = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [88]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [89]:
# Encoding categorical data
'''
Label Encoder class: 
convert categorical text data into model-understandable numerical data.
'''
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# from sklearn.compose import ColumnTransformer
labelencoder_X = LabelEncoder()
X[ : , 0] = labelencoder_X.fit_transform(X[ : , 0]) # encoded a set of country names into numerical data

In [90]:
print(X)

[[0 44.0 72000.0]
 [2 27.0 48000.0]
 [1 30.0 54000.0]
 [2 38.0 61000.0]
 [1 40.0 63777.77777777778]
 [0 35.0 58000.0]
 [2 38.77777777777778 52000.0]
 [0 48.0 79000.0]
 [1 50.0 83000.0]
 [0 37.0 67000.0]]


In [91]:
# onehotencoder = OneHotEncoder(categorical_features = [0])
# X = onehotencoder.fit_transform(X).toarray()
'''
OneHotEncoder: 
in case of a column has data with some kind of order or hierarchy.
 splits the column into multiple columns. 
 The numbers are replaced by 1s and 0s,
 depending on which column has what value. 
'''
onehotencoder = OneHotEncoder(categories='auto')#Determine categories automatically from the training data
X = onehotencoder.fit_transform(X)
labelencoder_Y = LabelEncoder()
Y =  labelencoder_Y.fit_transform(Y)

In [92]:
print("X:",X)
print("Y:",Y)

X:   (0, 0)	1.0
  (0, 10)	1.0
  (0, 20)	1.0
  (1, 2)	1.0
  (1, 3)	1.0
  (1, 13)	1.0
  (2, 1)	1.0
  (2, 4)	1.0
  (2, 15)	1.0
  (3, 2)	1.0
  (3, 7)	1.0
  (3, 17)	1.0
  (4, 1)	1.0
  (4, 9)	1.0
  (4, 18)	1.0
  (5, 0)	1.0
  (5, 5)	1.0
  (5, 16)	1.0
  (6, 2)	1.0
  (6, 8)	1.0
  (6, 14)	1.0
  (7, 0)	1.0
  (7, 11)	1.0
  (7, 21)	1.0
  (8, 1)	1.0
  (8, 12)	1.0
  (8, 22)	1.0
  (9, 0)	1.0
  (9, 6)	1.0
  (9, 19)	1.0
Y: [0 1 0 0 1 1 0 1 0 1]


In [56]:
# Splitting the datasets into training sets and Test sets
# from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split( X , Y , test_size = 0.2, random_state = 0)

In [57]:
print("X_train:",X_train)
print("X_test:",X_test)
print("Y_train:",Y_train)
print("Y_test:",Y_test)

X_train:   (0, 1)	1.0
  (0, 9)	1.0
  (0, 18)	1.0
  (1, 0)	1.0
  (1, 6)	1.0
  (1, 19)	1.0
  (2, 2)	1.0
  (2, 3)	1.0
  (2, 13)	1.0
  (3, 2)	1.0
  (3, 8)	1.0
  (3, 14)	1.0
  (4, 0)	1.0
  (4, 11)	1.0
  (4, 21)	1.0
  (5, 2)	1.0
  (5, 7)	1.0
  (5, 17)	1.0
  (6, 0)	1.0
  (6, 10)	1.0
  (6, 20)	1.0
  (7, 0)	1.0
  (7, 5)	1.0
  (7, 16)	1.0
X_test:   (0, 1)	1.0
  (0, 4)	1.0
  (0, 15)	1.0
  (1, 1)	1.0
  (1, 12)	1.0
  (1, 22)	1.0
Y_train: [1 1 1 0 1 0 0 1]
Y_test: [0 0]


In [39]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler(with_mean=False) 
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)