### Loadin toolboxes

In [1]:
import numpy as np
import pandas as pd

### Loading data

In [2]:
data = pd.read_csv('Data.csv')

#### Summary of the data

In [3]:
data.describe(include = 'all').transpose

<bound method DataFrame.transpose of        Country        Age        Salary Purchased
count       10   9.000000      9.000000        10
unique       3        NaN           NaN         2
top     France        NaN           NaN        No
freq         4        NaN           NaN         5
mean       NaN  38.777778  63777.777778       NaN
std        NaN   7.693793  12265.579662       NaN
min        NaN  27.000000  48000.000000       NaN
25%        NaN  35.000000  54000.000000       NaN
50%        NaN  38.000000  61000.000000       NaN
75%        NaN  44.000000  72000.000000       NaN
max        NaN  50.000000  83000.000000       NaN>

#### Finding the missed values

In [4]:
data.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

The above result indicates that there are one missed values in both *Age* and *Salary* columns

#### Slicing the data into the input and output sets
Input data are defined as below

In [5]:
X = data.iloc[:,:-1].values

And outputs are defined as

In [6]:
y = data.iloc[:,3].values

### Missed values

In [7]:
from sklearn.preprocessing import Imputer 

imputer  = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])

X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

### Dealing with categorical values
There are two columns with categorical values: 
* *Country*
* *Purchased*
So for teh inputs

In [8]:
from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()
X[:,0] = labelencoder_X.fit_transform(X[:,0])

X


array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

But  *Countries* is a nominal data so

In [9]:
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder(categorical_features = [0])
X = onehotencoder.fit_transform(X).toarray()

X.astype(int)

array([[    1,     0,     0,    44, 72000],
       [    0,     0,     1,    27, 48000],
       [    0,     1,     0,    30, 54000],
       [    0,     0,     1,    38, 61000],
       [    0,     1,     0,    40, 63777],
       [    1,     0,     0,    35, 58000],
       [    0,     0,     1,    38, 52000],
       [    1,     0,     0,    48, 79000],
       [    0,     1,     0,    50, 83000],
       [    1,     0,     0,    37, 67000]])

And for the outputs  

In [10]:
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

### Splitting into trainig and test sets

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2)

print("X_train-------------------------------------------------------------------------------------------------------")
print(X_train.astype(int))

print("X_test--------------------------------------------------------------------------------------------------------")
print(X_test.astype(int))

print("y_train-------------------------------------------------------------------------------------------------------")
print(y_train.astype(int))

print("y_test--------------------------------------------------------------------------------------------------------")
print(y_test.astype(int))

X_train-------------------------------------------------------------------------------------------------------
[[    1     0     0    48 79000]
 [    1     0     0    37 67000]
 [    0     0     1    38 52000]
 [    0     1     0    40 63777]
 [    0     0     1    27 48000]
 [    1     0     0    44 72000]
 [    0     0     1    38 61000]
 [    0     1     0    50 83000]]
X_test--------------------------------------------------------------------------------------------------------
[[    0     1     0    30 54000]
 [    1     0     0    35 58000]]
y_train-------------------------------------------------------------------------------------------------------
[1 1 0 1 1 0 0 0]
y_test--------------------------------------------------------------------------------------------------------
[0 1]
