# Importing libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Importing the dataset

In [5]:
dataset = pd.read_csv('Data.csv')
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [6]:
dataset.describe(include='all')

Unnamed: 0,Country,Age,Salary,Purchased
count,10,9.0,9.0,10
unique,3,,,2
top,France,,,No
freq,4,,,5
mean,,38.777778,63777.777778,
std,,7.693793,12265.579662,
min,,27.0,48000.0,
25%,,35.0,54000.0,
50%,,38.0,61000.0,
75%,,44.0,72000.0,


In [3]:
# Creating our dependent and independent variables
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

In [4]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [5]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

# Handling missing data

In [6]:
# Find rows with missing data
dataset[dataset.isna().sum(axis=1)>0]

Unnamed: 0,Country,Age,Salary,Purchased
4,Germany,40.0,,Yes
6,Spain,,52000.0,No


In [7]:
# Update missing salary by average
from sklearn.impute import SimpleImputer

In [8]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [9]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X[:,1:] = imputer.fit_transform(X[:,1:])
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

# Encoding categorical data

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [11]:
# One-hot encoding of country code
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])],
                       remainder='passthrough')
X = ct.fit_transform(X)
X

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [12]:
# Encoding the dependent variable

from sklearn.preprocessing import LabelEncoder

In [13]:
le = LabelEncoder()
y = le.fit_transform(y)
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

# Splitting data for training and testing

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [19]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8, 5), (2, 5), (8,), (2,))

In [20]:
X_train

array([[0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 35.0, 58000.0]], dtype=object)

In [21]:
X_test

array([[0.0, 1.0, 0.0, 30.0, 54000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [22]:
y_train

array([0, 1, 0, 0, 1, 1, 0, 1])

In [23]:
y_test

array([0, 1])

# Feature scaling

Normalisation is recommended when most features follow a normal distribution wile standardisation.
Feature scaling tool should not be applied in the test set. It is fit in the train and then the same fitting is used to scale the values of test.

Dummy variables (one-hot encoded) do not need to be standardised. Feature scaling should be only applied to the numerical values.

In [24]:
from sklearn.preprocessing import StandardScaler

In [25]:
sc = StandardScaler()

In [28]:
# Fit and transform training set
sc.fit(X_train[:,3:])
X_train[:,3:] = sc.transform(X_train[:,3:])
X_train

array([[0.0, 0.0, 1.0, -0.19159184384578545, -1.0781259408412425],
       [0.0, 1.0, 0.0, -0.014117293757057777, -0.07013167641635372],
       [1.0, 0.0, 0.0, 0.566708506533324, 0.633562432710455],
       [0.0, 0.0, 1.0, -0.30453019390224867, -0.30786617274297867],
       [0.0, 0.0, 1.0, -1.9018011447007988, -1.420463615551582],
       [1.0, 0.0, 0.0, 1.1475343068237058, 1.232653363453549],
       [0.0, 1.0, 0.0, 1.4379472069688968, 1.5749910381638885],
       [1.0, 0.0, 0.0, -0.7401495441200351, -0.5646194287757332]],
      dtype=object)

In [29]:
# Transform test set
X_test[:,3:] = sc.transform(X_test[:,3:])
X_test

array([[0.0, 1.0, 0.0, -1.4661817944830124, -0.9069571034860727],
       [1.0, 0.0, 0.0, -0.44973664397484414, 0.2056403393225306]],
      dtype=object)

# Summary

**Libraries**

From sklearn:
- impute.SimpleImputer: filling missing values.
- compose.ColumnTransformer: class to apply transformers to columns.
- preprocessing.OneHotEncoder: one-hot encoding categorical variables.
- preprocessing.LabelEncoder: one-hot encoding labels - categorical dependent variables.
- model_selection.train_test_split: splitting the data into training and testing.
- preprocessing.StandardScaler: scaling numerical variables.

**Steps**
1. Dataset import
2. Handling missing data
3. Encoding categorical data
4. Splitting train and test dataset
5. Feature scaaling