# Data Preparation

Credit: Notes based on Data Preparation Tutorial using Spyder IDE by Hadelin de Ponteves from SuperDataScience Team

# 1. Importing Data

In [166]:
pwd

'C:\\Users\\Aaron\\Desktop\\Deep Learning'

In [167]:
# importing the directory

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [168]:
# importing csv dataset

data = pd.read_csv("Data.csv")

In [169]:
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [170]:
# take values except last column (dependent variable)
X = data.iloc[:, :-1].values

In [171]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [172]:
# take last column values (dependant variable)
y = data.iloc[:, -1].values

In [173]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'], dtype=object)

# 2. Taking Care of missing data

In [174]:
# Note: Missing data labelled "NaN"
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [175]:
# Two common ways to approach this problem

# Method 1: Remove rows with incomplete data

# Method 2: To take mean/median of columns (More commonly used)

In [176]:
# Will use Method 2 for this example

In [177]:
# Import Preprocessing library from Scikit Learn
from sklearn.preprocessing import Imputer

In [178]:
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)

# Shift Tab to view Imputer commands
# Init signature: Imputer(missing_values='NaN', strategy='mean' or 'median' or 'most_frequent', axis=0, verbose=0, copy=True), 
# Docstring:     
# Imputation transformer for completing missing values.

In [179]:
# Fit imputer to Matrix X
# Missing data were on column 2 & 3 (Index 1 & 2 due to Python zero indexing)
# Note upper bound is last index + 1 (index 3 = upper bound)
imputer = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [180]:
# Missing values "NaN" has been replaced with:
# Age mean value of 38.7777
# Salary mean value of 63777
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

# 3. Encode Categorical Data

In [181]:
# Will be encoding two (2) variables "Country & "Purchase"

In [182]:
from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()

In [183]:
# Fit LabelEncoder on 1st column category "Country"
# Note that the Country labels have been converted to numbers

X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
X[:, 0]

array([0, 2, 1, 2, 1, 0, 2, 0, 1, 0], dtype=object)

In [184]:
X

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [185]:
# Problem: Since Machine Learning models are based on equations, the model might think that higher numbers are higher values.
# Is Spain > Germany > France ? (No)

In [186]:
# Solution: Dummy Variables
# Spliting Spain, Germany, and France into three (3) columns, 
#    1 = yes (Is Country)
#    0 = no (Not Country)

In [187]:
from sklearn.preprocessing import OneHotEncoder

# OneHotEncode 1st column
onehotencoder = OneHotEncoder(categorical_features = [0])
X = onehotencoder.fit_transform(X).toarray()

Init signature: OneHotEncoder(n_values='auto', categorical_features='all', dtype=<class 'numpy.float64'>, sparse=True, handle_unknown='error')

Docstring:     
Encode categorical integer features using a one-hot aka one-of-K scheme.

In [188]:
# Column 1: France
# Column 2: Germany
# Column 3: Spain

X[:, 0:3]

array([[ 1.,  0.,  0.],
       [ 0.,  0.,  1.],
       [ 0.,  1.,  0.],
       [ 0.,  0.,  1.],
       [ 0.,  1.,  0.],
       [ 1.,  0.,  0.],
       [ 0.,  0.,  1.],
       [ 1.,  0.,  0.],
       [ 0.,  1.,  0.],
       [ 1.,  0.,  0.]])

In [189]:
# Encoding Purchase category
# Only need to use LabelEncoder as the column only has Yes / No values

In [190]:
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

In [191]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1], dtype=int64)

# 4. Split data into training set & test set

In [192]:
from sklearn.cross_validation import train_test_split
import warnings
warnings.simplefilter('error')

In [193]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# random_state aka random seed. (For reproducibility)

In [204]:
pd.DataFrame(X_train)

Unnamed: 0,0,1,2,3,4
0,0.0,1.0,0.0,40.0,63777.777778
1,1.0,0.0,0.0,37.0,67000.0
2,0.0,0.0,1.0,27.0,48000.0
3,0.0,0.0,1.0,38.777778,52000.0
4,1.0,0.0,0.0,48.0,79000.0
5,0.0,0.0,1.0,38.0,61000.0
6,1.0,0.0,0.0,44.0,72000.0
7,1.0,0.0,0.0,35.0,58000.0


In [201]:
pd.DataFrame(y_train)

Unnamed: 0,0
0,1
1,1
2,1
3,0
4,1
5,0
6,0
7,1


In [195]:
pd.DataFrame(X_test)

Unnamed: 0,0,1,2,3,4
0,0.0,1.0,0.0,30.0,54000.0
1,0.0,1.0,0.0,50.0,83000.0


In [197]:
pd.DataFrame(y_test)

Unnamed: 0,0
0,0
1,0


# 5. Feature Scaling

In [206]:
# Age & Salary variables are not on the same scale

In [210]:
# Age
pd.DataFrame(X[:, 3])

Unnamed: 0,0
0,44.0
1,27.0
2,30.0
3,38.0
4,40.0
5,35.0
6,38.777778
7,48.0
8,50.0
9,37.0


In [212]:
# Salary
pd.DataFrame(X[:,4])

Unnamed: 0,0
0,72000.0
1,48000.0
2,54000.0
3,61000.0
4,63777.777778
5,58000.0
6,52000.0
7,79000.0
8,83000.0
9,67000.0


In [213]:
# Method 1: Normalize 
# Method 2: Standard Scaling

In [214]:
from sklearn.preprocessing import StandardScaler 

In [215]:
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)


In [216]:
# Do we need to fit & transform dummy variable? (as dummy variable are 0 and 1)
# Scaling dummy variables will improve model predictions, but will lose interpretation of model

In [217]:
# In this case, we shall scale the dummy variables

In [221]:
pd.DataFrame(X_train)

Unnamed: 0,0,1,2,3,4
0,-1.0,2.645751,-0.774597,0.263068,0.123815
1,1.0,-0.377964,-0.774597,-0.253501,0.461756
2,-1.0,-0.377964,1.290994,-1.975398,-1.530933
3,-1.0,-0.377964,1.290994,0.052614,-1.11142
4,1.0,-0.377964,-0.774597,1.640585,1.720297
5,-1.0,-0.377964,1.290994,-0.081312,-0.167514
6,1.0,-0.377964,-0.774597,0.951826,0.986148
7,1.0,-0.377964,-0.774597,-0.597881,-0.482149


In [222]:
pd.DataFrame(X_test)

Unnamed: 0,0,1,2,3,4
0,-1.0,2.645751,-0.774597,-1.458829,-0.901663
1,-1.0,2.645751,-0.774597,1.984964,2.139811


In [224]:
# All variables on the same scale between -1 & +1
# Will improve ML model.
# Scaling will help models converge much faster especially Decision Trees

In [226]:
# Do we need to perform feature scaling on y? (Categorical variable of 0 & 1)     

# (No)

In [227]:
# Note: For Regression based predictions on y, Feature Scaling is required