## Data Preprocessing

In [40]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Import the Dataset

In [41]:
dataset=pd.read_excel('data.xlsx')

In [6]:
dataset.head(3)

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No


#### Create Input features for first 03 Variables, country, age and salary

In [42]:
X=dataset.iloc[:,:-1].values
## We take all the columns except last one with index -1
## .values convert dataframe to numpy array

In [43]:
type(X) #X converted from dataframe to numpy array

numpy.ndarray

#### Convert 'Purchased' with Index 03 to Output/Dependent Variable

In [44]:
y=dataset.iloc[:,3].values

In [19]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'], dtype=object)

## Missing Data
#### Replace the missing data with mean values of the columns

#### Use sklearn preprocessing library and Import Imputer Class

In [45]:
from sklearn.preprocessing import Imputer
#Dataset contans two missing values, one for Age and one Salary, which
#are NaN as seen in excel file

imputer=Imputer(missing_values='NaN', strategy='mean', axis=0)
#Create imputer object from Imputer Class
#axis=0 takes the means of columns 

imputer=imputer.fit(X[:,1:3])
#Age and Salary columns index 1 & 2 contains NaN
# Fits imputer Object to matrix X

X[:,1:3]= imputer.transform(X[:,1:3])
#Replace missing Data by mean of column


In [29]:
X
#No NaN Values, 38.77 as Age and 63777.77 as Salary are imputed

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

## Encode Categorical Data
### Country and Purchase are Categotical variables

In [46]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder 
#Label Encoder is a class of from sklearn preprocessing Library

labelencoder_X = LabelEncoder()
# Create Object of Label Encder Class 

X[:,0]=labelencoder_X.fit_transform(X[:,0])
# Apply Label Encoder to Column 0, 'Country' using fit Transform Method
# Fitted Label encoder object to first column country of our matrix X 

X
# Input matrix X is now all Numeric

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [47]:
# 0,1 and 2 are encoded values for France, Germany, Spain
# Since 2>1>0. Machine Learning Model will interpret one country>other country

#Create Dummy Variables
onehotencoder=OneHotEncoder(categorical_features=[0])
# One Hot Encode Country Variable Index 0

X=onehotencoder.fit_transform(X).toarray()
# Fit onehotencoder object to Matrix X

In [49]:
X.shape #03 columns for country, age and salary

(10, 5)

## Use Label Encoder for Independent Variable

In [52]:
labelencoder_y=LabelEncoder()
y=labelencoder_y.fit_transform(y)
# Machine Learning Model knows output is category and no order between
# Yes and No in output Variable

In [53]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1], dtype=int64)

## Train Test Split

In [56]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2,
                                                   random_state=0)

# Model Selection Library
# Test Size choosen 0.2, random state to get same results

## Feature Scaling

In [59]:
# Age and Salary Variables are not on the same scale
# Ways of Scaling to avoid domination of one variable over another
# Standardization (value-mean/Std) and Normalisation ([x-min]/[max-min])

from sklearn.preprocessing import StandardScaler
#Import Standard Scala Class

sc_X=StandardScaler()
#Object of the Class

X_train=sc_X.fit_transform(X_train)
# Applying standard scalar object to training set

X_test=sc_X.transform(X_test)
# for test set we dont need to fit the sc_X object bcz its already fitted 
# to training set