In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv('Data.csv')

## Preparing Dataset

In [3]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [4]:
input_values = dataset.iloc[:,:-1]

In [5]:
input_values

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,
5,France,35.0,58000.0
6,Spain,,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [6]:
output_values = dataset.iloc[:,3:4]

In [7]:
output_values

Unnamed: 0,Purchased
0,No
1,Yes
2,No
3,No
4,Yes
5,Yes
6,No
7,Yes
8,No
9,Yes


## Handling Missing Values

In [8]:
from sklearn.impute import SimpleImputer

In [9]:
?SimpleImputer

In [10]:
new_values = SimpleImputer(missing_values = np.nan , strategy = 'mean')

In [11]:
new_values

SimpleImputer()

In [12]:
input_values.iloc[:,1:] = new_values.fit_transform(input_values.iloc[:,1:])

In [13]:
print(input_values)
to_encode = input_values

   Country        Age        Salary
0   France  44.000000  72000.000000
1    Spain  27.000000  48000.000000
2  Germany  30.000000  54000.000000
3    Spain  38.000000  61000.000000
4  Germany  40.000000  63777.777778
5   France  35.000000  58000.000000
6    Spain  38.777778  52000.000000
7   France  48.000000  79000.000000
8  Germany  50.000000  83000.000000
9   France  37.000000  67000.000000


In [14]:
to_encode

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.777778
5,France,35.0,58000.0
6,Spain,38.777778,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


## Encoding Categorical Class

### LabelEncoding ? 

<p>By doing so , Machine treats Spain (Encoded Value : 2) as a higher value than others , which doesn't make any sense <br/>
Hence we use one hot encoding</p>

### One Hot Encodings

In [15]:
to_encode

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.777778
5,France,35.0,58000.0
6,Spain,38.777778,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [16]:
from sklearn.preprocessing import OneHotEncoder

In [17]:
?OneHotEncoder

In [18]:
encoder = OneHotEncoder()

In [19]:
X = encoder.fit_transform(to_encode.iloc[:,0:1]).toarray()

In [20]:
X

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [21]:
new_df = pd.concat([to_encode, pd.DataFrame(X)], axis=1)

new_df.drop('Country' , axis = 1)

Unnamed: 0,Age,Salary,0,1,2
0,44.0,72000.0,1.0,0.0,0.0
1,27.0,48000.0,0.0,0.0,1.0
2,30.0,54000.0,0.0,1.0,0.0
3,38.0,61000.0,0.0,0.0,1.0
4,40.0,63777.777778,0.0,1.0,0.0
5,35.0,58000.0,1.0,0.0,0.0
6,38.777778,52000.0,0.0,0.0,1.0
7,48.0,79000.0,1.0,0.0,0.0
8,50.0,83000.0,0.0,1.0,0.0
9,37.0,67000.0,1.0,0.0,0.0


In [22]:
new_df = new_df.reindex(columns = [0,1,2,'Age','Salary'])

In [23]:
new_df

Unnamed: 0,0,1,2,Age,Salary
0,1.0,0.0,0.0,44.0,72000.0
1,0.0,0.0,1.0,27.0,48000.0
2,0.0,1.0,0.0,30.0,54000.0
3,0.0,0.0,1.0,38.0,61000.0
4,0.0,1.0,0.0,40.0,63777.777778
5,1.0,0.0,0.0,35.0,58000.0
6,0.0,0.0,1.0,38.777778,52000.0
7,1.0,0.0,0.0,48.0,79000.0
8,0.0,1.0,0.0,50.0,83000.0
9,1.0,0.0,0.0,37.0,67000.0


### LabelEncoding

<p>Applying LabelEncoding in 'Purchased' column<br/>
Since Purchased column (within output column) , only contains Yes/No , Hence we can use LabelEncoding to encode it </p>

In [24]:
from sklearn.preprocessing import LabelEncoder

In [25]:
labelendcoder = LabelEncoder()

In [26]:
encoded_output = labelendcoder.fit_transform(output_values)

  return f(**kwargs)


In [27]:
encoded_output

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [28]:
pd.DataFrame(encoded_output , columns = ['Purshased'])
# 0 - No
# 1 - Yes

Unnamed: 0,Purshased
0,0
1,1
2,0
3,0
4,1
5,1
6,0
7,1
8,0
9,1


## Splitting The Data

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
x_train,x_test,y_train,y_test = train_test_split(new_df , encoded_output , test_size = .2)

In [31]:
x_train

Unnamed: 0,0,1,2,Age,Salary
5,1.0,0.0,0.0,35.0,58000.0
3,0.0,0.0,1.0,38.0,61000.0
1,0.0,0.0,1.0,27.0,48000.0
2,0.0,1.0,0.0,30.0,54000.0
0,1.0,0.0,0.0,44.0,72000.0
4,0.0,1.0,0.0,40.0,63777.777778
6,0.0,0.0,1.0,38.777778,52000.0
9,1.0,0.0,0.0,37.0,67000.0


In [32]:
len(y_train)

8

In [33]:
x_test

Unnamed: 0,0,1,2,Age,Salary
8,0.0,1.0,0.0,50.0,83000.0
7,1.0,0.0,0.0,48.0,79000.0


In [34]:
y_test

array([0, 1])

## Feature Scaling

In [35]:
x_train

Unnamed: 0,0,1,2,Age,Salary
5,1.0,0.0,0.0,35.0,58000.0
3,0.0,0.0,1.0,38.0,61000.0
1,0.0,0.0,1.0,27.0,48000.0
2,0.0,1.0,0.0,30.0,54000.0
0,1.0,0.0,0.0,44.0,72000.0
4,0.0,1.0,0.0,40.0,63777.777778
6,0.0,0.0,1.0,38.777778,52000.0
9,1.0,0.0,0.0,37.0,67000.0


<p>As we can infer from the training dataframe that Salary is in order of 10^3 and Age in order of 10^1<br/>
Hence , We use feature scaling to scale them down to their corresponding proportionate value .</p>
<p>We can either <strong>Standardization</strong> or <strong>Normalization</strong>

#### Standardization

In [36]:
from sklearn.preprocessing import StandardScaler

In [37]:
scaler = StandardScaler() # Used for standardization of data

In [38]:
stdx_train = scaler.fit_transform(x_train)
stdx_test = scaler.fit_transform(x_test)

In [39]:
pd.DataFrame(stdx_train) # all values are centered around mean and unit standard deviation

Unnamed: 0,0,1,2,3,4
0,1.290994,-0.57735,-0.774597,-0.238371,-0.19542
1,-0.774597,-0.57735,1.290994,0.346722,0.202795
2,-0.774597,-0.57735,1.290994,-1.798619,-1.522804
3,-0.774597,1.732051,-0.774597,-1.213526,-0.726374
4,1.290994,-0.57735,-0.774597,1.516908,1.662916
5,-0.774597,1.732051,-0.774597,0.736784,0.571512
6,-0.774597,-0.57735,1.290994,0.498413,-0.99185
7,1.290994,-0.57735,-0.774597,0.151691,0.999225


In [40]:
pd.DataFrame(stdx_test)

Unnamed: 0,0,1,2,3,4
0,-1.0,1.0,0.0,1.0,1.0
1,1.0,-1.0,0.0,-1.0,-1.0


<p>We wont apply feature scaling to output values (y_train , t_test) as it a categorical value with only 2 categories (either 0 or 1)
<br/>
However , We can apply it when the number of categories are more .</p>

#### Normalization

In [41]:
from sklearn.preprocessing import MinMaxScaler

In [42]:
normalizescaler = MinMaxScaler() # all values are between 0 and 1

In [43]:
nx_train = normalizescaler.fit_transform(x_train)
nx_test = normalizescaler.fit_transform(x_test)

In [44]:
pd.DataFrame(nx_train)

Unnamed: 0,0,1,2,3,4
0,1.0,0.0,0.0,0.470588,0.416667
1,0.0,0.0,1.0,0.647059,0.541667
2,0.0,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.176471,0.25
4,1.0,0.0,0.0,1.0,1.0
5,0.0,1.0,0.0,0.764706,0.657407
6,0.0,0.0,1.0,0.69281,0.166667
7,1.0,0.0,0.0,0.588235,0.791667


In [45]:
pd.DataFrame(nx_test)

Unnamed: 0,0,1,2,3,4
0,0.0,1.0,0.0,1.0,1.0
1,1.0,0.0,0.0,0.0,0.0
