In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as pts

# Import the dataset

In [2]:
dataset = pd.read_csv('Data.csv')

In [3]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


# Separating matrix of features and dependent variable

In [4]:
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 3].values

In [5]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [6]:
Y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

# Take care of missing data

In [8]:
from sklearn.impute import SimpleImputer

In [9]:
impute = SimpleImputer(missing_values = np.nan, strategy = 'mean', verbose = 0)

In [10]:
impute = impute.fit(X[:,1:3])

In [11]:
X[:,1:3] = impute.transform(X[:,1:3])

In [12]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

### Encoding categorical data

![image.png](attachment:image.png)

    In the above case, Machine Learning will understand that one is greater than the other due to the numbering (1 < 2 < 3), so it is necessary -> Dummy Enconding

# Encoding Categorical Data - Dummy Enconding

In [13]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [14]:
ct = ColumnTransformer([('encoder', OneHotEncoder(), [0])], remainder='passthrough')

In [15]:
X = np.array(ct.fit_transform(X), dtype=np.float)

In [16]:
#View of X
X[:,0:4]

array([[ 1.        ,  0.        ,  0.        , 44.        ],
       [ 0.        ,  0.        ,  1.        , 27.        ],
       [ 0.        ,  1.        ,  0.        , 30.        ],
       [ 0.        ,  0.        ,  1.        , 38.        ],
       [ 0.        ,  1.        ,  0.        , 40.        ],
       [ 1.        ,  0.        ,  0.        , 35.        ],
       [ 0.        ,  0.        ,  1.        , 38.77777778],
       [ 1.        ,  0.        ,  0.        , 48.        ],
       [ 0.        ,  1.        ,  0.        , 50.        ],
       [ 1.        ,  0.        ,  0.        , 37.        ]])

    LabelEncoder is used when there are only 2 sorting options (specifically the outputs):

In [17]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Y = LabelEncoder().fit_transform(Y)

In [18]:
Y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

#### LabelEncoder 
    It can turn [dog,cat,dog,mouse,cat] into [1,2,1,3,2], but then the imposed ordinality means that the average of dog and mouse is cat. Still there are algorithms like decision trees and random forests that can work with categorical variables just fine and LabelEncoder can be used to store values using less disk space.
    1- The categorical feature is ordinal (Jr. kg, Sr. kg, Primary school, high school ,etc).
    2- When we can come up with a label encoder that assigns close labels to similar categories: This leads to less splits in the tress hence reducing the execution time.
    3- When the number of categorical features in the dataset is huge: One-hot encoding a categorical feature with huge number of values can lead to (1) high memory consumption and (2) the case when non-categorical features are rarely used by model. 

#### One-Hot-Encoding
    It has the advantage that the result is binary rather than ordinal and that everything sits in an orthogonal vector space. The disadvantage is that for high cardinality, the feature space can really blow up quickly and you start fighting with the curse of dimensionality. In these cases, I typically employ one-hot-encoding followed by PCA for dimensionality reduction.
    1- When the values that are close to each other in the label encoding correspond to target values that aren't close (non - linear data).
    2 -When the categorical feature is not ordinal (dog,cat,mouse).

# Splitting the Dataset into the Training set and Test set

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

# Feature Scaling
This part is important because when you use the Euclidean Distance, both values will be distance, with respect the values into the Age colum and Salary colum [ for example sqrt(41) is very smaller than sqrt(79000) ]
![image.png](attachment:image.png)

In [22]:
from sklearn.preprocessing import StandardScaler

In [24]:
sts_X = StandardScaler()

#### It is more interesting that you use this function when using linear regression and looking at the resource matrix and/or the dependent variable will have a huge range of values

In [25]:
X_train = sts_X.fit_transform(X_train)
X_test = sts_X.transform(X_test)

In [26]:
X_train

array([[-1.        ,  2.64575131, -0.77459667,  0.26306757,  0.12381479],
       [ 1.        , -0.37796447, -0.77459667, -0.25350148,  0.46175632],
       [-1.        , -0.37796447,  1.29099445, -1.97539832, -1.53093341],
       [-1.        , -0.37796447,  1.29099445,  0.05261351, -1.11141978],
       [ 1.        , -0.37796447, -0.77459667,  1.64058505,  1.7202972 ],
       [-1.        , -0.37796447,  1.29099445, -0.0813118 , -0.16751412],
       [ 1.        , -0.37796447, -0.77459667,  0.95182631,  0.98614835],
       [ 1.        , -0.37796447, -0.77459667, -0.59788085, -0.48214934]])

# THE BASIC TEMPLATE IS:
### #1- Import the libraries
### #2- Import the dataset
### #3- Splint the dataset into the training set and test set
### #4- If required feature scaling