### Data Preprocessing


#### Import the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline



In [2]:
data = pd.read_csv('Data.csv')
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [3]:
# Getting the independent variables (also called matrix of features)
X = data.iloc[ : , : -1].values # take all the rows and all the columns except the last one and values method makes it into a numpy array

X # this is a matrix

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [4]:
# Getting the dependent variable

y = data.iloc[ : , -1].values 
y    # This is a vector

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

#### Taking care of missing data

In [5]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[: , 1:3]) # find the missing data in column 1 and 2
X[: , 1:3] = imputer.transform(X[: , 1:3]) # do the replacing

In [11]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

#### Encoding categorical data
This is when the catogerical data is a string always encode strings

#### Feature matrix encoding

In [6]:
# Country column has strings as it variable we need to encode them
# It transform them into 

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough') # coulumn 0 is what we want to encode and passthrough means leave the others
X = np.array(ct.fit_transform(X)) # np.array makes it a matrix 
X


array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

#### Encoding the dependent vector

In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

#### Splitting into train and test set

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # random state make sure it has the same outcome everytime you run it


In [9]:
X_train

array([[0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 35.0, 58000.0]], dtype=object)

In [10]:
X_test

array([[0.0, 1.0, 0.0, 30.0, 54000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [11]:
y_train

array([0, 1, 0, 0, 1, 1, 0, 1])

In [12]:
y_test

array([0, 1])

#### Feature scaling

This is neccessary when some features are dominant and others are made insignificant but you wont need to do this since most machine learning models have made up for this.
Standardisation scaling is neccessary for most scenerios but normalisation is great for normally distributed random variable

In [14]:
# Do not scale dummy variables in the feature
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[ : , 3:] = sc.fit_transform(X_train[ : , 3:])
X_test[ : , 3:] = sc.transform(X_test[ : , 3:])



In [15]:
X_train

array([[0.0, 0.0, 1.0, -0.1915918438457856, -1.0781259408412427],
       [0.0, 1.0, 0.0, -0.014117293757057902, -0.07013167641635401],
       [1.0, 0.0, 0.0, 0.5667085065333239, 0.6335624327104546],
       [0.0, 0.0, 1.0, -0.3045301939022488, -0.30786617274297895],
       [0.0, 0.0, 1.0, -1.901801144700799, -1.4204636155515822],
       [1.0, 0.0, 0.0, 1.1475343068237056, 1.2326533634535488],
       [0.0, 1.0, 0.0, 1.4379472069688966, 1.5749910381638883],
       [1.0, 0.0, 0.0, -0.7401495441200352, -0.5646194287757336]],
      dtype=object)

In [16]:
(X_test)

array([[0.0, 1.0, 0.0, 30.0, 54000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

##### 