# Data Preprocessing Tools

## Importing the libraries

In [1]:
# For arrays
import numpy as np
# Dependant variable vector and matrix
import pandas as pd
# For visualization and plotting
import matplotlib.pyplot as plt

## Importing the dataset

In [2]:
dataset = pd.read_csv("Data.csv")
print(dataset)

#first create matrix of features(independant variables)
x = dataset.iloc[:, : -1].values #rows all, columns all but last. Values is to take the values and put it in a matrix

#second the dependant variable vector(last column)
y = dataset.iloc[:, -1 ].values

   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes


In [3]:
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [4]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Taking care of missing data

In [5]:
from sklearn.impute import SimpleImputer
#creating an object of the class SimpleImputer. Replace missing salary by the average of all other values in the column
imputer = SimpleImputer(missing_values=np.nan , strategy= 'mean')

#connect the imputer to our matrix of features and calculate average using fit. Its input is all cols with numerical values - age and salary
imputer.fit(x[: , 1:3])

#transforms and replaces our matrix. Transform returns updated matrix
x[: , 1:3] = imputer.transform(x[: , 1:3])


In [6]:
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding categorical data

### Encoding the Independent Variable

In [7]:
#OneHotEncoding - Split a columnn into its a number of columns same as its number of categories.
# Split Country into 3 columns as it has 3 categories

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

#object of columnTransformer. What transformation? Which columns to keep?
#arg1 = type of transformation, what kinda encoding? indexes of columns to encode
#arg2 = passthrough - keep columns to which we don't apply the transformation
ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [0])], remainder = 'passthrough')

#connect ct to x. No need fit and transform here, its done at once. It returns as np array. We still force it
x = np.array(ct.fit_transform(x))

In [8]:
print(x)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable

In [9]:
#convert yes to 1 and no to 0
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)

In [10]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set

In [15]:
from sklearn.model_selection import train_test_split
# matrix of features of training and testing set, dependant variable vector of training and test set. Random  factors happening
# during split. Just to make sure we have the same randomness.
x_train, x_test ,y_train, y_test =  train_test_split(x, y, test_size = 0.2, random_state = 1)


In [16]:
print(x_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [17]:
print(x_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [18]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [19]:
print(y_test)

[0 1]


## Feature Scaling

In [25]:
# Use standarisation over normalisation as it is best for all cases.
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
# Do not apply standardisation to dummy vars as it is already between [-3:+3] and it would lead to information loss
# Columns taken is Age and Salary. Fit computes mean and SD of all values.
x_train[:, 3:] = sc.fit_transform(x_train[: , 3:])

#We use the same scaler. We use the same mean and SD for scaling test set. So we just transform.
x_test[:, 3:] = sc.transform(x_test[: , 3:])
#Transform actually applies the Standardisation formula and transform values.



In [26]:
print(x_train)

[[0.0 0.0 1.0 -0.19159184384578554 -1.0781259408412427]
 [0.0 1.0 0.0 -0.014117293757057846 -0.07013167641635404]
 [1.0 0.0 0.0 0.5667085065333239 0.6335624327104546]
 [0.0 0.0 1.0 -0.3045301939022487 -0.307866172742979]
 [0.0 0.0 1.0 -1.901801144700799 -1.4204636155515822]
 [1.0 0.0 0.0 1.1475343068237056 1.2326533634535488]
 [0.0 1.0 0.0 1.4379472069688966 1.5749910381638883]
 [1.0 0.0 0.0 -0.7401495441200352 -0.5646194287757336]]


In [28]:
print(x_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]
