# Data Preprocessing Tools

## Importing the libraries

In [None]:
# For array operations
import numpy as np
# For Data Visualization
import matplotlib.pyplot as plt
# For Data Manipulation using Dataframes
import pandas as pd

## Importing the dataset

In [None]:
dataset = pd.read_csv('Data.csv')
# Features (all data excluding target variable in last column)
X = dataset.iloc[:, :-1].values
# Target/Dependent variable (assuming last column)
y = dataset.iloc[:, -1].values

In [None]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [None]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Taking care of missing data

If missing data acounts for less than 1% of dataset, we can discard them. But in all other cases, we have to replace missing data. Missing data can be replaced with either mean, median, most frequent data or with a constant using `SimpleImputer` from `sklearn.impute`. Other solutions include `IterativeImputer`, `KNNImputer` and `MissingIndicator`.



In [None]:
from sklearn.impute import SimpleImputer
# Replace missing values with mean
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# First fit, then transform for replacing missing values
# Only columns with numerical values should be considered for performing mean
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [None]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding categorical data

### Encoding the Independent Variable

In [None]:
# One-hot encoding of 'Country' column
# Replace 'Country' column with 3 new columns (since we have 3 categories for 'Country')
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [None]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable

In [None]:
# Replace 'Purchased' column values with 0's and 1's (One-hot encoding not needed since only two categories)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [None]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [None]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [None]:
print(y_test)

[0 1]


## Feature Scaling

Feature Scalling is performed only when different features are not scaled uniformly i.e. some features dominate other features so that there is a possibility for the machine learning model to discard the least dominant features. It is applicable only for certain machine learning models which do not inherently take care of the Feature Scaling problem.

For example, Linear Regression Models inherently take care of the Feature Scaling since Target Variable can be expressed as an explicit equation in terms of the features. In that scenario, coefficients compensate for high degree of variation between different features. But models (for example, Support Vector Regression) that relate Target Variable to Features as an implicit relationship need Feature Scaling before training the model.

Feature Scaling can be done either using Standardization or using Normalization.

1.   Standardization scales the features to lie between, say -2 and +2. It is applicable in almost all scenarios.
2.   Normalization scales the features to lie between 0 and 1. It is applicable in scenarios where most of the features follow a Normal Distribution.

![Feature Scaling - Standardization Vs Normalization](Feature-Scaling-Standardization-Vs-Normalization.PNG)

Since Standardization is preferred generally, we are going for Feature Scaling based on Standardization. Since our one-hot encoded `country` columns take values 0 and 1 (already lie between -2 and +2), we don't need to apply Feature Scaling there since it can change those values and create problems with interpretability of one-hot encoded columns. Also, since we have encoded the Target Variable `Purchased` column using `LabelEncoder` that takes values 0 and 1, we don't need to apply Feature Scaling there also.

In [None]:
from sklearn.preprocessing import StandardScaler
# Don't apply Feature Scaling to dummy variables (that replaced 'Country' column) to keep interpretability of the model
sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
# Only need to transform X_test according to feature scaling applied on X_train
X_test[:, 3:] = sc.transform(X_test[:, 3:])

In [None]:
print(X_train)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]


In [None]:
print(X_test)

[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]
