# Data Preprocessing Tools

## 1. Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## 2. Importing the dataset

In [2]:
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values # features
Y = dataset.iloc[:,-1].values

In [15]:
print('---------Features---------')
print(X)
print('--------------------DependentVariable------------------')
print(Y)

---------Features---------
[[1.0 0.0 1.0 0.0 0.0 44.0 72000.0]
 [0.0 1.0 0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 1.0 0.0 30.0 54000.0]
 [0.0 1.0 0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 1.0 0.0 0.0 35.0 58000.0]
 [0.0 1.0 0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 1.0 0.0 0.0 37.0 67000.0]]
--------------------DependentVariable------------------
['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## 3. Taking care of missing data

In [10]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:3]) # lookup for missing values and compute strategy
X[:, 1:3] = imputer.transform(X[:, 1:3]) # replace
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

## 4. Encoding categorical data

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Encoding the Independent Variable
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
X

array([[1.0, 0.0, 1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 1.0, 0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 1.0, 0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 1.0, 0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [17]:
# Encoding the Dependent Variable
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
Y = le.fit_transform(Y)
Y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

## 5. Spliting the dataset into the Training set and Test set

In [30]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
print('-------------Training-Features------------------')
print(X_train)
print('------Training-DependentVar---------')
print(Y_train)
print('------Test-Features----------------')
print(X_test)
print('------Test-DependentVar------------')
print(Y_test)

-------------Training-Features------------------
[[0.0 1.0 0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 1.0 0.0 0.0 44.0 72000.0]
 [0.0 1.0 0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 1.0 0.0 0.0 35.0 58000.0]]
------Training-DependentVar---------
[0 1 0 0 1 1 0 1]
------Test-Features----------------
[[0.0 1.0 0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 1.0 0.0 0.0 37.0 67000.0]]
------Test-DependentVar------------
[0 1]


## 6. Feature Scaling

### Standardization(works all the time)
$$x_{\text{stand}}=\frac{x-\text{mean(x)}}{\text{std(x)}}$$

### Normalization(recommended when most features follow a normal distribution)
$$x_{\text{norm}}=\frac{x-\text{min(x)}}{\text{max(x)-min(x)}}$$

In [33]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train[:, 5:] = sc.fit_transform(X_train[:, 5:])
X_test[:, 5:] = sc.transform(X_test[:, 5:])
print(X_train)
print(X_test)

[[0.0 1.0 0.0 0.0 1.0 -0.1915918438457856 -1.0781259408412427]
 [0.0 1.0 0.0 1.0 0.0 -0.014117293757057902 -0.07013167641635401]
 [1.0 0.0 1.0 0.0 0.0 0.5667085065333239 0.6335624327104546]
 [0.0 1.0 0.0 0.0 1.0 -0.3045301939022488 -0.30786617274297895]
 [0.0 1.0 0.0 0.0 1.0 -1.901801144700799 -1.4204636155515822]
 [1.0 0.0 1.0 0.0 0.0 1.1475343068237056 1.2326533634535488]
 [0.0 1.0 0.0 1.0 0.0 1.4379472069688966 1.5749910381638883]
 [1.0 0.0 1.0 0.0 0.0 -0.7401495441200352 -0.5646194287757336]]
[[0.0 1.0 0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 1.0 0.0 0.0 37.0 67000.0]]
