# Data Preprocessing Tools

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [3]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [4]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [5]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [6]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


### Slicing

In [7]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [8]:
X[:, 0:-2]

array([['France'],
       ['Spain'],
       ['Germany'],
       ['Spain'],
       ['Germany'],
       ['France'],
       ['Spain'],
       ['France'],
       ['Germany'],
       ['France']], dtype=object)

In [9]:
X[:,0:-1]

array([['France', 44.0],
       ['Spain', 27.0],
       ['Germany', 30.0],
       ['Spain', 38.0],
       ['Germany', 40.0],
       ['France', 35.0],
       ['Spain', nan],
       ['France', 48.0],
       ['Germany', 50.0],
       ['France', 37.0]], dtype=object)

In [10]:
X[:,0:3]

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [11]:
X[:,1:3]

array([[44.0, 72000.0],
       [27.0, 48000.0],
       [30.0, 54000.0],
       [38.0, 61000.0],
       [40.0, nan],
       [35.0, 58000.0],
       [nan, 52000.0],
       [48.0, 79000.0],
       [50.0, 83000.0],
       [37.0, 67000.0]], dtype=object)

In [12]:
X[:,:-1]

array([['France', 44.0],
       ['Spain', 27.0],
       ['Germany', 30.0],
       ['Spain', 38.0],
       ['Germany', 40.0],
       ['France', 35.0],
       ['Spain', nan],
       ['France', 48.0],
       ['Germany', 50.0],
       ['France', 37.0]], dtype=object)

In [13]:
X[:,:-2]

array([['France'],
       ['Spain'],
       ['Germany'],
       ['Spain'],
       ['Germany'],
       ['France'],
       ['Spain'],
       ['France'],
       ['Germany'],
       ['France']], dtype=object)

## Taking care of missing data

In [14]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:3])

SimpleImputer()

In [15]:
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [16]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding categorical data

### Encoding the Independent Variable

In [17]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [18]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable

In [19]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [20]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [22]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [23]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [24]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [25]:
print(y_test)

[0 1]


## Feature Scaling

### When do the feature scaling ?
 After splitting to test set and training set -> purpose is for scaling all variables / parameter if do before spliting it's prevent information data likeage because of get mean, std deviation for all data include the test set.

In [26]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
X_test[:, 3:] = sc.transform(X_test[:, 3:])

We all know that we call fit_transform() method on our training data and transform() method on our test data
The fit method is calculating the mean and variance of each of the features present in our data. The transform method is transforming all the features using the respective mean and variance.

#### transform()
Dengan menggunakan metode transformasi, kita dapat menggunakan mean dan varians yang sama seperti yang dihitung dari data pelatihan untuk mengubah data pengujian. Dengan demikian, parameter yang dipelajari oleh model kami menggunakan data pelatihan akan membantu kami mengubah data pengujian kami.

#### fit_transform()
fit_transform() digunakan pada data pelatihan sehingga kita dapat menskalakan data pelatihan dan juga mempelajari parameter penskalaan data tersebut. Di sini, model yang kami buat akan mempelajari mean dan varians dari fitur-fitur training set. Parameter yang dipelajari ini kemudian digunakan untuk menskalakan data pengujian kami.



When you one-hot encode your categorical variables, the values in encoded variables become 0 and 1. Therefore, encoded variables will not negatively affect your model. The fact that you encode variables and pass them to ML learning algorithms is good, as you may gain additional insights from ML models.

When scaling your dataset, make sure you pay attention to 2 things:

    Some ML algorithms require data to be scaled, and some do not. It is a good practice to only scale your data for models that are sensitive to un-scaled data, such as kNN.

    There are different methods to scale your data. StandardScaler() is one of them, but it is vulnerable to outliers. Therefore, make sure you are using the scaling method that best fits your business needs. You can learn more about different scaling methods here: 
    https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html
Both StandardScaler and MinMaxScaler are very sensitive to the presence of outliers.

Encoded categorical variables contain values on 0 and 1. Therefore, there is even no need to scale them. However, scaling methods will be applied to them when you choose to scale your entire dataset prior to using your data with scale-sensitive ML models.

Sources : https://stackoverflow.com/questions/63304223/scaling-of-categorical-variable

### Encoding Data
#### Mengubah data kategorikal ke data numerical, jenis data kategorikal :
1. Ordinal data (Nilai matkul, level pendidikan, dsb) diubah menjadi angka (1,2,3,4,..) menggunakan label encoder
        A. Ex : df_dataset['education'] = label_encoder.fit_transform(df_dataset['education'])
2. Gender, married -> change to (1,0)
        B. Ex : df_dataset["married"] = label_encoder.fit_transform(df_dataset["married"])
        C. Ex : df_dataset['loan_status'] = df_dataset['loan_status'].apply(lambda x: 1 if x=='APPROVE' else 0)
3. kind of property (house, apartment, dsb) -> get_dummies
        D. Ex : Property_Type = pd.get_dummies(df_dataset["property_type"], prefix="type")

### Normalisasi dan Standardisasi dilakukan pada data numeric.
#### Ketika split data menjadi train set dan tes set data, maka feature scalling dilakukan setelah splitting.
1. Untuk data numerical, dilakukan standardisasi dan normalisasi. 
2. Untuk data kategorikal tidak perlu dilakukan standardisasi dan normalisasi.
3. Untuk Data target (y) dilakukan standarscaler dan fit transform

In [27]:
print(X_train)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]


In [28]:
print(X_test)

[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]
