<a href="https://colab.research.google.com/github/bintangvirgy/learn-machinelearning-python/blob/main/01_data_preprocessing_tools.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing Tools

In [None]:
# preprocessing step is the most important step in ML modelling.
# in preprocessing step, we prepare datasets which will be used in create ML Model.
# two main step in data preprocessing :
# 1. Importing the dataset 
# 2. splitting the dataset into train set and test set : 2 main dataset include feature & label

# we may be used other step depend on what kind of data 
# encoding categorical data : transform the string based value to number, because ML can't process string data
# fill missing data : filling null data with some strategy (ex: mean of all other value)
# feature scaling : transform unscaled data and make sure all data have same scale 
#                   (ex: amount and ages data exist on same dataset, it must be transformed to same scale)

# *Feature is independant variable, variable that determine label
# *label is dependant variable, variable that determined by feature

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Importing the libraries

In [None]:
# this is the basic library for data processing in python
import numpy as np # for work with array
import matplotlib.pyplot as pl # for make graph chart & visualize data
import pandas as pd # for import & make matrix

## Importing the dataset

In [None]:
# import and create data frame to process
# pandas automatically make first row as col name
dataset = pd.read_csv('/content/drive/MyDrive/Learn_Machinelearning_Udemy/01.preprocessing_data/01_preprocessing_data.csv') 

# container for matrix of feature (feature is column used to make prediction)
# use function iloc to get location of feature col (1-3)(all except last one)
# first one is extract row, second one is extract column (imagine the dataset is formed like 2d matrix)
# if we use 1d matrix, we just need [:], if 2d [:, :]
x_feature = dataset.iloc[:, :-1].values

# container for dependant variable (depandant is what ML want to predict)
y_dependant = dataset.iloc[:, -1].values

In [None]:
print(dataset)

In [None]:
print(x_feature)

In [None]:
print(y_dependant)

## Taking care of missing data

In [None]:
# change all missing data with average of all other row in same column

# use scikitlearn module, most of common used module in datascience & ML (read imputer)
from sklearn.impute import SimpleImputer

# initiate class of imputer
# set parameter missing_values to not found cell (nan value), to declare nan values use np.nan
# set parameter strategy to what can of action we want to do if missing_value is found
var_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# add pre transformed matrix to imputer
# select all x_feature row, on column age & salary (column age is on index 1 & 2, so slice on 1:3, from index 1 to before index 3)
var_imputer.fit(x_feature[:, 1:3])

# execute transformation and put result on original matrix
x_feature[:, 1:3] = var_imputer.transform(x_feature[:, 1:3])

In [None]:
print(x_feature)

## Encoding categorical data

In [None]:
# we have 2 column of data that written in string (country and purchased), 
# ML can't recognize string value, so we must encode it to number  

### Encoding the Independent Variable

In [None]:
# for country column
# first idea is turn france = 0, spain = 1, germany = 2, but if we do this 
# ML can interpret this column as ordered number.
# Other idea is one-hot encoding(let's google it), by make 3 column because we have 3 category
# so french = 100, spain 010, germany = 001

# to one-hot encoding use 2 clases
# column transfrom class
from sklearn.compose import ColumnTransformer
# one-hot class
from sklearn.preprocessing import OneHotEncoder

# declare object column transformer
# transformer : what kind of transformation, what kind of encoding, index column (in tuple)
# remainder : parameter to keep or not to keep others column that not transformed
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')

# fit & transform the matrix and keep it in old variable
# and make result as numpy array because ML use numpy array
x_feature = np.array(ct.fit_transform(x_feature))

In [None]:
print(x_feature)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]



### Encoding the Dependent Variable

In [None]:
# for purchased column
# we don't need to one-hot this column because it only have 2 category
# so we just need to label it

# import label encoder class
from sklearn.preprocessing import LabelEncoder

# declare object label encoder
le = LabelEncoder()

# fit and transform matrix
# label encoder don't need transform to np array because it already np array
y_dependant = le.fit_transform(y_dependant) 

In [None]:
print(y_dependant)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set

In [None]:
# training set : set of data that used to train the ML model
# test set : set of data that used to test the accuracy of ML model
# we use sklearn module to split the dataset
from sklearn.model_selection import train_test_split

# 4 container for result, x train, y train, x test, y test
# train_test_split use x_feature and y_dependant as parameter, not a complete dataset
# test_size : size of test part in scale 0-1, if we make it 0.3 then the train set will be 0.7
# random_state : make sure selection of set will be random
x_train, x_test, y_train, y_test = train_test_split(x_feature, y_dependant, test_size = 0.2, random_state = 1)


In [None]:
print(x_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [None]:
print(x_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [None]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [None]:
print(y_test)

[0 1]


## Feature Scaling

In [None]:
# when feature scaling done is still debatable
# this tutorial recommend to do feature scaling after splitting dataset
# because test set is supposed to be a new brand set that tested after model deployed
# and feature scaling is done by using all the dataset to fit. 
# Logically, the test set must follow the model scale, not determine the model scale.
# Both test & train data must be scale, but only train data that allowed to determine the model scale.

# feature scaling is used to scaling all feature to make sure that all feature have same scale
# and there isn't a feature that dominate other feature
# 2 common scaling technique
# standardisation : x-mean(x)/ standard deviation of x, resulting value between -3 to +3
# normalisation : x - min(x)/ max(x) - min(x), resulting value 0 to 1
# normalisation is used when we have normal distribution in most feature
# standardisation work well all the time 

# we only use feature scaling on uncategorized variable
# because we already encoded category to 3 values, if we do scaling on this feature, we will lost it information

# we use standardisation on this tutorial
from sklearn.preprocessing import StandardScaler
# declare object
sc = StandardScaler()

# get only 2 last column (from col 3)
# we make a scale by fitting the train test & transform it values to determined scale
x_train[:, 3:] = sc.fit_transform((x_train[:, 3:]))
# transform test data to determined scale
x_test[:, 3:] = sc.transform((x_test[:, 3:]))


In [None]:
print(x_train)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]


In [None]:
print(x_test)

[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]
