In [53]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

In [54]:
tips = sns.load_dataset('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


# SCALER

In [55]:
x = tips[['tip','total_bill']]

minmax = MinMaxScaler()
stdsclr = StandardScaler()
robstsclr = RobustScaler()

## **1. MIN MAX SCALER**
- hati-hati untuk outliers yang terlalu jauh

In [56]:
minmax.fit(x)
x_minmax = minmax.transform(x)
x_minmax = pd.DataFrame(x_minmax, columns=['minmax tip','minmax total_bill'])
display(x.head())
display(x_minmax.head())

Unnamed: 0,tip,total_bill
0,1.01,16.99
1,1.66,10.34
2,3.5,21.01
3,3.31,23.68
4,3.61,24.59


Unnamed: 0,minmax tip,minmax total_bill
0,0.001111,0.291579
1,0.073333,0.152283
2,0.277778,0.375786
3,0.256667,0.431713
4,0.29,0.450775


## **2. StandardScaler**

In [57]:
stdsclr.fit(x)
x_stdsclr = stdsclr.transform(x)
x_stdsclr = pd.DataFrame(x_stdsclr, columns=['minmax tip','minmax total_bill'])
display(x.head())
display(x_stdsclr.head())

Unnamed: 0,tip,total_bill
0,1.01,16.99
1,1.66,10.34
2,3.5,21.01
3,3.31,23.68
4,3.61,24.59


Unnamed: 0,minmax tip,minmax total_bill
0,-1.439947,-0.314711
1,-0.969205,-1.063235
2,0.363356,0.13778
3,0.225754,0.438315
4,0.44302,0.540745


## **3. Robust Scaler**

In [58]:
robstsclr.fit(x)
x_robstsclr = robstsclr.transform(x)
x_robstsclr = pd.DataFrame(x_robstsclr, columns=['minmax tip','minmax total_bill'])
display(x.head())
display(x_robstsclr.head())

Unnamed: 0,tip,total_bill
0,1.01,16.99
1,1.66,10.34
2,3.5,21.01
3,3.31,23.68
4,3.61,24.59


Unnamed: 0,minmax tip,minmax total_bill
0,-1.2096,-0.074675
1,-0.7936,-0.691558
2,0.384,0.298237
3,0.2624,0.545918
4,0.4544,0.630334


# ENCODING

## 1. One Hot Encoding

In [59]:
#CARA 1
tips_dummy = pd.get_dummies(tips, columns=['sex','smoker','day','time'])
tips_dummy.head()

Unnamed: 0,total_bill,tip,size,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner
0,16.99,1.01,2,False,True,False,True,False,False,False,True,False,True
1,10.34,1.66,3,True,False,False,True,False,False,False,True,False,True
2,21.01,3.5,3,True,False,False,True,False,False,False,True,False,True
3,23.68,3.31,2,True,False,False,True,False,False,False,True,False,True
4,24.59,3.61,4,False,True,False,True,False,False,False,True,False,True


In [60]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [61]:
transform = ColumnTransformer([
    ('encoders', OneHotEncoder(),['sex','smoker','day','time'])
])
transform

In [62]:
tips_encode = pd.DataFrame(transform.fit_transform(tips),columns=transform.get_feature_names_out())
tips_encode.head()

new_tips = pd.concat([tips[['tip','total_bill','size']],tips_encode],axis=1)
new_tips.head()

Unnamed: 0,tip,total_bill,size,encoders__sex_Female,encoders__sex_Male,encoders__smoker_No,encoders__smoker_Yes,encoders__day_Fri,encoders__day_Sat,encoders__day_Sun,encoders__day_Thur,encoders__time_Dinner,encoders__time_Lunch
0,1.01,16.99,2,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,1.66,10.34,3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,3.5,21.01,3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,3.31,23.68,2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,3.61,24.59,4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [63]:
transform_new = ColumnTransformer([
    ('encoders', OneHotEncoder(),['sex','smoker','day','time'])
],remainder='passthrough')
transform_new

tips_encode_new = pd.DataFrame(transform_new.fit_transform(tips),columns=transform_new.get_feature_names_out())
tips_encode_new.head()

Unnamed: 0,encoders__sex_Female,encoders__sex_Male,encoders__smoker_No,encoders__smoker_Yes,encoders__day_Fri,encoders__day_Sat,encoders__day_Sun,encoders__day_Thur,encoders__time_Dinner,encoders__time_Lunch,remainder__total_bill,remainder__tip,remainder__size
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,16.99,1.01,2.0
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,10.34,1.66,3.0
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,21.01,3.5,3.0
3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,23.68,3.31,2.0
4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,24.59,3.61,4.0


In [64]:
transform.get_feature_names_out()

array(['encoders__sex_Female', 'encoders__sex_Male',
       'encoders__smoker_No', 'encoders__smoker_Yes', 'encoders__day_Fri',
       'encoders__day_Sat', 'encoders__day_Sun', 'encoders__day_Thur',
       'encoders__time_Dinner', 'encoders__time_Lunch'], dtype=object)

## 2. Ordinal Encoding

In [65]:
import category_encoders as ce

In [66]:
tips['day'].unique()

['Sun', 'Sat', 'Thur', 'Fri']
Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun']

In [67]:
ordinal_map = [
    {
        'col': 'day',
        'mapping':{
            'Thur': 1,
            'Fri' : 2,
            'Sat':3,
            'Sun':4 
        }
    }
]

ordinal_enc = ce.OrdinalEncoder(cols='day',mapping=ordinal_map)
x=ordinal_enc.fit_transform(tips['day'])
x

Unnamed: 0,day
0,4
1,4
2,4
3,4
4,4
...,...
239,3
240,3
241,3
242,3


In [68]:

tips_new = pd.concat([tips[['total_bill', 'tip', 'sex', 'smoker', 'time', 'size']],x],axis=1)
tips_new

Unnamed: 0,total_bill,tip,sex,smoker,time,size,day
0,16.99,1.01,Female,No,Dinner,2,4
1,10.34,1.66,Male,No,Dinner,3,4
2,21.01,3.50,Male,No,Dinner,3,4
3,23.68,3.31,Male,No,Dinner,2,4
4,24.59,3.61,Female,No,Dinner,4,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Dinner,3,3
240,27.18,2.00,Female,Yes,Dinner,2,3
241,22.67,2.00,Male,Yes,Dinner,2,3
242,17.82,1.75,Male,No,Dinner,2,3


## Binary Encoding

In [69]:
BE = ce.BinaryEncoder(cols=['day'])
dfBE = BE.fit_transform(tips['day'])
dfBE

Unnamed: 0,day_0,day_1,day_2
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
239,0,1,0
240,0,1,0
241,0,1,0
242,0,1,0


In [70]:

tips_new = pd.concat([tips[['total_bill', 'tip', 'sex', 'smoker', 'time', 'size']],dfBE],axis=1)
tips_new

Unnamed: 0,total_bill,tip,sex,smoker,time,size,day_0,day_1,day_2
0,16.99,1.01,Female,No,Dinner,2,0,0,1
1,10.34,1.66,Male,No,Dinner,3,0,0,1
2,21.01,3.50,Male,No,Dinner,3,0,0,1
3,23.68,3.31,Male,No,Dinner,2,0,0,1
4,24.59,3.61,Female,No,Dinner,4,0,0,1
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Dinner,3,0,1,0
240,27.18,2.00,Female,Yes,Dinner,2,0,1,0
241,22.67,2.00,Male,Yes,Dinner,2,0,1,0
242,17.82,1.75,Male,No,Dinner,2,0,1,0


# MISSING VALUE

## SIMPLE IMPUTER

In [71]:
from sklearn.impute import SimpleImputer
import numpy as np

In [77]:
df = pd.DataFrame({
    'X1': np.random.randint(1,100,5),
    'X2': [5,2,np.nan,15,np.nan],
    'X3': [5,15,3,np.nan,15],
    'X4': np.random.rand(5),
    'X5': ['A','B',np.nan,'X','Z']
})
df2= df.copy()
df3=df.copy()
df

Unnamed: 0,X1,X2,X3,X4,X5
0,70,5.0,5.0,0.533036,A
1,23,2.0,15.0,0.024045,B
2,93,,3.0,0.288584,
3,95,15.0,,0.961746,X
4,1,,15.0,0.29567,Z


In [78]:
imp_mean = SimpleImputer(strategy='median')

df[['X2','X3']]= imp_mean.fit_transform(df[['X2','X3']])

df

Unnamed: 0,X1,X2,X3,X4,X5
0,70,5.0,5.0,0.533036,A
1,23,2.0,15.0,0.024045,B
2,93,5.0,3.0,0.288584,
3,95,15.0,10.0,0.961746,X
4,1,5.0,15.0,0.29567,Z


In [79]:
imp_mean = SimpleImputer(strategy='constant',fill_value='C')

df[['X5']]= imp_mean.fit_transform(df[['X5']])

df

Unnamed: 0,X1,X2,X3,X4,X5
0,70,5.0,5.0,0.533036,A
1,23,2.0,15.0,0.024045,B
2,93,5.0,3.0,0.288584,C
3,95,15.0,10.0,0.961746,X
4,1,5.0,15.0,0.29567,Z


In [80]:
#Itertative Imputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imp_it = IterativeImputer(max_iter=15,random_state=100)

df2[['X2','X3']]= imp_it.fit_transform(df2[['X2','X3']])
df2

Unnamed: 0,X1,X2,X3,X4,X5
0,70,5.0,5.0,0.533036,A
1,23,2.0,15.0,0.024045,B
2,93,7.338502,3.0,0.288584,
3,95,15.0,9.481986,0.961746,X
4,1,7.329417,15.0,0.29567,Z


In [82]:
#KNN IMPUTERS
from sklearn.impute import KNNImputer
imp_KNN = KNNImputer(n_neighbors=10) #default:5

df3[['X2','X3']]= imp_KNN.fit_transform(df3[['X2','X3']])
df3

Unnamed: 0,X1,X2,X3,X4,X5
0,70,5.0,5.0,0.533036,A
1,23,2.0,15.0,0.024045,B
2,93,7.333333,3.0,0.288584,
3,95,15.0,9.5,0.961746,X
4,1,7.333333,15.0,0.29567,Z


KNN (K-Nearest Neighbors) imputer and Iterative imputer are both effective techniques for handling missing values in a dataset. Here are some considerations for when to use each of these imputation methods:

KNN Imputer:

Suitable for: Numeric and categorical data.
How it works: KNN imputation is based on the principle of finding similar instances (rows) in the dataset and using their values to impute missing values. It computes the distances between instances based on the available features and uses the values of the nearest neighbors to fill in the missing values.

When to use it:
- When the dataset has a relatively small number of missing values.
- When the missingness is not too high in the dataset.
- When the missing values are likely to have similar values to their nearest neighbors.
- When the dataset contains both numeric and categorical features.


Iterative Imputer:

Suitable for: Numeric data.
How it works: Iterative imputation involves building a model to predict the missing values based on the other features in the dataset. It iteratively estimates missing values by using the available features as predictors. In each iteration, the missing values are filled in using the predictions from the model.

When to use it:
- When the dataset has a moderate to high percentage of missing values.
- When the missingness in the dataset is not too dependent on the values of other features.
- When the dataset contains numeric features that have complex relationships with each other.
- When you want to incorporate the predictive power of other features in imputing missing values.