In [1]:
import yaml
import src.utils as utils
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as scs
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE

## 1. Load All Preprocessed Dataset

In [3]:
x_train = utils.deserialize_data("data/processed/x_train_prep.pkl")
y_train = utils.deserialize_data("data/processed/y_train_prep.pkl")

x_valid = utils.deserialize_data("data/processed/x_valid_prep.pkl")
y_valid = utils.deserialize_data("data/processed/y_valid_prep.pkl")

x_test = utils.deserialize_data("data/processed/x_test_prep.pkl")
y_test = utils.deserialize_data("data/processed/y_test_prep.pkl")

In [4]:
train_set = utils.combine_dataframe([x_train, y_train], axis=1)
valid_set = utils.combine_dataframe([x_valid, y_valid], axis=1)
test_set = utils.combine_dataframe([x_test, y_test], axis=1)

In [15]:
params = utils.load_params("config/params.yaml")

## 2. Outliers Removal

In [5]:
def quartile_based_outliers_removal(target_dataframe, columns_name):
    target_dataframe = target_dataframe.copy(deep=True)

    for column_name in columns_name:
        q1 = target_dataframe[column_name].quantile(0.25)
        q3 = target_dataframe[column_name].quantile(0.75)
        iqr = q3 - q1

        lower_limit = q1 - 1.5 * iqr
        upper_limit = q3 + 1.5 * iqr

        target_dataframe = target_dataframe[(target_dataframe[column_name] > lower_limit) & (target_dataframe[column_name] < upper_limit)]
    
    return target_dataframe     


In [9]:
train_set_rmout = quartile_based_outliers_removal(train_set, params["predictors_feature_engineered"])

In [10]:
train_set.describe()

Unnamed: 0,DKI1 (Bunderan HI),DKI2 (Kelapa Gading),DKI3 (Jagakarsa),DKI4 (Lubang Buaya),DKI5 (Kebon Jeruk) Jakarta Barat,pm10,pm25,so2,co,o3,no2,categori
count,1269.0,1269.0,1269.0,1269.0,1269.0,1269.0,1269.0,1269.0,1269.0,1269.0,1269.0,1269.0
mean,0.198582,0.197794,0.204098,0.201734,0.197794,52.613869,79.198582,35.289204,11.631206,31.662727,19.402679,0.895981
std,0.39909,0.398493,0.4032,0.401453,0.398493,14.840764,24.329396,12.227305,4.920135,14.356701,9.071623,0.305405
min,0.0,0.0,0.0,0.0,0.0,14.0,13.0,3.0,2.0,7.0,1.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,44.0,64.0,26.0,9.0,22.0,13.0,1.0
50%,0.0,0.0,0.0,0.0,0.0,54.0,79.0,35.0,11.0,29.0,18.0,1.0
75%,0.0,0.0,0.0,0.0,0.0,62.0,93.0,43.0,14.0,37.0,25.0,1.0
max,1.0,1.0,1.0,1.0,1.0,100.0,174.0,82.0,44.0,151.0,65.0,1.0


In [11]:
train_set_rmout.describe()

Unnamed: 0,DKI1 (Bunderan HI),DKI2 (Kelapa Gading),DKI3 (Jagakarsa),DKI4 (Lubang Buaya),DKI5 (Kebon Jeruk) Jakarta Barat,pm10,pm25,so2,co,o3,no2,categori
count,1095.0,1095.0,1095.0,1095.0,1095.0,1095.0,1095.0,1095.0,1095.0,1095.0,1095.0,1095.0
mean,0.206393,0.168037,0.222831,0.2,0.20274,51.610046,77.344292,35.223744,10.865753,29.604566,18.712329,0.892237
std,0.404901,0.37407,0.416336,0.400183,0.402224,13.734239,21.964097,11.466196,3.688669,10.875974,7.907624,0.310222
min,0.0,0.0,0.0,0.0,0.0,18.0,21.0,3.0,2.0,9.0,1.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,44.0,63.0,27.0,8.0,22.0,13.0,1.0
50%,0.0,0.0,0.0,0.0,0.0,53.0,79.0,35.0,11.0,28.0,18.0,1.0
75%,0.0,0.0,0.0,0.0,0.0,61.0,91.0,43.0,13.0,35.0,24.0,1.0
max,1.0,1.0,1.0,1.0,1.0,87.0,134.0,67.0,21.0,59.0,40.0,1.0


In [12]:
x_train, y_train = utils.split_predictor_target(train_set_rmout, params)

In [13]:
# here we have:
# 1. train_set variable, untouched one
# 2. train_set_rmout, with outlier removed

## 3. Feature Scaling

#### 3.1. Min-Max Normalization

In [14]:
def minmax_norm(target_dataframe, columns_name):
    target_dataframe = target_dataframe.copy(deep=True)

    minmax_object = dict()
    for column_name in columns_name:
        data = np.array(target_dataframe[column_name]).reshape(-1, 1)
        minmax_object["mms_{}".format(column_name)] = MinMaxScaler()
        data = minmax_object["mms_{}".format(column_name)].fit_transform(data)
        target_dataframe[column_name] = data

    return minmax_object, target_dataframe

In [19]:
train_set_minmax, minmax_scaler_object = minmax_norm(train_set, params["predictors_feature_engineered"])

In [20]:
train_set_rmout_minmax, rmout_minmax_scaler_object = minmax_norm(train_set_rmout, params["predictors_feature_engineered"])

In [21]:
# until this part, we have 4 train data:
# 1. train_set
# 2. train_set_outliers_removed
# 3. train_set_minmax
# 4. train_set_outliers_removed_minmax

# also we have 2 more object:
# 1. minmax scaler object for dataset train_set, it's called minmax_scaler_object
# 2. minmax scaler object for dataset train_set_rmout, it's called rmout_minmax_scaler_object

#### 3.2. Standardization