In [1]:
import src.utils as utils

## 1. Load Preprocessed Dataset

In [2]:
x_train = utils.deserialize_data("data/processed/x_train_prep.pkl")
y_train = utils.deserialize_data("data/processed/y_train_prep.pkl")

In [3]:
train_set = utils.combine_dataframe([x_train, y_train], axis=1)

In [4]:
params = utils.load_params("config/params.yaml")

## 2. Outliers Removal

In [5]:
def quartile_based_outliers_removal(target_dataframe, columns_name):
    target_dataframe = target_dataframe.copy(deep=True)

    for column_name in columns_name:
        q1 = target_dataframe[column_name].quantile(0.25)
        q3 = target_dataframe[column_name].quantile(0.75)
        iqr = q3 - q1

        lower_limit = q1 - 1.5 * iqr
        upper_limit = q3 + 1.5 * iqr

        target_dataframe = target_dataframe[(target_dataframe[column_name] > lower_limit) & (target_dataframe[column_name] < upper_limit)]
    
    return target_dataframe     


In [6]:
train_set_rmout = quartile_based_outliers_removal(train_set, params["predictors_feature_engineered"])

In [7]:
# here we have:
# 1. train_set variable, untouched one
# 2. train_set_rmout, with outlier removed

## 3. Feature Scaling

#### 3.1. Min-Max Normalization

In [8]:
minmax_scaler = utils.create_minmax_scaler_object()
train_set_minmax, minmax_scaler = utils.fit_transform_scaler(minmax_scaler, train_set, params["predictors_feature_engineered"], params["minmax_path"])

Serialized models/minmax.pkl


In [9]:
rmout_minmax_scaler = utils.create_minmax_scaler_object()
train_set_rmout_minmax, rmout_minmax_scaler = utils.fit_transform_scaler(rmout_minmax_scaler, train_set_rmout, params["predictors_feature_engineered"], params["rmout_minmax_path"])

Serialized models/rmout_minmax.pkl


In [10]:
# until this part, we have 4 train data:
# 1. train_set
# 2. train_set_rmout
# 3. train_set_minmax
# 4. train_set_rmout_minmax

# also we have 2 more object:
# 1. minmax_scaler
# 2. rmout_minmax_scaler

#### 3.2. Standardization

In [11]:
std_scaler = utils.create_standard_scaler_object()
train_set_std, std_scaler = utils.fit_transform_scaler(std_scaler, train_set, params["predictors_feature_engineered"], params["std_path"])

Serialized models/std.pkl


In [12]:
rmout_std_scaler = utils.create_standard_scaler_object()
train_set_rmout_std, rmout_std_scaler = utils.fit_transform_scaler(rmout_std_scaler, train_set_rmout, params["predictors_feature_engineered"], params["rmout_std_path"])

Serialized models/rmout_std.pkl


In [13]:
minmax_std_scaler = utils.create_standard_scaler_object()
train_set_minmax_std, minmax_std_scaler = utils.fit_transform_scaler(minmax_std_scaler, train_set_minmax, params["predictors_feature_engineered"], params["minmax_std_path"])

Serialized models/minmax_std.pkl


In [14]:
rmout_minmax_std_scaler = utils.create_standard_scaler_object()
train_set_rmout_minmax_std, rmout_minmax_std_scaler = utils.fit_transform_scaler(rmout_minmax_std_scaler, train_set_rmout_minmax, params["predictors_feature_engineered"], params["rmout_minmax_std_path"])

Serialized models/rmout_minmax_std.pkl


In [15]:
# lastly, until now, we have 8 train data:
# 1. train_set
# 2. train_set_rmout
# 3. train_set_std
# 4. train_set_minmax
# 5. train_set_rmout_std
# 6. train_set_rmout_minmax
# 7. train_set_minmax_std
# 8. train_set_rmout_minmax_std

# also we have 4 more object:
# 1. minmax_object
# 2. std_scaler
# 3. rmout_minmax_scaler
# 4. rmout_std_scaler
# 5. minmax_std_scaler
# 6. rmout_minmax_std_scaler