In [1]:
import src.utils as utils

## 1. Import Params

In [29]:
config_dir = "config/config.yaml"

In [30]:
config = utils.load_yaml(config_dir)

## 2. Load Preprocessed Dataset

In [4]:
x_train = utils.deserialize_data(config["x_train_prep_path"])
y_train = utils.deserialize_data(config["y_train_prep_path"])

x_valid = utils.deserialize_data(config["x_valid_prep_path"])
y_valid = utils.deserialize_data(config["y_valid_prep_path"])

x_test = utils.deserialize_data(config["x_test_prep_path"])
y_test = utils.deserialize_data(config["y_test_prep_path"])

In [5]:
train_set = utils.combine_dataframe([x_train, y_train], axis=1)
valid_set = utils.combine_dataframe([x_valid, y_valid], axis=1)
test_set = utils.combine_dataframe([x_test, y_test], axis=1)

## 2. Feature Engineering For Training Set

### 2.1. Outliers Removal

In [6]:
train_set_rmout, iqr_data = utils.fit_transform_iqr_outliers_removal(
    train_set,
    config["predictors_feature_engineered"]
)

### 2.2. Feature Scaling

#### 2.2.1. Min-Max Normalization

In [7]:
minmax_scaler = utils.create_minmax_scaler_object()
train_set_minmax, minmax_scaler = utils.fit_transform_scaler(
    minmax_scaler,
    train_set,
    config["predictors_feature_engineered"],
    config["minmax_path"]
)

Serialized models/minmax.pkl


In [8]:
rmout_minmax_scaler = utils.create_minmax_scaler_object()
train_set_rmout_minmax, rmout_minmax_scaler = utils.fit_transform_scaler(
    rmout_minmax_scaler,
    train_set_rmout,
    config["predictors_feature_engineered"],
    config["rmout_minmax_path"]
)

Serialized models/rmout_minmax.pkl


In [9]:
# until this part, we have 4 train data:
# 1. train_set
# 2. train_set_rmout
# 3. train_set_minmax
# 4. train_set_rmout_minmax

# also we have 2 more object:
# 1. minmax_scaler
# 2. rmout_minmax_scaler

#### 2.2.2. Standardization

In [10]:
std_scaler = utils.create_standard_scaler_object()
train_set_std, std_scaler = utils.fit_transform_scaler(
    std_scaler,
    train_set,
    config["predictors_feature_engineered"],
    config["std_path"]
)

Serialized models/std.pkl


In [11]:
rmout_std_scaler = utils.create_standard_scaler_object()
train_set_rmout_std, rmout_std_scaler = utils.fit_transform_scaler(
    rmout_std_scaler,
    train_set_rmout,
    config["predictors_feature_engineered"],
    config["rmout_std_path"]
)

Serialized models/rmout_std.pkl


In [12]:
minmax_std_scaler = utils.create_standard_scaler_object()
train_set_minmax_std, minmax_std_scaler = utils.fit_transform_scaler(
    minmax_std_scaler,
    train_set_minmax,
    config["predictors_feature_engineered"],
    config["minmax_std_path"]
)

Serialized models/minmax_std.pkl


In [13]:
rmout_minmax_std_scaler = utils.create_standard_scaler_object()
train_set_rmout_minmax_std, rmout_minmax_std_scaler = utils.fit_transform_scaler(
    rmout_minmax_std_scaler,
    train_set_rmout_minmax,
    config["predictors_feature_engineered"],
    config["rmout_minmax_std_path"]
)

Serialized models/rmout_minmax_std.pkl


In [14]:
# lastly, until now, we have 8 train data:
# 1. train_set
# 2. train_set_rmout
# 3. train_set_std
# 4. train_set_minmax
# 5. train_set_rmout_std
# 6. train_set_rmout_minmax
# 7. train_set_minmax_std
# 8. train_set_rmout_minmax_std

# also we have 4 more object:
# 1. minmax_object
# 2. std_scaler
# 3. rmout_minmax_scaler
# 4. rmout_std_scaler
# 5. minmax_std_scaler
# 6. rmout_minmax_std_scaler

## 3. Feature Engineering For Validation Set

### 3.1. Outliers Removal

In [15]:
valid_set_rmout = utils.transform_iqr_outliers_removal(valid_set, iqr_data)

### 3.2. Feature Scaling

#### 3.2.1. Min-Max Normalization

In [16]:
valid_set_minmax = utils.transform_using_scaler(
    valid_set,
    config["predictors_feature_engineered"],
    config["minmax_path"]
)

In [17]:
valid_set_rmout_minmax = utils.transform_using_scaler(
    valid_set_rmout,
    config["predictors_feature_engineered"],
    config["rmout_minmax_path"]
)

#### 3.2.2. Standardization

In [18]:
valid_set_std = utils.transform_using_scaler(
    valid_set,
    config["predictors_feature_engineered"],
    config["std_path"]
)

In [19]:
valid_set_rmout_std = utils.transform_using_scaler(
    valid_set_rmout,
    config["predictors_feature_engineered"],
    config["rmout_std_path"]
)

In [20]:
valid_set_minmax_std = utils.transform_using_scaler(
    valid_set_minmax,
    config["predictors_feature_engineered"],
    config["minmax_std_path"]
)

In [21]:
valid_set_rmout_minmax_std = utils.transform_using_scaler(
    valid_set_rmout_minmax,
    config["predictors_feature_engineered"],
    config["rmout_minmax_std_path"]
)

## 4. Feature Engineering For Test Set

### 4.1. Outliers Removal

In [22]:
test_set_rmout = utils.transform_iqr_outliers_removal(test_set, iqr_data)

### 4.2. Feature Scaling

#### 4.2.1. Min-Max Normalization

In [23]:
test_set_minmax = utils.transform_using_scaler(
    test_set,
    config["predictors_feature_engineered"],
    config["minmax_path"]
)

In [24]:
test_set_rmout_minmax = utils.transform_using_scaler(
    test_set_rmout,
    config["predictors_feature_engineered"],
    config["rmout_minmax_path"]
)

#### 4.2.2. Standardization

In [25]:
test_set_std = utils.transform_using_scaler(
    test_set,
    config["predictors_feature_engineered"],
    config["std_path"]
)

In [26]:
test_set_rmout_std = utils.transform_using_scaler(
    test_set_rmout,
    config["predictors_feature_engineered"],
    config["rmout_std_path"]
)

In [27]:
test_set_minmax_std = utils.transform_using_scaler(
    test_set_minmax,
    config["predictors_feature_engineered"],
    config["minmax_std_path"]
)

In [28]:
test_set_rmout_minmax_std = utils.transform_using_scaler(
    test_set_rmout_minmax,
    config["predictors_feature_engineered"],
    config["rmout_minmax_std_path"]
)

## 5. Serialize Dataset