## 1. Load Required Libraries

In [128]:
import src.util as utils
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler

## 2. Load Configuration File

In [129]:
config = utils.load_config()

## 3. Load Dataset

In [130]:
def load_dataset(config_data: dict):
    # Load every set of data
    x_train = utils.pickle_load(config_data["train_set_path"][0])
    y_train = utils.pickle_load(config_data["train_set_path"][1])

    x_valid = utils.pickle_load(config_data["valid_set_path"][0])
    y_valid = utils.pickle_load(config_data["valid_set_path"][1])

    x_test = utils.pickle_load(config_data["test_set_path"][0])
    y_test = utils.pickle_load(config_data["test_set_path"][1])

    # Concatenate x and y each set
    train_set = pd.concat([x_train, y_train], axis = 1)
    valid_set = pd.concat([x_valid, y_valid], axis = 1)
    test_set = pd.concat([x_test, y_test], axis = 1)

    # Return 3 set of data
    return train_set, valid_set, test_set

In [131]:
train_set, valid_set, test_set = load_dataset(config)

## 4. Balancing Train Label

In [132]:
x_rus, y_rus = RandomUnderSampler(random_state = 42).fit_resample(
    train_set.drop(columns = config["label"]),
    train_set[config["label"]]
)
train_set_bal = pd.concat([x_rus, y_rus], axis = 1)

## 5. Removing Outliers

In [133]:
def remove_outliers(set_data):
    set_data = set_data.copy()
    list_of_set_data = list()

    for col_name in set_data.columns[:-1]:
        q1 = set_data[col_name].quantile(0.25)
        q3 = set_data[col_name].quantile(0.75)
        iqr = q3 - q1
        set_data_cleaned = set_data[~((set_data[col_name] < (q1 - 1.5 * iqr)) | (set_data[col_name] > (q3 + 1.5 * iqr)))].copy()
        list_of_set_data.append(set_data_cleaned.copy())
    
    set_data_cleaned = pd.concat(list_of_set_data)
    count_duplicated_index = set_data_cleaned.index.value_counts()
    used_index_data = count_duplicated_index[count_duplicated_index == (set_data.shape[1]-1)].index
    set_data_cleaned = set_data_cleaned.loc[used_index_data].drop_duplicates()

    return set_data_cleaned

In [134]:
train_set_bal_cleaned = remove_outliers(train_set_bal)

In [135]:
train_set_bal_cleaned

Unnamed: 0,Temperature[C],Humidity[%],Pressure[hPa],PM1.0,TVOC[ppb],eCO2[ppm],Raw H2,Raw Ethanol,Fire Alarm
0,28.170,43.68,937.321,1.85,125,400,12793,20595,0
24184,19.584,49.22,938.715,1.58,1159,400,12933,19435,1
14473,24.590,53.35,938.699,1.79,1243,415,12906,19414,1
14472,24.570,53.35,938.740,1.57,1212,405,12898,19435,1
14471,12.353,51.77,938.771,1.57,1124,408,12877,19442,1
...,...,...,...,...,...,...,...,...,...
7517,26.220,46.59,937.422,1.74,75,400,12784,20638,0
7795,18.664,46.07,937.508,2.25,88,400,12826,20683,0
7642,18.882,47.74,937.418,1.35,177,431,12764,20540,0
7583,26.538,39.05,937.565,2.24,57,427,12850,20728,0


## 6. Dump Trainset

In [136]:
utils.pickle_dump(train_set_bal_cleaned[config["predictors"]], config["train_feng_set_path"][0])
utils.pickle_dump(train_set_bal_cleaned[config["label"]], config["train_feng_set_path"][1])

utils.pickle_dump(valid_set[config["predictors"]], config["valid_feng_set_path"][0])
utils.pickle_dump(valid_set[config["label"]], config["valid_feng_set_path"][1])

utils.pickle_dump(test_set[config["predictors"]], config["test_feng_set_path"][0])
utils.pickle_dump(test_set[config["label"]], config["test_feng_set_path"][1])