## 1. Load Required Libraries

In [1]:
import src.util as utils
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler

## 2. Load Configuration File

In [2]:
config = utils.load_config()

## 3. Load Dataset

In [3]:
def load_dataset(config_data: dict):
    # Load every set of data
    x_train = utils.pickle_load(config_data["train_set_path"][0])
    y_train = utils.pickle_load(config_data["train_set_path"][1])

    x_valid = utils.pickle_load(config_data["valid_set_path"][0])
    y_valid = utils.pickle_load(config_data["valid_set_path"][1])

    x_test = utils.pickle_load(config_data["test_set_path"][0])
    y_test = utils.pickle_load(config_data["test_set_path"][1])

    # Concatenate x and y each set
    train_set = pd.concat([x_train, y_train], axis = 1)
    valid_set = pd.concat([x_valid, y_valid], axis = 1)
    test_set = pd.concat([x_test, y_test], axis = 1)

    # Return 3 set of data
    return train_set, valid_set, test_set

In [4]:
train_set, valid_set, test_set = load_dataset(config)

## 4. Balancing Train Label

In [5]:
x_rus, y_rus = RandomUnderSampler(random_state = 42).fit_resample(
    train_set.drop(columns = config["label"]),
    train_set[config["label"]]
)
train_set_bal = pd.concat([x_rus, y_rus], axis = 1)

## 5. Removing Outliers

In [6]:
def remove_outliers(set_data):
    set_data = set_data.copy()
    list_of_set_data = list()

    for col_name in set_data.columns[:-1]:
        q1 = set_data[col_name].quantile(0.25)
        q3 = set_data[col_name].quantile(0.75)
        iqr = q3 - q1
        set_data_cleaned = set_data[~((set_data[col_name] < (q1 - 1.5 * iqr)) | (set_data[col_name] > (q3 + 1.5 * iqr)))].copy()
        list_of_set_data.append(set_data_cleaned.copy())
    
    set_data_cleaned = pd.concat(list_of_set_data)
    count_duplicated_index = set_data_cleaned.index.value_counts()
    used_index_data = count_duplicated_index[count_duplicated_index == (set_data.shape[1]-1)].index
    set_data_cleaned = set_data_cleaned.loc[used_index_data].drop_duplicates()

    return set_data_cleaned

In [7]:
train_set_bal_cleaned = remove_outliers(train_set_bal)

In [8]:
train_set_bal_cleaned

Unnamed: 0,Temperature[C],Humidity[%],Pressure[hPa],PM1.0,PM2.5,NC0.5,NC1.0,NC2.5,TVOC[ppb],eCO2[ppm],Raw H2,Raw Ethanol,CNT,Fire Alarm
53451,28.170,43.68,937.321,1.85,1.93,12.76,1.991,0.045,125,400,12793,20595,2309,0
6038,-8.501,53.28,939.662,1.46,1.51,10.02,1.563,0.035,142,400,13186,20131,6038,1
8829,-5.129,52.19,939.286,0.39,0.40,2.65,0.414,0.009,342,400,13073,19915,8829,1
45860,24.610,52.85,938.691,2.02,2.10,13.91,2.170,0.049,1352,415,12946,19393,20866,1
33650,20.470,53.19,939.318,0.45,0.47,3.12,0.487,0.011,339,400,13091,19923,8656,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53706,28.770,42.23,937.317,1.80,1.87,12.41,1.935,0.044,163,429,12771,20558,2564,0
26094,16.330,50.47,939.751,0.17,0.18,1.19,0.186,0.004,13,400,13050,19951,1100,0
54937,24.550,44.47,937.394,1.92,1.99,13.21,2.060,0.047,0,400,13420,21217,3795,0
53444,25.650,48.38,937.334,1.95,2.03,13.43,2.094,0.047,142,412,12792,20580,2302,0


In [9]:
train_set_bal_cleaned.isnull().sum()

Temperature[C]    0
Humidity[%]       0
Pressure[hPa]     0
PM1.0             0
PM2.5             0
NC0.5             0
NC1.0             0
NC2.5             0
TVOC[ppb]         0
eCO2[ppm]         0
Raw H2            0
Raw Ethanol       0
CNT               0
Fire Alarm        0
dtype: int64

## 6. Dump Trainset

In [10]:
utils.pickle_dump(train_set_bal_cleaned[config["predictors"]], config["train_feng_set_path"][0])
utils.pickle_dump(train_set_bal_cleaned[config["label"]], config["train_feng_set_path"][1])

utils.pickle_dump(valid_set[config["predictors"]], config["valid_feng_set_path"][0])
utils.pickle_dump(valid_set[config["label"]], config["valid_feng_set_path"][1])

utils.pickle_dump(test_set[config["predictors"]], config["test_feng_set_path"][0])
utils.pickle_dump(test_set[config["label"]], config["test_feng_set_path"][1])