In [1]:
import pandas as pd
import src.util as utils
from imblearn.over_sampling import SMOTEN
from sklearn.preprocessing import StandardScaler

## Load Configuration File

In [2]:
config = utils.load_config()

## Load Dataset

In [3]:
def load_dataset(config_data: dict):
    x_train = utils.pickle_load(config_data["train_set_path"][0])
    y_train = utils.pickle_load(config_data["train_set_path"][1])
    x_valid = utils.pickle_load(config_data["valid_set_path"][0])
    y_valid = utils.pickle_load(config_data["valid_set_path"][1])
    x_test = utils.pickle_load(config_data["test_set_path"][0])
    y_test = utils.pickle_load(config_data["test_set_path"][1])

    return x_train, x_valid, x_test, y_train, y_valid, y_test

In [4]:
x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(config)

## Imputation

In [7]:
def fill_missing_values(df):
    data = df.copy()
    for col in data.columns:
        if data[col].isna().sum() > 1:
            mean_value = data[col].mean()
            data[col].fillna(value=mean_value, inplace=True)
    return data

In [8]:
X_train_imp = fill_missing_values(x_train)
X_valid_imp = fill_missing_values(x_valid)
X_test_imp = fill_missing_values(x_test)

## Column Encoding

In [9]:
def column_encoder(df):
    data = pd.get_dummies(df)

    return data

In [10]:
X_train_imp_encode = column_encoder(X_train_imp)
X_valid_imp_encode = column_encoder(X_valid_imp)
X_test_imp_encode = column_encoder(X_test_imp)

## Scaling Data

In [11]:
def scaler_model(df, config):
    scaler = StandardScaler()
    scaler.fit(df)
    utils.pickle_dump(scaler, config["standard_scaler"]) 

def scale_data(df, config):
    scaler = utils.pickle_load(config["standard_scaler"])
    set_x_scaled = scaler.transform(df)

    return set_x_scaled

In [15]:
scaler_model(X_test_imp_encode)
X_train_imp_encode_scaled = scale_data(X_train_imp_encode, config)
X_valid_imp_encode_scaled = scale_data(X_valid_imp_encode, config)
X_test_imp_encode_scaled = scale_data(X_test_imp_encode, config)

## Oversampling

In [16]:
print("train:", X_train_imp_encode_scaled.shape)
print("valid:", X_valid_imp_encode_scaled.shape)
print("test:", X_test_imp_encode_scaled.shape)

train: (7000, 23)
valid: (1500, 23)
test: (1500, 23)


In [13]:
def resample_data(set_x, set_y):
    sm = SMOTEN(k_neighbors=20, n_jobs=-1)
    X_train_res, y_train_res = sm.fit_resample(set_x, set_y)

    return X_train_res, y_train_res

In [14]:
X_train_imp_encode_scaled_bal, y_train_bal = resample_data(
    X_train_imp_encode_scaled, 
    y_train
)
print("train:", X_train_imp_encode_scaled_bal.shape)
print("valid:", X_valid_imp_encode_scaled.shape)
print("test:", X_test_imp_encode_scaled.shape)



train: (9614, 23)
valid: (1500, 23)
test: (1500, 23)


In [17]:
X_train_imp_encode_scaled_bal

array([[ 0.89639623, -1.48727652, -0.42677835, ..., -0.67028006,
         1.89397887, -0.34675493],
       [ 0.89639623,  0.67236993,  3.25234539, ...,  1.49191369,
        -0.52798899, -0.34675493],
       [ 0.13374445,  0.67236993,  1.41278352, ...,  1.49191369,
        -0.52798899, -0.34675493],
       ...,
       [ 0.89639623, -1.48727652, -0.42677835, ..., -0.67028006,
        -0.52798899, -0.34675493],
       [ 0.02741608,  0.67236993, -0.42677835, ..., -0.67028006,
        -0.52798899, -0.34675493],
       [ 0.89639623, -1.48727652, -0.42677835, ..., -0.67028006,
        -0.52798899, -0.34675493]])