In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import src.util as util
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

## 1. Import Configuration File

In [2]:
config = util.load_config()

## 2. Load Dataset

In [3]:
def load_dataset(config_data: dict) -> pd.DataFrame:
    # Load every set of data
    x_train = util.pickle_load(config_data["train_set_path"][0])
    y_train = util.pickle_load(config_data["train_set_path"][1])

    x_valid = util.pickle_load(config_data["valid_set_path"][0])
    y_valid = util.pickle_load(config_data["valid_set_path"][1])

    x_test = util.pickle_load(config_data["test_set_path"][0])
    y_test = util.pickle_load(config_data["test_set_path"][1])

    # Concatenate x and y each set
    train_set = pd.concat([x_train, y_train], axis = 1)
    valid_set = pd.concat([x_valid, y_valid], axis = 1)
    test_set = pd.concat([x_test, y_test], axis = 1)

    # Return 3 set of data
    return train_set, valid_set, test_set

In [4]:
train_set, valid_set, test_set = load_dataset(config)

## 3. Join Categories

In [5]:
def join_label_categori(set_data, config_data):
    # Check if label not found in set data
    if config_data["label"] in set_data.columns.to_list():
        # Create copy of set data
        set_data = set_data.copy()

        # Return renamed set data
        return set_data
    else:
        raise RuntimeError("Kolom label tidak terdeteksi pada set data yang diberikan!")

## 4. Handling Missing Value

In [6]:
def categorical_nan_detector(set_data: pd.DataFrame) -> pd.DataFrame:
    # Create copy of set data
    set_data = set_data.copy()

    # Replace NaN with "UNKNOWN" in categorical columns
    categorical_columns = set_data.select_dtypes(include=["object"]).columns
    for column in categorical_columns:
        set_data[column] = set_data[column].fillna("UNKNOWN")

    # Return replaced set data
    return set_data



In [7]:
def numereical_nan_detector(set_data: pd.DataFrame) -> pd.DataFrame:
    # Create copy of set data
    set_data = set_data.copy()

    # Replace NaN with median in numerical columns
    numerical_columns = set_data.select_dtypes(include=['float64', 'int64']).columns
    for column in numerical_columns:
        if set_data[column].isnull().sum() > 0:
            median = set_data[column].median()
            set_data[column].fillna(median, inplace=True)

    # Return replaced set data
    return set_data


In [8]:
train_set = categorical_nan_detector(train_set)
train_set = numereical_nan_detector(train_set)

In [9]:
valid_set = categorical_nan_detector(train_set)
valid_set = numereical_nan_detector(train_set)

In [10]:
test_set = categorical_nan_detector(train_set)
test_set = numereical_nan_detector(train_set)

## 5. Encoding Categorical

In [11]:
def le_fit_transform(data_sets: list, config: dict) -> list:
    # Select categorical columns
    categorical_cols = data_sets[0].select_dtypes(include=['object']).columns.tolist()

    # Create le objects
    le_encoders = {col: LabelEncoder() for col in categorical_cols}

    # Fit and transform le for each categorical column in all data sets
    for col in categorical_cols:
        all_data = pd.concat([ds[col] for ds in data_sets], axis=0)
        le_encoders[col].fit(all_data)
        for ds in data_sets:
            ds[col] = le_encoders[col].transform(ds[col])

    # Save le objects
    util.pickle_dump(le_encoders, config["le_encoder_path"])

    # Return transformed data sets
    return data_sets

In [12]:
def le_transform(data: pd.DataFrame, config: dict) -> pd.DataFrame:
    # Load le object
    le_encoders = util.pickle_load(config["le_encoder_path"])

    # Find categorical columns
    categorical_cols = data.select_dtypes(include=['object']).columns

    # Transform categorical columns with le
    for col in categorical_cols:
        data[col] = le_encoders[col].transform(data[col])

    # Return transformed data
    return data


In [13]:
train_set, valid_set, test_set = le_fit_transform([train_set, valid_set, test_set], config)

In [14]:
train_set = le_transform(train_set, config)
valid_set = le_transform(valid_set, config)
test_set = le_transform(test_set, config)

## 6. Dump Dataset

In [20]:
x_train = train_set.drop(columns = "Profit")

y_train = train_set.Profit

In [None]:
util.pickle_dump(x_train, "data/processed/x_train_feng.pkl")
util.pickle_dump(y_train, "data/processed/y_train_feng.pkl")

util.pickle_dump(valid_set.drop(columns = "Profit"), "data/processed/x_valid_feng.pkl")
util.pickle_dump(valid_set.Profit, "data/processed/y_valid_feng.pkl")

util.pickle_dump(test_set.drop(columns = "Profit"), "data/processed/x_test_feng.pkl")
util.pickle_dump(test_set.Profit, "data/processed/y_test_feng.pkl")