In [1]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import src.utils as utils
from src.utils import Pipeline as ModPipe
from src.utils import OutliersRemoval
from src.utils import IamHere

In [2]:
def join_cat(set_data, params):
    if params["target"] in set_data.columns.to_list():
        set_data = set_data.copy()
        set_data.categori.replace(params["target_categories"][1], params["target_categories"][2], inplace = True)
        set_data.categori.replace(params["target_categories"][2], params["target_categories_new"][1], inplace = True)
        return set_data
    else:
        raise RuntimeError("Kolom label tidak terdeteksi pada set data yang diberikan!")
def nan_detector(set_data):
    set_data = set_data.copy()
    set_data.replace(-1, np.nan, inplace = True)
    return set_data

config_dir = "config/config.yaml"
config = utils.load_yaml(config_dir)

x_train = utils.deserialize_data(config["x_train_path"])
y_train = utils.deserialize_data(config["y_train_path"])
x_valid = utils.deserialize_data(config["x_valid_path"])
y_valid = utils.deserialize_data(config["y_valid_path"])
x_test = utils.deserialize_data(config["x_test_path"])
y_test = utils.deserialize_data(config["y_test_path"])

train_set = utils.combine_dataframe([x_train, y_train], axis = 1)
valid_set = utils.combine_dataframe([x_valid, y_valid], axis = 1)
test_set = utils.combine_dataframe([x_test, y_test], axis = 1)

train_set = join_cat(train_set, config)
valid_set = join_cat(valid_set, config)
test_set = join_cat(test_set, config)

train_set = train_set[config["predictors"] + [config["target"]]]
valid_set = valid_set[config["predictors"] + [config["target"]]]
test_set = test_set[config["predictors"] + [config["target"]]]

train_set = nan_detector(train_set)
valid_set = nan_detector(valid_set)
test_set = nan_detector(test_set)

x_train, y_train = utils.split_predictor_target(train_set, config)
x_valid, y_valid = utils.split_predictor_target(valid_set, config)
x_test, y_test = utils.split_predictor_target(test_set, config)

le = LabelEncoder()
le.fit(y_train)

y_train = le.transform(y_train)
y_valid = le.transform(y_valid)
y_test = le.transform(y_test)

numeric_mean_features = config["predictors"][1:4]
numeric_median_features = config["predictors"][4:]
categoric_features = make_column_selector(config["predictors"][0])

categorical_transformer = Pipeline(
    steps = [
        ('log_categoricraw', IamHere('log/0_categoric_raw.pkl')),
        ('OneHotEncoder', OneHotEncoder(handle_unknown = 'ignore', sparse_output = False)),
        ('log_categoricencoded', IamHere('log/1_categoric_encoded.pkl')),
    ]
)

numeric_mean_transformer = Pipeline(
    steps = [
        ('NumericImputer', SimpleImputer(strategy = 'mean')),
        ('log_numericimputed', IamHere('log/2_numeric_mean_imputed.pkl')),
        ('NumericStandardScaler', StandardScaler()),
        ('log_numericscaled', IamHere('log/3_numeric_mean_scaled.pkl'))
    ]
)

numeric_median_transformer = Pipeline(
    steps = [
        ('NumericImputer', SimpleImputer(strategy = 'median')),
        ('log_numericimputed', IamHere('log/4_numeric_median_imputed.pkl')),
        ('NumericStandardScaler', StandardScaler()),
        ('log_numericscaled', IamHere('log/5_numeric_median_scaled.pkl'))
    ]
)

preprocessor = ColumnTransformer(
    transformers = [
        ('categoric_features', categorical_transformer, categoric_features),
        ('numeric_mean_feature', numeric_mean_transformer, numeric_mean_features),
        ('numeric_median_feature', numeric_median_transformer, numeric_median_features)
    ],
    n_jobs = -1,
    verbose = True
)

outlier_removal_transformer = ModPipe(steps=[
    ('log_combined', IamHere('log/6_combined.pkl')),
    ('OutliersRemover', OutliersRemoval()),
    ('log_outliersremoved', IamHere('log/7_outliers_removed.pkl')),
])

pipeline = ModPipe(
    steps = [
        ('preprocessor', preprocessor),
        ('outliers', outlier_removal_transformer),
        ('classifier', RandomForestClassifier(random_state = 42))
    ]
)

In [3]:
pipeline.fit(x_train, y_train)

In [5]:
pipeline.predict(x_valid)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1])

In [6]:
y_valid

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1])

In [None]:
"""
To do:
1. how to skip outlier removal when pipeline used in online data? otherwise there is probability that some of online data will be deleted due to detected as outliers
2. clean up logging "iam here"
3. give it documentation
4. adding cross validation mechanism
5. adding threshold tuning
6. adding pytest
7. adding api and its backend
8. adding docker
9. clean up config.yaml
10. clean up utils.py
Goodluck!
"""