# FLow Modelling for Preparing PY script

notebook ini digunakan untuk eksperimentasi script sebelum dibawa ke bentuk script py

# 1. Utils

In [576]:
import yaml
import joblib

In [577]:
config_path = r"../config/config.yaml"

In [578]:
def load_config()->dict:

    '''
    this function load the config.yaml file
    '''

    try:
        with open(config_path, "r") as f:
            config = yaml.safe_load(f)

    except FileNotFoundError as fe:
        raise RuntimeError("Parameter File Not Found!")

    return config

def load_pickle(file_path: str):

    '''
    handler function to load pickle file
    '''
    try:
        with open(file_path, "rb") as f:
            pick = joblib.load(f)

    except FileNotFoundError as fe:
        raise RuntimeError("Parameter File Not Found!")

    return pick

def dump_pickle(file, file_path: str):

    '''
    handler function to dump pickle
    '''

    return joblib.dump(file, file_path)

In [579]:
config = load_config()

In [580]:
config

{'raw_dataset_path': '../1_data/raw_data/5year.arff',
 'raw_df_path': '../1_data/processed_data/raw_df.pkl',
 'train_set_path': {'x_train': '../1_data/processed_data/x_train.pkl',
  'y_train': '../1_data/processed_data/y_train.pkl'},
 'test_set_path': {'x_test': '../1_data/processed_data/x_test.pkl',
  'y_test': '../1_data/processed_data/y_test.pkl'},
 'prep_rus_path': {'x_rus': '../1_data/processed_data/x_rus.pkl',
  'y_rus': '../1_data/processed_data/y_rus.pkl'},
 'prep_sm_path': {'x_sm': '../1_data/processed_data/x_sm.pkl',
  'y_sm': '../1_data/processed_data/y_sm.pkl'},
 'prep_test_path': {'x_test': '../1_data/processed_data/x_test_prep.pkl',
  'y_test': '../1_data/processed_data/y_test.pkl'},
 'production_model_path': '../4_models/model_prod.pkl',
 'training_log_path': '../5_log/training_log.json',
 'scaler_path': '../4_models/std_scaler_prod.pkl',
 'float_columns': ['Attr21',
  'Attr13',
  'Attr27',
  'Attr34',
  'Attr24',
  'Attr35',
  'Attr51',
  'Attr6',
  'Attr56',
  'Attr49'

# 2. Data Collections

In [581]:
import pandas as pd
import copy
# from src.utils import load_config, dump_pickle
from scipy.io import arff
from sklearn.model_selection import train_test_split

In [582]:
def read_raw_df(raw_data_path: str):

    df = arff.loadarff(raw_data_path)
    df = pd.DataFrame(df[0])
    
    return df

In [583]:
def cleansing(dataframe: pd.DataFrame)->pd.DataFrame:
    temp = copy.deepcopy(dataframe)
    # temp.dropna(inplace=True)
    temp['class'] = temp['class'].replace("b",'')
    temp['class'] = temp['class'].astype(int)
    return temp

In [584]:
def split_data(dataframe: pd.DataFrame) -> tuple:

    df = copy.deepcopy(dataframe)

    X = df.drop(columns=config["label"])
    y = df[config["label"]]

    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

    return x_train, x_test, y_train, y_test

In [585]:
def imputation_selections(x_train: pd.DataFrame, x_test: pd.DataFrame)->pd.DataFrame:
    x_train = copy.deepcopy(x_train)
    x_test = copy.deepcopy(x_test)
    
    #select variables using modelling
    x_train = x_train[config["predictors"]]
    x_test = x_test[config["predictors"]]
    
    #handling missing values
    x_train.fillna(-9999999999, inplace=True)
    x_test.fillna(-9999999999, inplace=True)
    
    return x_train, x_test

In [586]:
def int_to_float(dataframe: pd.DataFrame)->pd.DataFrame:
    
    df = copy.deepcopy(dataframe)

    df[config["float_columns"]] = df[config["float_columns"]].astype("float64")

    return df

In [587]:
def main():
    """
    Jalankan pipeline data collections
    """
    config = load_config()
    
    df = read_raw_df(config["raw_dataset_path"])
    dump_pickle(df, config["raw_df_path"])
    df = cleansing(df)
    x_train, x_test, y_train, y_test = split_data(df)
    x_train, x_test = imputation_selections(x_train, x_test)
    x_train = int_to_float(x_train)
    x_test = int_to_float(x_test)
    dump_pickle(x_train, config["train_set_path"]["x_train"])
    dump_pickle(y_train, config["train_set_path"]["y_train"])
    dump_pickle(x_test, config["test_set_path"]["x_test"])
    dump_pickle(y_test, config["test_set_path"]["y_test"])

In [588]:
main()

# 3. Data Preprocessing

In [589]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
import copy
# from src.utils import load_config, load_pickle, dump_pickle
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

In [590]:
def std_scaler_fit(x_train: pd.DataFrame):

    std_scaler = StandardScaler()

    std_scaler.fit(x_train)

    return std_scaler

In [591]:
def std_scaler_transform(features: pd.DataFrame, scaler: object) -> pd.DataFrame:

    '''
    this function transform features using standar scaler machine
    '''
    
    col_names = scaler.feature_names_in_

    feat = copy.deepcopy(features)

    scaled = scaler.transform(feat)

    scaled_df = pd.DataFrame(scaled, columns=col_names)

    return scaled_df

In [592]:
def rus_balancer(x_train, y_train):

    rus = RandomUnderSampler(random_state=42)

    x_rus, y_rus = rus.fit_resample(x_train, y_train)

    return x_rus, y_rus

In [593]:
def sm_balancer(x_train, y_train):

    sm = SMOTE(random_state=42)

    x_sm, y_sm = sm.fit_resample(x_train, y_train)

    return x_sm, y_sm

In [594]:
def main():
    config = load_config()

    x_train = load_pickle(config["train_set_path"]["x_train"])
    x_test = load_pickle(config["test_set_path"]["x_test"])

    y_train = load_pickle(config["train_set_path"]["y_train"])
    y_test = load_pickle(config["test_set_path"]["y_test"])
    
    # standardizing
    scaler = std_scaler_fit(x_train)
    dump_pickle(scaler, config["scaler_path"])
    x_train_scaled = std_scaler_transform(x_train, scaler)
    x_test_scaled = std_scaler_transform(x_test, scaler)
    # class balancer - rus
    x_rus, y_rus = rus_balancer(x_train_scaled, y_train)
    # class balancer - smote
    x_sm, y_sm = sm_balancer(x_train_scaled, y_train)
    # dump everything
    dump_pickle(x_rus, config["prep_rus_path"]["x_rus"])
    dump_pickle(y_rus, config["prep_rus_path"]["y_rus"])

    dump_pickle(x_sm, config["prep_sm_path"]["x_sm"])
    dump_pickle(y_sm, config["prep_sm_path"]["y_sm"])

    dump_pickle(x_test_scaled, config["prep_test_path"]["x_test"])

In [595]:
main()

# 3. Modelling

In [596]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.metrics import classification_report, ConfusionMatrixDisplay, roc_curve, roc_auc_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from datetime import datetime
from tqdm import tqdm
import yaml
import joblib
import json
import pandas as pd
import copy
import hashlib

# from src.utils import load_config, load_pickle, dump_pickle

In [597]:
def time_stamp():
    return datetime.now()

In [598]:
def create_log_template():
    logger = {
        "model_name" : [],
        "model_uid" : [],
        "training_time" : [],
        "training_date" : [],
        "performance" : [],
        "f1_score_avg" : [],
        "precision_avg" : [],
        "recall_avg" : [],
        "data_configurations" : [],
    }

    return logger

In [599]:
def training_log_updater(current_log, log_path):
    current_log = current_log.copy()

    try:
        with open(log_path, "r") as file:
            last_log = json.load(file)
        file.close()
    except FileNotFoundError as ffe:
        with open(log_path, "w") as file:
            file.write("[]")
        file.close()
        with open(log_path, "r") as file:
            last_log = json.load(file)
        file.close()
    
    last_log.append(current_log)

    with open(log_path, "w") as file:
        json.dump(last_log, file)
        file.close()

    return last_log

In [600]:
def train_eval_model(list_of_model, prefix_model_name, x_train, y_train, data_configuration_name, x_valid, y_valid, log_path):

    list_of_model = copy.deepcopy(list_of_model)
    logger = create_log_template()

    for model in tqdm(list_of_model):    
        model_name = prefix_model_name + "-" + model["model_name"]

        start_time = time_stamp()
        model["model_object"].fit(x_train, y_train)
        finished_time = time_stamp()

        elapsed_time = finished_time - start_time
        elapsed_time = elapsed_time.total_seconds()

        y_pred = model["model_object"].predict(x_valid)
        performance = classification_report(y_valid, y_pred, output_dict = True)

        plain_id = str(start_time) + str(finished_time)
        chiper_id = hashlib.md5(plain_id.encode()).hexdigest()

        model["model_uid"] = chiper_id

        logger["model_name"].append(model_name)
        logger["model_uid"].append(chiper_id)
        logger["training_time"].append(elapsed_time)
        logger["training_date"].append(str(start_time))
        logger["performance"].append(performance)
        logger["f1_score_avg"].append(performance["macro avg"]["f1-score"])
        logger["precision_avg"].append(performance["macro avg"]["precision"])
        logger["recall_avg"].append(performance["macro avg"]["recall"])
        logger["data_configurations"].append(data_configuration_name)

    training_log = training_log_updater(logger, log_path)

    return training_log, list_of_model

In [601]:
def training_log_to_df(training_log):
    training_res = pd.DataFrame()

    for log in tqdm(training_log):
        training_res = pd.concat([training_res, pd.DataFrame(log)])
    
    training_res.sort_values(["f1_score_avg", "training_time"], ascending = [False, True], inplace = True)
    training_res.reset_index(inplace = True, drop = True)
    
    return training_res

In [602]:
def get_best_model(training_log_df, list_of_model):
    """
    Nah kalau fungsi ini buat milih model mana yang paling bagus, dan hasilnya buat production di api nantinya
    """
    model_object = None

    best_model_info = training_log_df.sort_values(["f1_score_avg", "training_time"], ascending = [False, True]).iloc[0]
    i=0
    for configuration_data in list_of_model:
        for model_data in list_of_model[configuration_data]:
            if model_data["model_uid"] == best_model_info["model_uid"]:
                model_object = model_data["model_object"]
                break

    if model_object == None:
        model_object = load_pickle(config["production_model_path"])
        print("The best model not found in your list of model.")
    
    return model_object

In [603]:
def main():
    config = load_config()

    # load preprocessed pickle of dataset
    x_rus = load_pickle(config["prep_rus_path"]["x_rus"])
    y_rus = load_pickle(config["prep_rus_path"]["y_rus"])

    x_sm = load_pickle(config["prep_sm_path"]["x_sm"])
    y_sm = load_pickle(config["prep_sm_path"]["y_sm"])

    x_test = load_pickle(config["prep_test_path"]["x_test"])
    y_test = load_pickle(config["prep_test_path"]["y_test"])
    
    # initiate models
    lgr_baseline = LogisticRegression()
    dct_baseline = DecisionTreeClassifier()
    rfc_baseline = RandomForestClassifier()
    knn_baseline = KNeighborsClassifier()
    xgb_baseline = XGBClassifier(seed=42)
    xgb_tuning = XGBClassifier(max_depth=6,
                               min_child_weight=1,
                               gamma=0.0,
                               reg_alpha=1e-05,
                               seed=42)
    
        # list of models
    list_of_model = {
    "undersampling" : [
        { "model_name": lgr_baseline.__class__.__name__, "model_object": lgr_baseline, "model_uid": ""},
        { "model_name": dct_baseline.__class__.__name__, "model_object": dct_baseline, "model_uid": ""},
        { "model_name": rfc_baseline.__class__.__name__, "model_object": rfc_baseline, "model_uid": ""},
        { "model_name": knn_baseline.__class__.__name__, "model_object": knn_baseline, "model_uid": ""},
        { "model_name": xgb_baseline.__class__.__name__, "model_object": xgb_baseline, "model_uid": ""},
        { "model_name": xgb_tuning.__class__.__name__+"_"+"tuning", "model_object": xgb_tuning, "model_uid": ""}
        ],
    "smote" : [
        { "model_name": lgr_baseline.__class__.__name__, "model_object": lgr_baseline, "model_uid": ""},
        { "model_name": dct_baseline.__class__.__name__, "model_object": dct_baseline, "model_uid": ""},
        { "model_name": rfc_baseline.__class__.__name__, "model_object": rfc_baseline, "model_uid": ""},
        { "model_name": knn_baseline.__class__.__name__, "model_object": knn_baseline, "model_uid": ""},
        { "model_name": xgb_baseline.__class__.__name__, "model_object": xgb_baseline, "model_uid": ""},
        { "model_name": xgb_tuning.__class__.__name__+"_"+"tuning", "model_object": xgb_tuning, "model_uid": ""}
        ],
    }
        # training of rus
    training_log, list_of_model_rus = train_eval_model(
    list_of_model["undersampling"],
    "baseline_model",
    x_rus,
    y_rus,
    "undersampling",
    x_test,
    y_test,
    config["training_log_path"]
    )
    
    list_of_model["undersampling"] = copy.deepcopy(list_of_model_rus)
    # list_of_model["undersampling"].append(copy.deepcopy(list_of_model_rus))
    
       # training of smote
    training_log, list_of_model_sm = train_eval_model(
    list_of_model["smote"],
    "baseline_model",
    x_sm,
    y_sm,
    "smote",
    x_test,
    y_test,
    config["training_log_path"]
    )
    
    list_of_model["smote"] = copy.deepcopy(list_of_model_sm)
    # list_of_model['smote'].append(copy.deepcopy(list_of_model_sm))
    
    # log to df
    training_res = training_log_to_df(training_log)
    
    # get best model
    model = get_best_model(training_res, list_of_model)
    
    print(model)
    
    # save best model
    dump_pickle(model, config["production_model_path"])

In [612]:
# for i in range(1,10):
#     main()

In [None]:
main()

# 4. MAIN 

In [605]:
# import data_collection
# import praprocessing
# import modelling
from sklearn.metrics import classification_report,
import pandas as pd

# from utils import load_config, load_pickle

In [606]:
def train():

    data_collections.main()

    data_preprocessing.main()

    modelling.main()

In [607]:
if __name__ == "__main__":

    config = load_config()

    # train()

    model = load_pickle(config["production_model_path"])
    
    x_train = load_pickle(config["prep_sm_path"]["x_sm"])
    y_train = load_pickle(config["prep_sm_path"]["y_sm"])

    x_test = load_pickle(config["prep_test_path"]["x_test"])
    y_test = load_pickle(config["prep_test_path"]["y_test"])

In [610]:
def evaluate_model_testing(model,fitur_train, y_train, fitur_test, y_test):
    #test
    predY = model.predict(fitur_test)
    probs = model.predict_proba(fitur_test)
    probs = probs[:, 1]
    #classification report
    print("Testing Report")
    print(classification_report(y_test, predY))

In [611]:
evaluate_model_testing(model,x_train,y_train,x_test,y_test)

Testing Report
              precision    recall  f1-score   support

           0       0.98      0.96      0.97      1100
           1       0.61      0.74      0.67        82

    accuracy                           0.95      1182
   macro avg       0.80      0.85      0.82      1182
weighted avg       0.95      0.95      0.95      1182

