In [1]:
from omegaconf import DictConfig,OmegaConf
conf = OmegaConf.load('config/config.yaml')
pic_=conf['config']['pic_']
tracking_uri_=conf['config']['tracking_uri']

In [2]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import pickle5 as pickle
from catboost import Pool
from sklearn.ensemble import StackingClassifier, VotingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,StratifiedKFold

from sklearn.metrics import accuracy_score, f1_score,cohen_kappa_score,roc_auc_score,log_loss
import mlflow

### Get Feature from data corresponded with the model

In [3]:
##load_feature
fs=[]
with open('ml_output/04_05_modeling/feature_selection/fs.pickle', 'rb') as handle:
    fs_=pickle.load(handle)
    
    catboost_params=fs_['catboost-wo_artificial']
    fs_cat_wo_na=catboost_params['params.feature_name'].replace('[','').replace(']','').replace('\'','').replace(' ','').split(',')
    fs.extend(fs_cat_wo_na)
    catboost_params=fs_['catboost-w_artificial']
    fs_cat_w_a=catboost_params['params.feature_name'].replace('[','').replace(']','').replace('\'','').replace(' ','').split(',')
    fs.extend(fs_cat_w_a)
    
    lgbm_params=fs_['lightgbm-wo_artificial']
    fs_lgbm_wo_na=lgbm_params['params.feature_name'].replace('[','').replace(']','').replace('\'','').replace(' ','').split(',')
    fs.extend(fs_lgbm_wo_na)
    lgbm_params=fs_['lightgbm-w_artificial']
    fs_lgbm_w_a=lgbm_params['params.feature_name'].replace('[','').replace(']','').replace('\'','').replace(' ','').split(',')
    fs.extend(fs_lgbm_w_a)
    
    rf_params=fs_['randomforest-wo_artificial']
    fs_rf_wo_na=rf_params['params.feature_name'].replace('[','').replace(']','').replace('\'','').replace(' ','').split(',')
    fs.extend(fs_rf_wo_na)
    rf_params=fs_['randomforest-w_artificial']
    fs_rf_w_a=rf_params['params.feature_name'].replace('[','').replace(']','').replace('\'','').replace(' ','').split(',')
    fs.extend(fs_rf_w_a)
    
    fs=list(dict.fromkeys(fs))

### Get model

In [4]:
##Load model
model=[]
with open('ml_output/04_05_modeling/tuning/model_tuned.pickle', 'rb') as handle:
    model=pickle.load(handle)

### Load data

In [5]:
##Load data
data_sample=pd.read_csv("gs://bps-gcp-bucket/MLST2023/preprocessing/sample_"+str(pic_) +".csv",sep=',')
data_sample=data_sample
X=data_sample[fs]
y=data_sample[['nama_valid']]
    
train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.2,stratify=y)

### Running Experiment

In [6]:
#sk_fold=StratifiedKFold(n_splits=5,shuffle=False)
experiment_name = "Modeling and Stacking"
    ## check if the experiment already exists
if not mlflow.get_experiment_by_name(experiment_name):
        mlflow.create_experiment(name=experiment_name) 
experiment = mlflow.get_experiment_by_name(experiment_name)
mlflow.set_tracking_uri(tracking_uri_)

In [7]:
def clf_estimate(clf,f_select,params):
    global valid_x, valid_y,experiment
    pred_y=clf.predict(valid_x[f_select])
    preds_proba_y=clf.predict_proba(valid_x[f_select])
    f1_micro_=f1_score(valid_y, pred_y,average='micro')
    log_loss_=log_loss(valid_y,preds_proba_y)
    roc_auc_score_=roc_auc_score(valid_y, preds_proba_y, average="weighted", multi_class="ovr")
    cohen_kappa_score_=cohen_kappa_score(valid_y, pred_y)
    with mlflow.start_run(experiment_id = experiment.experiment_id,
                          run_name=params):
        mlflow.log_metric("f1_score", np.mean(f1_micro_))
        mlflow.log_metric("log_loss", np.mean( log_loss_))
        mlflow.log_metric("roc_auc", np.mean(roc_auc_score_))
        mlflow.log_metric("cohen_kappa", np.mean(cohen_kappa_score_))
        mlflow.log_param('classifier',params)
        mlflow.log_param('feature',f_select)
        if params[:1]=='c':
            mlflow.catboost.log_model(clf, params)
        elif params[:1]=='l':
            mlflow.lightgbm.log_model(clf, params)
        else:
            mlflow.sklearn.log_model(clf,params)

In [8]:
params='catboost_wo_na'
clf_estimate(model[params],fs_cat_wo_na,params)

params='catboost_w_a'
clf_estimate(model[params],fs_cat_w_a,params)

params='lightgbm_wo_na'
clf_estimate(model[params],fs_lgbm_wo_na,params)

params='lightgbm_w_a'
clf_estimate(model[params],fs_lgbm_w_a,params)

params='lightgbm_wo_na'
clf_estimate(model[params],fs_lgbm_wo_na,params)

params='randomforest_wo_na'
clf_estimate(model[params],fs_rf_wo_na,params)

params='randomforest_w_a'
clf_estimate(model[params],fs_rf_w_a,params)



In [9]:
cat_wo_na_transform=FunctionTransformer(lambda X: X[fs_cat_wo_na])
cat_w_a_transform=FunctionTransformer(lambda X: X[fs_cat_w_a])
lgbm_wo_na_transform=FunctionTransformer(lambda X: X[fs_lgbm_wo_na])
lgbm_w_a_transform=FunctionTransformer(lambda X: X[fs_lgbm_w_a])
rf_wo_na_transform=FunctionTransformer(lambda X: X[fs_rf_wo_na])
rf_w_a_transform=FunctionTransformer(lambda X: X[fs_rf_w_a])


cat_wona_pipe = Pipeline([('transform_cat_wo', cat_wo_na_transform), ('catboost_wo_na', model['catboost_wo_na'])])
cat_wa_pipe = Pipeline([('transform_cat_wa', cat_w_a_transform), ('catboost_w_a', model['catboost_w_a'])])

lgbm_wona_pipe = Pipeline([('transform_lgbm_wo', lgbm_wo_na_transform), ('lightgbm_wo_na', model['lightgbm_wo_na'])])
lgbm_wa_pipe = Pipeline([('transform_lgbm_wa', lgbm_w_a_transform), ('lightgbm_w_a', model['lightgbm_w_a'])])

rf_wona_pipe = Pipeline([('transform_rf_wo', cat_wo_na_transform), ('randomforest_wo_na', model['randomforest_wo_na'])])
rf_wa_pipe = Pipeline([('transform_rf_wa', cat_w_a_transform), ('randomforest_w_a', model['randomforest_w_a'])])


In [10]:
sclf_wona=StackingClassifier(estimators=[('cat_wona_pipe',cat_wona_pipe),
                              ('lgbm_wona_pipe',lgbm_wona_pipe),
                              ('rf_wona_pipe',rf_wona_pipe)
                             ],final_estimator=LogisticRegression(),cv=3)
sclf_wona.fit(train_x,train_y)
params='stacking_wo_na'
clf_estimate(sclf_wona,fs,params)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
sclf_wa=StackingClassifier(estimators=[('cat_wa_pipe',cat_wa_pipe),
                              ('lgbm_wa_pipe',lgbm_wa_pipe),
                              ('rf_wa_pipe',rf_wa_pipe)
                             ],final_estimator=LogisticRegression(),cv=3)
sclf_wa.fit(train_x,train_y)
params='stacking_wa'
clf_estimate(sclf_wa,fs,params)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
sclf_combined=StackingClassifier(estimators=[
                              ('cat_wona_pipe',cat_wona_pipe),
                              ('lgbm_wona_pipe',lgbm_wona_pipe),
                              ('rf_wona_pipe',rf_wona_pipe),
                              ('cat_wa_pipe',cat_wa_pipe),
                              ('lgbm_wa_pipe',lgbm_wa_pipe),
                              ('rf_wa_pipe',rf_wa_pipe)
                             ],final_estimator=LogisticRegression(),cv=3)
sclf_combined.fit(train_x,train_y)
params='stacking_combined'
clf_estimate(sclf_combined,fs,params)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:

eclf_wo_na = VotingClassifier(estimators=[('cat_wona_pipe',cat_wona_pipe),
                              ('lgbm_wona_pipe',lgbm_wona_pipe),
                              ('rf_wona_pipe',rf_wona_pipe)
                             ], voting='soft')
eclf_wo_na.fit(train_x,train_y)
clf_estimate(eclf_wo_na,fs,'voting_wo_na')


eclf_wa = VotingClassifier(estimators=[('cat_wa_pipe',cat_wa_pipe),
                              ('lgbm_wa_pipe',lgbm_wa_pipe),
                              ('rf_wa_pipe',rf_wa_pipe)
                             ], voting='soft')
eclf_wa.fit(train_x,train_y)
clf_estimate(eclf_wa,fs,'voting_wa')


eclf_comb = VotingClassifier(estimators=[('cat_wa_pipe',cat_wa_pipe),
                              ('lgbm_wa_pipe',lgbm_wa_pipe),
                              ('rf_wa_pipe',rf_wa_pipe),
                                       ('cat_wona_pipe',cat_wona_pipe),
                              ('lgbm_wona_pipe',lgbm_wona_pipe),
                              ('rf_wona_pipe',rf_wona_pipe)
                             ], voting='soft')
eclf_comb.fit(train_x,train_y)
clf_estimate(eclf_comb,fs,'voting_comb')




  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)




  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)




  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


