In [5]:
pic_='F'
tracking_uri_="http://34.128.104.38:5000"

In [6]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import pickle5 as pickle
from catboost import Pool
from sklearn.ensemble import StackingClassifier, VotingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split,StratifiedKFold

from sklearn.metrics import accuracy_score, f1_score,cohen_kappa_score,roc_auc_score,log_loss
import mlflow

In [7]:
##load_feature
fs=[]
with open('model/feature_selection/fs.pickle', 'rb') as handle:
    fs_=pickle.load(handle)
    catboost_params=fs_['catboost-wo_artificial']
    fs_cat=catboost_params['params.feature_name'].replace('[','').replace(']','').replace('\'','').replace(' ','').split(',')
    fs.extend(fs_cat)
    lgbm_params=fs_['lightgbm-wo_artificial']
    fs_lgb=lgbm_params['params.feature_name'].replace('[','').replace(']','').replace('\'','').replace(' ','').split(',')
    fs.extend(fs_lgb)
    rf_params=fs_['randomforest-wo_artificial']
    fs_rf=rf_params['params.feature_name'].replace('[','').replace(']','').replace('\'','').replace(' ','').split(',')
    fs.extend(fs_rf)
    fs=list(dict.fromkeys(fs))

In [8]:
##Load model
model=[]
with open('model/tuning/model_tuned.pickle', 'rb') as handle:
    model=pickle.load(handle)

In [None]:
##Load data
data_sample=pd.read_csv("gs://bps-gcp-bucket/MLST2023/preprocessing/sample_"+str(pic_) +".csv",sep=',')
data_sample=data_sample
X=data_sample[fs]
y=data_sample[['nama_valid']]
    
train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.2,stratify=y)
#sk_fold=StratifiedKFold(n_splits=5,shuffle=False)
experiment_name = "Modeling and Stacking"
    ## check if the experiment already exists
if not mlflow.get_experiment_by_name(experiment_name):
        mlflow.create_experiment(name=experiment_name) 
experiment = mlflow.get_experiment_by_name(experiment_name)
mlflow.set_tracking_uri(tracking_uri_)
f1_micro_={}
log_loss_={}
roc_auc_score_={}
cohen_kappa_score_={}

f1_micro_['cat']=[]
f1_micro_['lgb']=[]
f1_micro_['rf']=[]
f1_micro_['sta']=[]
f1_micro_['vot']=[]

log_loss_['cat']=[]
log_loss_['lgb']=[]
log_loss_['rf']=[]
log_loss_['sta']=[]
log_loss_['vot']=[]

roc_auc_score_['cat']=[]
roc_auc_score_['lgb']=[]
roc_auc_score_['rf']=[]
roc_auc_score_['sta']=[]
roc_auc_score_['vot']=[]

cohen_kappa_score_['cat']=[]
cohen_kappa_score_['lgb']=[]
cohen_kappa_score_['rf']=[]
cohen_kappa_score_['sta']=[]
cohen_kappa_score_['vot']=[]


estimators = [('catboost',model['catboost']),
              ('lightgbm',model['lightgbm']),
              ('randomforest',model['randomforest'])]
clf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression()
    )
cat_=model['catboost']
lgb_=model['lightgbm']
rf_=model['randomforest']


eclf = VotingClassifier(estimators=estimators, voting='soft')

pred_y=cat_.predict(valid_x[fs_cat])
preds_proba_y=cat_.predict_proba(valid_x[fs_cat])
f1_micro_['cat'].append(f1_score(valid_y, pred_y,average='micro'))
log_loss_['cat'].append(log_loss(valid_y,preds_proba_y))
roc_auc_score_['cat'].append(roc_auc_score(valid_y, preds_proba_y, average="weighted", multi_class="ovr"))
cohen_kappa_score_['cat'].append(cohen_kappa_score(valid_y, pred_y))
with mlflow.start_run(experiment_id = experiment.experiment_id,
                          run_name='catboost'):
    mlflow.catboost.log_model(cat_, "catboost")
    #mlflow.catboost.save_model(cat_, "model/modeling and stacking/catboost/catboost")
    mlflow.log_metric("f1_score", np.mean(f1_micro_['cat']))
    mlflow.log_metric("log_loss", np.mean(log_loss_['cat']))
    mlflow.log_metric("roc_auc", np.mean(roc_auc_score_['cat']))
    mlflow.log_metric("cohen_kappa", np.mean(cohen_kappa_score_['cat']))
    mlflow.log_param('classifier','catboost')
    
pred_y=lgb_.predict(valid_x[fs_lgb])
preds_proba_y=lgb_.predict_proba(valid_x[fs_lgb])
f1_micro_['lgb'].append(f1_score(valid_y, pred_y,average='micro'))
log_loss_['lgb'].append(log_loss(valid_y,preds_proba_y))
roc_auc_score_['lgb'].append(roc_auc_score(valid_y, preds_proba_y, average="weighted", multi_class="ovr"))
cohen_kappa_score_['lgb'].append(cohen_kappa_score(valid_y, pred_y))    
with mlflow.start_run(experiment_id = experiment.experiment_id,
                          run_name='lightgbm'):
    mlflow.lightgbm.log_model(lgb_, "lightgbm")
    #mlflow.lightgbm.save_model(lgb_, "model/modeling and stacking/lightgbm/lightgbm")
    mlflow.log_metric("f1_score", np.mean(f1_micro_['lgb']))
    mlflow.log_metric("log_loss", np.mean(log_loss_['lgb']))
    mlflow.log_metric("roc_auc", np.mean(roc_auc_score_['lgb']))
    mlflow.log_metric("cohen_kappa", np.mean(cohen_kappa_score_['lgb']))
    mlflow.log_param('classifier','lightgbm')

pred_y=rf_.predict(valid_x[fs_rf])
preds_proba_y=rf_.predict_proba(valid_x[fs_rf])
f1_micro_['rf'].append(f1_score(valid_y, pred_y,average='micro'))
log_loss_['rf'].append(log_loss(valid_y,preds_proba_y))
roc_auc_score_['rf'].append(roc_auc_score(valid_y, preds_proba_y, average="weighted", multi_class="ovr"))
cohen_kappa_score_['rf'].append(cohen_kappa_score(valid_y, pred_y))    
with mlflow.start_run(experiment_id = experiment.experiment_id,
                          run_name='randomforest'):
    mlflow.sklearn.log_model(rf_, "randomforest")
    #mlflow.lightgbm.save_model(lgb_, "model/modeling and stacking/lightgbm/lightgbm")
    mlflow.log_metric("f1_score", np.mean(f1_micro_['rf']))
    mlflow.log_metric("log_loss", np.mean(log_loss_['rf']))
    mlflow.log_metric("roc_auc", np.mean(roc_auc_score_['rf']))
    mlflow.log_metric("cohen_kappa", np.mean(cohen_kappa_score_['rf']))
    mlflow.log_param('classifier','randomforest')
        
clf.fit(train_x,train_y)
pred_y=clf.predict(valid_x)
preds_proba_y=clf.predict_proba(valid_x)
f1_micro_['sta'].append(f1_score(valid_y, pred_y,average='micro'))
log_loss_['sta'].append(log_loss(valid_y,preds_proba_y))
roc_auc_score_['sta'].append(roc_auc_score(valid_y, preds_proba_y, average="weighted", multi_class="ovr"))
cohen_kappa_score_['sta'].append(cohen_kappa_score(valid_y, pred_y))    
with mlflow.start_run(experiment_id = experiment.experiment_id,
                          run_name='stacking'):
    mlflow.sklearn.log_model(clf, "stacking")
    #mlflow.sklearn.save_model(clf, "model/modeling and stacking/stacking/stacking")
    mlflow.log_metric("f1_score", np.mean(f1_micro_['sta']))
    mlflow.log_metric("log_loss", np.mean(log_loss_['sta']))
    mlflow.log_metric("roc_auc", np.mean(roc_auc_score_['sta']))
    mlflow.log_metric("cohen_kappa", np.mean(cohen_kappa_score_['sta']))
    mlflow.log_param('classifier','stacking cat-lgbm-rf')
    
eclf.fit(train_x,train_y)
pred_y=eclf.predict(valid_x)
preds_proba_y=eclf.predict_proba(valid_x)
f1_micro_['vot'].append(f1_score(valid_y, pred_y,average='micro'))
log_loss_['vot'].append(log_loss(valid_y,preds_proba_y))
roc_auc_score_['vot'].append(roc_auc_score(valid_y, preds_proba_y, average="weighted", multi_class="ovr"))
cohen_kappa_score_['vot'].append(cohen_kappa_score(valid_y, pred_y))    
with mlflow.start_run(experiment_id = experiment.experiment_id,
                          run_name='voting'):
    mlflow.sklearn.log_model(eclf, "voting")
    #mlflow.sklearn.save_model(clf, "model/modeling and stacking/stacking/stacking")
    mlflow.log_metric("f1_score", np.mean(f1_micro_['vot']))
    mlflow.log_metric("log_loss", np.mean(log_loss_['vot']))
    mlflow.log_metric("roc_auc", np.mean(roc_auc_score_['vot']))
    mlflow.log_metric("cohen_kappa", np.mean(cohen_kappa_score_['vot']))
    mlflow.log_param('classifier','voting cat-lgbm-rf')
    

