# Load required libs

In [1]:
%cd ..
%pwd
import sys
sys.path.append('./src')
from src.data_processor import RawDataProcessor
from src.problem_config import create_prob_config
from src.drift_detector import ks_drift_detect
prob_config = create_prob_config("phase-3", "prob-1")

c:\VENV\api_prediction


In [2]:
from mlflow.models.signature import infer_signature
import mlflow

def log_model_to_tracker_lgbm(model, metrics, desc):
    MLFLOW_TRACKING_URI = 'http://192.168.88.113:5000'
    mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
    mlflow.set_experiment("phase-3_prob-1_lgbm")
    MLFLOW_MODEL_PREFIX = "model"
    mlflow.start_run(description=desc)
    mlflow.log_metrics(metrics)
    mlflow.log_params(model.get_params())
    signature = infer_signature(test_x.astype(np.float64), predictions)
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path=MLFLOW_MODEL_PREFIX,
        signature=signature,
        pip_requirements ='src/requirements.txt'
        #registered_model_name="phase-1_prob-1_model-1"
    )

    experimentid = mlflow.active_run().info.run_id
    mlflow.end_run()
    return experimentid

# Prepare datasets

In [4]:
import pandas as pd
import numpy as np
import pickle

training_data = pd.read_parquet(prob_config.raw_data_path)
training_data = training_data.drop_duplicates()

training_data, category_index = RawDataProcessor.build_category_features(
            training_data, prob_config.categorical_cols
        )

# Store the category_index
with open(prob_config.category_index_path, "wb") as f:
    pickle.dump(category_index, f)

In [3]:
#conflict_labels = training_data[training_data.duplicated(prob_config.feature_cols, keep=False)].sort_values(by=prob_config.feature_cols)
#conflict_labels.groupby(prob_config.feature_cols).apply(lambda x: tuple(x.index))

In [5]:
from sklearn.model_selection import train_test_split

#training_data.drop_duplicates(subset=prob_config.feature_cols, keep=False, inplace=True)
target_col = prob_config.target_col
train_x0 = training_data.drop([target_col], axis=1)
train_y0 = training_data[[target_col]]

train, dev = train_test_split(training_data, test_size=0.1, random_state=123)

train_x = train.drop(["label"], axis=1)
train_y = train[[target_col]]
test_x = dev.drop(["label"], axis=1)
test_y = dev[[target_col]]

In [6]:
labels_dict = {}
labels_unq = train_y0['label'].unique()
labels_unq.sort()
for i in range(len(labels_unq)):
    labels_dict[labels_unq[i]] = i

inv_labels_dict = {v: k for k, v in labels_dict.items()}
model_classes_path = prob_config.data_path / 'classes.npy'
np.save(model_classes_path, labels_unq)

In [21]:
''' import AutoML class from flaml package '''
from flaml import AutoML
automl = AutoML()

settings = {
    "time_budget": 600,  # total running time in seconds
    "metric": 'roc_auc', 
    "estimator_list": ['lgbm'],#lgbm, xgboost
    "task": 'classification',  # task type
    "log_file_name": 'prob1_experiment.log',  # flaml log file
    "seed": 123,    # random seed
}

automl.fit(X_train=train_x, y_train=train_y['label'], **settings)

[flaml.automl.logger: 08-24 22:10:43] {1679} INFO - task = classification
[flaml.automl.logger: 08-24 22:10:43] {1690} INFO - Evaluation method: holdout
[flaml.automl.logger: 08-24 22:10:43] {1788} INFO - Minimizing error metric: 1-accuracy
[flaml.automl.logger: 08-24 22:10:43] {1900} INFO - List of ML learners in AutoML Run: ['lgbm']
[flaml.automl.logger: 08-24 22:10:43] {2218} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 08-24 22:10:43] {2344} INFO - Estimated sufficient time budget=3278s. Estimated necessary time budget=3s.
[flaml.automl.logger: 08-24 22:10:43] {2391} INFO -  at 0.5s,	estimator lgbm's best error=0.1597,	best estimator lgbm's best error=0.1597
[flaml.automl.logger: 08-24 22:10:43] {2218} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 08-24 22:10:43] {2391} INFO -  at 0.6s,	estimator lgbm's best error=0.1385,	best estimator lgbm's best error=0.1385
[flaml.automl.logger: 08-24 22:10:43] {2218} INFO - iteration 2, current learner lgbm
[

In [22]:
print('Best hyperparmeter config:', automl.best_config)
print('Best accuracy on validation data: {0:.4g}'.format(1-automl.best_loss))
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))

Best hyperparmeter config: {'n_estimators': 511, 'num_leaves': 61, 'min_child_samples': 20, 'learning_rate': 0.0775767904075595, 'log_max_bin': 8, 'colsample_bytree': 0.667293102702816, 'reg_alpha': 0.0009765625, 'reg_lambda': 0.062435110376239486}
Best accuracy on validation data: 0.925
Training duration of best run: 5.64 s


In [23]:
from lightgbm import LGBMClassifier

#model0 = LGBMClassifier(objective="binary", random_state=123, is_unbalance=True, num_leaves=34)
model0 = LGBMClassifier(objective="binary", random_state=123, is_unbalance=True, **automl.best_config)
model0.fit(train_x0, train_y0['label'])

In [7]:

#fix_label = pd.DataFrame(model0.predict(conflict_labels[prob_config.feature_cols].drop_duplicates()))
#train_x_new = pd.DataFrame(np.concatenate((train_x, conflict_labels[prob_config.feature_cols].drop_duplicates())), columns=train_x.columns)
#train_y_new = pd.DataFrame(np.concatenate((train_y, fix_label)), columns=train_y.columns)

#model1 = LGBMClassifier(objective="binary", random_state=123, is_unbalance=True)
#model1.fit(train_x_new, train_y_new, verbose=False)

In [46]:
from sklearn.metrics import roc_auc_score
predictions = model1.predict_proba(test_x.astype(np.float64))[:,1]
#predictions = d4p.gbt_classification_prediction(nClasses=2).compute(test_x, daal_model)
#predictions = llvm_model.predict(test_x)
auc_score = roc_auc_score(test_y, predictions)
metrics = {"test_auc": auc_score}
print(f"metrics: {metrics}")

metrics: {'test_auc': 0.9975518059888039}


In [27]:
data21 = pd.read_parquet('data/phase-2/prob-1/raw_train.parquet')

data21 = RawDataProcessor.apply_category_features(
    raw_df=data21,
    categorical_cols=prob_config.categorical_cols,
    category_index=category_index,
)

data21.drop_duplicates(inplace=True)

In [47]:
roc_auc_score(data21[[target_col]], model1.predict_proba(data21.drop([target_col], axis=1))[:,1])

0.9975144388997476

In [15]:
#conflict_labels2 = data21[data21.duplicated(prob_config.feature_cols, keep=False)].sort_values(by=prob_config.feature_cols)
#conflict_labels2.groupby(prob_config.feature_cols).apply(lambda x: tuple(x.index))

In [29]:
unswnb15 = pd.read_csv('C:/VENV/MLOPS Competition/UNSW-NB15/UNSW_NB15_training-set.csv')
unswnb15_t = pd.read_csv('C:/VENV/MLOPS Competition/UNSW-NB15/UNSW_NB15_testing-set.csv')

def label_mod(x):
    if x in ['Exploits', 'Normal']:
        return x
    elif (x == 'DoS') or (x == 'Fuzzers'):
        return 'Denial of Service'
    elif (x == 'Reconnaissance'):
        return 'Information Gathering'
    elif (x == 'Worms') or (x == 'Shellcode'):
        return 'Malware'
    elif (x == 'Generic'):
        return 'Other'
    else:
        return np.nan
    
fets_dict = {'dur': 'feature1', 'proto': 'feature2', 'service': 'feature3', 'state': 'feature4', \
                                    'spkts': 'feature5', 'dpkts': 'feature6', 'sbytes': 'feature7', 'dbytes': 'feature8',\
                                        'sttl': 'feature9', 'dttl': 'feature10', 'sload': 'feature11', 'dload': 'feature12',\
                                            'sloss': 'feature13', 'dloss': 'feature14', 'sinpkt': 'feature15', 'dinpkt': 'feature16',\
                                                'sjit': 'feature17', 'djit': 'feature18', 'swin': 'feature19', 'stcpb': 'feature20', \
                                                    'dtcpb': 'feature21', 'dwin': 'feature22', 'tcprtt': 'feature23', 'synack': 'feature24',\
                                                        'ackdat': 'feature25', 'smean': 'feature26', 'dmean': 'feature27', 'ct_ftp_cmd': 'feature37',\
                                                            'response_body_len': 'feature29','ct_srv_src': 'feature30', 'ct_state_ttl': 'feature31', 'ct_dst_ltm': 'feature32', \
                                                                'ct_src_dport_ltm': 'feature33', 'ct_dst_sport_ltm': 'feature34', 'ct_dst_src_ltm': 'feature35',\
                                                                    'is_ftp_login': 'feature36', 'trans_depth': 'feature28', 'ct_flw_http_mthd': 'feature38', 'ct_src_ltm': 'feature39',\
                                                                    'ct_srv_dst':'feature40', 'is_sm_ips_ports':'feature41'}

unswnb15 = unswnb15.rename(columns=fets_dict)

unswnb15 = RawDataProcessor.apply_category_features(
    raw_df=unswnb15,
    categorical_cols=prob_config.categorical_cols,
    category_index=category_index,
)

# feature 20 / feature 21 (swap?)
# feature 23 / feature 24
# rate -> feature ?

unswnb15["attack_cat"] = unswnb15["attack_cat"].apply(label_mod)
unswnb15 = unswnb15.dropna()

unswnb15_t = unswnb15_t.rename(columns=fets_dict)
unswnb15_t = RawDataProcessor.apply_category_features(
    raw_df=unswnb15_t,
    categorical_cols=prob_config.categorical_cols,
    category_index=category_index,
)
unswnb15_t["attack_cat"] = unswnb15_t["attack_cat"].apply(label_mod)
unswnb15_t = unswnb15_t.dropna()

In [30]:
train_x_new = pd.DataFrame(np.concatenate((train_x0, data21[prob_config.feature_cols], \
                                           unswnb15[prob_config.feature_cols], unswnb15_t[prob_config.feature_cols])), columns=train_x.columns)
train_y_new = pd.DataFrame(np.concatenate((train_y0, data21[[prob_config.target_col]], \
                                           unswnb15[['label']], unswnb15_t[['label']])), columns=train_y.columns)


In [44]:
''' import AutoML class from flaml package '''
from flaml import AutoML
automl = AutoML()

settings = {
    "time_budget": 600,  # total running time in seconds
    "metric": 'roc_auc', 
    "estimator_list": ['xgboost'],#lgbm, xgboost
    "task": 'classification',  # task type
    "log_file_name": 'prob1_experiment.log',  # flaml log file
    "seed": 123,    # random seed
}

automl.fit(X_train=train_x_new, y_train=train_y_new['label'], **settings)

[flaml.automl.logger: 08-24 22:51:04] {1679} INFO - task = classification
[flaml.automl.logger: 08-24 22:51:04] {1690} INFO - Evaluation method: holdout
[flaml.automl.logger: 08-24 22:51:05] {1788} INFO - Minimizing error metric: 1-roc_auc
[flaml.automl.logger: 08-24 22:51:05] {1900} INFO - List of ML learners in AutoML Run: ['xgboost']
[flaml.automl.logger: 08-24 22:51:05] {2218} INFO - iteration 0, current learner xgboost
[flaml.automl.logger: 08-24 22:51:05] {2344} INFO - Estimated sufficient time budget=33969s. Estimated necessary time budget=34s.
[flaml.automl.logger: 08-24 22:51:05] {2391} INFO -  at 2.9s,	estimator xgboost's best error=0.0403,	best estimator xgboost's best error=0.0403
[flaml.automl.logger: 08-24 22:51:05] {2218} INFO - iteration 1, current learner xgboost
[flaml.automl.logger: 08-24 22:51:05] {2391} INFO -  at 3.0s,	estimator xgboost's best error=0.0335,	best estimator xgboost's best error=0.0335




[flaml.automl.logger: 08-24 22:51:05] {2218} INFO - iteration 2, current learner xgboost
[flaml.automl.logger: 08-24 22:51:05] {2391} INFO -  at 3.1s,	estimator xgboost's best error=0.0335,	best estimator xgboost's best error=0.0335
[flaml.automl.logger: 08-24 22:51:05] {2218} INFO - iteration 3, current learner xgboost
[flaml.automl.logger: 08-24 22:51:06] {2391} INFO -  at 3.2s,	estimator xgboost's best error=0.0335,	best estimator xgboost's best error=0.0335




[flaml.automl.logger: 08-24 22:51:06] {2218} INFO - iteration 4, current learner xgboost




[flaml.automl.logger: 08-24 22:51:06] {2391} INFO -  at 3.4s,	estimator xgboost's best error=0.0157,	best estimator xgboost's best error=0.0157
[flaml.automl.logger: 08-24 22:51:06] {2218} INFO - iteration 5, current learner xgboost
[flaml.automl.logger: 08-24 22:51:06] {2391} INFO -  at 3.5s,	estimator xgboost's best error=0.0157,	best estimator xgboost's best error=0.0157
[flaml.automl.logger: 08-24 22:51:06] {2218} INFO - iteration 6, current learner xgboost




[flaml.automl.logger: 08-24 22:51:06] {2391} INFO -  at 4.0s,	estimator xgboost's best error=0.0157,	best estimator xgboost's best error=0.0157
[flaml.automl.logger: 08-24 22:51:06] {2218} INFO - iteration 7, current learner xgboost
[flaml.automl.logger: 08-24 22:51:07] {2391} INFO -  at 4.2s,	estimator xgboost's best error=0.0156,	best estimator xgboost's best error=0.0156




[flaml.automl.logger: 08-24 22:51:07] {2218} INFO - iteration 8, current learner xgboost
[flaml.automl.logger: 08-24 22:51:07] {2391} INFO -  at 4.4s,	estimator xgboost's best error=0.0156,	best estimator xgboost's best error=0.0156




[flaml.automl.logger: 08-24 22:51:07] {2218} INFO - iteration 9, current learner xgboost
[flaml.automl.logger: 08-24 22:51:07] {2391} INFO -  at 4.6s,	estimator xgboost's best error=0.0156,	best estimator xgboost's best error=0.0156




[flaml.automl.logger: 08-24 22:51:07] {2218} INFO - iteration 10, current learner xgboost
[flaml.automl.logger: 08-24 22:51:07] {2391} INFO -  at 4.8s,	estimator xgboost's best error=0.0156,	best estimator xgboost's best error=0.0156




[flaml.automl.logger: 08-24 22:51:07] {2218} INFO - iteration 11, current learner xgboost




[flaml.automl.logger: 08-24 22:51:07] {2391} INFO -  at 5.1s,	estimator xgboost's best error=0.0156,	best estimator xgboost's best error=0.0156
[flaml.automl.logger: 08-24 22:51:07] {2218} INFO - iteration 12, current learner xgboost
[flaml.automl.logger: 08-24 22:51:08] {2391} INFO -  at 5.3s,	estimator xgboost's best error=0.0153,	best estimator xgboost's best error=0.0153




[flaml.automl.logger: 08-24 22:51:08] {2218} INFO - iteration 13, current learner xgboost
[flaml.automl.logger: 08-24 22:51:08] {2391} INFO -  at 5.5s,	estimator xgboost's best error=0.0153,	best estimator xgboost's best error=0.0153
[flaml.automl.logger: 08-24 22:51:08] {2218} INFO - iteration 14, current learner xgboost




[flaml.automl.logger: 08-24 22:51:08] {2391} INFO -  at 5.9s,	estimator xgboost's best error=0.0150,	best estimator xgboost's best error=0.0150
[flaml.automl.logger: 08-24 22:51:08] {2218} INFO - iteration 15, current learner xgboost




[flaml.automl.logger: 08-24 22:51:09] {2391} INFO -  at 6.4s,	estimator xgboost's best error=0.0147,	best estimator xgboost's best error=0.0147
[flaml.automl.logger: 08-24 22:51:09] {2218} INFO - iteration 16, current learner xgboost




[flaml.automl.logger: 08-24 22:51:09] {2391} INFO -  at 6.8s,	estimator xgboost's best error=0.0147,	best estimator xgboost's best error=0.0147
[flaml.automl.logger: 08-24 22:51:09] {2218} INFO - iteration 17, current learner xgboost




[flaml.automl.logger: 08-24 22:51:10] {2391} INFO -  at 7.2s,	estimator xgboost's best error=0.0147,	best estimator xgboost's best error=0.0147
[flaml.automl.logger: 08-24 22:51:10] {2218} INFO - iteration 18, current learner xgboost




[flaml.automl.logger: 08-24 22:51:10] {2391} INFO -  at 7.9s,	estimator xgboost's best error=0.0147,	best estimator xgboost's best error=0.0147
[flaml.automl.logger: 08-24 22:51:10] {2218} INFO - iteration 19, current learner xgboost




[flaml.automl.logger: 08-24 22:51:11] {2391} INFO -  at 8.7s,	estimator xgboost's best error=0.0147,	best estimator xgboost's best error=0.0147
[flaml.automl.logger: 08-24 22:51:11] {2218} INFO - iteration 20, current learner xgboost




[flaml.automl.logger: 08-24 22:51:12] {2391} INFO -  at 9.3s,	estimator xgboost's best error=0.0121,	best estimator xgboost's best error=0.0121
[flaml.automl.logger: 08-24 22:51:12] {2218} INFO - iteration 21, current learner xgboost




[flaml.automl.logger: 08-24 22:51:13] {2391} INFO -  at 10.4s,	estimator xgboost's best error=0.0117,	best estimator xgboost's best error=0.0117
[flaml.automl.logger: 08-24 22:51:13] {2218} INFO - iteration 22, current learner xgboost




[flaml.automl.logger: 08-24 22:51:13] {2391} INFO -  at 11.0s,	estimator xgboost's best error=0.0117,	best estimator xgboost's best error=0.0117
[flaml.automl.logger: 08-24 22:51:13] {2218} INFO - iteration 23, current learner xgboost




[flaml.automl.logger: 08-24 22:51:14] {2391} INFO -  at 12.0s,	estimator xgboost's best error=0.0117,	best estimator xgboost's best error=0.0117
[flaml.automl.logger: 08-24 22:51:14] {2218} INFO - iteration 24, current learner xgboost




[flaml.automl.logger: 08-24 22:51:16] {2391} INFO -  at 13.3s,	estimator xgboost's best error=0.0117,	best estimator xgboost's best error=0.0117
[flaml.automl.logger: 08-24 22:51:16] {2218} INFO - iteration 25, current learner xgboost




[flaml.automl.logger: 08-24 22:51:16] {2391} INFO -  at 13.7s,	estimator xgboost's best error=0.0117,	best estimator xgboost's best error=0.0117
[flaml.automl.logger: 08-24 22:51:16] {2218} INFO - iteration 26, current learner xgboost




[flaml.automl.logger: 08-24 22:51:19] {2391} INFO -  at 17.1s,	estimator xgboost's best error=0.0117,	best estimator xgboost's best error=0.0117
[flaml.automl.logger: 08-24 22:51:19] {2218} INFO - iteration 27, current learner xgboost




[flaml.automl.logger: 08-24 22:51:22] {2391} INFO -  at 19.4s,	estimator xgboost's best error=0.0082,	best estimator xgboost's best error=0.0082
[flaml.automl.logger: 08-24 22:51:22] {2218} INFO - iteration 28, current learner xgboost




[flaml.automl.logger: 08-24 22:51:24] {2391} INFO -  at 21.2s,	estimator xgboost's best error=0.0082,	best estimator xgboost's best error=0.0082
[flaml.automl.logger: 08-24 22:51:24] {2218} INFO - iteration 29, current learner xgboost




[flaml.automl.logger: 08-24 22:51:27] {2391} INFO -  at 24.8s,	estimator xgboost's best error=0.0082,	best estimator xgboost's best error=0.0082
[flaml.automl.logger: 08-24 22:51:27] {2218} INFO - iteration 30, current learner xgboost




[flaml.automl.logger: 08-24 22:51:36] {2391} INFO -  at 33.4s,	estimator xgboost's best error=0.0034,	best estimator xgboost's best error=0.0034
[flaml.automl.logger: 08-24 22:51:36] {2218} INFO - iteration 31, current learner xgboost




[flaml.automl.logger: 08-24 22:51:38] {2391} INFO -  at 36.0s,	estimator xgboost's best error=0.0034,	best estimator xgboost's best error=0.0034
[flaml.automl.logger: 08-24 22:51:38] {2218} INFO - iteration 32, current learner xgboost




[flaml.automl.logger: 08-24 22:52:05] {2391} INFO -  at 62.6s,	estimator xgboost's best error=0.0034,	best estimator xgboost's best error=0.0034
[flaml.automl.logger: 08-24 22:52:05] {2218} INFO - iteration 33, current learner xgboost




[flaml.automl.logger: 08-24 22:52:09] {2391} INFO -  at 66.6s,	estimator xgboost's best error=0.0034,	best estimator xgboost's best error=0.0034
[flaml.automl.logger: 08-24 22:52:09] {2218} INFO - iteration 34, current learner xgboost




[flaml.automl.logger: 08-24 22:52:18] {2391} INFO -  at 75.6s,	estimator xgboost's best error=0.0034,	best estimator xgboost's best error=0.0034
[flaml.automl.logger: 08-24 22:52:18] {2218} INFO - iteration 35, current learner xgboost




[flaml.automl.logger: 08-24 22:52:20] {2391} INFO -  at 78.0s,	estimator xgboost's best error=0.0034,	best estimator xgboost's best error=0.0034
[flaml.automl.logger: 08-24 22:52:20] {2218} INFO - iteration 36, current learner xgboost




[flaml.automl.logger: 08-24 22:52:38] {2391} INFO -  at 95.2s,	estimator xgboost's best error=0.0024,	best estimator xgboost's best error=0.0024
[flaml.automl.logger: 08-24 22:52:38] {2218} INFO - iteration 37, current learner xgboost




[flaml.automl.logger: 08-24 22:52:42] {2391} INFO -  at 99.3s,	estimator xgboost's best error=0.0024,	best estimator xgboost's best error=0.0024
[flaml.automl.logger: 08-24 22:52:42] {2218} INFO - iteration 38, current learner xgboost




[flaml.automl.logger: 08-24 22:52:52] {2391} INFO -  at 110.1s,	estimator xgboost's best error=0.0024,	best estimator xgboost's best error=0.0024
[flaml.automl.logger: 08-24 22:52:52] {2218} INFO - iteration 39, current learner xgboost




[flaml.automl.logger: 08-24 22:53:10] {2391} INFO -  at 127.3s,	estimator xgboost's best error=0.0021,	best estimator xgboost's best error=0.0021
[flaml.automl.logger: 08-24 22:53:10] {2218} INFO - iteration 40, current learner xgboost




[flaml.automl.logger: 08-24 22:53:53] {2391} INFO -  at 170.2s,	estimator xgboost's best error=0.0021,	best estimator xgboost's best error=0.0021
[flaml.automl.logger: 08-24 22:53:53] {2218} INFO - iteration 41, current learner xgboost




[flaml.automl.logger: 08-24 22:54:03] {2391} INFO -  at 180.3s,	estimator xgboost's best error=0.0021,	best estimator xgboost's best error=0.0021
[flaml.automl.logger: 08-24 22:54:03] {2218} INFO - iteration 42, current learner xgboost




[flaml.automl.logger: 08-24 22:54:11] {2391} INFO -  at 188.3s,	estimator xgboost's best error=0.0021,	best estimator xgboost's best error=0.0021
[flaml.automl.logger: 08-24 22:54:11] {2218} INFO - iteration 43, current learner xgboost




[flaml.automl.logger: 08-24 22:54:29] {2391} INFO -  at 207.1s,	estimator xgboost's best error=0.0021,	best estimator xgboost's best error=0.0021
[flaml.automl.logger: 08-24 22:54:29] {2218} INFO - iteration 44, current learner xgboost




[flaml.automl.logger: 08-24 22:54:43] {2391} INFO -  at 220.3s,	estimator xgboost's best error=0.0018,	best estimator xgboost's best error=0.0018
[flaml.automl.logger: 08-24 22:54:43] {2218} INFO - iteration 45, current learner xgboost




[flaml.automl.logger: 08-24 22:55:00] {2391} INFO -  at 237.3s,	estimator xgboost's best error=0.0018,	best estimator xgboost's best error=0.0018
[flaml.automl.logger: 08-24 22:55:00] {2218} INFO - iteration 46, current learner xgboost




[flaml.automl.logger: 08-24 22:55:06] {2391} INFO -  at 243.2s,	estimator xgboost's best error=0.0018,	best estimator xgboost's best error=0.0018
[flaml.automl.logger: 08-24 22:55:06] {2218} INFO - iteration 47, current learner xgboost




[flaml.automl.logger: 08-24 22:55:21] {2391} INFO -  at 259.1s,	estimator xgboost's best error=0.0018,	best estimator xgboost's best error=0.0018
[flaml.automl.logger: 08-24 22:55:21] {2218} INFO - iteration 48, current learner xgboost




[flaml.automl.logger: 08-24 22:55:35] {2391} INFO -  at 272.6s,	estimator xgboost's best error=0.0018,	best estimator xgboost's best error=0.0018
[flaml.automl.logger: 08-24 22:55:35] {2218} INFO - iteration 49, current learner xgboost




[flaml.automl.logger: 08-24 22:55:45] {2391} INFO -  at 282.9s,	estimator xgboost's best error=0.0018,	best estimator xgboost's best error=0.0018
[flaml.automl.logger: 08-24 22:55:45] {2218} INFO - iteration 50, current learner xgboost




[flaml.automl.logger: 08-24 22:55:55] {2391} INFO -  at 293.0s,	estimator xgboost's best error=0.0018,	best estimator xgboost's best error=0.0018
[flaml.automl.logger: 08-24 22:55:55] {2218} INFO - iteration 51, current learner xgboost




[flaml.automl.logger: 08-24 22:56:11] {2391} INFO -  at 308.2s,	estimator xgboost's best error=0.0018,	best estimator xgboost's best error=0.0018
[flaml.automl.logger: 08-24 22:56:11] {2218} INFO - iteration 52, current learner xgboost




[flaml.automl.logger: 08-24 22:56:16] {2391} INFO -  at 313.6s,	estimator xgboost's best error=0.0018,	best estimator xgboost's best error=0.0018
[flaml.automl.logger: 08-24 22:56:16] {2218} INFO - iteration 53, current learner xgboost




[flaml.automl.logger: 08-24 22:57:01] {2391} INFO -  at 358.4s,	estimator xgboost's best error=0.0017,	best estimator xgboost's best error=0.0017
[flaml.automl.logger: 08-24 22:57:01] {2218} INFO - iteration 54, current learner xgboost




[flaml.automl.logger: 08-24 22:57:51] {2391} INFO -  at 408.5s,	estimator xgboost's best error=0.0017,	best estimator xgboost's best error=0.0017
[flaml.automl.logger: 08-24 22:57:51] {2218} INFO - iteration 55, current learner xgboost




[flaml.automl.logger: 08-24 22:58:14] {2391} INFO -  at 431.2s,	estimator xgboost's best error=0.0017,	best estimator xgboost's best error=0.0017
[flaml.automl.logger: 08-24 22:58:14] {2218} INFO - iteration 56, current learner xgboost




[flaml.automl.logger: 08-24 22:59:39] {2391} INFO -  at 516.8s,	estimator xgboost's best error=0.0017,	best estimator xgboost's best error=0.0017
[flaml.automl.logger: 08-24 22:59:39] {2218} INFO - iteration 57, current learner xgboost




[flaml.automl.logger: 08-24 22:59:46] {2391} INFO -  at 523.6s,	estimator xgboost's best error=0.0017,	best estimator xgboost's best error=0.0017
[flaml.automl.logger: 08-24 22:59:46] {2218} INFO - iteration 58, current learner xgboost




[flaml.automl.logger: 08-24 23:00:31] {2391} INFO -  at 569.0s,	estimator xgboost's best error=0.0016,	best estimator xgboost's best error=0.0016




[flaml.automl.logger: 08-24 23:01:12] {2627} INFO - retrain xgboost for 40.4s
[flaml.automl.logger: 08-24 23:01:12] {2630} INFO - retrained model: XGBClassifier(base_score=None, booster=None, callbacks=[],
              colsample_bylevel=0.6957061218486587, colsample_bynode=None,
              colsample_bytree=0.6621805817811603, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy='lossguide',
              importance_type=None, interaction_constraints=None,
              learning_rate=0.38871152537031983, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=0, max_leaves=1271,
              min_child_weight=0.00424075605371196, missing=nan,
              monotone_constraints=None, n_estimators=300, n_jobs=-1,
              num_parallel_tree=None, predictor=None, random_state=None, ...)
[flaml.automl.logger: 08-24 2

In [45]:
from xgboost import XGBClassifier
#model1 = LGBMClassifier(objective="binary", random_state=123, is_unbalance=True)
model1 = XGBClassifier(objective="binary:logistic", random_state=123, **automl.best_config)
model1.fit(train_x_new, train_y_new['label'])

In [218]:
run_description = """
### Header
LGBM model, First Base Model Prob1
Model: LGBM
    """
log_model_to_tracker_lgbm(model1, metrics, run_description)

'83ada593deb44d00a1d7237bf7c20266'

# Drift Detect

In [10]:
# Save referent for drift detection.
X_baseline = train_x0.sample(1000)
X_baseline_df = pd.DataFrame(X_baseline, columns=prob_config.drift_cols)
X_baseline_df.to_parquet(prob_config.driff_ref_path, index=False)

# Online Data

# Test Model Performance

In [None]:
%%timeit -n 10
model0.predict(test_x.sample(2000))

5.31 ms ± 1.8 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
import lleaves
model_path = ".venv/phase2_1_lgbm.txt"
model0.booster_.save_model(filename=model_path)
llvm_model = lleaves.Model(model_file=model_path)
llvm_model.compile()

In [None]:
%%timeit -n 10
llvm_model.predict(test_x.sample(2000))

4.38 ms ± 166 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
import daal4py as d4p
daal_model = d4p.get_gbt_model_from_lightgbm(model0.booster_)

In [None]:
%%timeit -n 10
daal_prediction = d4p.gbt_classification_prediction(nClasses=2, resultsToEvaluate="computeClassLabels|computeClassProbabilities").compute(test_x.sample(2000), daal_model)

3.77 ms ± 299 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
import onnxruntime as rt
from skl2onnx import convert_sklearn, update_registered_converter
from skl2onnx.common.shape_calculator import calculate_linear_classifier_output_shapes  # noqa
from onnxmltools.convert.lightgbm.operator_converters.LightGbm import convert_lightgbm  # noqa
from skl2onnx.common.data_types import FloatTensorType
import numpy
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier

  tys = obj.typeStr or ''
  if getattr(obj, 'isHomogeneous', False):
  return getattr(obj, attribute)


In [None]:
update_registered_converter(
    LGBMClassifier, 'LightGbmLGBMClassifier',
    calculate_linear_classifier_output_shapes, convert_lightgbm,
    options={'nocl': [True, False], 'zipmap': [True, False, 'columns']})

In [None]:
model_onnx = convert_sklearn(
    model0, 'pipeline_lightgbm',
    [('input', FloatTensorType([None, 41]))],
    target_opset={'': 12, 'ai.onnx.ml': 2})

# And save.
with open(".venv/pipeline_lightgbm1.onnx", "wb") as f:
    f.write(model_onnx.SerializeToString())

In [None]:
print("predict", model0.predict(test_x.to_numpy()[:5].astype(numpy.float32)))
print("predict_proba", model0.predict_proba(test_x.to_numpy()[:1].astype(numpy.float32)))

predict [1 1 0 1 1]
predict_proba [[0.01256021 0.98743979]]


In [None]:
sess = rt.InferenceSession(".venv/pipeline_lightgbm1.onnx")

pred_onx = sess.run(None, {"input": test_x.to_numpy()[:5].astype(numpy.float32)})
print("predict", pred_onx[0])
print("predict_proba", pred_onx[1][:1])

predict [1 1 0 1 1]
predict_proba [{0: 0.012560248374938965, 1: 0.987439751625061}]


In [None]:
%%timeit
pred_onx = sess.run(None, {"input": test_x.sample(2000).to_numpy().astype(numpy.float32)})

4.86 ms ± 128 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
import mlflow
import pathlib
MLFLOW_TRACKING_URI = 'http://localhost:5000'
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
model_uri = str(pathlib.Path("models:/", "phase-2_prob-1_model", "1").as_posix())
model0_ref = mlflow.pyfunc.load_model(model_uri)