# Load required libs

In [1]:
%cd ..
%pwd
import sys
sys.path.append('./src')
from src.data_processor import RawDataProcessor
from src.problem_config import create_prob_config
from src.drift_detector import ks_drift_detect
prob_config = create_prob_config("phase-2", "prob-1")

c:\VENV\api_prediction


In [2]:
from mlflow.models.signature import infer_signature
import mlflow

def log_model_to_tracker_lgbm(model, metrics, desc):
    MLFLOW_TRACKING_URI = 'http://127.0.0.1:5000'
    mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
    mlflow.set_experiment("phase-2_prob-1_lgbm")
    MLFLOW_MODEL_PREFIX = "model"
    mlflow.start_run(description=desc)
    mlflow.log_metrics(metrics)
    mlflow.log_params(model.get_params())
    signature = infer_signature(test_x.astype(np.float64), predictions)
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path=MLFLOW_MODEL_PREFIX,
        signature=signature,
        pip_requirements ='src/requirements.txt'
        #registered_model_name="phase-1_prob-1_model-1"
    )

    experimentid = mlflow.active_run().info.run_id
    mlflow.end_run()
    return experimentid

# Prepare datasets

In [4]:
import pandas as pd
import numpy as np
import pickle

training_data = pd.read_parquet(prob_config.raw_data_path)

training_data, category_index = RawDataProcessor.build_category_features(
            training_data, prob_config.categorical_cols
        )

target_col = prob_config.target_col
train_x0 = training_data.drop([target_col], axis=1)
train_y0 = training_data[[target_col]]

# Store the category_index
with open(prob_config.category_index_path, "wb") as f:
    pickle.dump(category_index, f)

In [62]:
from sklearn.model_selection import train_test_split

train, dev = train_test_split(training_data, test_size=0.1, random_state=123)

train_x = train.drop(["label"], axis=1)
train_y = train[[target_col]]
test_x = dev.drop(["label"], axis=1)
test_y = dev[[target_col]]

In [63]:
from lightgbm import LGBMClassifier

model0 = LGBMClassifier(objective="binary", random_state=123, is_unbalance=True)
model0.fit(train_x, train_y, verbose=False)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [94]:
from sklearn.metrics import roc_auc_score
predictions = model0.predict_proba(test_x.astype(np.float64))[:,1]
#predictions = d4p.gbt_classification_prediction(nClasses=2).compute(test_x, daal_model)
#predictions = llvm_model.predict(test_x)
auc_score = roc_auc_score(test_y, predictions)
metrics = {"test_auc": auc_score}
print(f"metrics: {metrics}")

metrics: {'test_auc': 0.9902415454678807}


In [40]:
run_description = """
### Header
LGBM model, First Base Model Prob1
Model: LGBM
    """
log_model_to_tracker_lgbm(model0, metrics, run_description)



'a41c2c67b7484680ba61f540ef0ec034'

# Drift Detect

In [38]:
# Save referent for drift detection.
#X_baseline_df = pd.DataFrame(X_baseline, columns=prob_config.drift_cols)
#X_baseline_df.to_parquet(prob_config.driff_ref_path, index=False)

# Model drift

In [13]:
from sklearn.cluster import MiniBatchKMeans
import joblib

# N là số lượng cluster
N = 19000 * len(np.unique(train_y))
# Train clustering model cho data đã có label
kmeans = MiniBatchKMeans(n_clusters=N, random_state=0, n_init='auto').fit(train_x)

#joblib.dump(kmeans, 'data/captured_data/phase-1/prob-1/kmeans.cpk')
#kmeans = joblib.load('data/captured_data/phase-1/prob-1/kmeans.cpk')

# Tạo 1 mảng ánh xạ cluster với 1 label mới (do các data drift thuộc cùng 1 cluster sẽ có label giống nhau)
new_labels = []

# Duyệt từng cluster
for  i  in  range(N):
	# Lấy các label của các data point thuộc cluster i
	mask = (kmeans.labels_ == i)
	cluster_labels = train_y[mask]

	if  len(cluster_labels) == 0:
		# Nếu cluster i rỗng thì xác định cluster i ánh xạ với 1 label mặc định (ở đây lựa chọn là 0)
		new_labels.append(0)
	else:
		# Tìm label mới cho cả cụm cluster trong trường hợp cụm cluster khác rỗng
		#if  isinstance(train_y.flatten()[0], float):
			# Nếu là bài toán Regression thì lấy giá trị trung bình của các label thuộc cluster
		#	new_labels.append(np.mean(cluster_labels.flatten()))
		#else:
			# Nếu là bài toán Classification thì lấy label xuất hiện nhiều nhất trong cluster
			new_labels.append(np.bincount(cluster_labels.to_numpy().flatten()).argmax())

# Ánh xạ lại label cho data drift dựa trên kết quả predict cluster ở trên
y_drift_propagated = [new_labels[c] for  c  in  kmeans.labels_]

In [15]:
from sklearn.metrics import roc_auc_score
y_drift_test_propagated = [ new_labels[c] for c in kmeans.predict(train_x)]
propagated_auc = {"test_auc": roc_auc_score(train_y, y_drift_propagated)}
propagated_auc

{'test_auc': 0.9857373418231327}

# Online Data

In [14]:
import pandas as pd
import redis
import pickle
# Load data for problem 1
rc1 = redis.Redis(host='localhost', db=1, port=6379,  socket_keepalive=True)

#captured_x = {}
#for key in rc1.keys():
#    captured_data = pickle.loads(rc1.get(key))
#    #captured_x = pd.concat([captured_x, captured_data])
#    captured_x[key] = captured_data

#captured_x.drop_duplicates(inplace=True, ignore_index=True)
#captured_x = apply_category_features(
#    raw_df=captured_x[train_x.columns],
#    categorical_cols=prob_config.categorical_cols,
#    category_index=category_index,
#)

# Test Model Performance

In [None]:
%%timeit -n 10
model0.predict(test_x.sample(2000))

5.31 ms ± 1.8 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
import lleaves
model_path = ".venv/phase2_1_lgbm.txt"
model0.booster_.save_model(filename=model_path)
llvm_model = lleaves.Model(model_file=model_path)
llvm_model.compile()

In [None]:
%%timeit -n 10
llvm_model.predict(test_x.sample(2000))

4.38 ms ± 166 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
import daal4py as d4p
daal_model = d4p.get_gbt_model_from_lightgbm(model0.booster_)

In [None]:
%%timeit -n 10
daal_prediction = d4p.gbt_classification_prediction(nClasses=2, resultsToEvaluate="computeClassLabels|computeClassProbabilities").compute(test_x.sample(2000), daal_model)

3.77 ms ± 299 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
import onnxruntime as rt
from skl2onnx import convert_sklearn, update_registered_converter
from skl2onnx.common.shape_calculator import calculate_linear_classifier_output_shapes  # noqa
from onnxmltools.convert.lightgbm.operator_converters.LightGbm import convert_lightgbm  # noqa
from skl2onnx.common.data_types import FloatTensorType
import numpy
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier

  tys = obj.typeStr or ''
  if getattr(obj, 'isHomogeneous', False):
  return getattr(obj, attribute)


In [None]:
update_registered_converter(
    LGBMClassifier, 'LightGbmLGBMClassifier',
    calculate_linear_classifier_output_shapes, convert_lightgbm,
    options={'nocl': [True, False], 'zipmap': [True, False, 'columns']})

In [None]:
model_onnx = convert_sklearn(
    model0, 'pipeline_lightgbm',
    [('input', FloatTensorType([None, 41]))],
    target_opset={'': 12, 'ai.onnx.ml': 2})

# And save.
with open(".venv/pipeline_lightgbm1.onnx", "wb") as f:
    f.write(model_onnx.SerializeToString())

In [None]:
print("predict", model0.predict(test_x.to_numpy()[:5].astype(numpy.float32)))
print("predict_proba", model0.predict_proba(test_x.to_numpy()[:1].astype(numpy.float32)))

predict [1 1 0 1 1]
predict_proba [[0.01256021 0.98743979]]


In [None]:
sess = rt.InferenceSession(".venv/pipeline_lightgbm1.onnx")

pred_onx = sess.run(None, {"input": test_x.to_numpy()[:5].astype(numpy.float32)})
print("predict", pred_onx[0])
print("predict_proba", pred_onx[1][:1])

predict [1 1 0 1 1]
predict_proba [{0: 0.012560248374938965, 1: 0.987439751625061}]


In [None]:
%%timeit
pred_onx = sess.run(None, {"input": test_x.sample(2000).to_numpy().astype(numpy.float32)})

4.86 ms ± 128 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
import mlflow
import pathlib
MLFLOW_TRACKING_URI = 'http://localhost:5000'
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
model_uri = str(pathlib.Path("models:/", "phase-2_prob-1_model", "1").as_posix())
model0_ref = mlflow.pyfunc.load_model(model_uri)