# Load required libs

In [3]:
%cd ..
%pwd

c:\VENV\api_prediction


'c:\\VENV\\api_prediction'

In [4]:
import sys
sys.path.append('./src')
from src.data_processor import RawDataProcessor
from src.problem_config import create_prob_config 
prob_config = create_prob_config("phase-2", "prob-1")

In [5]:
from mlflow.models.signature import infer_signature
import mlflow

def log_model_to_tracker_lgbm(model, metrics, desc):
    MLFLOW_TRACKING_URI = 'http://localhost:5000'
    mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
    mlflow.set_experiment("phase-2_prob-1_lgbm")
    MLFLOW_MODEL_PREFIX = "model"
    mlflow.start_run(description=desc)
    mlflow.log_metrics(metrics)
    mlflow.log_params(model.get_params())
    signature = infer_signature(test_x.astype(np.float64), predictions)
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path=MLFLOW_MODEL_PREFIX,
        signature=signature,
        pip_requirements ='src/requirements.txt'
        #registered_model_name="phase-1_prob-1_model-1"
    )

    experimentid = mlflow.active_run().info.run_id
    mlflow.end_run()
    return experimentid

# Prepare datasets

In [6]:
import pandas as pd
import numpy as np
import pickle

training_data = pd.read_parquet(prob_config.raw_data_path)

training_data, category_index = RawDataProcessor.build_category_features(
            training_data, prob_config.categorical_cols
        )

target_col = prob_config.target_col
train_x = training_data.drop([target_col], axis=1)
train_y = training_data[[target_col]]

# Store the category_index
with open(prob_config.category_index_path, "wb") as f:
    pickle.dump(category_index, f)

In [5]:
from lightgbm import LGBMClassifier

model0 = LGBMClassifier(objective="binary", random_state=123)
model0.fit(train_x, train_y, verbose=False)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [6]:
sample = training_data.sample(1000)

test_x = sample.drop([target_col], axis=1)
test_y = sample[[target_col]]

In [7]:
from sklearn.metrics import roc_auc_score
predictions = model0.predict_proba(test_x.astype(np.float64))[:,1]
#predictions = d4p.gbt_classification_prediction(nClasses=2).compute(test_x, daal_model)
#predictions = model0.predict(test_x)
auc_score = roc_auc_score(test_y, predictions)
metrics = {"test_auc": auc_score}
print(f"metrics: {metrics}")

metrics: {'test_auc': 0.9935781577033099}


In [25]:
%%timeit -n 10
model0.predict(test_x)

2.27 ms ± 305 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [9]:
import lleaves
model_path = prob_config.data_path / "phase2_1_lgbm.txt"
llvm_model_path = prob_config.data_path / "phase2_1_lleaves"
model0.booster_.save_model(filename=model_path)
llvm_model = lleaves.Model(model_file=model_path)
llvm_model.compile(cache=llvm_model_path)

In [27]:
%%timeit -n 10
llvm_model.predict(test_x)

2.86 ms ± 232 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [11]:
import daal4py as d4p
daal_model = d4p.get_gbt_model_from_lightgbm(model0.booster_)

In [55]:
%%timeit -n 10
daal_prediction = d4p.gbt_classification_prediction(nClasses=2, resultsToEvaluate="computeClassLabels|computeClassProbabilities").compute(test_x, daal_model)

2.48 ms ± 296 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [37]:
import onnxruntime as rt
from skl2onnx import convert_sklearn, update_registered_converter
from skl2onnx.common.shape_calculator import calculate_linear_classifier_output_shapes  # noqa
from onnxmltools.convert.lightgbm.operator_converters.LightGbm import convert_lightgbm  # noqa
from skl2onnx.common.data_types import FloatTensorType
import numpy
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier

In [38]:
update_registered_converter(
    LGBMClassifier, 'LightGbmLGBMClassifier',
    calculate_linear_classifier_output_shapes, convert_lightgbm,
    options={'nocl': [True, False], 'zipmap': [True, False, 'columns']})

In [44]:
model_onnx = convert_sklearn(
    model0, 'pipeline_lightgbm',
    [('input', FloatTensorType([None, 41]))],
    target_opset={'': 12, 'ai.onnx.ml': 2})

# And save.
with open("pipeline_lightgbm.onnx", "wb") as f:
    f.write(model_onnx.SerializeToString())

In [49]:
print("predict", model0.predict(test_x.to_numpy()[:5].astype(numpy.float32)))
print("predict_proba", model0.predict_proba(test_x.to_numpy()[:1].astype(numpy.float32)))

predict [1 1 1 1 0]
predict_proba [[0.17720758 0.82279242]]


In [45]:
sess = rt.InferenceSession("pipeline_lightgbm.onnx")

pred_onx = sess.run(None, {"input": test_x.to_numpy()[:5].astype(numpy.float32)})
print("predict", pred_onx[0])
print("predict_proba", pred_onx[1][:1])

predict [1 1 1 1 0]
predict_proba [{0: 0.17720752954483032, 1: 0.8227924704551697}]


In [54]:
%%timeit
pred_onx = sess.run(None, {"input": test_x.to_numpy().astype(numpy.float32)})

1.92 ms ± 111 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [27]:
run_description = """
### Header
LGBM model, First Base Model
Model: LGBM
    """
log_model_to_tracker_lgbm(model0, metrics, run_description)

2023/07/05 16:47:38 INFO mlflow.tracking.fluent: Experiment with name 'phase-2_prob-1_lgbm' does not exist. Creating a new experiment.


'c08077d9b63a4ab6b3782a090d7fb3ea'

In [19]:
import mlflow
import pathlib
MLFLOW_TRACKING_URI = 'http://localhost:5000'
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
model_uri = str(pathlib.Path("models:/", "phase-2_prob-1_model", "2").as_posix())
model0_ref = mlflow.pyfunc.load_model(model_uri)

 - numpy (current: 1.23.0, required: numpy==1.23.5)
 - pyarrow (current: 6.0.1, required: pyarrow==11.0.0)
 - pandas (current: 1.5.3, required: pandas==2.0.1)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


In [22]:
model0_ref._model_impl.predict_proba(test_x)

array([[9.99900175e-01, 9.98248478e-05],
       [1.28688555e-02, 9.87131144e-01],
       [7.01833056e-04, 9.99298167e-01],
       ...,
       [1.12157821e-03, 9.98878422e-01],
       [9.03452717e-05, 9.99909655e-01],
       [9.99874004e-01, 1.25996093e-04]])

In [15]:
model0_ref._model_impl.lgb_model.

<mlflow.lightgbm._LGBModelWrapper at 0x1c60c4bc2e0>

# Drift Detect

In [132]:
# Old KS Drift from alibi_detect
from alibi_detect.cd import KSDrift
X_baseline = train_x.sample(100)
cd = KSDrift(p_val=0.05, x_ref=X_baseline.to_numpy())

In [169]:
x=train_x.sample(1000).to_numpy()

In [172]:
%%timeit
cd.predict(x)

21.6 ms ± 372 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [38]:
# Save referent for drift detection.
#X_baseline_df = pd.DataFrame(X_baseline, columns=prob_config.drift_cols)
#X_baseline_df.to_parquet(prob_config.driff_ref_path, index=False)

# Model drift

In [4]:
from sklearn.cluster import MiniBatchKMeans
import joblib

# N là số lượng cluster
N = 19000 * len(np.unique(train_y))
# Train clustering model cho data đã có label
#kmeans = MiniBatchKMeans(n_clusters=N, random_state=0, n_init='auto').fit(train_x)

#joblib.dump(kmeans, 'data/captured_data/phase-1/prob-1/kmeans.cpk')
kmeans = joblib.load('data/captured_data/phase-1/prob-1/kmeans.cpk')

# Tạo 1 mảng ánh xạ cluster với 1 label mới (do các data drift thuộc cùng 1 cluster sẽ có label giống nhau)
new_labels = []

# Duyệt từng cluster
for  i  in  range(N):
	# Lấy các label của các data point thuộc cluster i
	mask = (kmeans.labels_ == i)
	cluster_labels = train_y[mask]

	if  len(cluster_labels) == 0:
		# Nếu cluster i rỗng thì xác định cluster i ánh xạ với 1 label mặc định (ở đây lựa chọn là 0)
		new_labels.append(0)
	else:
		# Tìm label mới cho cả cụm cluster trong trường hợp cụm cluster khác rỗng
		#if  isinstance(train_y.flatten()[0], float):
			# Nếu là bài toán Regression thì lấy giá trị trung bình của các label thuộc cluster
		#	new_labels.append(np.mean(cluster_labels.flatten()))
		#else:
			# Nếu là bài toán Classification thì lấy label xuất hiện nhiều nhất trong cluster
			new_labels.append(np.bincount(cluster_labels.to_numpy().flatten()).argmax())

# Ánh xạ lại label cho data drift dựa trên kết quả predict cluster ở trên
y_drift_propagated = [new_labels[c] for  c  in  kmeans.labels_]

In [5]:
from sklearn.metrics import roc_auc_score
y_drift_test_propagated = [ new_labels[c] for c in kmeans.predict(train_x)]
propagated_auc = {"test_auc": roc_auc_score(train_y, y_drift_propagated)}
propagated_auc

{'test_auc': 0.8626996177558589}

# Online Data

In [2]:
import pandas as pd
import redis
import pickle
# Load data for problem 1
rc1 = redis.Redis(host='localhost', db=1, port=6379,  socket_keepalive=True)

captured_x = pd.DataFrame()
for key in rc1.keys():
    captured_data = pickle.loads(rc1.get(key))
    captured_x = pd.concat([captured_x, captured_data])

#captured_x.drop_duplicates(inplace=True, ignore_index=True)
#captured_x = apply_category_features(
#    raw_df=captured_x[train_x.columns],
#    categorical_cols=prob_config.categorical_cols,
#    category_index=category_index,
#)

In [7]:
allkey = rc1.keys()

# Test API

In [19]:
rows = []
test = pickle.loads(rc1.get(allkey[12]))

for index, row in test.iterrows():
    rows.append(row.to_list())

data = {
  "id": "123",
  "rows": rows,
  "columns": test.columns.to_list()
}

import requests
#response = requests.post('http://14.225.205.204:5040/phase-2/prob-1/predict', json=data)
response = requests.post('http://localhost:8000/phase-2/prob-1/predict', json=data)
print(response.text)

{"id": "123", "predictions": [0.9998940907040825, 0.9998837423846397, 0.9998839077678122, 0.9925387737871727, 0.9998312531011813, 0.9947252168982391, 0.9998837423846397, 0.00014542207962149096, 0.9989706337320569, 0.9875498308556642, 0.99343781620919, 0.5357602162788402, 0.44005802147545464, 0.999611156961625, 0.9998691604210737, 0.00013624585501300376, 0.7581794698315378, 0.7834442915282387, 0.9541812699131945, 0.9966554863330027, 0.00011172515012575173, 0.9866436954532017, 0.9998865364025179, 0.00014582498847773515, 0.9909732504502361, 0.9974129847335662, 0.9994365767562283, 0.9987575701735351, 0.9998837423846397, 0.00016191065769012583, 0.9986278872817316, 0.9998837423846397, 0.9978634960763247, 0.9995141548690593, 0.6643903933638151, 0.9998837423846397, 0.9998837423846397, 8.025772588206463e-05, 0.9989620739872076, 0.00013624585501300376, 8.767557777107723e-05, 0.9993847284322537, 0.6307952253193835, 0.9664884488409891, 0.9935544561922129, 0.9998735434462112, 0.00015151533312950224

In [108]:
from aiocache import Cache
from aiocache.serializers import PickleSerializer

cacherequest = Cache(Cache.REDIS, endpoint="localhost", port=6379, db=1, serializer=PickleSerializer())
await cacherequest.clear()

True