# Load required libs

In [1]:
%cd ..
%pwd

c:\VENV\api_prediction


'c:\\VENV\\api_prediction'

In [2]:
import sys
sys.path.append('./src')
from src.data_processor import RawDataProcessor
from src.problem_config import create_prob_config 
prob_config = create_prob_config("phase-2", "prob-1")

In [3]:
from mlflow.models.signature import infer_signature
import mlflow

def log_model_to_tracker_lgbm(model, metrics, desc):
    MLFLOW_TRACKING_URI = 'http://127.0.0.1:5000'
    mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
    mlflow.set_experiment("phase-2_prob-1_lgbm")
    MLFLOW_MODEL_PREFIX = "model"
    mlflow.start_run(description=desc)
    mlflow.log_metrics(metrics)
    mlflow.log_params(model.get_params())
    signature = infer_signature(test_x.astype(np.float64), predictions)
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path=MLFLOW_MODEL_PREFIX,
        signature=signature,
        pip_requirements ='src/requirements.txt'
        #registered_model_name="phase-1_prob-1_model-1"
    )

    experimentid = mlflow.active_run().info.run_id
    mlflow.end_run()
    return experimentid

# Prepare datasets

In [4]:
import pandas as pd
import numpy as np
import pickle

training_data = pd.read_parquet(prob_config.raw_data_path)

training_data, category_index = RawDataProcessor.build_category_features(
            training_data, prob_config.categorical_cols
        )

target_col = prob_config.target_col
train_x = training_data.drop([target_col], axis=1)
train_y = training_data[[target_col]]

# Store the category_index
with open(prob_config.category_index_path, "wb") as f:
    pickle.dump(category_index, f)

In [39]:
from lightgbm import LGBMClassifier

model0 = LGBMClassifier(objective="binary", random_state=123)
model0.fit(train_x, train_y, verbose=False)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [8]:
sample = training_data.sample(1000)

test_x = sample.drop([target_col], axis=1)
test_y = sample[[target_col]]

In [9]:
from sklearn.metrics import roc_auc_score
predictions = model0.predict_proba(test_x.astype(np.float64))[:,1]
#predictions = d4p.gbt_classification_prediction(nClasses=2).compute(test_x, daal_model)
#predictions = llvm_model.predict(test_x)
auc_score = roc_auc_score(test_y, predictions)
metrics = {"test_auc": auc_score}
print(f"metrics: {metrics}")

metrics: {'test_auc': 0.9951577733560488}


In [10]:
%%timeit -n 10
model0.predict(train_x)

76.8 ms ± 3.26 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [48]:
import lleaves
model_path = prob_config.data_path / "phase2_1_lgbm.txt"
llvm_model_path = prob_config.data_path / "phase2_1_lleaves"
model0.booster_.save_model(filename=model_path)
llvm_model = lleaves.Model(model_file=model_path)
llvm_model.compile(cache=llvm_model_path)

In [39]:
%%timeit -n 10
llvm_model.predict(train_x)

26.8 ms ± 555 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [11]:
import daal4py as d4p
daal_model = d4p.get_gbt_model_from_lightgbm(model0.booster_)

In [55]:
%%timeit -n 10
daal_prediction = d4p.gbt_classification_prediction(nClasses=2, resultsToEvaluate="computeClassLabels|computeClassProbabilities").compute(test_x, daal_model)

2.48 ms ± 296 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [37]:
import onnxruntime as rt
from skl2onnx import convert_sklearn, update_registered_converter
from skl2onnx.common.shape_calculator import calculate_linear_classifier_output_shapes  # noqa
from onnxmltools.convert.lightgbm.operator_converters.LightGbm import convert_lightgbm  # noqa
from skl2onnx.common.data_types import FloatTensorType
import numpy
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier

In [38]:
update_registered_converter(
    LGBMClassifier, 'LightGbmLGBMClassifier',
    calculate_linear_classifier_output_shapes, convert_lightgbm,
    options={'nocl': [True, False], 'zipmap': [True, False, 'columns']})

In [44]:
model_onnx = convert_sklearn(
    model0, 'pipeline_lightgbm',
    [('input', FloatTensorType([None, 41]))],
    target_opset={'': 12, 'ai.onnx.ml': 2})

# And save.
with open("pipeline_lightgbm.onnx", "wb") as f:
    f.write(model_onnx.SerializeToString())

In [49]:
print("predict", model0.predict(test_x.to_numpy()[:5].astype(numpy.float32)))
print("predict_proba", model0.predict_proba(test_x.to_numpy()[:1].astype(numpy.float32)))

predict [1 1 1 1 0]
predict_proba [[0.17720758 0.82279242]]


In [45]:
sess = rt.InferenceSession("pipeline_lightgbm.onnx")

pred_onx = sess.run(None, {"input": test_x.to_numpy()[:5].astype(numpy.float32)})
print("predict", pred_onx[0])
print("predict_proba", pred_onx[1][:1])

predict [1 1 1 1 0]
predict_proba [{0: 0.17720752954483032, 1: 0.8227924704551697}]


In [54]:
%%timeit
pred_onx = sess.run(None, {"input": test_x.to_numpy().astype(numpy.float32)})

1.92 ms ± 111 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [40]:
run_description = """
### Header
LGBM model, First Base Model Prob1
Model: LGBM
    """
log_model_to_tracker_lgbm(model0, metrics, run_description)



'a41c2c67b7484680ba61f540ef0ec034'

In [20]:
import mlflow
import pathlib
MLFLOW_TRACKING_URI = 'http://localhost:5000'
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
model_uri = str(pathlib.Path("models:/", "phase-2_prob-1_model", "1").as_posix())
model0_ref = mlflow.pyfunc.load_model(model_uri)

# Drift Detect

In [22]:
# Old KS Drift from alibi_detect
from alibi_detect.cd import KSDrift
X_baseline = train_x.sample(500)
cd = KSDrift(p_val=0.05, x_ref=X_baseline.to_numpy())

In [25]:
x=train_x.sample(1000).to_numpy()

In [26]:
%%timeit
cd.predict(x)

21.7 ms ± 345 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [38]:
# Save referent for drift detection.
#X_baseline_df = pd.DataFrame(X_baseline, columns=prob_config.drift_cols)
#X_baseline_df.to_parquet(prob_config.driff_ref_path, index=False)

# Model drift

In [4]:
from sklearn.cluster import MiniBatchKMeans
import joblib

# N là số lượng cluster
N = 19000 * len(np.unique(train_y))
# Train clustering model cho data đã có label
#kmeans = MiniBatchKMeans(n_clusters=N, random_state=0, n_init='auto').fit(train_x)

#joblib.dump(kmeans, 'data/captured_data/phase-1/prob-1/kmeans.cpk')
kmeans = joblib.load('data/captured_data/phase-1/prob-1/kmeans.cpk')

# Tạo 1 mảng ánh xạ cluster với 1 label mới (do các data drift thuộc cùng 1 cluster sẽ có label giống nhau)
new_labels = []

# Duyệt từng cluster
for  i  in  range(N):
	# Lấy các label của các data point thuộc cluster i
	mask = (kmeans.labels_ == i)
	cluster_labels = train_y[mask]

	if  len(cluster_labels) == 0:
		# Nếu cluster i rỗng thì xác định cluster i ánh xạ với 1 label mặc định (ở đây lựa chọn là 0)
		new_labels.append(0)
	else:
		# Tìm label mới cho cả cụm cluster trong trường hợp cụm cluster khác rỗng
		#if  isinstance(train_y.flatten()[0], float):
			# Nếu là bài toán Regression thì lấy giá trị trung bình của các label thuộc cluster
		#	new_labels.append(np.mean(cluster_labels.flatten()))
		#else:
			# Nếu là bài toán Classification thì lấy label xuất hiện nhiều nhất trong cluster
			new_labels.append(np.bincount(cluster_labels.to_numpy().flatten()).argmax())

# Ánh xạ lại label cho data drift dựa trên kết quả predict cluster ở trên
y_drift_propagated = [new_labels[c] for  c  in  kmeans.labels_]

In [5]:
from sklearn.metrics import roc_auc_score
y_drift_test_propagated = [ new_labels[c] for c in kmeans.predict(train_x)]
propagated_auc = {"test_auc": roc_auc_score(train_y, y_drift_propagated)}
propagated_auc

{'test_auc': 0.8626996177558589}

# Online Data

In [5]:
import pandas as pd
import redis
import pickle
# Load data for problem 1
rc1 = redis.Redis(host='192.168.88.113', db=1, port=6379,  socket_keepalive=True)

captured_x = pd.DataFrame()
for key in rc1.keys():
    captured_data = pickle.loads(rc1.get(key))
    captured_x = pd.concat([captured_x, captured_data])

#captured_x.drop_duplicates(inplace=True, ignore_index=True)
#captured_x = apply_category_features(
#    raw_df=captured_x[train_x.columns],
#    categorical_cols=prob_config.categorical_cols,
#    category_index=category_index,
#)

In [6]:
allkey = rc1.keys()

In [7]:
rc1test= redis.Redis(host='localhost', db=1, port=6379,  socket_keepalive=True)

In [35]:
rc1test.keys()

[]

In [14]:
rc1test.get(rc1test.keys()[0])

b'\x80\x04\x95\x12\x01\x00\x00\x00\x00\x00\x00\x8c\x11pandas.core.frame\x94\x8c\tDataFrame\x94\x93\x94)\x81\x94}\x94(\x8c\x04_mgr\x94\x8c\x1epandas.core.internals.managers\x94\x8c\x0cBlockManager\x94\x93\x94\x8c\x16pandas._libs.internals\x94\x8c\x0f_unpickle_block\x94\x93\x94\x8c\x15numpy.core.multiarray\x94\x8c\x0c_reconstruct\x94\x93\x94\x8c\x05numpy\x94\x8c\x07ndarray\x94\x93\x94K\x00\x85\x94C\x01b\x94\x87\x94R\x94(K\x01K&M\xe8\x03\x86\x94h\x0f\x8c\x05dtype\x94\x93\x94\x8c\x02f8\x94\x89\x88\x87\x94R\x94(K\x03\x8c\x01<\x94NNNJ\xff\xff\xff\xffJ\xff\xff\xff\xffK\x00t\x94b\x89B\x80\xa3\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00`\x91\xf1@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x80U@\x00\x00\x00\x00\x00\xd0\x99@\x00\x00\x00\x00\x00`\x81@\x00\x00\x00\x00\x00 v@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00@d@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xc0p@\x00\x00\x00\x00\x00@d@\x00\x0

# Test API

In [55]:
rows = []
test = pickle.loads(rc1.get(allkey[8]))

for index, row in test.iterrows():
    rows.append(row.to_list())

data = {
  "id": "123",
  "rows": rows,
  "columns": test.columns.to_list()
}

import requests
#response = requests.post('http://14.225.205.204:80/phase-2/prob-1/predict', json=data)
#response = requests.post('http://27.71.25.203:8000/phase-2/prob-1/predict', json=data)
response = requests.post('http://localhost:8000/phase-2/prob-1/predict', json=data)
print(response.text)

{"id":"123","predictions":[0.999905894469594,0.9982137953368204,0.9998837423846397,0.005662271130951562,0.9983902821774042,0.3885123168416646,0.36683862845946397,0.6957572396545334,0.9998837423846397,0.99993561982375,0.00015151533312950143,0.004935131920503036,0.9951420882731894,0.00015151533312950143,0.4975790326460494,0.9998837423846397,0.9996784712482033,0.8757181376222133,0.10762285196123844,0.9998837423846397,0.9984604031529982,0.0003030946270267483,0.9998691905148397,0.00009221100547946161,0.9996784712482033,0.5263126545496319,0.9992373621840459,0.00015048398050289722,0.00011722781910257276,0.9986857564508228,0.2920324548003542,0.00023495320623125377,0.0022348806287717164,0.9998837423846397,0.999905894469594,0.997997023317774,0.31111151404362075,0.00014589227975874084,0.9892269421457419,0.24373161657753298,0.14697461736920883,0.9989785591318632,0.999905894469594,0.00015347951935021917,0.9998837423846397,0.9984799636066983,0.9998837423846397,0.9645611089465718,0.9998837423846397,0

In [57]:
for i in range(10):
    response = requests.post('http://localhost:8000/phase-2/prob-1/predict', json=data)

In [34]:
from aiocache import Cache
from aiocache.serializers import PickleSerializer

cacherequest = Cache(Cache.REDIS, endpoint="localhost", port=6379, db=1, serializer=PickleSerializer())
await cacherequest.clear()

True