In [27]:
import joblib, pickle

from s2and_ext.my_models import LightGBMWrapper
from s2and_ext.my_featurization import get_matrices, featurizing_function

from skl2onnx import convert_sklearn, update_registered_converter
from skl2onnx.common.shape_calculator import calculate_linear_classifier_output_shapes
from skl2onnx.common.data_types import FloatTensorType
from onnxmltools.convert.lightgbm.operator_converters.LightGbm import convert_lightgbm

import lightgbm as lgb

### Conversion to ONNX

In [28]:
# Get GBMClassifier out of LIghtGBMWrapper
model = joblib.load('models/lightgbm_sparse_0_23.joblib').model

X_train, y_train, X_val, y_val, X_test, y_test = get_matrices(datasets=['aminer', 'pubmed', 'zbmath', 'kisti', 'arnetminer'], 
                                                                  featurizing_function=featurizing_function, 
                                                                  remove_nan=False,
                                                                  default_embeddings=True,
                                                                  external_emb_dir=None)

update_registered_converter(
    lgb.LGBMClassifier, 'LightGbmLGBMClassifier',
    calculate_linear_classifier_output_shapes, convert_lightgbm,
    options={'nocl': [True, False], 'zipmap': [True, False, 'columns']})

model_onnx = convert_sklearn(
    model, 'lightgbm',
    [('input', FloatTensorType([None, 9]))],
    target_opset=12)

# And save.
with open("models/lightgbm_sparse_0_23.onnx", "wb") as f:
    f.write(model_onnx.SerializeToString())

Loaded dataset from pickle...
Processed aminer
Loaded dataset from pickle...
Processed pubmed
Loaded dataset from pickle...
Processed zbmath
Loaded dataset from pickle...
Processed kisti
Loaded dataset from pickle...
Processed arnetminer
Nan values for each feature :
[  1085      0      0   2431   2431      0      0    519 434278]


### ONNX Inference

sess.run returns tuple of lists. The first list in the tuple contains the labels for each input (0 or 1). The second list in the tuple contains list of probabailities. The output the must be used is the probability of 0. In the following code cell, an example output of sess.run is printed.

In [29]:
import onnxruntime as rt
import numpy as np

try:
    sess = rt.InferenceSession("models/lightgbm_sparse_0_23.onnx")
except:
    sess = None

if sess is not None:
    pred_onx = sess.run(None, {"input": X_train.astype(np.float32)})

y_onnx = np.asarray([pred[0] for pred in pred_onx[1]])
y_target = model.predict_proba(X_train)[:,0]

print(sess.run(None, {"input": X_train.astype(np.float32)[:5,:]}))

[array([0, 0, 1, 1, 1], dtype=int64), [{0: 0.7047421336174011, 1: 0.2952578663825989}, {0: 0.5081993937492371, 1: 0.49180060625076294}, {0: 0.22425144910812378, 1: 0.7757485508918762}, {0: 0.360037624835968, 1: 0.639962375164032}, {0: 0.3745957612991333, 1: 0.6254042387008667}]]


### Testing

In [30]:
try:
    np.testing.assert_array_almost_equal(y_target,y_onnx, decimal=5)
    print('Test passed')
except:
    print('Test failed')

Test failed


In [31]:
np.testing.assert_array_almost_equal(y_target,y_onnx, decimal=5)

AssertionError: 
Arrays are not almost equal to 5 decimals

Mismatched elements: 1 / 440487 (0.000227%)
Max absolute difference: 0.02116809
Max relative difference: 0.02300601
 x: array([0.70474, 0.5082 , 0.22425, ..., 0.01489, 0.30098, 0.1226 ])
 y: array([0.70474, 0.5082 , 0.22425, ..., 0.01489, 0.30098, 0.1226 ])

In [None]:
pred_onx = sess.run(None, {"input": X_test.astype(np.float32)})
y_onnx = np.asarray([pred[0] for pred in pred_onx[1]])
y_target = model.predict_proba(X_test)[:,0]

try:
    np.testing.assert_array_almost_equal(y_target,y_onnx, decimal=5)
    print('Test passed')
except:
    print('Test failed')

Test passed
