In [128]:
import pickle
import numpy as np
import random
import time

import matplotlib
import sklearn
from matplotlib import pyplot as plt

In [129]:
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [130]:
import mlflow
import mlflow.sklearn
import joblib

In [131]:
experiment = "inference"
seed = 0

In [132]:
mlflow.set_experiment(experiment)


2024/11/21 20:23:21 INFO mlflow.tracking.fluent: Experiment with name 'inference' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///home/cunning/studying/ISSII/lab3/mlruns/569937853582641813', creation_time=1732209801191, experiment_id='569937853582641813', last_update_time=1732209801191, lifecycle_stage='active', name='inference', tags={}>

In [133]:
dataset = pd.read_csv('dataset.csv', sep=';')

In [134]:
dataset.head()

Unnamed: 0,is_genuine,diagonal,height_left,height_right,margin_low,margin_up,length
0,True,171.81,104.86,104.95,4.52,2.89,112.83
1,True,171.46,103.36,103.66,3.77,2.99,113.09
2,True,172.69,104.48,103.5,4.4,2.94,113.16
3,True,171.36,103.91,103.94,3.62,3.01,113.51
4,True,171.73,104.28,103.46,4.04,3.48,112.54


In [135]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   is_genuine    1500 non-null   bool   
 1   diagonal      1500 non-null   float64
 2   height_left   1500 non-null   float64
 3   height_right  1500 non-null   float64
 4   margin_low    1463 non-null   float64
 5   margin_up     1500 non-null   float64
 6   length        1500 non-null   float64
dtypes: bool(1), float64(6)
memory usage: 71.9 KB


In [136]:
y = dataset.is_genuine.values
x = dataset.drop('is_genuine', axis=1)

In [137]:
for i in range(2000):
  raw = random.randint(0, x.shape[0])-1
  col = random.randint(0, x.shape[1])-1
  x.iloc[raw, col] = np.nan

In [138]:
for i in range(10):
  raw = random.randint(0, x.shape[0])-1
  col = random.randint(0, x.shape[1])-1
  x.iloc[raw, col] = x.iloc[raw, col] + random.randint(1000, 100000)


In [139]:
def plot_scores(optimizer, plot_path):
    plt.semilogx( optimizer.cv_results_['param_C'], optimizer.cv_results_['mean_test_score'])
    plt.fill_between(optimizer.cv_results_['param_C'], optimizer.cv_results_['mean_test_score']-optimizer.cv_results_['std_test_score'],
                                  optimizer.cv_results_['mean_test_score']+optimizer.cv_results_['std_test_score'], alpha=0.3)
    plt.savefig(plot_path)
    plt.close()  # Close the plot to free resources

In [140]:
mean = x[['diagonal',	'height_left',	'height_right',	'margin_low',	'margin_up',	'length']].mean()
std = x[['diagonal',	'height_left',	'height_right',	'margin_low',	'margin_up',	'length']].std()
threshold = 3
df = x[(x[['diagonal',	'height_left',	'height_right',	'margin_low',	'margin_up',	'length']] < mean + threshold * std) & (x[['diagonal',	'height_left',	'height_right',	'margin_low',	'margin_up',	'length']] > mean - threshold * std)]

In [141]:
x_train, x_test, y_train, y_test = train_test_split(df,y, test_size=0.3, random_state=seed)

In [142]:
X_real_mean_train = x_train[['diagonal',	'height_left',	'height_right',	'margin_low',	'margin_up',	'length']].fillna(x_train.mean())

In [143]:
X_real_mean_test = x_test[['diagonal',	'height_left',	'height_right',	'margin_low',	'margin_up',	'length']].fillna(x_test.mean())

In [144]:
regression_model = None
with open("best_model.pkl", "rb") as f:
    regression_model = pickle.loads(f.read())
regression_model

In [145]:
num_runs = 100

In [146]:
mlflow.start_run(run_name="source model")

<ActiveRun: >

In [147]:
total_time = 0
for _ in range(num_runs):
    start_time = time.time()
    accuracy = regression_model.score(X_real_mean_test, y_test)
    total_time += time.time() - start_time
average_time = total_time / num_runs
inference_time = average_time

In [148]:
print(f"Accuracy: {accuracy}")
print(f"Inference time: {inference_time}")
mlflow.log_metric("accuracy", accuracy)
mlflow.log_metric("inference time", inference_time)

Accuracy: 0.9555555555555556
Inference time: 0.0007443523406982422


In [149]:
mlflow.end_run()

## ONNX

In [150]:
from skl2onnx import convert_sklearn  # noqa: E402
from skl2onnx.common.data_types import FloatTensorType 

In [151]:
initial_type = [("float_input", FloatTensorType([None, 6]))]
onx = convert_sklearn(regression_model, initial_types=initial_type)
with open("logreg_iris.onnx", "wb") as f:
    f.write(onx.SerializeToString())

In [152]:
import onnxruntime as rt

In [153]:
mlflow.start_run(run_name="onnx model")

<ActiveRun: >

In [154]:
sess = rt.InferenceSession("logreg_iris.onnx", providers=rt.get_available_providers())

In [155]:
print(f"input name='{sess.get_inputs()[0].name}' and shape={sess.get_inputs()[0].shape}")
print(f"output name='{sess.get_outputs()[0].name}' and shape={sess.get_outputs()[0].shape}")

input name='float_input' and shape=[None, 6]
output name='output_label' and shape=[None]


In [156]:
input_name = sess.get_inputs()[0].name
label_name = sess.get_outputs()[0].name

total_time = 0
for _ in range(num_runs):
    start_time = time.time()
    y_onx = sess.run([label_name], {input_name: X_real_mean_test.values.astype(numpy.float32)})[0]
    total_time += time.time() - start_time
accuracy = accuracy_score(y_test, y_onx)
inference_time = total_time/num_runs

In [157]:
print(f"Accuracy: {accuracy:.8f}")
print(f"Inference time: {inference_time:.8f} ")
mlflow.log_metric("accuracy", accuracy)
mlflow.log_metric("inference time", inference_time)

Accuracy: 0.95555556
Inference time: 0.00015585 


In [158]:
mlflow.end_run()

## bentoml

In [159]:
import bentoml

In [160]:
mlflow.start_run(run_name="bentoml model")

<ActiveRun: >

In [161]:
t = {"X_test": X_real_mean_test.values.tolist(),
     "y_test": y_test.tolist()}

results = []
for _ in range(num_runs):
   with bentoml.SyncHTTPClient("http://localhost:3000") as client:
      result = client.predict(
         input_sample=t
      )
      results.append(result)

In [162]:
print("Accuracy: "+ str(results[0]['accuracy']))
print("Inference time: " + str(sum(result['inference_time'] for result in results)/num_runs))

Accuracy: 0.9555555555555556
Inference time: 0.0001166534423828125


In [163]:
mlflow.end_run()