In [37]:
# !pip install onnxruntime
# !pip install onnx
# !pip install skl2onnx
# !pip install pandas
# !pip install numpy
# !pip install scikit-learn

In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import onnxruntime as rt
import onnx
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import to_onnx
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from skl2onnx import convert_sklearn

## Load the dataset

In [39]:
# Let's load the dataset
data = pd.read_csv("..\data\synth_data_for_training.csv")

# Let's specify the features and the target
y = data['checked']
X = data.drop(['checked'], axis=1)
X = X.astype(np.float32)

# Let's split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


## Split the dataset accordingly

In [40]:
all_cols = X.columns.tolist()

group_features_bad_keywords = {
    "persoonlijke_eigenschappen_nl_", 
    "persoonlijke_eigenschappen_spreektaal", 
    "persoonlijke_eigenschappen_taaleis_", 
    "persoon_geslacht_vrouw", 
    "persoon_leeftijd_bij_onderzoek", 
    "relatie_kind_", 
    "relatie_partner_", 
    "adres_", 
    "belemmering_", 
}

group_features_neutral_keywords = {
    "afspraak_aantal_woorden",
    "contact_soort_e_mail__uitgaand_",
    "deelname_act_reintegratieladder_werk_re_integratie",
}

group_features_bad = [
    col for col in all_cols
    if any(col.startswith(bad_keyword) for bad_keyword in group_features_bad_keywords)
]

group_features_neutral = [
    col for col in all_cols
    if any(col.startswith(neutral_keyword) for neutral_keyword in group_features_neutral_keywords)
]

group_features_combined = set(group_features_bad + group_features_neutral)

group_features_good = [
    col for col in all_cols
    if col not in group_features_bad
]

In [None]:
def train_model(cols):    
    all_features = X_train.columns.tolist()
    
    indices = [all_features.index(col) for col in cols]
    
    colTransformer = ColumnTransformer(transformers=[("select_features", "passthrough", indices)], remainder="drop")
    selector = VarianceThreshold()
    classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
    
    pipeline = Pipeline(steps=[("select", colTransformer), ('scaler', selector), ('classification', classifier)])
    
    pipeline.fit(X_train, y_train)
    
    # Let's evaluate the model
    y_pred = pipeline.predict(X_test)
    original_accuracy = accuracy_score(y_test, y_pred)
    print('Accuracy of the original model: ', original_accuracy)
    
    return pipeline

In [47]:
print("\nTraining the 'bad' model")
bad_model = train_model(group_features_combined)

print("\nTraining the 'good' model")
good_model = train_model(group_features_good)


Training the 'bad' model
Accuracy of the original model:  0.9003795066413662

Training the 'good' model
Accuracy of the original model:  0.931056293485136


In [48]:
onnx_model_bad = convert_sklearn(bad_model, initial_types=[('data', FloatTensorType((None, X_train.shape[1])))],target_opset=12)
onnx_model_good = convert_sklearn(good_model, initial_types=[('data', FloatTensorType((None, X_train.shape[1])))],target_opset=12)

# Let's check the accuracy of the converted model
sess_bad = rt.InferenceSession(onnx_model_bad.SerializeToString())
y_pred_onnx_bad =  sess_bad.run(None, {'data': X_test.values.astype(np.float32)})

sess_good = rt.InferenceSession(onnx_model_good.SerializeToString())
y_pred_onnx_good =  sess_good.run(None, {'data': X_test.values.astype(np.float32)})

accuracy_onnx_model_bad = accuracy_score(y_test, y_pred_onnx_bad[0])
print('Accuracy of the bad ONNX model: ', accuracy_onnx_model_bad)

accuracy_onnx_model_good = accuracy_score(y_test, y_pred_onnx_good[0])
print('Accuracy of the good ONNX model: ', accuracy_onnx_model_good)

Accuracy of the bad ONNX model:  0.9003795066413662
Accuracy of the good ONNX model:  0.931056293485136


In [49]:
# Let's save the model
onnx.save(onnx_model_bad, "../model/model_1.onnx")
onnx.save(onnx_model_good, "../model/model_2.onnx")

# Let's load the model
new_session_1 = rt.InferenceSession("../model/model_1.onnx")
new_session_2 = rt.InferenceSession("../model/model_2.onnx")

# Let's predict the target
y_pred_onnx2_1 =  new_session_1.run(None, {'data': X_test.values.astype(np.float32)})
y_pred_onnx2_2 =  new_session_2.run(None, {'data': X_test.values.astype(np.float32)})

accuracy_onnx_model_1 = accuracy_score(y_test, y_pred_onnx2_1[0])
print('Accuracy of the ONNX model 1: ', accuracy_onnx_model_1)

accuracy_onnx_model_2 = accuracy_score(y_test, y_pred_onnx2_2[0])
print('Accuracy of the ONNX model 2: ', accuracy_onnx_model_2)

Accuracy of the ONNX model 1:  0.9003795066413662
Accuracy of the ONNX model 2:  0.931056293485136


In [None]:
# Select data based on variance (not the final version yet, for now just for testing)
# selector = VarianceThreshold()

In [None]:
# Define a gradient boosting classifier
# classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)

In [None]:
# Create a pipeline object with our selector and classifier
# NOTE: You can create custom pipeline objects but they must be registered to onnx or it will not recognise them
# Because of this we recommend using the onnx known objects as defined in the documentation
# pipeline = Pipeline(steps=[('feature selection', selector), ('classification', classifier)])

In [None]:
# # Let's train a simple model
# pipeline.fit(X_train, y_train)

# # Let's evaluate the model
# y_pred = pipeline.predict(X_test)
# original_accuracy = accuracy_score(y_test, y_pred)
# print('Accuracy of the original model: ', original_accuracy)

Accuracy of the original model:  0.9456040480708412


In [None]:
# # Let's convert the model to ONNX
# onnx_model = convert_sklearn(
#     pipeline, initial_types=[('X', FloatTensorType((None, X.shape[1])))],
#     target_opset=12)

# # Let's check the accuracy of the converted model
# sess = rt.InferenceSession(onnx_model.SerializeToString())
# y_pred_onnx =  sess.run(None, {'X': X_test.values.astype(np.float32)})

# accuracy_onnx_model = accuracy_score(y_test, y_pred_onnx[0])
# print('Accuracy of the ONNX model: ', accuracy_onnx_model)

Accuracy of the ONNX model:  0.9456040480708412


In [None]:
# # Let's save the model
# onnx.save(onnx_model, "model/gboost.onnx")

# # Let's load the model
# new_session = rt.InferenceSession("model/gboost.onnx")

# # Let's predict the target
# y_pred_onnx2 =  new_session.run(None, {'X': X_test.values.astype(np.float32)})

# accuracy_onnx_model = accuracy_score(y_test, y_pred_onnx2[0])
# print('Accuracy of the ONNX model: ', accuracy_onnx_model)


Accuracy of the ONNX model:  0.9456040480708412
