In [28]:
# !pip install onnxruntime
# !pip install onnx
# !pip install skl2onnx
# !pip install pandas
# !pip install numpy
# !pip install scikit-learn



In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import onnxruntime as rt
import onnx
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import to_onnx
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from skl2onnx import convert_sklearn

# Load Dataset 


In [30]:
# Let's load the dataset
def load_data():

    data = pd.read_csv('../data/synth_data_for_training.csv')

    # Let's specify the features and the target
    y = data['checked']
    X = data.drop(['checked'], axis=1)
    X = X.astype(np.float32)

    # Let's split the dataset into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    return X_train, X_test, y_train, y_test

# Loading and Running on Test Model

In [31]:
# for running tests, use this
def load_onnx_model(model_path):
    sess = rt.InferenceSession(str(model_path))
    return sess

In [51]:
def predict(sess, X) -> np.ndarray:
    input = sess.get_inputs()[0].name
    # if you run into issues do X.values.astype("float32")
    
    if hasattr(X, "values"):
        X_arr = X.values.astype("float32")
    else:
        X_arr = np.asarray(X, dtype="float32")
        
    outputs = sess.run(None, {input: X_arr})
    predictions = outputs[0]

    if predictions.ndim == 2 and predictions.shape[1] == 2:
        return predictions[:,1]
    return predictions.astype("float32")

In [33]:
def predict_labels(sess, X) -> np.ndarray:
    labels = predict(sess, X)
    return (labels >= 0.5).astype(int)

# Partitioning Tests

In [34]:
# age, gender, and language proxies

# gender

def test_gender_accuracy_similar(model_path):
    _ , X_test, _ , y_test = load_data()
    sess = load_onnx_model(model_path)

    GENDER_COL = "persoon_geslacht_vrouw"

    assert GENDER_COL in X_test.columns, f"{GENDER_COL} not in dataset"

    filter_empty = X_test[GENDER_COL].notna()
    X_test = X_test[filter_empty]
    y_test = y_test[filter_empty]

    women = X_test[GENDER_COL] == 1
    other = X_test[GENDER_COL] == 0

    # remove if not needed
    assert women.sum() > 40, "Not enough women!"
    assert other.sum() > 40, "Not enough others!"

    y_pred = predict_labels(sess, X_test.values.astype("float32"))

    women_accuracy = accuracy_score(y_test[women], y_pred[women])
    other_accuracy = accuracy_score(y_test[other], y_pred[other])

    difference = abs(women_accuracy - other_accuracy)
    print(f"[Partition gender] women_accuracy={women_accuracy:.3f}, other_accuracy={other_accuracy:.3f}, diff={difference:.3f}")

    tolerance = 0.18
    assert difference<=tolerance, (f"{difference:.3f} > {tolerance:.3f}. Difference is too large.")


In [None]:
# age, gender, and language proxies

# gender

def test_gender_rate(model_path):
    _ , X_test, _ , _ = load_data()
    sess = load_onnx_model(model_path)

    GENDER_COL = "persoon_geslacht_vrouw"

    assert GENDER_COL in X_test.columns, f"{GENDER_COL} not in dataset"

    filter_empty = X_test[GENDER_COL].notna()
    X_test = X_test[filter_empty]

    women = X_test[GENDER_COL] == 1
    other = X_test[GENDER_COL] == 0

    # remove if not needed
    assert women.sum() > 40, "Not enough women!"
    assert other.sum() > 40, "Not enough others!"

    y_pred = predict_labels(sess, X_test)

    women_rate = y_pred[women].mean()
    other_rate = y_pred[other].mean()

    difference = abs(women_rate - other_rate)
    print(f"[Partition gender] women_rate={women_rate:.3f}, other_rate={other_rate:.3f}, diff={difference:.3f}")

    tolerance = 0.22
    assert difference<=tolerance, (f"{difference:.3f} > {tolerance:.3f}. Difference is too large.")


In [36]:
# age, gender, and language proxies

# age

def test_age_accuracy_similar(model_path):
    _ , X_test, _ , y_test = load_data()
    sess = load_onnx_model(model_path)

    AGE_COL = "persoon_leeftijd_bij_onderzoek"

    assert AGE_COL in X_test.columns, f"{AGE_COL} not in dataset"

    filter_empty = X_test[AGE_COL].notna()
    X_test = X_test[filter_empty]
    y_test = y_test[filter_empty]

    age = X_test[AGE_COL]

    age_groups = {
        "young": age < 30,
        "middle": (age >= 30) & (age < 60),
        "senior": age >= 60,
    }


    y_pred = predict_labels(sess, X_test)

    accuracies = {}
    for name, filtered in age_groups.items():
        if filtered.sum() < 30:
            continue
        accuracy = accuracy_score(y_test[filtered], y_pred[filtered])
        accuracies[name] = accuracy
        print(f"[Partition age] {name:>6}: accuracy={accuracy:.3f}, number={filtered.sum()}")

    accuracy_list = list(accuracies.values())
    max_difference = max(accuracy_list) - min(accuracy_list)


    tolerance = 0.20
    assert max_difference<=tolerance, (f"{max_difference:.3f} > {tolerance:.3f}. Difference is too large.")




In [37]:
# LANGUAGE_COLS = [
#     "persoonlijke_eigenschappen_nl_lezen3",
#     "persoonlijke_eigenschappen_nl_lezen4",
#     "persoonlijke_eigenschappen_nl_schrijven0",
#     "persoonlijke_eigenschappen_nl_schrijven1",
#     "persoonlijke_eigenschappen_nl_schrijven2",
#     "persoonlijke_eigenschappen_nl_schrijven3",
#     "persoonlijke_eigenschappen_nl_spreken1",
#     "persoonlijke_eigenschappen_nl_spreken2",
#     "persoonlijke_eigenschappen_nl_spreken3",
# ]

In [38]:
# age, gender, and language proxies

# language

def test_language_accuracy_similar(model_path):
    _ , X_test, _ , y_test = load_data()
    sess = load_onnx_model(model_path)

    LANGUAGE_COL = "persoonlijke_eigenschappen_spreektaal_anders"

    assert LANGUAGE_COL in X_test.columns, f"{LANGUAGE_COL} not in dataset"


    filter_empty = X_test[LANGUAGE_COL].notna()
    X_test = X_test[filter_empty]
    y_test = y_test[filter_empty]

    dutch = X_test[LANGUAGE_COL] == 0
    non_dutch = X_test[LANGUAGE_COL] == 1

    # remove if not needed
    assert dutch.sum() > 40, "Not enough dutch speaking people!"
    assert non_dutch.sum() > 40, "Not enough non_dutch speaking people!"

    y_pred = predict_labels(sess, X_test)

    dutch_accuracy = accuracy_score(y_test[dutch], y_pred[dutch])
    non_dutch_accuracy = accuracy_score(y_test[non_dutch], y_pred[non_dutch])

    difference = abs(dutch_accuracy - non_dutch_accuracy)
    print(f"[Partition language] dutch_accuracy={dutch_accuracy:.3f}, non_dutch_accuracy={non_dutch_accuracy:.3f}, diff={difference:.3f}")

    tolerance = 0.20
    assert difference<=tolerance, (f"{difference:.3f} > {tolerance:.3f}. Difference is too large.")

In [59]:
# age, gender, and language proxies

# language

def test_language_rate(model_path):
    _ , X_test, _ , _ = load_data()
    sess = load_onnx_model(model_path)

    LANGUAGE_COL = "persoonlijke_eigenschappen_taaleis_voldaan"

    assert LANGUAGE_COL in X_test.columns, f"{LANGUAGE_COL} not in dataset"

    filter_empty = X_test[LANGUAGE_COL].notna()
    X_test = X_test[filter_empty]

    dutch = X_test[LANGUAGE_COL] == 0
    non_dutch = X_test[LANGUAGE_COL] == 1

    # remove if not needed
    assert dutch.sum() > 40, "Not enough dutch speaking people!"
    assert non_dutch.sum() > 40, "Not enough non-dutch speaking people!"

    y_pred = predict_labels(sess, X_test)

    dutch_rate = y_pred[dutch].mean()
    non_dutch_rate = y_pred[non_dutch].mean()

    difference = abs(dutch_rate - non_dutch_rate)
    print(f"[Partition language] dutch_rate={dutch_rate:.3f}, non_dutch_rate={non_dutch_rate:.3f}, diff={difference:.3f}")

    tolerance = 0.20
    assert difference<=tolerance, (f"{difference:.3f} > {tolerance:.3f}. Difference is too large.")

# Metamorphic Data Augmentation

In [None]:
# ============================================================
# Metamorphic Data Augmentation: Gender Flipping
# ============================================================

# Gender transformation map (same as above)
gender_flip_map = {
    1: 0,
    0: 1
}

def flip_gender_all(val):
    """Flip gender if possible, otherwise return original."""
    return gender_flip_map.get(val, val)


In [43]:
def flip_gender_test(model_path):
    _ , X_test, _ , y_test = load_data()
    sess = load_onnx_model(model_path)

    # Create a copy of test set
    X_test_flipped = X_test.copy()

    X_test_flipped['persoon_geslacht_vrouw'] = X_test_flipped['persoon_geslacht_vrouw'].apply(flip_gender_all)

    # Predict with flipped gender
    y_pred_original = predict_labels(sess, X_test)
    y_pred_flipped = predict_labels(sess, X_test_flipped)

    # Compare
    print("Accuracy after gender flip:", accuracy_score(y_test, y_pred_flipped))
    print("Changed predictions:", np.sum(y_pred_original != y_pred_flipped))


In [55]:
# ============================================================
# Metamorphic Data Augmentation: Language Flipping
# ============================================================

# Gender transformation map (same as above)
language_flip_map = {
    1: 0,
    0: 1
}

def flip_language_all(val):
    """Flip language if possible, otherwise return original."""
    return language_flip_map.get(val, val)

In [58]:
def flip_language_test(model_path):
    _ , X_test, _ , y_test = load_data()
    sess = load_onnx_model(model_path)

    # Create a copy of test set
    X_test_flipped = X_test.copy()

    X_test_flipped["persoonlijke_eigenschappen_taaleis_voldaan"] = X_test_flipped["persoonlijke_eigenschappen_taaleis_voldaan"].apply(flip_language_all)

    # Predict with flipped gender
    y_pred_original = predict_labels(sess, X_test)
    y_pred_flipped = predict_labels(sess, X_test_flipped)

    # Compare
    print("Accuracy after language flip:", accuracy_score(y_test, y_pred_flipped))
    print("Changed predictions:", np.sum(y_pred_original != y_pred_flipped))


# Testing the models!

In [47]:
model_1 = "..\model\model_1.onnx"
model_2 = "..\model\model_2.onnx" 

In [60]:
test_gender_accuracy_similar(model_1)
test_gender_accuracy_similar(model_2)

test_gender_rate(model_1)
test_gender_rate(model_2)

test_age_accuracy_similar(model_1)
test_age_accuracy_similar(model_2)

test_language_accuracy_similar(model_1)
test_language_accuracy_similar(model_2)

test_language_rate(model_1)
test_language_rate(model_2)

flip_gender_test(model_1)
flip_gender_test(model_2)

flip_language_test(model_1)
flip_language_test(model_2)

[Partition gender] women_accuracy=0.907, other_accuracy=0.894, diff=0.013
[Partition gender] women_accuracy=0.941, other_accuracy=0.922, diff=0.018
[Partition gender] women_rate=0.023, other_rate=0.038, diff=0.015
[Partition gender] women_rate=0.059, other_rate=0.074, diff=0.015
[Partition age]  young: accuracy=0.762, number=105
[Partition age] middle: accuracy=0.903, number=2531
[Partition age] senior: accuracy=0.916, number=526
[Partition age]  young: accuracy=0.867, number=105
[Partition age] middle: accuracy=0.938, number=2531
[Partition age] senior: accuracy=0.913, number=526
[Partition language] dutch_accuracy=0.900, non_dutch_accuracy=0.901, diff=0.000
[Partition language] dutch_accuracy=0.930, non_dutch_accuracy=0.933, diff=0.003
[Partition language] dutch_rate=0.052, non_dutch_rate=0.011, diff=0.040
[Partition language] dutch_rate=0.102, non_dutch_rate=0.038, diff=0.064
Accuracy after gender flip: 0.9003795066413662
Changed predictions: 0
Accuracy after gender flip: 0.93105629

In [None]:
# # ============================================================
# # Metamorphic Data Augmentation: Gender Flipping
# # ============================================================

# # Gender transformation map (same as above)
# gender_flip_map = {
#     1: 2,
#     2: 1,
#     3: 5,
#     5: 3
# }

# def flip_gender_all(val):
#     """Flip gender if possible, otherwise return original."""
#     return gender_flip_map.get(val, val)


# # --------------------------
# # Create flipped training set
# # --------------------------
# X_train_flipped = X_train.copy()
# X_train_flipped["Attribute9"] = X_train_flipped["Attribute9"].apply(flip_gender_all)

# # y_train does not change
# y_train_flipped = y_train.copy()

# # --------------------------
# # Augment the training data
# # --------------------------
# X_train_aug = pd.concat([X_train, X_train_flipped], ignore_index=True)
# y_train_aug = pd.concat([y_train, y_train_flipped], ignore_index=True)

# print("Original training size:", len(X_train))
# print("Augmented training size:", len(X_train_aug))

In [None]:
# clf_aug = DecisionTreeClassifier()
# clf_aug.fit(X_train_aug, y_train_aug)

In [None]:
# # Create a copy of test set
# X_test_flipped = X_test.copy()

# # Apply to Attribute9 (column 8)
# X_test_flipped['Attribute9'] = X_test_flipped['Attribute9'].apply(flip_gender_all)

# # Predict with flipped gender
# y_pred_original = clf_aug.predict(X_test)
# y_pred_flipped = clf_aug.predict(X_test_flipped)

# # Compare
# print("Accuracy after gender flip:", metrics.accuracy_score(y_test, y_pred_flipped))
# print("Changed predictions:", np.sum(y_pred_original != y_pred_flipped))