# Training a bad and a good model

- Both models are random forest classifier in order to focus only on the data used for training.

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score

In [2]:
import pandas as pd

In [8]:
df = pd.read_csv("investigation_train_large_checked.csv")

Sensitive and proxy features not to be used by good model

In [9]:
SENSITIVE_AND_PROXY_FEATURES = [
    # Direct sensitive attributes
    "persoon_geslacht_vrouw",
    "persoon_leeftijd_bij_onderzoek",
    # Strong proxies
    "adres_recentste_wijk_charlois",
    "adres_recentste_wijk_delfshaven",
    "adres_recentste_wijk_feijenoord",
    "adres_recentste_wijk_ijsselmonde",
    "adres_recentste_wijk_kralingen_c",
    "adres_recentste_wijk_noord",
    "adres_recentste_wijk_other",
    "adres_recentste_wijk_prins_alexa",
    "adres_recentste_wijk_stadscentru",
    "adres_recentste_buurt_groot_ijsselmonde",
    "adres_recentste_buurt_nieuwe_westen",
    "adres_recentste_buurt_other",
    "adres_recentste_buurt_oude_noorden",
    "adres_recentste_buurt_vreewijk",
    "adres_recentste_plaats_other",
    "adres_recentste_plaats_rotterdam",
    "adres_aantal_verschillende_wijken",
    "adres_dagen_op_adres",
    "adres_unieke_wijk_ratio",
    # Household & children
    "relatie_kind_basisschool_kind",
    "relatie_kind_heeft_kinderen",
    "relatie_kind_huidige_aantal",
    "relatie_kind_jongvolwassen",
    "relatie_kind_leeftijd_verschil_ouder_eerste_kind",
    "relatie_kind_tiener",
    "relatie_kind_volwassen",
    "relatie_overig_actueel_vorm__gemachtigde",
    "relatie_overig_actueel_vorm__kostendeler",
    "relatie_overig_actueel_vorm__onderhoudsplichtige",
    "relatie_overig_actueel_vorm__ouders_verzorgers",
    "relatie_overig_actueel_vorm_other",
    "relatie_overig_bewindvoerder",
    "relatie_overig_historie_vorm__andere_inwonende",
    "relatie_overig_historie_vorm__gemachtigde",
    "relatie_overig_historie_vorm__kostendeler",
    "relatie_overig_historie_vorm__onderhoudsplichtige",
    "relatie_overig_kostendeler",
    "relatie_partner_aantal_partner___partner__gehuwd_",
    "relatie_partner_aantal_partner___partner__ongehuwd_",
    "relatie_partner_huidige_partner___partner__gehuwd_",
    "relatie_partner_totaal_dagen_partner",
    # Language & integration
    "persoonlijke_eigenschappen_nl_begrijpen3",
    "persoonlijke_eigenschappen_nl_lezen3",
    "persoonlijke_eigenschappen_nl_lezen4",
    "persoonlijke_eigenschappen_nl_schrijven0",
    "persoonlijke_eigenschappen_nl_schrijven1",
    "persoonlijke_eigenschappen_nl_schrijven2",
    "persoonlijke_eigenschappen_nl_schrijven3",
    "persoonlijke_eigenschappen_nl_schrijvenfalse",
    "persoonlijke_eigenschappen_nl_spreken1",
    "persoonlijke_eigenschappen_nl_spreken2",
    "persoonlijke_eigenschappen_nl_spreken3"

]

Create training and test sets for both bad and good model. The good model does not consider sensitive and proxy features

In [10]:
X_bad = df.drop(columns=["checked", "Ja", "Nee"])
y_bad = df["checked"]

X_train_bad, X_test_bad, y_train_bad, y_test_bad = train_test_split(
    X_bad, y_bad, test_size=0.2, random_state=42, stratify=y_bad
)

X_good = df.drop(columns=["checked", "Ja", "Nee"])
X_good = X_good.drop(columns=SENSITIVE_AND_PROXY_FEATURES)
y_good = df["checked"]

X_train_good, X_test_good, y_train_good, y_test_good = train_test_split(
    X_good, y_good, test_size=0.2, random_state=42, stratify=y_good
)

Train bad random forest classifier

In [11]:
bad_rf = RandomForestClassifier(
    n_estimators=1000,
    max_depth=None,
    min_samples_leaf=5,
    class_weight=None,
    random_state=42,
    n_jobs=-1
)

bad_rf.fit(X_train_bad, y_train_bad)

Train good random forest classifier

In [12]:
good_rf = RandomForestClassifier(
    n_estimators=1000,
    max_depth=None,
    min_samples_leaf=5,
    class_weight=None,
    random_state=42,
    n_jobs=-1
)

good_rf.fit(X_train_good, y_train_good)

In [13]:
y_pred_bad = bad_rf.predict(X_test_bad)
y_proba_bad = bad_rf.predict_proba(X_test_bad)[:, 1]

print("Accuracy:", accuracy_score(y_test_bad, y_pred_bad))
print("AUC:", roc_auc_score(y_test_bad, y_proba_bad))

Accuracy: 0.8774615384615385
AUC: 0.9555934581117974


In [14]:
from sklearn.metrics import roc_auc_score, balanced_accuracy_score, classification_report

print("AUC:", roc_auc_score(y_test_bad, y_proba_bad))
print("Balanced accuracy:", balanced_accuracy_score(y_test_bad, y_pred_bad))
print(classification_report(y_test_bad, y_pred_bad))

AUC: 0.9555934581117974
Balanced accuracy: 0.5933319173040605
              precision    recall  f1-score   support

       False       0.87      1.00      0.93     22099
        True       0.98      0.19      0.31      3901

    accuracy                           0.88     26000
   macro avg       0.93      0.59      0.62     26000
weighted avg       0.89      0.88      0.84     26000



In [15]:
y_pred_good = good_rf.predict(X_test_good)
y_proba_good = good_rf.predict_proba(X_test_good)[:, 1]

print("Accuracy:", accuracy_score(y_test_good, y_pred_good))
print("AUC:", roc_auc_score(y_test_good, y_proba_good))

Accuracy: 0.8706923076923077
AUC: 0.8744379522416424


In [16]:
from sklearn.metrics import roc_auc_score, balanced_accuracy_score, classification_report

print("AUC:", roc_auc_score(y_test_good, y_proba_good))
print("Balanced accuracy:", balanced_accuracy_score(y_test_good, y_pred_good))
print(classification_report(y_test_good, y_pred_good))

AUC: 0.8744379522416424
Balanced accuracy: 0.5746788307223539
              precision    recall  f1-score   support

       False       0.87      1.00      0.93     22099
        True       0.92      0.15      0.26      3901

    accuracy                           0.87     26000
   macro avg       0.89      0.57      0.59     26000
weighted avg       0.88      0.87      0.83     26000



### Export to ONNX

In [17]:
!pip install skl2onnx onnx onnxruntime

Collecting skl2onnx
  Downloading skl2onnx-1.19.1-py3-none-any.whl.metadata (3.8 kB)
Collecting onnx
  Downloading onnx-1.20.1-cp312-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Collecting onnxruntime
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading skl2onnx-1.19.1-py3-none-any.whl (315 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.5/315.5 kB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading onnx-1.20.1-cp312-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (17.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.5/17.5 MB[0m [31m70.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading onnx

In [18]:
import numpy as np
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

In [21]:
!touch rf_good.onnx

initial_type = [
    ("float_input", FloatTensorType([None, X_train_good.shape[1]]))
]

onnx_good = convert_sklearn(
    good_rf,
    initial_types=initial_type,
    options={RandomForestClassifier: {"zipmap": False}}
)

with open("rf_good.onnx", "wb") as f:
    f.write(onnx_good.SerializeToString())

In [22]:
!touch rf_bad.onnx

initial_type = [
    ("float_input", FloatTensorType([None, X_train_bad.shape[1]]))
]

onnx_bad = convert_sklearn(
    bad_rf,
    initial_types=initial_type,
    options={RandomForestClassifier: {"zipmap": False}}
)

with open("rf_bad.onnx", "wb") as f:
    f.write(onnx_bad.SerializeToString())


In [28]:
from google.colab import files

files.download("rf_good.onnx")
files.download("rf_bad.onnx")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Test onnx

In [24]:
sess = rt.InferenceSession("rf_good.onnx", providers=["CPUExecutionProvider"])

for inp in sess.get_inputs():
    print(inp.name, inp.shape, inp.type)

for out in sess.get_outputs():
    print(out.name, out.shape, out.type)

float_input [None, 261] tensor(float)
label [None] tensor(int64)
probabilities [None, 2] tensor(float)


In [25]:
import onnxruntime as rt

sess = rt.InferenceSession("rf_good.onnx", providers=["CPUExecutionProvider"])

input_name = sess.get_inputs()[0].name
label_name = sess.get_outputs()[0].name
proba_name = sess.get_outputs()[1].name

X_test_np = X_test_good.to_numpy(dtype=np.float32)

pred_onnx, proba_onnx = sess.run(
    None,   # <-- IMPORTANT
    {"float_input": X_test_np}
)


In [27]:
np.allclose(
    good_rf.predict_proba(X_test_good),
    proba_onnx,
    atol=1e-6
)


True