# ML Model Testing

Here several models are trained on the data and the predictions on noised test data are recorded

In [92]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, r2_score
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.experimental import enable_hist_gradient_boosting  # Required in older versions
from sklearn.ensemble import HistGradientBoostingRegressor
from tab_err.api import high_level
from tab_err import error_type, error_mechanism

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload




In [93]:
# Read in iris data
iris_data = datasets.load_iris()
iris = pd.DataFrame(iris_data.data, columns=iris_data.feature_names)
iris["target"] = iris_data.target


# Read in california housing data
california_data = datasets.fetch_california_housing()
california = pd.DataFrame(california_data.data, columns=california_data.feature_names)
california["target"] = california_data.target


### Do CV and add errors just before the predict step

In [94]:
# Method that returns a list of the unaltered data accuracy and the altered data accuracy

def evaluate_on_dirty_data(data, error_rate, model, evaluation_function, n_splits=5, error_types_to_exclude = None, error_mechanisms_to_exclude = None, seed=None):

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    clean_acc = []
    dirty_acc = []

    for train_idx, test_idx in kf.split(data):
        # Split the data
        df_train, df_test = data.iloc[train_idx], data.iloc[test_idx]
        x_train, y_train = df_train.drop(columns=["target"]), df_train["target"]
        x_test, y_test = df_test.drop(columns=["target"]), df_test["target"]

        # Fit the model
        model.fit(x_train, y_train)  # Logistic regression

        # Predict on the clean test
        y_pred_clean = model.predict(x_test)
        acc_clean = evaluation_function(y_test, y_pred_clean)
        clean_acc.append(acc_clean)

        # Use high level api on x_test - perturb dataset
        x_test_perturbed, error_mask = high_level.create_errors(
            x_test,
            error_rate=error_rate,
            error_types_to_exclude=error_types_to_exclude,
            error_mechanisms_to_exclude=error_mechanisms_to_exclude,
            seed=seed
            )

        # Predict on the dirty test
        y_pred_dirty = model.predict(x_test_perturbed)
        acc_dirty = evaluation_function(y_test, y_pred_dirty)
        dirty_acc.append(acc_dirty)

    return clean_acc, dirty_acc

def print_acc(acc_list, result_name):
    print(
    "Accuracies of ", result_name, " test data: ",
    acc_list,
    f"Mean accuracy: { (sum(acc_list) / len(acc_list)):.4f}" if acc_list else "Mean accuracy: None"
    )

#### Models and accuracy eval on iris dataset

In [None]:
# Test on iris dataset -- try leaving out different error mechanisms
error_rate = 0.5
iris_model_1 = LogisticRegression(max_iter=200, random_state=42)
folds= 5

clean_acc, dirty_acc = evaluate_on_dirty_data(iris, error_rate=error_rate, evaluation_function=accuracy_score, model=iris_model_1, n_splits=folds, error_types_to_exclude=[error_type.MissingValue()], seed=42)
clean_acc, no_ear_acc = evaluate_on_dirty_data(iris, error_rate=error_rate, evaluation_function=accuracy_score, model=iris_model_1, error_types_to_exclude=[error_type.MissingValue()], error_mechanisms_to_exclude=[error_mechanism.EAR()], seed=42)
clean_acc, no_enar_acc = evaluate_on_dirty_data(iris, error_rate=error_rate, evaluation_function=accuracy_score, model=iris_model_1, error_types_to_exclude=[error_type.MissingValue()], error_mechanisms_to_exclude=[error_mechanism.ENAR()], seed=42)
clean_acc, no_ecar_acc = evaluate_on_dirty_data(iris, error_rate=error_rate, evaluation_function=accuracy_score, model=iris_model_1, error_types_to_exclude=[error_type.MissingValue()], error_mechanisms_to_exclude=[error_mechanism.ECAR()], seed=42)
clean_acc, no_enar_enar_acc = evaluate_on_dirty_data(iris, error_rate=error_rate, evaluation_function=accuracy_score, model=iris_model_1, error_types_to_exclude=[error_type.MissingValue()], error_mechanisms_to_exclude=[error_mechanism.EAR(), error_mechanism.ENAR()], seed=42)
clean_acc, no_enar_ecar_acc = evaluate_on_dirty_data(iris, error_rate=error_rate, evaluation_function=accuracy_score, model=iris_model_1, error_types_to_exclude=[error_type.MissingValue()], error_mechanisms_to_exclude=[error_mechanism.ENAR(), error_mechanism.ECAR()], seed=42)
clean_acc, no_enar_ecar_acc = evaluate_on_dirty_data(iris, error_rate=error_rate, evaluation_function=accuracy_score, model=iris_model_1, error_types_to_exclude=[error_type.MissingValue()], error_mechanisms_to_exclude=[error_mechanism.EAR(), error_mechanism.ECAR()], seed=42)

print(f"With a constant error rate of {error_rate}, we observe the following accuracies using {folds}-fold CV")
print_acc(clean_acc, "clean")
print_acc(dirty_acc, "all error mechanisms")
print_acc(no_ear_acc, "no EAR")
print_acc(no_enar_acc, "no ENAR")
print_acc(no_ecar_acc, "no ECAR")
print_acc(no_enar_enar_acc, "no EAR or ENAR")
print_acc(no_enar_ecar_acc, "no ECAR or ENAR")
print_acc(no_enar_ecar_acc, "no EAR or ECAR")

  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self

With a constant error rate of 0.5, we observe the following accuracies using 5-fold CV
Accuracies of  clean  test data:  [1.0, 1.0, 0.9333333333333333, 0.9666666666666667, 0.9666666666666667] Mean accuracy: 0.9733
Accuracies of  dirty  test data:  [0.7333333333333333, 0.8333333333333334, 0.7333333333333333, 0.6333333333333333, 0.6666666666666666] Mean accuracy: 0.7200
Accuracies of  no EAR  test data:  [0.8, 0.7666666666666667, 0.6666666666666666, 0.7333333333333333, 0.7333333333333333] Mean accuracy: 0.7400
Accuracies of  no ENAR  test data:  [0.8333333333333334, 0.8, 0.6333333333333333, 0.6333333333333333, 0.8] Mean accuracy: 0.7400
Accuracies of  no ECAR  test data:  [0.8666666666666667, 0.8, 0.7, 0.6666666666666666, 0.7333333333333333] Mean accuracy: 0.7533
Accuracies of  no EAR or ENAR  test data:  [0.8333333333333334, 0.7666666666666667, 0.7333333333333333, 0.7333333333333333, 0.7333333333333333] Mean accuracy: 0.7600
Accuracies of  no ECAR or ENAR  test data:  [0.666666666666666

  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)


#### Models and accuracy eval on California housing dataset

In [None]:
# Ridge regression
error_rate = 0.5
cali_model_1 = Ridge(max_iter=200, random_state=42)
folds= 5

clean_acc, dirty_acc = evaluate_on_dirty_data(california, error_rate=error_rate, evaluation_function=r2_score, model=cali_model_1, n_splits=folds, error_types_to_exclude=[error_type.MissingValue()], seed=42)
clean_acc, no_ear_acc = evaluate_on_dirty_data(california, error_rate=error_rate, evaluation_function=r2_score, model=cali_model_1, error_types_to_exclude=[error_type.MissingValue()], error_mechanisms_to_exclude=[error_mechanism.EAR()], seed=42)
clean_acc, no_enar_acc = evaluate_on_dirty_data(california, error_rate=error_rate, evaluation_function=r2_score, model=cali_model_1, error_types_to_exclude=[error_type.MissingValue()], error_mechanisms_to_exclude=[error_mechanism.ENAR()], seed=42)
clean_acc, no_ecar_acc = evaluate_on_dirty_data(california, error_rate=error_rate, evaluation_function=r2_score, model=cali_model_1, error_types_to_exclude=[error_type.MissingValue()], error_mechanisms_to_exclude=[error_mechanism.ECAR()], seed=42)
clean_acc, no_enar_enar_acc = evaluate_on_dirty_data(california, error_rate=error_rate, evaluation_function=r2_score, model=cali_model_1, error_types_to_exclude=[error_type.MissingValue()], error_mechanisms_to_exclude=[error_mechanism.EAR(), error_mechanism.ENAR()], seed=42)
clean_acc, no_enar_ecar_acc = evaluate_on_dirty_data(california, error_rate=error_rate, evaluation_function=r2_score, model=cali_model_1, error_types_to_exclude=[error_type.MissingValue()], error_mechanisms_to_exclude=[error_mechanism.ENAR(), error_mechanism.ECAR()], seed=42)
clean_acc, no_enar_ecar_acc = evaluate_on_dirty_data(california, error_rate=error_rate, evaluation_function=r2_score, model=cali_model_1, error_types_to_exclude=[error_type.MissingValue()], error_mechanisms_to_exclude=[error_mechanism.EAR(), error_mechanism.ECAR()], seed=42)

print(f"With a constant error rate of {error_rate}, we observe the following accuracies using {folds}-fold CV")
print_acc(clean_acc, "clean")
print_acc(dirty_acc, "all error mechanisms")
print_acc(no_ear_acc, "no EAR")
print_acc(no_enar_acc, "no ENAR")
print_acc(no_ecar_acc, "no ECAR")
print_acc(no_enar_enar_acc, "no EAR or ENAR")
print_acc(no_enar_ecar_acc, "no ECAR or ENAR")
print_acc(no_enar_ecar_acc, "no EAR or ECAR")

  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self

With a constant error rate of 0.5, we observe the following accuracies using 5-fold CV
Accuracies of  clean  test data:  [0.5758549611440127, 0.6137452779808752, 0.6085441736956012, 0.6212426773157897, 0.5875365240278356] Mean accuracy: 0.6014
Accuracies of  dirty  test data:  [-24012.138278902028, -23800.625597317212, -24236.513399196407, -23919.731876125996, -23344.745891297684] Mean accuracy: -23862.7510
Accuracies of  no EAR  test data:  [-28534.83008604784, -27555.971249304297, -29174.867343406902, -28984.68410083725, -27168.690131348572] Mean accuracy: -28283.8086
Accuracies of  no ENAR  test data:  [-17354.495741492407, -16621.30962328753, -17326.452923542463, -17498.421149450372, -16307.481913758482] Mean accuracy: -17021.6323
Accuracies of  no ECAR  test data:  [-22826.686786637056, -22378.016347388122, -23593.57986257111, -22915.38730840147, -22057.57088501962] Mean accuracy: -22754.2482
Accuracies of  no EAR or ENAR  test data:  [-15723.74380132568, -15100.05445181844, -1569

  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)


In [None]:
# Ridge regression
error_rate = 0.5
cali_model_2 = HistGradientBoostingRegressor(max_iter=200, random_state=42)
folds= 5

clean_acc, dirty_acc = evaluate_on_dirty_data(california, error_rate=error_rate, evaluation_function=r2_score, model=cali_model_2, n_splits=folds, error_types_to_exclude=[error_type.MissingValue()], seed=42)
clean_acc, no_ear_acc = evaluate_on_dirty_data(california, error_rate=error_rate, evaluation_function=r2_score, model=cali_model_2, error_types_to_exclude=[error_type.MissingValue()], error_mechanisms_to_exclude=[error_mechanism.EAR()], seed=42)
clean_acc, no_enar_acc = evaluate_on_dirty_data(california, error_rate=error_rate, evaluation_function=r2_score, model=cali_model_2, error_types_to_exclude=[error_type.MissingValue()], error_mechanisms_to_exclude=[error_mechanism.ENAR()], seed=42)
clean_acc, no_ecar_acc = evaluate_on_dirty_data(california, error_rate=error_rate, evaluation_function=r2_score, model=cali_model_2, error_types_to_exclude=[error_type.MissingValue()], error_mechanisms_to_exclude=[error_mechanism.ECAR()], seed=42)
clean_acc, no_enar_enar_acc = evaluate_on_dirty_data(california, error_rate=error_rate, evaluation_function=r2_score, model=cali_model_2, error_types_to_exclude=[error_type.MissingValue()], error_mechanisms_to_exclude=[error_mechanism.EAR(), error_mechanism.ENAR()], seed=42)
clean_acc, no_enar_ecar_acc = evaluate_on_dirty_data(california, error_rate=error_rate, evaluation_function=r2_score, model=cali_model_2, error_types_to_exclude=[error_type.MissingValue()], error_mechanisms_to_exclude=[error_mechanism.ENAR(), error_mechanism.ECAR()], seed=42)
clean_acc, no_enar_ecar_acc = evaluate_on_dirty_data(california, error_rate=error_rate, evaluation_function=r2_score, model=cali_model_2, error_types_to_exclude=[error_type.MissingValue()], error_mechanisms_to_exclude=[error_mechanism.EAR(), error_mechanism.ECAR()], seed=42)

print(f"With a constant error rate of {error_rate}, we observe the following accuracies using {folds}-fold CV")
print_acc(clean_acc, "clean")
print_acc(dirty_acc, "all error mechanisms")
print_acc(no_ear_acc, "no EAR")
print_acc(no_enar_acc, "no ENAR")
print_acc(no_ecar_acc, "no ECAR")
print_acc(no_enar_enar_acc, "no EAR or ENAR")
print_acc(no_enar_ecar_acc, "no ECAR or ENAR")
print_acc(no_enar_ecar_acc, "no EAR or ECAR")

  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self

With a constant error rate of 0.5, we observe the following accuracies using 5-fold CV
Accuracies of  clean  test data:  [0.8422041369921434, 0.8491206511667679, 0.8332179758206206, 0.8557720826558177, 0.8392724090642296] Mean accuracy: 0.8439
Accuracies of  dirty  test data:  [-0.5597232198790107, -0.4147506666903509, -0.43900616986344865, -0.3813742206293833, -0.40755861641784485] Mean accuracy: -0.4405
Accuracies of  no EAR  test data:  [-0.6633132940150168, -0.5897910979063063, -0.6338385268010025, -0.5353534039864964, -0.5549726248679576] Mean accuracy: -0.5955
Accuracies of  no ENAR  test data:  [-0.33837434087841145, -0.328112150074253, -0.2614584293325657, -0.14632619727224094, -0.20354099537457504] Mean accuracy: -0.2556
Accuracies of  no ECAR  test data:  [-0.5043541811379451, -0.40275398022050135, -0.45377174181538726, -0.3426877010111393, -0.39653884330536937] Mean accuracy: -0.4200
Accuracies of  no EAR or ENAR  test data:  [-0.38057676015870023, -0.27072058514272324, -0.2

  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
