In [1]:
import json
import numpy as np
import pandas as pd

import anonypy

# Load Datasets

In [2]:
with open('datasets/datasets.json') as f:
    meta_datasets = json.load(f)

dataset_o = {}
dataset_k = {}
dataset_l = {}
dataset_t = {}

for d in meta_datasets['datasets']:
    df = pd.read_csv(d['path'], delimiter=';')
    for name in d['categorical']:
        df[name] = df[name].astype('category')
    dataset_o[d['name']] = df
    
    p = anonypy.Preserver(df, d['feature columns'], d['sensitive column'])

    # K-anonymity
    rows = p.anonymize_k_anonymity(k=5)
    dataset_k[d['name']] = pd.DataFrame(rows)

    # L-diversity
    unique = df[d['sensitive column']].nunique()
    rows = p.anonymize_l_diversity(k=5, l=min(unique, 3))
    dataset_l[d['name']] = pd.DataFrame(rows)

    # T-closeness
    rows = p.anonymize_t_closeness(k=5, p=0.2)
    dataset_t[d['name']] = pd.DataFrame(rows)

# Utils Functions

In [3]:
def pre_processing_raw(meta, datasets, max_categories=5):
    # load dataset from dict
    dataset = datasets[meta['name']]

    # Remove ID if in dataframe (id does not provide any info)
    filterID = dataset.filter(['ID'])
    dataset.drop(filterID, inplace=True, axis=1)

    # Define categorical fields
    #categorical = set(meta['categorical'])
    #if meta['sensitive column'] in  categorical:
    #    categorical.remove(meta['sensitive column'])
    #categorical = list(categorical)
    categorical = set(dataset.select_dtypes(['category']).columns)
    if meta['sensitive column'] in  categorical:
        categorical.remove(meta['sensitive column'])
    categorical = list(categorical)

    # Define numerical fields
    numerical = set(dataset.columns)
    numerical = numerical.difference(categorical)
    if meta['sensitive column'] in  numerical:
        numerical.remove(meta['sensitive column'])
    numerical = list(numerical)

    # Apply min max scaling on numerical
    if len(numerical) > 0:
        scaler = MinMaxScaler()
        dataset[numerical] = scaler.fit_transform(dataset[numerical])

    # Apply One-hot Encoding in categorial features
    transformer = make_column_transformer((OneHotEncoder(max_categories=max_categories,
        handle_unknown='ignore', sparse_output=False), categorical))
    # Transforming
    transformed = transformer.fit_transform(dataset)
    # Transformating back
    transformed_df = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
    # One-hot encoding removed an index. Let's put it back:
    transformed_df.index = dataset.index
    # Joining tables
    dataset = pd.concat([dataset, transformed_df], axis=1)
    # Dropping old categorical columns
    dataset.drop(categorical, axis=1, inplace=True)

    # Apply label encoding in target feature
    label_encoder = LabelEncoder() 
    dataset[meta['sensitive column']] = label_encoder.fit_transform(dataset[meta['sensitive column']])

    # Split Dataset
    y = dataset[meta['sensitive column']]
    X = dataset.drop(meta['sensitive column'], axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

    return X_train, X_test, y_train, y_test


In [4]:
def pre_processing_anonymity(meta, datasets, max_categories=5):
    # load dataset from dict
    dataset = datasets[meta['name']]

    # Remove ID if in dataframe (id does not provide any info)
    filterID = dataset.filter(['ID'])
    dataset.drop(filterID, inplace=True, axis=1)

    objects = set(dataset.select_dtypes(['object']).columns)
    lists = [c for c in objects if isinstance(dataset.iloc[0][c], list)]

    categorical = set(dataset.select_dtypes(['object']).columns)
    if meta['sensitive column'] in  categorical:
        categorical.remove(meta['sensitive column'])
    categorical = list(categorical)

    numerical = set(dataset.columns)
    numerical = numerical.difference(categorical)
    if meta['sensitive column'] in  numerical:
        numerical.remove(meta['sensitive column'])
    numerical = list(numerical)

    # FIX list cell into string
    for column in objects:
        dataset[column] = [','.join(map(str, l)) for l in dataset[column]]

    # Apply min max scaling on numerical
    if len(numerical) > 0:
        scaler = MinMaxScaler()
        dataset[numerical] = scaler.fit_transform(dataset[numerical])

    # Apply One-hot Encoding in categorial features
    transformer = make_column_transformer((OneHotEncoder(max_categories=7, handle_unknown='ignore', sparse_output=False), categorical))
    # Transforming
    transformed = transformer.fit_transform(dataset)
    # Transformating back
    transformed_df = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
    # One-hot encoding removed an index. Let's put it back:
    transformed_df.index = dataset.index
    # Joining tables
    dataset = pd.concat([dataset, transformed_df], axis=1)
    # Dropping old categorical columns
    dataset.drop(categorical, axis=1, inplace=True)

    # Apply label encoding in target feature
    label_encoder = LabelEncoder() 
    dataset[meta['sensitive column']] = label_encoder.fit_transform(dataset[meta['sensitive column']])

    # Fix y class distribution
    threshold = 2
    value_counts = dataset[meta['sensitive column']].value_counts()
    to_remove = value_counts[value_counts <= threshold].index
    dataset[meta['sensitive column']] = dataset[meta['sensitive column']].replace(to_remove, np.nan)
    mode = dataset[meta['sensitive column']].mode()[0]
    dataset[meta['sensitive column']] = dataset[meta['sensitive column']].fillna(mode)

    # Split Dataset
    y = dataset[meta['sensitive column']]
    X = dataset.drop(meta['sensitive column'], axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

    return X_train, X_test, y_train, y_test

# HyperParameter Training

In [5]:
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import xgboost as xgb


from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, matthews_corrcoef


optuna.logging.set_verbosity(optuna.logging.WARNING)


def lr_objective(trial, X_train, y_train):
    C = trial.suggest_categorical('C', [0.001, 0.01, 0.1, 1, 10])
    l1_ratio = trial.suggest_categorical('l1_ratio', [0, 0.1, 0.25, 0.5, 0.75, 0.9, 1.0])
    
    clf = LogisticRegression(random_state=42, solver='saga', 
    penalty='elasticnet', max_iter=5000, C=C, l1_ratio=l1_ratio)
    score = cross_val_score(clf, X_train, y_train, n_jobs=-1, 
    cv=3, scoring=make_scorer(matthews_corrcoef))
    return score.mean()


def train_lr(X_train, X_test, y_train, y_test):
    # Running the Optuna optimization
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: lr_objective(trial, X_train, y_train), n_trials=100)

    # Get the best model and hyperparameters
    best_params_optuna = study.best_params

    # Train with the best parameters from Optuna
    best_clf = LogisticRegression(random_state=42, solver='saga', 
    penalty='elasticnet', max_iter=5000, **best_params_optuna)
    best_clf.fit(X_train, y_train)
    y_pred = best_clf.predict(X_test)
    mcc = matthews_corrcoef(y_test, y_pred)
    
    return mcc


def knn_objective(trial, X_train, y_train):
    n_neighbors = trial.suggest_int('n_neighbors', 1, 31, step=2)
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    metric = trial.suggest_categorical('metric', ['euclidean', 'manhattan'])

    clf = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, metric=metric)
    score = cross_val_score(clf, X_train, y_train, n_jobs=-1, cv=3, scoring=make_scorer(matthews_corrcoef))
    return score.mean()


def train_knn(X_train, X_test, y_train, y_test):
    # Running the Optuna optimization
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: knn_objective(trial, X_train, y_train), n_trials=100)

    # Get the best model and hyperparameters
    best_params_optuna = study.best_params
    
    # Train with the best parameters from Optuna
    best_clf = KNeighborsClassifier(**best_params_optuna)
    best_clf.fit(X_train, y_train)
    y_pred = best_clf.predict(X_test)
    mcc = matthews_corrcoef(y_test, y_pred)
    
    return mcc


def mlp_objective(trial, X_train, y_train):
    n_layers = trial.suggest_int('n_layers', 1, 3)
    layers = []
    for i in range(n_layers):
        layers.append(trial.suggest_int(f'n_units_{i}', 10, 100, step=10))
    alpha = trial.suggest_categorical('alpha', [0.0001, 0.001, 0.01, 0.1, 1])
    learning_rate_init = trial.suggest_categorical('learning_rate_init', [0.0001, 0.001, 0.01])

    clf = MLPClassifier(random_state=42, max_iter=10000, 
    hidden_layer_sizes=tuple(layers), alpha=alpha, learning_rate_init=learning_rate_init)
    score = cross_val_score(clf, X_train, y_train, n_jobs=-1, cv=3, scoring=make_scorer(matthews_corrcoef))
    return score.mean()


def train_mlp(X_train, X_test, y_train, y_test):
    # Running the Optuna optimization
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: mlp_objective(trial, X_train, y_train), n_trials=100)

    # Get the best model and hyperparameters
    best_params_optuna = study.best_params
    
    # Train with the best parameters from Optuna
    alpha = best_params_optuna['alpha']
    learning_rate_init = best_params_optuna['learning_rate_init']
    layers = []
    for i in range(best_params_optuna['n_layers']):
        layers.append(best_params_optuna[f'n_units_{i}'])
    best_clf = MLPClassifier(random_state=42, max_iter=10000, 
    hidden_layer_sizes=tuple(layers), alpha=alpha, learning_rate_init=learning_rate_init)
    best_clf.fit(X_train, y_train)
    y_pred = best_clf.predict(X_test)
    mcc = matthews_corrcoef(y_test, y_pred)
    
    return mcc


def rf_objective(trial, X_train, y_train):
    n_estimators = trial.suggest_int('n_estimators', 100, 500, step=25)
    max_depth = trial.suggest_int('max_depth', 5, 30, step=5)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])

    clf = RandomForestClassifier(random_state=42, n_estimators=n_estimators, max_depth=max_depth, max_features=max_features)
    score = cross_val_score(clf, X_train, y_train, n_jobs=-1, cv=3, scoring=make_scorer(matthews_corrcoef))
    return score.mean()


def train_rf(X_train, X_test, y_train, y_test):
    # Running the Optuna optimization
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: rf_objective(trial, X_train, y_train), n_trials=100)

    # Get the best model and hyperparameters
    best_params_optuna = study.best_params
    
    # Train with the best parameters from Optuna
    best_clf = RandomForestClassifier(random_state=42, **best_params_optuna)
    best_clf.fit(X_train, y_train)
    y_pred = best_clf.predict(X_test)
    mcc = matthews_corrcoef(y_test, y_pred)
    
    return mcc


def xgb_objective(trial, X_train, y_train):
    n_estimators = trial.suggest_int('n_estimators', 100, 500, step=25)
    max_depth = trial.suggest_int('max_depth', 5, 30, step=5)
    learning_rate = trial.suggest_categorical('learning_rate', [0.001, 0.01, 0.1])

    if len(np.unique(y_train)) > 2:
        eval_metric = 'merror'
        objective = 'multi:softprob'
    else:
        eval_metric = 'error'
        objective = 'binary:hinge'

    clf = xgb.XGBClassifier(random_state=42, tree_method='hist',
    eval_metric=eval_metric, verbosity = 0, n_estimators=n_estimators, 
    max_depth=max_depth, learning_rate=learning_rate)
    score = cross_val_score(clf, X_train, y_train, n_jobs=-1, cv=3, scoring=make_scorer(matthews_corrcoef))
    return score.mean()


def train_xgb(X_train, X_test, y_train, y_test):
    # Running the Optuna optimization
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: xgb_objective(trial, X_train, y_train), n_trials=100)

    # Get the best model and hyperparameters
    best_params_optuna = study.best_params
    
    # Train with the best parameters from Optuna
    best_clf = xgb.XGBClassifier(random_state=42, tree_method='hist', 
    verbosity = 0, **best_params_optuna)
    best_clf.fit(X_train, y_train)
    y_pred = best_clf.predict(X_test)
    mcc = matthews_corrcoef(y_test, y_pred)
    
    return mcc

# Baseline

In [6]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

from sklearn.compose import make_column_transformer

for d in meta_datasets['datasets']:
    X_train, X_test, y_train, y_test = pre_processing_raw(d, dataset_o)

    mcc = train_lr(X_train, X_test, y_train, y_test)
    print(f'{d['name']} LR: {mcc}')

    mcc = train_knn(X_train, X_test, y_train, y_test)
    print(f'{d['name']} KNN: {mcc}')

    mcc = train_mlp(X_train, X_test, y_train, y_test)
    print(f'{d['name']} MLP: {mcc}')

    mcc = train_rf(X_train, X_test, y_train, y_test)
    print(f'{d['name']} RF: {mcc}')

    mcc = train_xgb(X_train, X_test, y_train, y_test)
    print(f'{d['name']} XGB: {mcc}')

The Adult Dataset LR: 0.4632863497480889
The Adult Dataset KNN: 0.49197616146819856
The Adult Dataset MLP: 0.509842523974503
The Adult Dataset RF: 0.49999255400108633
The Adult Dataset XGB: 0.5083101421321792
California Housing Prices LR: 0.7022925589661282
California Housing Prices KNN: 0.8143199784457996
California Housing Prices MLP: 0.9233934879922826
California Housing Prices RF: 0.9760151309088261
California Housing Prices XGB: 0.9745188230956336
Contraceptive Method Choice LR: 0.21222540748684532
Contraceptive Method Choice KNN: 0.19722624565837857
Contraceptive Method Choice MLP: 0.2778350610817005
Contraceptive Method Choice RF: 0.2582613029328623
Contraceptive Method Choice XGB: 0.2899781331132844
The Mammographic Mass LR: 0.7130399572296933
The Mammographic Mass KNN: 0.7106753812636165
The Mammographic Mass MLP: 0.7118373275236021
The Mammographic Mass RF: 0.7003269622932883
The Mammographic Mass XGB: 0.6762078179487477


# k-anonymity

In [7]:
for d in meta_datasets['datasets']:
    X_train, X_test, y_train, y_test = pre_processing_anonymity(d, dataset_k)

    mcc = train_lr(X_train, X_test, y_train, y_test)
    print(f'{d['name']} LR: {mcc}')

    mcc = train_knn(X_train, X_test, y_train, y_test)
    print(f'{d['name']} KNN: {mcc}')

    mcc = train_mlp(X_train, X_test, y_train, y_test)
    print(f'{d['name']} MLP: {mcc}')
    
    mcc = train_rf(X_train, X_test, y_train, y_test)
    print(f'{d['name']} RF: {mcc}')

    mcc = train_xgb(X_train, X_test, y_train, y_test)
    print(f'{d['name']} XGB: {mcc}')

Categorical ['education', 'marital-status', 'age', 'native-country', 'workclass', 'race', 'sex', 'occupation']
Numerical []
The Adult Dataset LR: 0.4178016139218438
The Adult Dataset KNN: 0.43322415184053126
The Adult Dataset MLP: 0.43295118396082094
The Adult Dataset RF: 0.4321073889938239
The Adult Dataset XGB: 0.4464555940182314
Categorical ['median_income', 'longitude', 'housing_median_age', 'median_house_value', 'latitude']
Numerical ['mean_bedrooms', 'households', 'population', 'mean_rooms']
California Housing Prices LR: 0.19407010718577372
California Housing Prices KNN: 0.18951229229105493
California Housing Prices MLP: 0.19854954700208877
California Housing Prices RF: 0.2535213725115397




California Housing Prices XGB: 0.2510697101649099
Categorical ['age', 'children', 'Weducation']
Numerical ['Heducation', 'solindex', 'religion', 'working', 'exposure', 'occupation']
Contraceptive Method Choice LR: 0.20736389680117945
Contraceptive Method Choice KNN: 0.20126691397046004
Contraceptive Method Choice MLP: 0.1892432032071628
Contraceptive Method Choice RF: 0.24583349821535896
Contraceptive Method Choice XGB: 0.2511683345835261
Categorical ['age', 'bi_rads_assessment', 'density', 'margin', 'shape']
Numerical []
The Mammographic Mass LR: 0.6795060570120499
The Mammographic Mass KNN: 0.6262259766564466
The Mammographic Mass MLP: 0.6628348160686296
The Mammographic Mass RF: 0.7376223909876124
The Mammographic Mass XGB: 0.6744642653455278


# l-diversity

In [8]:
for d in meta_datasets['datasets']:
    X_train, X_test, y_train, y_test = pre_processing_anonymity(d, dataset_l)
    
    mcc = train_lr(X_train, X_test, y_train, y_test)
    print(f'{d['name']} LR: {mcc}')

    mcc = train_knn(X_train, X_test, y_train, y_test)
    print(f'{d['name']} KNN: {mcc}')

    mcc = train_mlp(X_train, X_test, y_train, y_test)
    print(f'{d['name']} MLP: {mcc}')
    
    mcc = train_rf(X_train, X_test, y_train, y_test)
    print(f'{d['name']} RF: {mcc}')

    mcc = train_xgb(X_train, X_test, y_train, y_test)
    print(f'{d['name']} XGB: {mcc}')

Categorical ['education', 'marital-status', 'age', 'native-country', 'workclass', 'race', 'sex', 'occupation']
Numerical []
The Adult Dataset LR: 0.4050408056957064
The Adult Dataset KNN: 0.4097687313676189
The Adult Dataset MLP: 0.4084350863242584
The Adult Dataset RF: 0.4134880801896997
The Adult Dataset XGB: 0.4170277478293139
Categorical ['median_income', 'longitude', 'housing_median_age', 'median_house_value', 'latitude']
Numerical ['mean_bedrooms', 'households', 'population', 'mean_rooms']
California Housing Prices LR: 0.27374683171897035
California Housing Prices KNN: 0.2513540179423702
California Housing Prices MLP: 0.28240554964302966
California Housing Prices RF: 0.3300490786778223
California Housing Prices XGB: 0.3209542402103175
Categorical ['age', 'children', 'Weducation']
Numerical ['Heducation', 'solindex', 'religion', 'working', 'exposure', 'occupation']
Contraceptive Method Choice LR: 0.2624760351057159
Contraceptive Method Choice KNN: 0.26302700481576236
Contraceptive

# t-closeness

In [9]:
for d in meta_datasets['datasets']:
    X_train, X_test, y_train, y_test = pre_processing_anonymity(d, dataset_t)
    
    mcc = train_lr(X_train, X_test, y_train, y_test)
    print(f'{d['name']} LR: {mcc}')

    mcc = train_knn(X_train, X_test, y_train, y_test)
    print(f'{d['name']} KNN: {mcc}')

    mcc = train_mlp(X_train, X_test, y_train, y_test)
    print(f'{d['name']} MLP: {mcc}')
    
    mcc = train_rf(X_train, X_test, y_train, y_test)
    print(f'{d['name']} RF: {mcc}')

    mcc = train_xgb(X_train, X_test, y_train, y_test)
    print(f'{d['name']} XGB: {mcc}')

Categorical ['education', 'marital-status', 'age', 'native-country', 'workclass', 'race', 'sex', 'occupation']
Numerical []
The Adult Dataset LR: 0.3761728626277607
The Adult Dataset KNN: 0.41064171459351484
The Adult Dataset MLP: 0.39626494897130515
The Adult Dataset RF: 0.39751978000683424
The Adult Dataset XGB: 0.39770116889923746
Categorical ['median_income', 'longitude', 'housing_median_age', 'median_house_value', 'latitude']
Numerical ['mean_bedrooms', 'households', 'population', 'mean_rooms']
California Housing Prices LR: 0.3123444285515015
California Housing Prices KNN: 0.3175393695046432
California Housing Prices MLP: 0.33740247622179964
California Housing Prices RF: 0.3496349148069025
California Housing Prices XGB: 0.35981043508793986
Categorical ['age', 'children', 'Weducation']
Numerical ['Heducation', 'solindex', 'religion', 'working', 'exposure', 'occupation']
Contraceptive Method Choice LR: 0.26008312132637684
Contraceptive Method Choice KNN: 0.21133917067125352
Contrace