In [1]:
import pandas as pd
import numpy as np
from anjana.anonymity import k_anonymity_inner, k_anonymity, l_diversity, t_closeness, alpha_k_anonymity
from masking_effects_on_xai_techniques.hierarchy import generate_hierarchy

In [2]:
path = "../data/adult/"

In [3]:
data = pd.read_csv(path + "data.csv")
len_data = len(data)

# Clean the data
data['income'] = data['income'].str.rstrip('.')
data.dropna(inplace=True)
data.drop(columns=['education-num', 'fnlwgt'], inplace=True)

len_clean_data = len(data)
print(f"Dropped {len_data - len_clean_data} rows")
data.to_csv(path + "clean.csv", index=False)

Dropped 1221 rows


In [24]:
hierarchies = {
    "age": dict(pd.read_csv("../hierarchies/age.csv", header=None)),
    "education": dict(pd.read_csv("../hierarchies/education.csv", header=None)),
    "marital-status": dict(pd.read_csv("../hierarchies/marital.csv", header=None)),
    "occupation": dict(pd.read_csv("../hierarchies/occupation.csv", header=None)),
    "sex": dict(pd.read_csv("../hierarchies/sex.csv", header=None)),
    "native-country": dict(pd.read_csv("../hierarchies/country.csv", header=None)),
    "workclass": dict(pd.read_csv("../hierarchies/workclass.csv", header=None)),
    "relationship": dict(pd.read_csv("../hierarchies/relationship.csv", header=None)),
    "race": dict(pd.read_csv("../hierarchies/race.csv", header=None)),
    "capital-gain": generate_hierarchy(data['capital-gain'], 8),
    "capital-loss": generate_hierarchy(data['capital-loss'], 6),
    "hours-per-week": generate_hierarchy(data['hours-per-week'], 5),
}

In [28]:
len(data['hours-per-week'].unique())

96

In [5]:
# Check with mina what should be identified as quai identifiers
quasi_ident = list(data.columns)
quasi_ident.remove('race')
quasi_ident.remove('income')

#quasi_ident.remove('capital-gain')
#quasi_ident.remove('capital-loss')
#quasi_ident.remove('hours-per-week')

ident = ['race'] # Making race quasi identifier make k much larger
sens_att = 'income'

In [6]:
all_counts_and_features = []
for feat, items in hierarchies.items():
    for item in items.values():
        length = len(item.unique())
        all_counts_and_features.append((length, feat))
sorted_data = sorted(
    all_counts_and_features, 
    key=lambda x: x[0], 
    reverse=True
)
ordered_features = [feat for count, feat in sorted_data]

for feat, items in hierarchies.items():
    lengths = [f"{len(item.unique()):>4}" for item in items.values()]
    out = ", ".join(lengths)
    print(f"{feat:<15}: {out}")

print("Order in which features will be generalized:")
print(ordered_features)

age            :   74,   14,    8,    5,    3,    2,    1
education      :   16,    5,    3,    1
marital-status :    7,    2,    1
occupation     :   15,    3,    1
sex            :    2,    1
native-country :   42,    7,    1
workclass      :    9,    4,    1
relationship   :    6,    1
race           :    5,    1
capital-gain   :  122,    5,    4,    4,    4,    3,    3,    2,    1
capital-loss   :   98,    6,    5,    4,    3,    2,    1
hours-per-week :   96,    5,    4,    3,    2,    1
Order in which features will be generalized:
['capital-gain', 'capital-loss', 'hours-per-week', 'age', 'native-country', 'education', 'occupation', 'age', 'workclass', 'age', 'marital-status', 'native-country', 'relationship', 'capital-loss', 'age', 'education', 'race', 'capital-gain', 'capital-loss', 'hours-per-week', 'workclass', 'capital-gain', 'capital-gain', 'capital-gain', 'capital-loss', 'hours-per-week', 'age', 'education', 'occupation', 'capital-gain', 'capital-gain', 'capital-loss', 'hou

In [10]:
# Check with mina if this k is representative of used k in litterature
k = 10
supp_level = 20 # Select the suppression limit allowed
anon_df, supp_n, hiear = k_anonymity_inner(
    data, ident, quasi_ident, k, supp_level, hierarchies
)
max_supp_n = int(round(len(data) * supp_level / 100, 0))
print(f"Max rows that can be suppressed: {max_supp_n}")
print(f"Rows suppressed                : {supp_n}")
print(f"% of allowed rows suppressed   : {round(supp_n / max_supp_n * 100, 1)}%")
print(f"Generalization level           : {sum(hiear.values())}")
hiear

Max rows that can be suppressed: 9524
Rows suppressed                : 7663
% of allowed rows suppressed   : 80.5%
Generalization level           : 12


{'age': 3,
 'workclass': 1,
 'education': 1,
 'marital-status': 1,
 'occupation': 1,
 'relationship': 0,
 'sex': 0,
 'capital-gain': 1,
 'capital-loss': 1,
 'hours-per-week': 1,
 'native-country': 2}

In [23]:
k_list = [2 ** n for n in range(1,9)]
annon_data = []
for k in k_list: 
    anon_df = k_anonymity_inner(
        data, ident, quasi_ident, k, supp_level, hierarchies
    )
    annon_data

[2, 4, 8, 16, 32, 64, 128, 256]

In [None]:
for t in np.linspace(0.1,1.0,10):
    t = round(t, 1) 
    print(f"---- {t} ----")
    if(df.empty):
        print("Skipping") 
        continue
    t_closeness(
        data, ident, quasi_ident,sens_att, k, t, supp_level, hierarchies
    ).to_csv(f"../data/t_closeness/{t}.csv", index=False)

In [None]:
for alpha in np.linspace(0.1,1.0,10):
    alpha = round(alpha, 1) 
    print(f"---- {alpha} ----")
    df = alpha_k_anonymity(
        data, ident, quasi_ident,sens_att, k, alpha, supp_level, hierarchies
    )
    if(df.empty):
        print("Skipping") 
        continue
    df.to_csv(f"../data/alpha_k_anonymity/{alpha}.csv", index=False)

In [None]:
for l in range(1,11):
    print(f"---- {l} ----")
    df = l_diversity(
        data, ident, quasi_ident,sens_att, k, l, supp_level, hierarchies
    )
    if(df.empty):
        print("Skipping") 
        continue
    df.to_csv(f"../data/l_diversity/{l}.csv", index=False)