In [1]:
import pandas as pd
import numpy as np
from anjana.anonymity import k_anonymity_inner, k_anonymity, l_diversity, t_closeness, alpha_k_anonymity

In [3]:
data = pd.read_csv('../data/data.csv')
len_data = len(data)

# Clean the data
data['income'] = data['income'].str.rstrip('.')
data.dropna(inplace=True)
data.drop(columns=['education-num', 'fnlwgt'], inplace=True)

len_clean_data = len(data)
print(f"Dropped {len_data - len_clean_data} rows")
data.to_csv("../data/clean.csv", index=False)

Dropped 1221 rows


In [3]:
hierarchies = {
    "age": dict(pd.read_csv("../hierarchies/age.csv", header=None)),
    "education": dict(pd.read_csv("../hierarchies/education.csv", header=None)),
    "marital-status": dict(pd.read_csv("../hierarchies/marital.csv", header=None)),
    "occupation": dict(pd.read_csv("../hierarchies/occupation.csv", header=None)),
    "sex": dict(pd.read_csv("../hierarchies/sex.csv", header=None)),
    "native-country": dict(pd.read_csv("../hierarchies/country.csv", header=None)),
    "workclass": dict(pd.read_csv("../hierarchies/workclass.csv", header=None)),
    "relationship": dict(pd.read_csv("../hierarchies/relationship.csv", header=None)),
    "race": dict(pd.read_csv("../hierarchies/race.csv", header=None)),
}

In [4]:
# Check with mina what should be identified as quai identifiers
quasi_ident = [
    'age',
    'education',
    'marital-status',
    'occupation',
    'sex',
    'native-country',
    'workclass', # Makes k decrease a LOT
    'relationship', # Makes more sense to drop this altogether. Algorithms generalize it to * but increases k a lot
    # 'race'
]
ident = ['race'] # Making race quasi identifier make k much larger
sens_att = 'income'

In [5]:
all_counts_and_features = []
for feat, items in hierarchies.items():
    for item in items.values():
        length = len(item.unique())
        all_counts_and_features.append((length, feat))
sorted_data = sorted(
    all_counts_and_features, 
    key=lambda x: x[0], 
    reverse=True
)
ordered_features = [feat for count, feat in sorted_data]

for feat, items in hierarchies.items():
    lengths = [f"{len(item.unique()):>4}" for item in items.values()]
    out = ", ".join(lengths)
    print(f"{feat:<15}: {out}")

print("Order in which features will be generalized:")
print(ordered_features)

age            :   74,   14,    8,    5,    3,    2,    1
education      :   16,    5,    3,    1
marital-status :    7,    2,    1
occupation     :   15,    3,    1
sex            :    2,    1
native-country :   42,    7,    1
workclass      :    9,    4,    1
relationship   :    6,    1
race           :    5,    1
Order in which features will be generalized:
['age', 'native-country', 'education', 'occupation', 'age', 'workclass', 'age', 'marital-status', 'native-country', 'relationship', 'age', 'education', 'race', 'workclass', 'age', 'education', 'occupation', 'age', 'marital-status', 'sex', 'age', 'education', 'marital-status', 'occupation', 'sex', 'native-country', 'workclass', 'relationship', 'race']


In [6]:
# Check with mina if this k is representative of used k in litterature
k = 10
supp_level = 20 # Select the suppression limit allowed
anon_df, supp_n, hiear = k_anonymity_inner(
    data, ident, quasi_ident, k, supp_level, hierarchies
)
max_supp_n = int(round(len(data) * supp_level / 100, 0))
print(f"Max rows that can be suppressed: {max_supp_n}")
print(f"Rows suppressed                : {supp_n}")
print(f"% of allowed rows suppressed   : {round(supp_n / max_supp_n * 100, 1)}%")
print(f"Generalization level           : {sum(hiear.values())}")
hiear

Max rows that can be suppressed: 9524
Rows suppressed                : 8580
% of allowed rows suppressed   : 90.1%
Generalization level           : 6


{'age': 2,
 'education': 1,
 'marital-status': 0,
 'occupation': 1,
 'sex': 0,
 'native-country': 1,
 'workclass': 1,
 'relationship': 0}

In [13]:
for t in np.linspace(0.1,1.0,10):
    t = round(t, 1) 
    print(f"---- {t} ----")
    if(df.empty):
        print("Skipping") 
        continue
    t_closeness(
        data, ident, quasi_ident,sens_att, k, t, supp_level, hierarchies
    ).to_csv(f"../data/t_closeness/{t}.csv", index=False)

---- 0.1 ----
---- 0.2 ----
---- 0.3 ----
---- 0.4 ----
---- 0.5 ----
---- 0.6 ----
---- 0.7 ----
---- 0.8 ----
The data verifies t-closeness with t=0.7388386567967009
---- 0.9 ----
The data verifies t-closeness with t=0.7388386567967009
---- 1.0 ----
The data verifies t-closeness with t=0.7388386567967009


In [17]:
for alpha in np.linspace(0.1,1.0,10):
    alpha = round(alpha, 1) 
    print(f"---- {alpha} ----")
    df = alpha_k_anonymity(
        data, ident, quasi_ident,sens_att, k, alpha, supp_level, hierarchies
    )
    if(df.empty):
        print("Skipping") 
        continue
    df.to_csv(f"../data/alpha_k_anonymity/{alpha}.csv", index=False)

SyntaxError: 'return' outside function (3403059694.py, line 1)

In [20]:
for l in range(1,11):
    print(f"---- {l} ----")
    df = l_diversity(
        data, ident, quasi_ident,sens_att, k, l, supp_level, hierarchies
    )
    if(df.empty):
        print("Skipping") 
        continue
    df.to_csv(f"../data/l_diversity/{l}.csv", index=False)

---- 1 ----
The data verifies l-diversity with l=1
---- 2 ----
---- 3 ----
l-diversity cannot be achieved for l=3
Skipping
---- 4 ----


KeyboardInterrupt: 