In [1]:
import pandas as pd
import numpy as np

# Define column names
columns = ["age", "workclass", "fnlwgt", "education", "education_num", "marital_status",
           "occupation", "relationship", "race", "sex", "capital_gain", "capital_loss",
           "hours_per_week", "native_country", "income"]

# Load the dataset
df = pd.read_csv("adult.csv")

In [2]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [3]:
# Select Quasi-Identifiers (QIDs) for anonymization
quasi_identifiers = ["age", "workclass", "education", "marital_status", "occupation", "race", "sex", "native_country", "hours_per_week"]
bins = [0, 25, 40, 60, 100]
labels = ["0-25", "26-40", "41-60", "61-100"]
df["age"] = pd.cut(df["age"], bins=bins, labels=labels)
df["education"] = df["education"].replace({
    "Preschool": "Low",
    "1st-4th": "Low", "5th-6th": "Low", "7th-8th": "Low",
    "9th": "Middle", "10th": "Middle", "11th": "Middle", "12th": "Middle",
    "HS-grad": "Middle",
    "Some-college": "High", "Assoc-voc": "High", "Assoc-acdm": "High",
    "Bachelors": "Higher", "Masters": "Higher", "Doctorate": "Higher", "Prof-school": "Higher"
})

In [4]:
df["workclass"] = df["workclass"].replace({
    "Private": "Employed",
    "Self-emp-not-inc": "Self-Employed",
    "Self-emp-inc": "Self-Employed",
    "Federal-gov": "Government",
    "Local-gov": "Government",
    "State-gov": "Government",
    "Without-pay": "Unemployed",
    "Never-worked": "Unemployed"
})

In [5]:
df["marital_status"] = df["marital_status"].replace({
    "Married-civ-spouse": "Married",
    "Married-AF-spouse": "Married",
    "Divorced": "Separated",
    "Separated": "Separated",
    "Widowed": "Widowed",
    "Never-married": "Single"
})

In [6]:
north_america = ["United-States", "Canada", "Mexico"]
south_america = ["Columbia", "Ecuador", "Peru", "Guatemala"]
asia = ["India", "China", "Japan", "Philippines", "Vietnam"]
europe = ["Germany", "England", "Italy", "France", "Greece"]
df["native_country"] = df["native_country"].apply(
    lambda x: "North America" if x in north_america else
              "South America" if x in south_america else
              "Asia" if x in asia else
              "Europe" if x in europe else "Other")
bins = [0, 20, 40, 60, 100]
labels = ["0-20", "21-40", "41-60", "61-100"]
df["hours_per_week"] = pd.cut(df["hours_per_week"], bins=bins, labels=labels)

In [7]:
k = 5  # Minimum number of records per group
grouped = df.groupby(quasi_identifiers, observed=False)  # Apply groupby with observed=False
df_anonymized = grouped.filter(lambda x: len(x) >= k)

print(f"Original dataset size: {len(df)}, After anonymization: {len(df_anonymized)}")

Original dataset size: 32561, After anonymization: 23006


In [8]:
df_anonymized.to_csv("adult_anonymized_fulldomain.csv", index=False)
print("K-Anonymized dataset saved as 'adult_anonymized_fulldomain.csv'")

K-Anonymized dataset saved as 'adult_anonymized_fulldomain.csv'


In [10]:
df_anonymized

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,26-40,Government,77516,Higher,13,Single,Adm-clerical,Not-in-family,White,Male,2174,0,21-40,North America,<=50K
2,26-40,Employed,215646,Middle,9,Separated,Handlers-cleaners,Not-in-family,White,Male,0,0,21-40,North America,<=50K
3,41-60,Employed,234721,Middle,7,Married,Handlers-cleaners,Husband,Black,Male,0,0,21-40,North America,<=50K
5,26-40,Employed,284582,Higher,14,Married,Exec-managerial,Wife,White,Female,0,0,21-40,North America,<=50K
7,41-60,Self-Employed,209642,Middle,9,Married,Exec-managerial,Husband,White,Male,0,0,41-60,North America,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32555,0-25,Employed,310152,High,10,Single,Protective-serv,Not-in-family,White,Male,0,0,21-40,North America,<=50K
32556,26-40,Employed,257302,High,12,Married,Tech-support,Wife,White,Female,0,0,21-40,North America,<=50K
32557,26-40,Employed,154374,Middle,9,Married,Machine-op-inspct,Husband,White,Male,0,0,21-40,North America,>50K
32558,41-60,Employed,151910,Middle,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,21-40,North America,<=50K
