In [346]:
import pandas as pd
import numpy as np
import math


In [347]:
# Define column names based on the description from 'adult.names'
column_names = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "sex",
    "capital_gain", "capital_loss", "hours_per_week", "native_country", "salary"
]

adult_data = pd.read_csv('adult.data', header=None, names=column_names, na_values=" ?")

adult_test = pd.read_csv('adult.test', header=None, names=column_names, skiprows=1, na_values=" ?") # skipping the first line bc it was not important


In [348]:
# Replace NaN in the 'occupation' column with 'Unknown'
adult_data.fillna({'occupation':'Unknown'}, inplace=True)

In [349]:
print(adult_data.isnull().sum())

age                  0
workclass         1836
fnlwgt               0
education            0
education_num        0
marital_status       0
occupation           0
relationship         0
race                 0
sex                  0
capital_gain         0
capital_loss         0
hours_per_week       0
native_country     583
salary               0
dtype: int64


In [350]:
print(adult_data.head())
print(adult_test.head())


   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  salary  
0          2174             0              40   United-States   <=50

In [351]:
with open('adult.names', 'r') as f:
    names_content = f.read()

# Print the content of the 'adult.names' file
# print(names_content)

In [352]:
print(adult_data['salary'])

0         <=50K
1         <=50K
2         <=50K
3         <=50K
4         <=50K
          ...  
32556     <=50K
32557      >50K
32558     <=50K
32559     <=50K
32560      >50K
Name: salary, Length: 32561, dtype: object


In [353]:
adult_data['age'].sort_values(ascending=False)

24043    90
31696    90
18277    90
12451    90
5406     90
         ..
12678    17
22107    17
12838    17
3605     17
3618     17
Name: age, Length: 32561, dtype: int64

Generalize the age based on the hierarchy in hierarchy.txt

In [354]:
def generalize_age(age, level):
    if level == 1:
        # Level 1: No generalization, return the precise age
        return age
    elif level == 2:
        # Level 2: Group into small age ranges
        if 17 <= age <= 19:
            return '17-19'
        elif 20 <= age <= 29:
            return '20-29'
        elif 30 <= age <= 39:
            return '30-39'
        elif 40 <= age <= 49:
            return '40-49'
        elif 50 <= age <= 59:
            return '50-59'
        elif 60 <= age <= 69:
            return '60-69'
        elif 70 <= age <= 79:
            return '70-79'
        elif 80 <= age <= 89:
            return '80-89'
        elif age >= 90:
            return '90+'
    elif level == 3:
        # Level 3: Group into medium ranges
        if 17 <= age <= 29:
            return '17-29'
        elif 30 <= age <= 49:
            return '30-49'
        elif 50 <= age <= 69:
            return '50-69'
        elif age >= 70:
            return '70+'
    elif level == 4:
        # Level 4: Group into broad categories
        if 17 <= age <= 29:
            return 'Young Adult (17-29)'
        elif 30 <= age <= 49:
            return 'Middle Age (30-49)'
        elif 50 <= age <= 69:
            return 'Senior (50-69)'
        elif age >= 70:
            return 'Elderly (70+)'
    elif level == 5:
        # Level 5: Generalize to "All Ages"
        return 'All Ages'
    
    return 'All Ages'

age = 91
generalized_age = generalize_age(age, 2)
print(f'Generalized age at level 2: {generalized_age}')

age = 35
generalized_age = generalize_age(age, 3)
print(f'Generalized age at level 3: {generalized_age}')


Generalized age at level 2: 90+
Generalized age at level 3: 30-49


In [355]:
adult_data['education'].unique()


array([' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th',
       ' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th',
       ' Doctorate', ' Prof-school', ' 5th-6th', ' 10th', ' 1st-4th',
       ' Preschool', ' 12th'], dtype=object)

Education generaliztion

In [356]:
def generalize_education(education, level):
    education = education.strip()
    
    if level == 1:
        return education
    
    elif level == 2:
        if education in ['Preschool', '1st-4th', '5th-6th']:
            return 'Primary School'
        elif education in ['7th-8th', '9th']:
            return 'Middle School'
        elif education in ['10th', '11th', '12th', 'HS-grad']:
            return 'High School'
        elif education in ['Assoc-voc', 'Prof-school', 'Some-college']:
            return 'Vocational School'
        elif education in ['Assoc-acdm', 'Bachelors']:
            return 'Undergraduate School'
        elif education in ['Masters', 'Doctorate']:
            return 'Graduate School'
    
    elif level == 3:
        if education in ['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th', '10th', '11th', '12th', 'HS-grad']:
            return 'Compulsory School'
        elif education in ['Assoc-voc', 'Prof-school', 'Some-college', 'Assoc-acdm', 'Bachelors']:
            return 'Basic Degree'
        elif education in ['Masters', 'Doctorate']:
            return 'Advanced Degree'
    
    elif level == 4:
        if education == 'Compulsory School':
            return 'Compulsory School'
        elif education in ['Basic Degree', 'Advanced Degree']:
            return 'Advanced School'
    
    elif level == 5:
        return 'Education'

    return 'Education'

education = 'Bachelors'
generalized_education = generalize_education(education, 3)
print(f'Generalized education at level 3: {generalized_education}')


Generalized education at level 3: Basic Degree


Marital Status Generalization

In [357]:
def generalize_marital_status(status, level):
    status = status.strip()
    
    if level == 1:
        return status
    
    elif level == 2:
        if status in ['Separated', 'Married-spouse-absent']:
            return 'Separated-Married'
        elif status in ['Never-married', 'Widowed', 'Divorced']:
            return 'Single'
        elif status in ['Married-civ-spouse', 'Married-AF-spouse']:
            return 'Married-Together'
    
    elif level == 3:
        if status in ['Separated', 'Married-spouse-absent', 'Married-civ-spouse', 'Married-AF-spouse']:
            return 'Married'
        elif status in ['Never-married', 'Widowed', 'Divorced']:
            return 'Not-Married'
    
    elif level == 4:
        return '*'
    
    return '*'

marital_status = 'Married-civ-spouse'
generalized_status = generalize_marital_status(marital_status, 3)
print(f'Generalized marital status at level 3: {generalized_status}')


Generalized marital status at level 3: Married


Race Generalization

In [358]:
def generalize_race(race, level):
    race = race.strip()
    
    if level == 1:
        return race
    
    elif level == 2:
        if race == 'White':
            return 'Caucasian'
        elif race == 'Asian-Pac-Islander':
            return 'Asian'
        elif race == 'Amer-Indian-Eskimo':
            return 'Indigenous'
        elif race == 'Black':
            return 'African Descent'
        elif race == 'Other':
            return 'Other'
    
    elif level == 3:
        if race in ['White', 'Black']:
            return 'Western Origin'
        elif race in ['Asian-Pac-Islander', 'Amer-Indian-Eskimo']:
            return 'Eastern Origin'
        elif race == 'Other':
            return 'Other'
    
    elif level == 4:
        if race in ['White', 'Black']:
            return 'Western Origin'
        else:
            return 'Non-Western Origin'
    
    elif level == 5:
        return 'Race'
    return 'Race'
race = 'Asian-Pac-Islander'
generalized_race = generalize_race(race, 3)
print(f'Generalized race at level 3: {generalized_race}')

Generalized race at level 3: Eastern Origin


In [359]:
# create a diffent dataset with the generalized data
generalized_data = adult_data.copy()
generalized_data # will be generalized in the next steps

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [360]:
def generalize_QIs(data, educationLevel, maritalStatusLevel, raceLevel, ageLevel):
    data['age'] = data['age'].apply(lambda x : generalize_age(x, ageLevel)) 
    data['education'] = data['education'].apply(lambda x : generalize_education(x, educationLevel))
    data['marital_status'] = data['marital_status'].apply(lambda x : generalize_marital_status(x, maritalStatusLevel))
    data['race'] = data['race'].apply(lambda x : generalize_race(x, raceLevel))
    return data

In [361]:

def check_k_anonymity_le50(data, k1):
    """
    Check if the dataset meets the k-anonymity requirement.
    
    :param data: The dataset to check.
    :param k1: The k-anonymity level for users with salaries ≤ 50K.
    :param k2: The k-anonymity level for users with salaries > 50K.
    :return: True if the dataset meets the k-anonymity requirement, False otherwise.
    """
    # Print unique values in the salary column for debugging
    # print("Unique salary values:", data['salary'].unique())
    
    # Split the dataset into two parts based on the salary
    data_le_50k = data[data['salary'] == ' <=50K']
    
    # Print the number of rows in each part for debugging
    # print("Number of rows with salary <= 50K:", len(data_le_50k))
    # print("Number of rows with salary > 50K:", len(data_gt_50k))
    
    # Group by QIs for each part
    grouped_le_50k = data_le_50k.groupby(['age', 'education', 'marital_status', 'race'])
    
    # Check k-anonymity for users with salaries ≤ 50K
    for _, group in grouped_le_50k:
        if len(group) < k1:
            return False, data_le_50k
    
    
    
    return True, data_le_50k

In [362]:

def check_k_anonymity_gt50(data, k2):
    """
    Check if the dataset meets the k-anonymity requirement.
    
    :param data: The dataset to check.
    :param k1: The k-anonymity level for users with salaries ≤ 50K.
    :param k2: The k-anonymity level for users with salaries > 50K.
    :return: True if the dataset meets the k-anonymity requirement, False otherwise.
    """
    # Print unique values in the salary column for debugging
    # print("Unique salary values:", data['salary'].unique())
    
    # Split the dataset into two parts based on the salary
    data_gt_50k = data[data['salary'] == ' >50K']
    
    # Print the number of rows in each part for debugging
    # print("Number of rows with salary <= 50K:", len(data_le_50k))
    # print("Number of rows with salary > 50K:", len(data_gt_50k))
    
    # Group by QIs for each part
    grouped_gt_50k = data_gt_50k.groupby(['age', 'education', 'marital_status', 'race'])
    
    
    # Check k-anonymity for users with salaries > 50K
    for _, group in grouped_gt_50k:
        if len(group) < k2:
            return False, data_gt_50k
    
    return True, data_gt_50k

Generalizing the data with salary of <=50k while meeting the k-anonymity requirements

In [363]:
# Max level of generalization for Race, Education, Marital Status, Age is 5, 5, 4, 5 respectively.
# Also attributes can have different levels of generalization in the same dataset
generalized_data = adult_data.copy()
k1 = 10
k2 = 5

found = False
global gen_levels
for i in range(1, 6):
    for j in range(1, 6):
        for k in range(1, 5):
            for l in range(1, 6):
                generalized_data = adult_data.copy()
                generalized_data = generalize_QIs(generalized_data, i, j, k, l)
                
                # Check if the dataset meets the k-anonymity requirement
                is_k_anonymous_le_50k, table_le_50k= check_k_anonymity_le50(generalized_data, k1)

                if is_k_anonymous_le_50k:
                    table_le_50k.to_csv(f'hw1-1-generalized_data_le_50k_{i}_{j}_{k}_{l}.csv', index=False)
                    print(f'Generalization levels for <=50k dataset: Race={k}, Education={i}, Marital Status={j}, Age={l}')
                    gen_levels_le_50k = [i, j, k, l]
                    generalization_levels_le50k = {
                        'age': l,
                        'education': i, 
                        'marital_status': j, 
                        'race': k 
                    }
                    found = True
                    break
            if found:
                break
        if found:
            break
    if found:
        break

Generalization levels for <=50k dataset: Race=4, Education=2, Marital Status=3, Age=5


Generalizing the data with salary of >50k while meeting the k-anonymity requirements

In [364]:
found = False
for j in range(1, 6):
    for l in range(1, 6):
        for k in range(1, 5):
            for i in range(1, 6):
                generalized_data = adult_data.copy()
                generalized_data = generalize_QIs(generalized_data, i, j, k, l)
                
                # Check if the dataset meets the k-anonymity requirement
                is_k_anonymous_gt_50k, table_gt_50k= check_k_anonymity_gt50(generalized_data, k2)

                if is_k_anonymous_gt_50k:
                    table_gt_50k.to_csv(f'hw1-1-generalized_data_gt_50k_{i}_{j}_{k}_{l}.csv', index=False)
                    print(f'Generalization levels for >50k dataset: Race={k}, Education={i}, Marital Status={j}, Age={l}')
                    gen_levels_gt_50k = [i, j, k, l]
                    generalization_levels_gt50k = {
                        'age': l,
                        'education': i, 
                        'marital_status': j, 
                        'race': k 
                    }
                    found = True
                    break
            if found:
                break
        if found:
            break
    if found:
        break

Generalization levels for >50k dataset: Race=4, Education=4, Marital Status=2, Age=5


In [365]:
print(f'Is k-anonymous: {is_k_anonymous_le_50k} for <=50k dataset')

Is k-anonymous: True for <=50k dataset


In [366]:
print(f'Is k-anonymous: {is_k_anonymous_gt_50k} for >50k dataset')

Is k-anonymous: True for >50k dataset


In [367]:
print("Table for users with salaries ≤ 50K:")

table_le_50k

Table for users with salaries ≤ 50K:


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
0,All Ages,State-gov,77516,Undergraduate School,13,Not-Married,Adm-clerical,Not-in-family,Western Origin,Male,2174,0,40,United-States,<=50K
1,All Ages,Self-emp-not-inc,83311,Undergraduate School,13,Married,Exec-managerial,Husband,Western Origin,Male,0,0,13,United-States,<=50K
2,All Ages,Private,215646,High School,9,Not-Married,Handlers-cleaners,Not-in-family,Western Origin,Male,0,0,40,United-States,<=50K
3,All Ages,Private,234721,High School,7,Married,Handlers-cleaners,Husband,Western Origin,Male,0,0,40,United-States,<=50K
4,All Ages,Private,338409,Undergraduate School,13,Married,Prof-specialty,Wife,Western Origin,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32553,All Ages,Private,116138,Graduate School,14,Not-Married,Tech-support,Not-in-family,Non-Western Origin,Male,0,0,11,Taiwan,<=50K
32555,All Ages,Private,310152,Vocational School,10,Not-Married,Protective-serv,Not-in-family,Western Origin,Male,0,0,40,United-States,<=50K
32556,All Ages,Private,257302,Undergraduate School,12,Married,Tech-support,Wife,Western Origin,Female,0,0,38,United-States,<=50K
32558,All Ages,Private,151910,High School,9,Not-Married,Adm-clerical,Unmarried,Western Origin,Female,0,0,40,United-States,<=50K


In [368]:
# For table_le_50k
q_block_counts_le_50k = table_le_50k.groupby(['age', 'education', 'marital_status', 'race']).size().reset_index(name='counts')
print("\n Q* block counts for users with salaries ≤ 50K:")
print(q_block_counts_le_50k)



 Q* block counts for users with salaries ≤ 50K:
         age             education marital_status                race  counts
0   All Ages       Graduate School        Married  Non-Western Origin      40
1   All Ages       Graduate School        Married      Western Origin     269
2   All Ages       Graduate School    Not-Married  Non-Western Origin      24
3   All Ages       Graduate School    Not-Married      Western Origin     538
4   All Ages           High School        Married  Non-Western Origin     214
5   All Ages           High School        Married      Western Origin    4504
6   All Ages           High School    Not-Married  Non-Western Origin     267
7   All Ages           High School    Not-Married      Western Origin    6227
8   All Ages         Middle School        Married  Non-Western Origin      35
9   All Ages         Middle School        Married      Western Origin     577
10  All Ages         Middle School    Not-Married  Non-Western Origin      23
11  All Ages   

In [369]:

# For table_gt_50k
q_block_counts_gt_50k = table_gt_50k.groupby(['age', 'education', 'marital_status', 'race']).size().reset_index(name='counts')
print("\n Q* block counts for users with salaries > 50K:")
print(q_block_counts_gt_50k)


 Q* block counts for users with salaries > 50K:
        age  education     marital_status                race  counts
0  All Ages  Education   Married-Together  Non-Western Origin     282
1  All Ages  Education   Married-Together      Western Origin    6420
2  All Ages  Education  Separated-Married  Non-Western Origin       8
3  All Ages  Education  Separated-Married      Western Origin      92
4  All Ages  Education             Single  Non-Western Origin      47
5  All Ages  Education             Single      Western Origin     992


In [370]:
print("Table for users with salaries > 50K:")

table_gt_50k

Table for users with salaries > 50K:


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
7,All Ages,Self-emp-not-inc,209642,Education,9,Married-Together,Exec-managerial,Husband,Western Origin,Male,0,0,45,United-States,>50K
8,All Ages,Private,45781,Education,14,Single,Prof-specialty,Not-in-family,Western Origin,Female,14084,0,50,United-States,>50K
9,All Ages,Private,159449,Education,13,Married-Together,Exec-managerial,Husband,Western Origin,Male,5178,0,40,United-States,>50K
10,All Ages,Private,280464,Education,10,Married-Together,Exec-managerial,Husband,Western Origin,Male,0,0,80,United-States,>50K
11,All Ages,State-gov,141297,Education,13,Married-Together,Prof-specialty,Husband,Non-Western Origin,Male,0,0,40,India,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32539,All Ages,,287372,Education,16,Married-Together,Unknown,Husband,Western Origin,Male,0,0,10,United-States,>50K
32545,All Ages,Local-gov,111499,Education,12,Married-Together,Adm-clerical,Wife,Western Origin,Female,0,0,20,United-States,>50K
32554,All Ages,Private,321865,Education,14,Married-Together,Exec-managerial,Husband,Western Origin,Male,0,0,40,United-States,>50K
32557,All Ages,Private,154374,Education,9,Married-Together,Machine-op-inspct,Husband,Western Origin,Male,0,0,40,United-States,>50K


In [371]:
# Verify group sizes for users with salaries ≤ 50K
if (q_block_counts_le_50k['counts'] >= k1).all():
    print("All Q* blocks for users with salaries ≤ 50K meet the k-anonymity requirement.")
else:
    print("Some Q* blocks for users with salaries ≤ 50K do not meet the k-anonymity requirement.")

# Verify group sizes for users with salaries > 50K
if (q_block_counts_gt_50k['counts'] >= k2).all():
    print("All Q* blocks for users with salaries > 50K meet the k-anonymity requirement.")
else:
    print("Some Q* blocks for users with salaries > 50K do not meet the k-anonymity requirement.")

All Q* blocks for users with salaries ≤ 50K meet the k-anonymity requirement.
All Q* blocks for users with salaries > 50K meet the k-anonymity requirement.


In [372]:
num_attr = len(table_gt_50k.columns)

In [373]:
num_attr

15

In [374]:
len(table_le_50k.columns)

15

In [375]:
len(generalized_data.columns)

15

Calculate the distortion


In [376]:
def calculate_distortion(gen_levels):
    # For all atributes: generalization level / max generalization level
    sum_of_generalizations = gen_levels['age']/5 + gen_levels['marital_status']/4 + gen_levels['education']/5 + gen_levels['age']/5
    sum_of_generalizations = sum_of_generalizations + (15 - 4) # Add the genralization level of the other attributes/columns which are 1's
    return sum_of_generalizations / num_attr


In [377]:
print(f'Distortion for data >50k: {calculate_distortion(generalization_levels_gt50k)}')

Distortion for data >50k: 0.9533333333333334


In [378]:
print(f'Distortion for data <=50k: {calculate_distortion(generalization_levels_le50k)}')

Distortion for data <=50k: 0.9433333333333334


On average, the attributes have been generalized to about 94.33% of their maximum generalization levels.

Calculate precision

In [379]:
table_gt_50k_len = len(table_gt_50k.index)
table_gt_50k_len

7841

In [380]:
table_le_50k_len = len(table_le_50k.index)
table_le_50k_len

24720

In [381]:
def calculate_precision(data, generalization_levels, hierarchy_depths):
    """
    Calculate the precision.
    
    :param data: pandas DataFrame of the dataset
    :param generalization_levels: Dictionary of generalization levels for each attribute
    :param hierarchy_depths: Dictionary of the depth of the value generalization hierarchy for each attribute
    :return: Precision value
    """
    N_A = len(generalization_levels)  # Number of QI attributes
    PT = len(data.index)  # Total number of records
    sum_generalization_height = 0

    # Calculate the total generalization height normalized by the depth of the hierarchy
    for attribute, level in generalization_levels.items():
        depth = hierarchy_depths[attribute]
        for _ in range(PT):  # The generalization level is uniform across all records
            sum_generalization_height += level / depth

    precision = 1 - (sum_generalization_height / (PT * N_A))
    return precision

hierarchy_depths = {
    'age': 5, 
    'education': 5,
    'marital_status': 4,
    'race': 5 
}

precision_value = calculate_precision(table_le_50k, generalization_levels_le50k, hierarchy_depths)
print(f'Precision for <=50k dataset: {precision_value}')


Precision for <=50k dataset: 0.2624999999989086


In [382]:
precision_value = calculate_precision(table_gt_50k, generalization_levels_gt50k, hierarchy_depths)
print(f'Precision for >50K dataset: {precision_value}')

Precision for >50K dataset: 0.22500000000035114


In [383]:
table_gt_50k

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
7,All Ages,Self-emp-not-inc,209642,Education,9,Married-Together,Exec-managerial,Husband,Western Origin,Male,0,0,45,United-States,>50K
8,All Ages,Private,45781,Education,14,Single,Prof-specialty,Not-in-family,Western Origin,Female,14084,0,50,United-States,>50K
9,All Ages,Private,159449,Education,13,Married-Together,Exec-managerial,Husband,Western Origin,Male,5178,0,40,United-States,>50K
10,All Ages,Private,280464,Education,10,Married-Together,Exec-managerial,Husband,Western Origin,Male,0,0,80,United-States,>50K
11,All Ages,State-gov,141297,Education,13,Married-Together,Prof-specialty,Husband,Non-Western Origin,Male,0,0,40,India,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32539,All Ages,,287372,Education,16,Married-Together,Unknown,Husband,Western Origin,Male,0,0,10,United-States,>50K
32545,All Ages,Local-gov,111499,Education,12,Married-Together,Adm-clerical,Wife,Western Origin,Female,0,0,20,United-States,>50K
32554,All Ages,Private,321865,Education,14,Married-Together,Exec-managerial,Husband,Western Origin,Male,0,0,40,United-States,>50K
32557,All Ages,Private,154374,Education,9,Married-Together,Machine-op-inspct,Husband,Western Origin,Male,0,0,40,United-States,>50K


Calculate entropy and l-diversity

">50K" dataset didn't meet the l-diversity requirements when l = 3:

In [384]:
def calculate_entropy(group):
    """ Calculate entropy for a single group of records """
    if len(group) == 0:
        return 0
    value_counts = group.value_counts()
    probabilities = value_counts / value_counts.sum()
    entropy = -np.sum(probabilities * np.log(probabilities))
    return entropy

def check_l_diversity(data, l):
    """ Check if the dataset satisfies l-diversity """
    entropy_threshold = math.log(l)  # Define the minimum entropy threshold
    fails = 0  # Track number of failures
    
    # Group data by QIs and calculate entropy for each group
    grouped = data.groupby(['age', 'education', 'marital_status', 'race'])
    for name, group in grouped:
        entropy = calculate_entropy(group['occupation'])
        if entropy < entropy_threshold:
            print(f'Group {name} fails to meet the entropy l-diversity with entropy {entropy:.4f}')
            fails += 1
    if fails == 0:
        print("All groups meet the l-diversity requirement.")
        return True
    else:
        print(f"{fails} groups fail to meet the l-diversity requirement.")
        return False

l = 3
print("Checking >50K dataset for l-diversity:")
satisfied_l_diversity_gt_50k = check_l_diversity(table_gt_50k, l)


Checking >50K dataset for l-diversity:
Group ('All Ages', 'Education', 'Separated-Married', 'Non-Western Origin') fails to meet the entropy l-diversity with entropy 0.9003
1 groups fail to meet the l-diversity requirement.


The generalized dataset with salary <=50k meets the requirements of l-diversity:

In [385]:
print("Checking <=50K dataset for l-diversity:")
satisfied_l_diversity_le_50k = check_l_diversity(table_le_50k, l)

Checking <=50K dataset for l-diversity:
All groups meet the l-diversity requirement.


Increase the generalization level for a specific attribute, say 'education' in table_gt_50k

In [386]:
def apply_adjusted_generalization(data, generalization_levels, attribute, condition_value, generalization_function, condition_attribute='education', generalization_level=4):
    """
    Apply generalization based on a condition on the attribute.
    condition_value:  the value you're checking against
    """
    def generalized_value(row, attribute, condition_value, generalization_function, generalization_level):
        if row[condition_attribute] == condition_value and generalization_level > 0:
            return generalization_function(row[attribute], generalization_level)
        return row[attribute]
    
    # Apply generalization
    data[attribute] = data.apply(lambda row: generalized_value(row, attribute, condition_value, generalization_function, generalization_level), axis=1)
    
    # Update the generalization level
    generalization_levels[attribute] = generalization_level
    return data


# Apply adjusted generalization
generalized_data_gt_50k_adjusted = apply_adjusted_generalization(
    data=table_gt_50k.copy(), 
    generalization_levels=generalization_levels_gt50k, 
    attribute='marital_status', 
    condition_value='Non-Western Origin',  # The group condition (you can adjust as needed)
    generalization_function=generalize_education, 
    condition_attribute='race',  # This ensures that the generalization happens only for the 'Non-Western Origin' group
    generalization_level=generalization_levels_gt50k['marital_status'] + 1  # Increment education's generalization level
)


# generalized_data_gt_50k_adjusted = apply_adjusted_generalization(
#     data=table_gt_50k.copy(), 
#     generalization_levels=generalization_levels_gt50k, 
#     attribute='race', 
#     condition_value='Basic Degree',  # The group condition
#     generalization_function=generalize_race, 
#     condition_attribute='education',  # This ensures that the generalization happens for the 'Basic Degree' group in education
#     generalization_level=generalization_levels_gt50k['race'] + 1  # Increment race's generalization level
# )

# Check l-diversity again
print("Rechecking >50K dataset for l-diversity after adjustment:")
satisfied_l_diversity_gt_50k_adjusted = check_l_diversity(generalized_data_gt_50k_adjusted, l)
print(satisfied_l_diversity_gt_50k_adjusted)


Rechecking >50K dataset for l-diversity after adjustment:
All groups meet the l-diversity requirement.
True


In [387]:
# Save the adjusted dataset
generalized_data_gt_50k_adjusted.to_csv('hw1-1-generalized_data_gt_50k_adjusted_for_lDiversity.csv', index=False)


Calculate Recursive (c, ℓ)-diversity

In [390]:
def check_recursive_diversity(data, l, c, attribute_groups, detailed=False):
    all_diverse = True
    failed_groups = []
    for name, group in data.groupby(attribute_groups):
        occupation_counts = group['occupation'].value_counts()
        if len(occupation_counts) < l:
            if detailed:
                failed_groups.append(name)
            all_diverse = False
            continue
        
        sorted_counts = occupation_counts.sort_values(ascending=False)
        threshold = c * sorted_counts.iloc[l-1] if len(sorted_counts) >= l else 0
        if any(sorted_counts.iloc[:l-1] > threshold):
            if detailed:
                failed_groups.append(name)
            all_diverse = False

    return all_diverse, failed_groups

In [389]:
def auto_adjust_generalization(data, generalization_levels, l, c, attribute_groups, max_attempts=10, gt_data=False):
    attempts = 0
    while attempts < max_attempts:
        # Check diversity and get detailed information about failing groups
        diverse, failed_groups = check_recursive_diversity(data, l, c, attribute_groups, detailed=True)

        if diverse:
            print("All groups meet the recursive (c, l)-diversity requirement.")
            break
        else:
            print(f"Adjusting generalization levels due to failures in groups: {failed_groups}")
            no_more_adjustments = True  # Track whether we can adjust any further
            
            # Increase generalization for failed groups
            for group in failed_groups:
                group_conditions = dict(zip(attribute_groups, group))  # Map group attributes to values

                # Adjust the generalization based on the specific group that failed
                for attribute, value in group_conditions.items():
                    current_level = generalization_levels[attribute]
                    max_depth = hierarchy_depths[attribute]

                    print(f"Before Adjustment: Attribute: {attribute}, Current Level: {current_level}, Max Depth: {max_depth}")

                    # If we're not yet at the max generalization level, increase the level
                    if current_level < max_depth:
                        # Optionally adjust the increment based on difficulty of group generalization
                        new_generalization_level = current_level + 1

                        # Apply the adjustment to generalize more
                        data = apply_adjusted_generalization(
                            data,
                            generalization_levels,
                            attribute=attribute,
                            condition_value=value,
                            generalization_function=get_generalization_function(attribute),
                            condition_attribute=attribute,
                            generalization_level=new_generalization_level  # Increase generalization by 1 level
                        )
                        
                        generalization_levels[attribute] = new_generalization_level  # Update level
                        no_more_adjustments = False  # We made at least one adjustment

                        print(f"After Adjustment: Attribute: {attribute}, New Generalization Level: {new_generalization_level}")
                    else:
                        print(f"Attribute {attribute} is already at its maximum generalization level.")
            
            # If no attributes could be adjusted, break the loop
            if no_more_adjustments:
                print("No further adjustments possible. Exiting.")
                break
        
        attempts += 1
    
    if attempts == max_attempts:
        print("Maximum adjustment attempts reached, some groups may still fail the diversity requirements.")

def get_generalization_function(attribute):
    """
    Returns the correct generalization function based on the attribute.
    """
    if attribute == 'education':
        return generalize_education
    elif attribute == 'race':
        return generalize_race
    elif attribute == 'marital_status':
        return generalize_marital_status
    elif attribute == 'age':
        return generalize_age
    else:
        raise ValueError(f"Unknown attribute: {attribute}")

In [391]:
l_recursive_data_c_point5 = table_gt_50k.copy()
generalization_levels_recursive_data_c_point5 = {
    'age': 5,
    'education': 3,
    'marital_status': 4,
    'race': 3
}

In [392]:
# When k = 5 and l = 3, c = 0.5
auto_adjust_generalization(l_recursive_data_c_point5, generalization_levels_recursive_data_c_point5, l=3, c=0.5, attribute_groups=['education', 'race'])

Adjusting generalization levels due to failures in groups: [('Education', 'Non-Western Origin'), ('Education', 'Western Origin')]
Before Adjustment: Attribute: education, Current Level: 3, Max Depth: 5
After Adjustment: Attribute: education, New Generalization Level: 4
Before Adjustment: Attribute: race, Current Level: 3, Max Depth: 5
After Adjustment: Attribute: race, New Generalization Level: 4
Before Adjustment: Attribute: education, Current Level: 4, Max Depth: 5
After Adjustment: Attribute: education, New Generalization Level: 5
Before Adjustment: Attribute: race, Current Level: 4, Max Depth: 5
After Adjustment: Attribute: race, New Generalization Level: 5
Adjusting generalization levels due to failures in groups: [('Education', 'Non-Western Origin'), ('Education', 'Race')]
Before Adjustment: Attribute: education, Current Level: 5, Max Depth: 5
Attribute education is already at its maximum generalization level.
Before Adjustment: Attribute: race, Current Level: 5, Max Depth: 5
Att

In [393]:
l_recursive_data_c_1 = table_gt_50k.copy()
generalization_levels_recursive_data_c_1 = {
    'age': 5,
    'education': 3,
    'marital_status': 4,
    'race': 3
}

In [394]:
auto_adjust_generalization(l_recursive_data_c_1, generalization_levels_recursive_data_c_1, l=3, c=1, attribute_groups=['education', 'race'])

Adjusting generalization levels due to failures in groups: [('Education', 'Non-Western Origin'), ('Education', 'Western Origin')]
Before Adjustment: Attribute: education, Current Level: 3, Max Depth: 5
After Adjustment: Attribute: education, New Generalization Level: 4
Before Adjustment: Attribute: race, Current Level: 3, Max Depth: 5
After Adjustment: Attribute: race, New Generalization Level: 4
Before Adjustment: Attribute: education, Current Level: 4, Max Depth: 5
After Adjustment: Attribute: education, New Generalization Level: 5
Before Adjustment: Attribute: race, Current Level: 4, Max Depth: 5
After Adjustment: Attribute: race, New Generalization Level: 5
Adjusting generalization levels due to failures in groups: [('Education', 'Non-Western Origin'), ('Education', 'Race')]
Before Adjustment: Attribute: education, Current Level: 5, Max Depth: 5
Attribute education is already at its maximum generalization level.
Before Adjustment: Attribute: race, Current Level: 5, Max Depth: 5
Att

In [395]:
l_recursive_data_c_2 = table_gt_50k.copy()
generalization_levels_recursive_data_c_2 = {
    'age': 5,
    'education': 3,
    'marital_status': 4,
    'race': 3
}

In [333]:
# When k = 5 and l = 3, c = 2
auto_adjust_generalization(l_recursive_data_c_2, generalization_levels_recursive_data_c_2, l=3, c=2, attribute_groups=['education', 'race'])

Adjusting generalization levels due to failures in groups: [('Education', 'Non-Western Origin')]
Before Adjustment: Attribute: education, Current Level: 3, Max Depth: 5
After Adjustment: Attribute: education, New Generalization Level: 4
Before Adjustment: Attribute: race, Current Level: 3, Max Depth: 5
After Adjustment: Attribute: race, New Generalization Level: 4
Adjusting generalization levels due to failures in groups: [('Education', 'Non-Western Origin')]
Before Adjustment: Attribute: education, Current Level: 4, Max Depth: 5
After Adjustment: Attribute: education, New Generalization Level: 5
Before Adjustment: Attribute: race, Current Level: 4, Max Depth: 5
After Adjustment: Attribute: race, New Generalization Level: 5
Adjusting generalization levels due to failures in groups: [('Education', 'Race')]
Before Adjustment: Attribute: education, Current Level: 5, Max Depth: 5
Attribute education is already at its maximum generalization level.
Before Adjustment: Attribute: race, Current

Calculate distortion and precision when k=5

In [334]:
print(f'Distortion for data with salary >50k when c=0.5: {calculate_distortion(generalization_levels_recursive_data_c_point5)}')
print("precision for data with salary >50k when c=0.5: ", calculate_precision(l_recursive_data_c_point5, generalization_levels_recursive_data_c_point5, hierarchy_depths))

Distortion for data with salary >50k when c=0.5: 1.0
precision for data with salary >50k when c=0.5:  0.0


In [335]:
print(f'Distortion for data with salary >50k when c=1: {calculate_distortion(generalization_levels_recursive_data_c_1)}')
print("precision for data with salary >50k when c=1: ", calculate_precision(l_recursive_data_c_1, generalization_levels_recursive_data_c_1, hierarchy_depths))

Distortion for data with salary >50k when c=1: 1.0
precision for data with salary >50k when c=1:  0.0


In [336]:
print(f'Distortion for data with salary >50k when c=2: {calculate_distortion(generalization_levels_recursive_data_c_2)}')
print("precision for data with salary >50k when c=2: ", calculate_precision(l_recursive_data_c_2, generalization_levels_recursive_data_c_2, hierarchy_depths))

Distortion for data with salary >50k when c=2: 1.0
precision for data with salary >50k when c=2:  0.0


In [337]:
l_recursive_data_c_2

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
7,All Ages,Self-emp-not-inc,209642,Education,9,Married-Together,Exec-managerial,Husband,Western Origin,Male,0,0,45,United-States,>50K
8,All Ages,Private,45781,Education,14,Single,Prof-specialty,Not-in-family,Western Origin,Female,14084,0,50,United-States,>50K
9,All Ages,Private,159449,Education,13,Married-Together,Exec-managerial,Husband,Western Origin,Male,5178,0,40,United-States,>50K
10,All Ages,Private,280464,Education,10,Married-Together,Exec-managerial,Husband,Western Origin,Male,0,0,80,United-States,>50K
11,All Ages,State-gov,141297,Education,13,Married-Together,Prof-specialty,Husband,Race,Male,0,0,40,India,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32539,All Ages,,287372,Education,16,Married-Together,Unknown,Husband,Western Origin,Male,0,0,10,United-States,>50K
32545,All Ages,Local-gov,111499,Education,12,Married-Together,Adm-clerical,Wife,Western Origin,Female,0,0,20,United-States,>50K
32554,All Ages,Private,321865,Education,14,Married-Together,Exec-managerial,Husband,Western Origin,Male,0,0,40,United-States,>50K
32557,All Ages,Private,154374,Education,9,Married-Together,Machine-op-inspct,Husband,Western Origin,Male,0,0,40,United-States,>50K


In [338]:
generalized_data_gt_50k_adjusted

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
7,All Ages,Self-emp-not-inc,209642,Education,9,Married-Together,Exec-managerial,Husband,Western Origin,Male,0,0,45,United-States,>50K
8,All Ages,Private,45781,Education,14,Single,Prof-specialty,Not-in-family,Western Origin,Female,14084,0,50,United-States,>50K
9,All Ages,Private,159449,Education,13,Married-Together,Exec-managerial,Husband,Western Origin,Male,5178,0,40,United-States,>50K
10,All Ages,Private,280464,Education,10,Married-Together,Exec-managerial,Husband,Western Origin,Male,0,0,80,United-States,>50K
11,All Ages,State-gov,141297,Education,13,Married-Together,Prof-specialty,Husband,Non-Western Origin,Male,0,0,40,India,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32539,All Ages,,287372,Education,16,Married-Together,Unknown,Husband,Western Origin,Male,0,0,10,United-States,>50K
32545,All Ages,Local-gov,111499,Education,12,Married-Together,Adm-clerical,Wife,Western Origin,Female,0,0,20,United-States,>50K
32554,All Ages,Private,321865,Education,14,Married-Together,Exec-managerial,Husband,Western Origin,Male,0,0,40,United-States,>50K
32557,All Ages,Private,154374,Education,9,Married-Together,Machine-op-inspct,Husband,Western Origin,Male,0,0,40,United-States,>50K
