In [1]:
import pandas as pd

# Load the dataset
survey_data = pd.read_excel("data/private_dataG.xlsx")  # Replace with your actual file path

# Step 1: Remove 'name' to ensure direct identifiers are not in the dataset
anonymized_survey_data = survey_data.drop(columns=['name'])

# Step 2: Generalize 'dob' to new age groups: "18-35", "36-65", "65+"
def broader_age_group(dob):
    age = 2024 - dob.year
    if age <= 35:
        return "18-35"
    elif age <= 65:
        return "36-65"
    else:
        return "65+"

anonymized_survey_data['age_group'] = pd.to_datetime(anonymized_survey_data['dob']).apply(broader_age_group)
anonymized_survey_data = anonymized_survey_data.drop(columns=['dob'])

# Step 3: Generalize 'zip' to regions
zip_to_region = {2100: 'Region A', 2200: 'Region A', 2300: 'Region B', 2400: 'Region B'}
anonymized_survey_data['region'] = anonymized_survey_data['zip'].map(zip_to_region)
anonymized_survey_data = anonymized_survey_data.drop(columns=['zip'])

# Step 4: Simplify 'marital_status' to broader categories
marital_mapping = {
    'Never married': 'Single',
    'Divorced': 'Single',
    'Married/separated': 'Married',
    'Widowed': 'Married'
}
anonymized_survey_data['marital_status'] = anonymized_survey_data['marital_status'].map(marital_mapping)

# Step 5: Generalize 'education' into fewer categories
education_mapping = {
    'Primary education': 'Basic Education',
    'Upper secondary education': 'Basic Education',
    'Vocational Education and Training (VET)': 'Higher Education',
    'Short cycle higher education': 'Higher Education',
    'Vocational bachelors educations': 'Higher Education',
    'Bachelors programmes': 'Higher Education',
    'Masters programmes': 'Higher Education',
    'PhD programmes': 'Higher Education',
    'Not stated': 'Basic Education'
}
anonymized_survey_data['education'] = anonymized_survey_data['education'].map(education_mapping)

# Step 6: Generalize 'citizenship' to 'Domestic' or 'Foreign'
anonymized_survey_data['citizenship'] = anonymized_survey_data['citizenship'].apply(
    lambda x: 'Domestic' if x == 'Denmark' else 'Foreign'
)

# Display the final anonymized survey data for verification
print("Anonymized Survey Data Sample:")
print(anonymized_survey_data.head())

# Define quasi-identifiers for k-anonymity and l-diversity analysis
quasi_identifiers = ['sex', 'age_group', 'region', 'marital_status', 'education', 'citizenship']

# 1. Calculate Disclosure Risk
grouped_counts = anonymized_survey_data.groupby(quasi_identifiers).size()
unique_groups = grouped_counts[grouped_counts == 1].count()
near_unique_groups = grouped_counts[grouped_counts <= 2].count()

print("\nDisclosure Risk Analysis:")
print(f"Unique groups (potential re-identification risk): {unique_groups}")
print(f"Near-unique groups (groups of size 2): {near_unique_groups}")

# 2. Calculate k-Anonymity
min_k_value = grouped_counts.min()
print(f"\nk-Anonymity Analysis:")
print(f"Minimum k value: {min_k_value}")

# 3. Calculate l-Diversity
# We need to check the diversity of sensitive values ('party') within each quasi-identifier group
l_diversity_counts = anonymized_survey_data.groupby(quasi_identifiers)['party'].nunique()
min_l_diversity = l_diversity_counts.min()

print(f"\nl-Diversity Analysis:")
print(f"Minimum l-diversity value for 'party': {min_l_diversity}")

Anonymized Survey Data Sample:
      sex  evote         education citizenship marital_status  party  \
0  Female      1  Higher Education    Domestic        Married  Green   
1    Male      0   Basic Education    Domestic        Married  Green   
2    Male      0  Higher Education    Domestic        Married    Red   
3    Male      0  Higher Education    Domestic        Married    Red   
4    Male      1  Higher Education     Foreign        Married  Green   

  age_group    region  
0     36-65  Region B  
1       65+  Region A  
2     36-65  Region A  
3     36-65  Region A  
4     18-35  Region A  

Disclosure Risk Analysis:
Unique groups (potential re-identification risk): 17
Near-unique groups (groups of size 2): 25

k-Anonymity Analysis:
Minimum k value: 1

l-Diversity Analysis:
Minimum l-diversity value for 'party': 1


In [2]:
# Identify and print unique groups (those with only one record)
unique_groups = grouped_counts[grouped_counts == 1]

print("Unique Groups (potential re-identification risk):")
print(unique_groups)


Unique Groups (potential re-identification risk):
sex     age_group  region    marital_status  education         citizenship
Female  18-35      Region A  Married         Higher Education  Domestic       1
                   Region B  Married         Higher Education  Foreign        1
                             Single          Higher Education  Foreign        1
        36-65      Region A  Married         Basic Education   Domestic       1
                                             Higher Education  Foreign        1
                             Single          Basic Education   Domestic       1
                   Region B  Married         Basic Education   Foreign        1
                                             Higher Education  Foreign        1
                             Single          Higher Education  Foreign        1
        65+        Region A  Single          Basic Education   Domestic       1
                                             Higher Education  Domestic    

In [3]:
# Step 1: Identify high-risk groups (unique and near-unique records)
# Group by quasi-identifiers and find groups with size <= 2 (unique or near-unique)
grouped_counts = anonymized_survey_data.groupby(quasi_identifiers).size()
high_risk_groups = grouped_counts[grouped_counts <= 2].index

# Step 2: Apply local suppression to high-risk groups
# Replace values in specific columns for records in high-risk groups
anonymized_survey_data['suppressed'] = anonymized_survey_data[quasi_identifiers].apply(
    lambda row: tuple(row) in high_risk_groups, axis=1
)

# Locally suppress sensitive quasi-identifier values for high-risk records
anonymized_survey_data.loc[anonymized_survey_data['suppressed'], 'age_group'] = "Unknown"
anonymized_survey_data.loc[anonymized_survey_data['suppressed'], 'region'] = "Unknown"
anonymized_survey_data.loc[anonymized_survey_data['suppressed'], 'education'] = "Unknown"
anonymized_survey_data.loc[anonymized_survey_data['suppressed'], 'citizenship'] = "Unknown"

# Drop the helper column used for marking suppressed rows
anonymized_survey_data = anonymized_survey_data.drop(columns=['suppressed'])

# Recalculate Disclosure Risk, k-Anonymity, and l-Diversity after local suppression
grouped_counts = anonymized_survey_data.groupby(quasi_identifiers).size()
unique_groups = grouped_counts[grouped_counts == 1].count()
near_unique_groups = grouped_counts[grouped_counts <= 2].count()

print("\nRevised Disclosure Risk Analysis with Local Suppression:")
print(f"Unique groups (potential re-identification risk): {unique_groups}")
print(f"Near-unique groups (groups of size 2): {near_unique_groups}")

# Recalculate minimum k-anonymity and l-diversity
min_k_value = grouped_counts.min()
print(f"\nk-Anonymity Analysis:")
print(f"Minimum k value: {min_k_value}")

l_diversity_counts = anonymized_survey_data.groupby(quasi_identifiers)['party'].nunique()
min_l_diversity = l_diversity_counts.min()
print(f"\nl-Diversity Analysis:")
print(f"Minimum l-diversity value for 'party': {min_l_diversity}")



Revised Disclosure Risk Analysis with Local Suppression:
Unique groups (potential re-identification risk): 0
Near-unique groups (groups of size 2): 0

k-Anonymity Analysis:
Minimum k value: 3

l-Diversity Analysis:
Minimum l-diversity value for 'party': 1
