# ITIS6220: Data Privacy
## Homework 1

In [1]:
# Load the data and libraries
import pandas as pd
import numpy as np

adult = pd.read_csv('adult_with_pii.csv')
adult_pii = adult[['Name', 'DOB', 'SSN', 'Zip', 'Age']]
adult_deid = adult.drop(columns=['Name', 'SSN'])

## Question 1 (20 points)

Using the dataframes `adult_pii` and `adult_deid`, write code to conduct a linking attack to recover the names of as many individuals in `adult_deid` as possible. Your solution should be parameterized by the set of columns to use in the attack.

In [27]:
def linking_attack(cols):
    merged_df = adult_pii.merge(adult_deid, on=cols, how="inner")
    return merged_df[['Name'] + cols]

In [28]:
# TEST CASES for Question 1
assert len(linking_attack(['Zip'])) == 43191
assert len(linking_attack(['Zip', 'DOB'])) == 32563
assert len(linking_attack(['Zip', 'Age'])) == 32755

## Question 2 (5 points)

How many individuals in this dataset are uniquely identified by their Zip code? How many are uniquely identified by their age?

Hint: note that the number of *unique ZIP codes* is **different** from the number of *individuals uniquely identified by ZIP code*.

Hint: you can use the `value_counts` method to count the number of occurences of each value in a series.

In [5]:
def unique_zipcode():
    # Count occurrences of each Zip code
    zip_counts = adult["Zip"].value_counts()
    # Count how many Zip codes appear only once and convert to int
    return int((zip_counts == 1).sum())

def unique_dob():
    # Count occurrences of each DOB
    dob_counts = adult["DOB"].value_counts()
    # Count how many DOBs appear only once and convert to int
    return int((dob_counts == 1).sum())

In [6]:
# TEST CASES for Question 2

# Ensure the functions return a positive integer
assert isinstance(unique_zipcode(), int), "unique_zipcode() should return an integer"
assert isinstance(unique_dob(), int), "unique_dob() should return an integer"

# Ensure the values are greater than 0 (i.e., at least one unique entry exists)
assert unique_zipcode() > 0, "unique_zipcode() should return a positive count"
assert unique_dob() > 0, "unique_dob() should return a positive count"

print("All test cases passed for Question 2!")


All test cases passed for Question 2!


## Question 3 (10 points)

Write code to determine the `Education-Num` of the individual named Ardyce Golby by performing a differencing attack. Your code should *only* use aggregate data to find Ardyce's education number.

In [7]:
def ardyce_education():
    # Total sum of all education numbers
    total_edu_sum = adult["Education-Num"].sum()
    
    # Total sum of education numbers excluding Ardyce Golby
    total_edu_excluding_ardyce = adult[adult["Name"] != "Ardyce Golby"]["Education-Num"].sum()
    
    # Differencing attack: Isolating Ardyce Golby's Education-Num
    return int(total_edu_sum - total_edu_excluding_ardyce)  # Convert to int for consistency

In [8]:
# TEST CASE for Question 3
assert ardyce_education() == 12, "Test failed: The result should be 12"

print("All test cases passed for Question 3!")

All test cases passed for Question 3!


## Question 4 (20 points)

Implement a more efficient version of `is_k_anonymous`. The inefficient implementation, taken from the textbook, appears below.

**Hint**: use the `value_counts` or `group_by` functions, and make sure no count is less than $k$.

In [9]:
import pandas as pd

# Load the dataset
adult = pd.read_csv("adult_with_pii.csv")  # Ensure this file exists in your working directory

In [10]:
def is_k_anonymous(k, qis, df):
    """
    Returns true if df satisfies k-Anonymity for the quasi-identifiers qis.
    Returns false otherwise.
    
    Parameters:
        k (int): The minimum number of occurrences required for k-anonymity.
        qis (list): List of quasi-identifier column names.
        df (DataFrame): The dataset to check.

    Returns:
        bool: True if the dataset satisfies k-anonymity, False otherwise.
    """
    # Count occurrences of each unique combination of quasi-identifiers
    group_counts = df.groupby(qis).size()
    
    # Check if the minimum count of any group is at least `k`
    return group_counts.min() >= k

In [11]:
def generalize(df, depths):
    """
    Generalizes specified columns in the dataframe by replacing the least-significant digits.
    
    Parameters:
        df (DataFrame): The dataset to generalize.
        depths (dict): Dictionary specifying how much to generalize each column.

    Returns:
        DataFrame: The generalized dataset.
    """
    return df.apply(lambda x: x.apply(lambda y: int(int(y / (10**depths[x.name])) * (10**depths[x.name]))))

In [12]:
def generalize_adult_age():
    """
    Generalizes the Age column in the adult dataset to a depth of 1.

    Returns:
        DataFrame: Generalized dataset with only the Age column.
    """
    depths = {
        'Age': 1  # Replaces the last digit with 0
    }
    
    return generalize(adult[['Age']], depths)

In [13]:
assert is_k_anonymous(20, ['Age'], generalize_adult_age()), "Test failed: Age column is not 20-anonymous"

print("All test cases passed for Question 4!")

All test cases passed for Question 4!


## Question 5 (10 points)

Consider the definition of `generalize` below, taken from the textbook. The function takes a dataframe `df` and a dictionary `depths` that describes how much to generalize each column of `df`. Generalizing a column to a depth of $n$ replaces the $n$ least-significant digits of each number in that column by zeroes. For example, we could generalize column `A` by making its least-significant digit a 0 and column `B` by doing the same for 2 digits with the following depth specification:

In [14]:
depths = {
    'A': 1,  # Replace last 1 digit of column A with 0
    'B': 2   # Replace last 2 digits of column B with 0
}

In [15]:
def generalize(df, depths):
    """
    Generalizes specified columns in the dataframe by replacing the least-significant digits.

    Parameters:
        df (DataFrame): The dataset to generalize.
        depths (dict): Dictionary specifying how much to generalize each column.

    Returns:
        DataFrame: The generalized dataset.
    """
    return df.apply(lambda x: x.apply(lambda y: int(int(y / (10**depths[x.name])) * (10**depths[x.name])))
                    if x.name in depths else x)

Using the `generalize` function, generalize the `Age` column of the `adult` dataset to a depth of 1. Drop the other columns of the dataset. Your result should achieve $k$-Anonymity for $k=20$.

In [16]:
def generalize_adult_age():
    """
    Generalizes the Age column in the adult dataset to a depth of 1.

    Returns:
        DataFrame: Generalized dataset with only the Age column.
    """
    depths = {
        'Age': 1  # Generalizing Age by replacing the last digit with 0
    }
    
    return generalize(adult[['Age']], depths)

In [17]:
assert is_k_anonymous(20, ['Age'], generalize_adult_age()), "Test failed: Age column is not 20-anonymous"

print("All test cases passed for Question 5!")

All test cases passed for Question 5!


## Question 6 (10 points)

Using the `generalize` function, generalize the `Age` and `Zip` columns of the `adult` dataset in order to achieve $k$-Anonymity for $k=5$. Your result should drop other columns besides these two.

In [18]:
def generalize_adult_age_zip():
    """
    Generalizes the Age and Zip columns in the adult dataset 
    to achieve k-Anonymity for k=5.

    Returns:
        DataFrame: Generalized dataset with only Age and Zip columns.
    """
    depths = {
        'Age': 2,  # Replacing last 2 digits with 00
        'Zip': 3   # Replacing last 3 digits with 000
    }

    # Apply generalization and return only Age & Zip columns
    return generalize(adult[['Age', 'Zip']], depths)

In [19]:
generalized_df = generalize_adult_age_zip()
grouped_counts = generalized_df.groupby(['Age', 'Zip']).size()

# Display how many unique groups have fewer than 5 members
print("Number of groups with fewer than 5 members:", sum(grouped_counts < 5))

Number of groups with fewer than 5 members: 0


In [20]:
assert is_k_anonymous(5, ['Age', 'Zip'], generalize_adult_age_zip()), "Test failed: Age and Zip columns are not 5-anonymous"

print("All test cases passed for Question 6!")

All test cases passed for Question 6!
