<a href="https://colab.research.google.com/github/abojha/dataPrivacyLab/blob/main/lkc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('adult.csv')

In [None]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [None]:
columns = ['age', 'education', 'relationship', 'sex', 'race', 'income']
df = df[columns]

In [None]:
df.head()

Unnamed: 0,age,education,relationship,sex,race,income
0,39,Bachelors,Not-in-family,Male,White,<=50K
1,50,Bachelors,Husband,Male,White,<=50K
2,38,HS-grad,Not-in-family,Male,White,<=50K
3,53,11th,Husband,Male,Black,<=50K
4,28,Bachelors,Wife,Female,Black,<=50K


In [None]:
# Define Quasi-Identifiers (QIs) and Sensitive Attribute
quasi_identifiers = ['age', 'education', 'relationship', 'sex', 'race']
sensitive_attribute = 'income'

In [None]:
from itertools import combinations
# Function to generate all subsets of quasi-identifiers of size ≤ L
def get_qid_subsets(qi_list, L):
    qid_subsets = []
    for i in range(1, L + 1):
        qid_subsets.extend(combinations(qi_list, i))
    return qid_subsets

# Function to check LKC-Privacy
def verify_LKC_privacy(df, L, K, C):
    qid_subsets = get_qid_subsets(quasi_identifiers, L)
    violating_k_groups = []
    violating_c_groups = []

    for qid_subset in qid_subsets:
        groups = df.groupby(list(qid_subset))

        for qid, group in groups:
            # Condition 1: K-Anonymity Check
            if len(group) < K:
                violating_k_groups.append((qid_subset, qid, len(group)))

            # Condition 2: C-Confidence Bound Check
            sensitive_value_counts = group[sensitive_attribute].value_counts(normalize=True)
            for s, confidence in sensitive_value_counts.items():
                if confidence > C:
                    violating_c_groups.append((qid_subset, qid, s, confidence))

    # Print Violations
    if not violating_k_groups:
        print(f"✅ The dataset satisfies L={L}, K={K} condition.")
    else:
        print(f"❌ The dataset does NOT satisfy L={L}, K={K}! Violating groups:")

    if not violating_c_groups:
        print(f"✅ The dataset satisfies the C={C} confidence threshold.")
    else:
        print(f"❌ The dataset does NOT satisfy the C={C} confidence bound! Violating groups:")



In [None]:
# checking whether K-anonymous or not
def is_anony(df, k):
  count = df.groupby(quasi_identifiers).size()
  count_k = count[count < k]

  return len(count_k)


In [None]:
val = is_anony(df, 3)
val

4392

In [None]:
# function for applying the k-anonymization

def a_age(age):
  if age < 30:
    return "young"
  elif age < 50:
    return "middle_aged"
  else: return "old"

def a_education(education):
  if education == "Masters" or education == "Bachelors":
    return "Higher_Studies"

  else:
    return "Lower_Studies"

def a_sex(sex):
  return "Any Sex"

def a_race(race):
  return "Any Race"

def a_relationship(relationship):
  rel = ["Husband", "Wife", "Own-child"]
  if relationship in rel:
    return "My Family"
  else : return "Non-Family"

In [None]:
# applying functions to the dataset
df['age'] = df['age'].apply(a_age)
df['relationship'] = df['relationship'].apply(a_relationship)
df['education'] = df['education'].apply(a_education)
df['sex'] = df['sex'].apply(a_sex)
df['race'] = df['race'].apply(a_race)

In [None]:
# now check the k_anonymization again
val = is_anony(df, 3)
val

0

In [None]:
# Apply LKC-Privacy Check
L = 3  # Max prior knowledge size
K = 2  # Anonymity threshold
C = 0.5  # Max confidence threshold
verify_LKC_privacy(df, L, K, C)

✅ The dataset satisfies L=3, K=2 condition.
❌ The dataset does NOT satisfy the C=0.5 confidence bound! Violating groups:


In [None]:
# Function to enforce C-Distinctness (Optimized)
def enforce_C_distinctness(df, C):
    """
    Ensures that no sensitive value in an equivalence class
    has a confidence strictly greater than C.
    Modifies the minimum number of records to balance the distribution.
    """
    groups = df.groupby(quasi_identifiers)

    for qid, group in groups:
        # Compute confidence for each sensitive value
        sensitive_value_counts = group[sensitive_attribute].value_counts(normalize=True)

        for s, confidence in sensitive_value_counts.items():
            if confidence > C:  # ✅ Fix: Modify only if confidence is strictly greater than C
                print(f"Fixing C-Distinctness violation for group {qid} - {s} has confidence {confidence:.2f}")

                # Number of records to change to meet the threshold
                total_records = len(group)
                excess_confidence = confidence - C
                excess_records = int(np.ceil(excess_confidence * total_records))  # Round up

                # Get indices of records with this sensitive value
                violating_indices = group[group[sensitive_attribute] == s].index

                if len(violating_indices) > excess_records:
                    # Select only the required number of records to change
                    change_indices = np.random.choice(violating_indices, excess_records, replace=False)

                    # Get possible replacement values from dataset
                    possible_values = df[sensitive_attribute].unique()
                    possible_values = [v for v in possible_values if v != s]  # Exclude current value

                    # Introduce new values while keeping distribution similar
                    new_values = np.random.choice(possible_values, len(change_indices), replace=True)
                    df.loc[change_indices, sensitive_attribute] = new_values

                # Recalculate confidence after modification
                updated_confidence = df.loc[group.index, sensitive_attribute].value_counts(normalize=True).max()
                print(f"  → After modification, new max confidence: {updated_confidence:.2f}")

    return df


In [None]:
# Apply C-Distinctness Enforcement
df = enforce_C_distinctness(df, C)

Fixing C-Distinctness violation for group ('middle_aged', 'Higher_Studies', 'My Family', 'Any Sex', 'Any Race') - >50K has confidence 0.70
  → After modification, new max confidence: 0.50
Fixing C-Distinctness violation for group ('middle_aged', 'Higher_Studies', 'Non-Family', 'Any Sex', 'Any Race') - <=50K has confidence 0.77
  → After modification, new max confidence: 0.50
Fixing C-Distinctness violation for group ('middle_aged', 'Lower_Studies', 'My Family', 'Any Sex', 'Any Race') - <=50K has confidence 0.64
  → After modification, new max confidence: 0.50
Fixing C-Distinctness violation for group ('middle_aged', 'Lower_Studies', 'Non-Family', 'Any Sex', 'Any Race') - <=50K has confidence 0.93
  → After modification, new max confidence: 0.50
Fixing C-Distinctness violation for group ('old', 'Higher_Studies', 'My Family', 'Any Sex', 'Any Race') - >50K has confidence 0.69
  → After modification, new max confidence: 0.50
Fixing C-Distinctness violation for group ('old', 'Higher_Studies

In [None]:
# Apply LKC-Privacy Check
L = 3  # Max prior knowledge size
K = 2  # Anonymity threshold
C = 0.5  # Max confidence threshold
verify_LKC_privacy(df, L, K, C)

✅ The dataset satisfies L=3, K=2 condition.
❌ The dataset does NOT satisfy the C=0.5 confidence bound! Violating groups:
