In [1]:
import pandas as pd
import numpy as np  

private_data = pd.read_excel('private_dataM.xlsx')

In [2]:
anon = private_data.copy()

In [3]:
# remove direct identifiers
anon = anon.drop(columns=['name'])

In [4]:
# generalise quasi-identifiers
'''Age is very specific, so we will transform to age bands, 
and merge further every non-Danish to 'other', merge divorced/widowed, and merge education
 '''

today = pd.Timestamp.today()
anon['dob'] = pd.to_datetime(anon['dob'], format='%d/%m/%Y')
anon['age'] = (today - anon['dob']).dt.days // 365

bins = [18, 29, 39, 49, 59, 69, 120]
labels = ['18-29','30-39','40-49','50-59','60-69', '70+']
anon['age_band'] = pd.cut(anon['age'], bins=bins, labels=labels)
anon = anon.drop(columns=['dob', 'age'])

anon.loc[anon['citizenship'] != 'Denmark', 'citizenship'] = 'Other'
anon.loc[anon['marital_status'].isin(['Divorced', 'Widowed']), 'marital_status'] = 'Divorced/Widowed'

education_map = {
    'Upper secondary education': 'Secondary education',
    'Vocational Education and Training (VET)': 'Secondary education',
    'Vocational bachelors educations': 'Vocational/Medium cycle',
    'Short cycle higher education': 'Vocational/Medium cycle',
    'Bachelors programmes': 'Higher education',
    'Masters programmes': 'Higher education',
    'PhD programmes': 'Higher education',
    'Not stated': 'Other/Not stated',
    'Qualifying educational programmes': 'Other/Not stated'
}

anon['education'] = anon['education'].replace(education_map)

In [26]:
# supress rare combinations with k-anonymity
# TODO: improve this so we don't remove most of the dataset
# could remove zip & education, results in only 46 risky records

test = anon.copy()
qid = [ 'age_band', 'citizenship', 'sex', 'evote', 'party']
counts = test.groupby(qid).size().reset_index(name='count')
riskydf = test.merge(counts, on=qid, how='left')
risky = riskydf[riskydf['count'] == 1]
print(len(test), len(risky))

200 25


In [37]:
# adding probabilistic anonymity for party and education


def randomize_within_groups(df, columns_to_randomize, random=0.15, random_seed=1):

    np.random.seed(random_seed)
    
    df_anonymized = df.copy()
    
    for col_name, possible_values in columns_to_randomize.items():
        if col_name not in df.columns:
            print(f"Warning: Column '{col_name}' not found in dataframe")
            continue
        
        print(f"\nProcessing column: {col_name}")
        
        # For each value in the column, decide whether to randomize (randomize_pct chance)
        randomize_mask = np.random.random(len(df)) < random
        n_to_randomize = randomize_mask.sum()
        
        print(f"  Total values: {len(df)}")
        print(f"  Values to randomize: {n_to_randomize} ({100*n_to_randomize/len(df):.1f}%)")
        
        # Generate random values for selected rows
        random_values = np.random.choice(possible_values, size=n_to_randomize)
        
        indices_to_change = df.index[randomize_mask]
        if len(indices_to_change) > 0:
            sample_size = min(5, len(indices_to_change))
            sample_indices = indices_to_change[:sample_size]
            print(f"  Sample original values: {df_anonymized.loc[sample_indices, col_name].tolist()}")
            
            # Assign random values
            df_anonymized.loc[randomize_mask, col_name] = random_values
            
            print(f"  Sample new values: {df_anonymized.loc[sample_indices, col_name].tolist()}")
        
        # Special handling for 'party' column: randomize 'Invalid vote' values
        if col_name == 'party':
            invalid_mask = df_anonymized[col_name] == 'Invalid vote'
            n_invalid = invalid_mask.sum()
            if n_invalid > 0:
                print(f"  Found {n_invalid} 'Invalid vote' values, randomizing to Red/Green")
                df_anonymized.loc[invalid_mask, col_name] = np.random.choice(['Red', 'Green'], size=n_invalid)

        if col_name == 'education':
            other_mask = df_anonymized[col_name] == 'Other/Not stated'
            n_other = other_mask.sum()
            if n_other > 0:
                print(f"  Found {n_other} 'Other/Not stated' values, randomizing to other education levels")
                edu_levels = ['Primary education', 'Secondary education', 'Vocational/Medium cycle', 'Higher education']
                df_anonymized.loc[other_mask, col_name] = np.random.choice(edu_levels, size=n_other)
    
    return df_anonymized


# Example usage:
if __name__ == "__main__":
    df = anon.copy()
    
    print("\nOriginal value counts:")
    print("Party:", df['party'].value_counts().to_dict())
    print("Evote:", df['evote'].value_counts().to_dict())
    print("Education:", df['education'].value_counts().to_dict())
    print('Zip:', df['zip'].value_counts().to_dict())
    
    # Define columns to randomize and their possible values
    columns_to_randomize = {
        'party': ['Green', 'Red'],
        'evote': [0, 1],
        'education': ['Primary education', 'Secondary education', 'Vocational/Medium cycle', 'Higher education'],
        'zip': df['zip'].unique().tolist()
    }
    
    # Perform randomization
    df_anon = randomize_within_groups(
        df=df,
        columns_to_randomize=columns_to_randomize
    )

    print("\n" + "="*50)
    print("RESULTS")
    print("="*50)
    print("\nAnonymized value counts:")
    print("Party:", df_anon['party'].value_counts().to_dict())
    print("Evote:", df_anon['evote'].value_counts().to_dict())
    print("Education:", df_anon['education'].value_counts().to_dict())
    print('Zip:', df_anon['zip'].value_counts().to_dict())

    
    # Compare changes
    party_changed = (df['party'] != df_anon['party']).sum()
    evote_changed = (df['evote'] != df_anon['evote']).sum()
    edu_changed = (df['education'] != df_anon['education']).sum()
    zip_changed = (df['zip'] != df_anon['zip']).sum()
    print(f"\nRows where 'party' changed: {party_changed}")
    print(f"Rows where 'evote' changed: {evote_changed}")
    print(f"Rows where 'education' changed: {edu_changed}")
    print(f"Rows where 'zip' changed: {zip_changed}")



Original value counts:
Party: {'Green': 126, 'Red': 72, 'Invalid vote': 2}
Evote: {0: 134, 1: 66}
Education: {'Secondary education': 85, 'Vocational/Medium cycle': 44, 'Primary education': 37, 'Higher education': 31, 'Other/Not stated': 3}
Zip: {2400: 67, 2300: 55, 2200: 47, 2100: 31}

Processing column: party
  Total values: 200
  Values to randomize: 42 (21.0%)
  Sample original values: ['Green', 'Green', 'Green', 'Green', 'Red']
  Sample new values: ['Green', 'Green', 'Red', 'Red', 'Red']
  Found 2 'Invalid vote' values, randomizing to Red/Green

Processing column: evote
  Total values: 200
  Values to randomize: 23 (11.5%)
  Sample original values: [0, 0, 1, 0, 1]
  Sample new values: [0, 0, 0, 1, 0]

Processing column: education
  Total values: 200
  Values to randomize: 31 (15.5%)
  Sample original values: ['Secondary education', 'Secondary education', 'Vocational/Medium cycle', 'Secondary education', 'Primary education']
  Sample new values: ['Higher education', 'Vocational/Med

In [None]:
# df_anon.to_csv('anon_data_gaston.csv', index=False)