## Privacy Engineering in Practice

### Initialization
- Importing DEEZER Brazil data set
- Turning into a Sparse Matrix

In [1]:
import pandas as pd
from anonypy import Preserver
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder

# Load the .inter file 
file_path = r"P:\pCloud Offline\PLUS\2nd Sem\Privacy Engineering\Data\DEEZER_BR.inter"

# Read the file
deezer_data = pd.read_csv(file_path, delimiter=",")  
deezer_data = deezer_data.rename(columns={'user_id:token': 'user_id','item_id:token': 'item_id'})

# Display the first few rows
print(deezer_data.head())

# Check column names and data types
print(deezer_data.info())

   user_id  item_id
0    26371    24082
1    26371   294519
2    26371    46312
3    26371   114070
4    26371   216818
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1045554 entries, 0 to 1045553
Data columns (total 2 columns):
 #   Column   Non-Null Count    Dtype
---  ------   --------------    -----
 0   user_id  1045554 non-null  int64
 1   item_id  1045554 non-null  int64
dtypes: int64(2)
memory usage: 16.0 MB
None


Drop duplicates so that what remains are interacted/not interacted with

In [2]:
deezer_data_unique = deezer_data.drop_duplicates(subset=['user_id', 'item_id'])
print(deezer_data_unique.info())

<class 'pandas.core.frame.DataFrame'>
Index: 669686 entries, 0 to 1045550
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   user_id  669686 non-null  int64
 1   item_id  669686 non-null  int64
dtypes: int64(2)
memory usage: 15.3 MB
None


In [3]:
df = deezer_data_unique
df['interaction'] = 1

# Encode user and item IDs to integer indices
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

user_indices = user_encoder.fit_transform(df['user_id'])
item_indices = item_encoder.fit_transform(df['item_id'])

# Build the binary interaction matrix
interaction_matrix = csr_matrix(
    (df['interaction'], (user_indices, item_indices))
)

print("Shape of matrix:", interaction_matrix.shape)
print("Number of interactions:", interaction_matrix.nnz)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['interaction'] = 1


Shape of matrix: (10000, 144178)
Number of interactions: 669686


Converting the sparse matrix into dataframe

In [5]:
dense_array = interaction_matrix.toarray()  # or use .todense()
df_orig = pd.DataFrame(dense_array)
print(df_orig.head)

<bound method NDFrame.head of       0       1       2       3       4       5       6       7       8       \
0          0       0       0       0       0       0       0       0       0   
1          0       0       0       0       0       0       0       0       0   
2          0       0       0       0       0       0       0       0       0   
3          0       0       0       0       0       0       0       0       0   
4          0       0       0       0       0       0       0       0       0   
...      ...     ...     ...     ...     ...     ...     ...     ...     ...   
9995       0       0       0       0       0       0       0       0       0   
9996       0       0       0       0       0       0       0       0       0   
9997       0       0       0       0       0       0       0       0       0   
9998       0       0       0       0       0       0       0       0       0   
9999       0       0       0       0       0       0       0       0       0   

      9  

Using a subset of the original data: filtering based on number of interactions (users/items)

In [None]:
# sequential !! items first before users
items_min_interaction = df_orig.sum(axis=0) < 100 # each item(column) should have at least ?? users
print(f"Number of columns with few interactions: {items_min_interaction.sum()}")

df_cleaned = df_orig.loc[:, ~items_min_interaction]

users_min_interaction = df_cleaned.sum(axis=1) < 100 # each user(row) should have at least ?? items
print(f"Number of rows with few interactions: {users_min_interaction.sum()}")

df_cleaned = df_cleaned.loc[~users_min_interaction, :]
original_shape = df_orig.shape
cleaned_shape = df_cleaned.shape
print(f"Original shape: {original_shape}")
print(f"Cleaned shape: {cleaned_shape}")
print(f"Proportion of 1s: {((df_cleaned == 1).sum().sum())/(df_cleaned.shape[0]*df_cleaned.shape[1])}") # sparsity

Number of rows with few interactions: 9314
Original shape: (10000, 144178)
Cleaned shape: (686, 1065)
Proportion of 1s: 0.10595819816860345


Mondrian implementation

In [None]:
KANON = 15 # change the k value

for_anon = df_cleaned.copy()
col_count = for_anon.shape[1]
for_anon.columns = for_anon.columns.astype(str)

for_anon['dummy_sensitive'] = 0 # dummy variable for the sensitive attribute (none for this analysis)

sensitive_column = 'dummy_sensitive'
feature_columns =  [str(col) for col in for_anon.columns if col != sensitive_column]

p = Preserver(for_anon, feature_columns, sensitive_column)
anonymized_data = p.anonymize_k_anonymity(k=KANON)  
anonymized_df = pd.DataFrame(anonymized_data)
print(anonymized_df.head())


     132    189    190    290    686    743    761    801    878    925  ...  \
0  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  ...   
1  [0-1]    [0]  [0-1]  [0-1]    [0]    [0]    [0]  [0-1]  [0-1]  [0-1]  ...   
2    [0]  [0-1]    [0]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]    [0]  ...   
3    [0]  [0-1]  [0-1]  [0-1]  [0-1]    [0]    [0]  [0-1]  [0-1]  [0-1]  ...   
4    [0]  [0-1]    [0]    [0]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  ...   

  143265 143621 143879 143896 144003 144049 144131 144167 dummy_sensitive  \
0  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]               0   
1    [0]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]    [0]               0   
2    [0]  [0-1]  [0-1]  [0-1]    [0]  [0-1]  [0-1]  [0-1]               0   
3    [0]  [0-1]    [0]  [0-1]    [0]  [0-1]  [0-1]  [0-1]               0   
4    [0]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]    [0]               0   

  count  
0   137  
1    19  
2    21  
3    26  
4    1

This is the same implementation as above but was modified to have a dummy variable that contains the row numbers in each group. This is needed for the flipping later.

In [68]:
for_anon = df_cleaned.copy()
col_count = for_anon.shape[1]
for_anon.columns = for_anon.columns.astype(str)

for_anon['dummy_sensitive'] = range(len(for_anon))

sensitive_column = 'dummy_sensitive'
feature_columns =  [str(col) for col in for_anon.columns if col != sensitive_column]


p = Preserver(for_anon, feature_columns, sensitive_column)
anonymized_data = p.anonymize_k_anonymity(k=KANON)  
anonymized_df_withrows = pd.DataFrame(anonymized_data)
print(anonymized_df_withrows.head())



     132    189    190    290    686    743    761    801    878    925  ...  \
0  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  ...   
1  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  ...   
2  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  ...   
3  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  ...   
4  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  ...   

  143265 143621 143879 143896 144003 144049 144131 144167 dummy_sensitive  \
0  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]              18   
1  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]              20   
2  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]              22   
3  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]              24   
4  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]  [0-1]              27   

  count  
0     1  
1     1  
2     1  
3     1  
4     

Checker of the sizes of the groups formed

In [None]:
vc = anonymized_df['count'].value_counts()
vc.name = 'frequency'
freq_table = vc.reset_index()
freq_table.columns = ['count_value', 'frequency']
print(freq_table)

    count_value  frequency
0            15          5
1            21          4
2            19          4
3            22          4
4            18          3
5            16          2
6            17          1
7           137          1
8            26          1
9            25          1
10           23          1
11           29          1
12           20          1


Convert group sizes and row number to lists in preparation for the flipping.

In [None]:
df_groups = df_cleaned.copy()
df_groups['group_number'] = None

# convert group sizes and row numbers to lists
group_sizes = anonymized_df['count'].tolist()
row_numbers = anonymized_df_withrows['dummy_sensitive'].tolist()

print(group_sizes)
print(row_numbers)


[137, 19, 21, 26, 17, 15, 19, 16, 25, 22, 22, 15, 15, 18, 21, 21, 21, 23, 15, 29, 15, 22, 18, 19, 20, 16, 18, 19, 22]
[18, 20, 22, 24, 27, 39, 41, 53, 63, 65, 68, 71, 94, 103, 116, 118, 119, 126, 130, 138, 147, 148, 149, 150, 153, 164, 173, 180, 183, 187, 195, 203, 204, 206, 208, 212, 214, 215, 219, 226, 231, 234, 236, 250, 252, 256, 267, 269, 272, 275, 276, 278, 279, 280, 289, 295, 296, 307, 318, 320, 321, 323, 326, 332, 350, 358, 361, 366, 374, 377, 408, 409, 410, 412, 416, 419, 427, 434, 440, 441, 447, 449, 453, 454, 459, 461, 468, 479, 481, 484, 487, 488, 499, 501, 502, 504, 505, 521, 526, 532, 541, 544, 546, 550, 552, 554, 557, 565, 568, 571, 577, 583, 585, 586, 602, 603, 604, 608, 609, 613, 616, 621, 622, 627, 631, 636, 639, 640, 641, 643, 654, 657, 661, 669, 670, 672, 685, 110, 134, 161, 167, 225, 248, 310, 339, 389, 401, 404, 466, 498, 529, 537, 579, 645, 662, 676, 10, 78, 140, 141, 213, 217, 229, 232, 247, 262, 290, 316, 344, 364, 483, 495, 530, 549, 566, 619, 648, 35, 36, 48,

In [71]:
# Assign groups iteratively
df_groups.reset_index(drop=True, inplace=True)
start_idx = 0
for group_num, size in enumerate(group_sizes, start=1):
    # Extract row numbers for current group
    group_rows = row_numbers[start_idx:start_idx + size]
    
    # Assign group number to valid rows in A
    for row in group_rows:
        idx = row   # Convert to 0-based index
        df_groups.at[idx, 'group_number'] = group_num
    
    start_idx += size  # Move to next group

Anonymize each group by doing the flipping based on majority value

In [None]:
# anonymize each group
def anonymize_per_group(matrix_df):

    # create working copy and remove group_number column
    anonymized_mat = matrix_df.drop(columns=['group_number']).copy()
    
    # get group assignments from the list
    groups = matrix_df.groupby('group_number').groups
    
    for group_id, row_indices in groups.items():
        # extract cluster submatrix
        cluster_mat = anonymized_mat.loc[row_indices]
        
        # process each column
        for col in cluster_mat.columns:
            col_sum = cluster_mat[col].sum()
            if col_sum > 0:  # only process non-zero columns
                # determine majority value 
                majority_val = 1 if col_sum >= len(row_indices)/2 else 0
                # apply to all rows in cluster
                anonymized_mat.loc[row_indices, col] = majority_val
                
    return anonymized_mat

anonymized_A = anonymize_per_group(df_groups)


Exporting the dataset in preparation for recommender systems

In [None]:
transactions = []
for user_id, row in anonymized_A.iterrows():
    for item_id, value in row.items():
        if value == 1:  # only include interactions
            transactions.append({
                'userID': user_id,
                'itemID': item_id,
                'rating': 1,  # dummy for recsys
                'timestamp': 0  # dummy for recsys
            })

recsys_data = pd.DataFrame(transactions)
recsys_data.head()
recsys_data.to_csv(r'P:\pCloud Offline\PLUS\2nd Sem\Privacy Engineering\Data\Results\mondrian_k15_75_50.csv', index=False)



Computing the Hamming distance loss

In [73]:
import numpy as np

def compute_loss(original, anonymized) -> float:
    return np.abs(original.values - anonymized.values).sum() / original.size

print(compute_loss(df_cleaned,anonymized_A))

0.08670526560724894
