# About

This notebook contains the sandbox for developing random response algorithms (obfuscation), i.e. for user-item interactions (i.e., user-items ratings reduced to interactions).  
These correspond to algorithms 5-8 from the Polat and Batmaz framework.

This is used only for testing and debugging and **shouldn't be used to generate obfuscated datasets**.

In [1]:
import pandas as pd
import numpy as np
import math
from io import StringIO

In [2]:
df_ratings = pd.read_csv("../data/ml-1m/ratings.dat", sep="::", header=None, engine="python", usecols=[0,1,2])
df_ratings.columns = ['user_id', 'movie_id', 'rating']

In [3]:
n_users = df_ratings['user_id'].nunique()
n_items = df_ratings['movie_id'].nunique()
n_users, n_items

(6040, 3706)

In [4]:
# NO NEED TO EXECUTE IF YOU DON'T GENERATE SYNTHETIC FEATURES
# generate synthetic binary data from binomial distribution - coin flip
# n_features = 5
# col_names = ['f_{}'.format(i) for i in range(n_features)]
# response_col_names = ['fr_{}'.format(i) for i in range(n_features)]
# df_ratings[col_names] = np.random.binomial(1, 0.5, (len(df_ratings), n_features))

In [4]:
# Transform ratings to interactions: everything above 3 => 1, else => 0
df_ratings['interaction'] = df_ratings['rating'] >= 3
df_ratings['interaction'].replace({True:1, False:0}, inplace=True)
col_names = ['interaction']
response_col_names = [col+'_r' for col in col_names]
n_features = len(response_col_names)
col_names, response_col_names

(['interaction'], ['interaction_r'])

In [5]:
cols_export = ['user_id', 'movie_id', 'rating', 'interaction', 'interaction_r']

In [6]:
grp_sizes = [3, 5, 10, 20, 30, 50, 70, 100]

In [7]:
df_ratings[response_col_names] = df_ratings[col_names].copy()

In [8]:
n_groups = 3
theta_max = 1

In [9]:
# group by user id
grouped = df_ratings.groupby('user_id')

In [10]:
df_ratings['group'] = grouped['user_id'].transform(lambda x: np.random.choice(n_groups, len(x)))

In [13]:
df_ratings

Unnamed: 0,user_id,movie_id,rating,interaction,interaction_r,group
0,1,1193,5,1,1,2
1,1,661,3,1,1,0
2,1,914,3,1,1,1
3,1,3408,4,1,1,1
4,1,2355,5,1,1,2
...,...,...,...,...,...,...
1000204,6040,1091,1,0,0,1
1000205,6040,1094,5,1,1,1
1000206,6040,562,5,1,1,1
1000207,6040,1096,4,1,1,0


# Fixed response
## No masking
Corresponds to **Framework 5**.

In [17]:
response_col_names

['interaction_r']

In [19]:
def response_fixed_no_masking(df: pd.DataFrame):
    df = df.copy()
    df['r'] = df.groupby(['user_id', 'group'])['group'].transform(lambda x: np.random.binomial(1, 0.5))
    df.loc[df['r'] == 1, response_col_names] = 1 - df.loc[df['r'] == 1, response_col_names]
    return df

In [20]:
df_ratings = response_fixed_no_masking(df_ratings)

   user_id  movie_id  rating  interaction  interaction_r  group  r
0        1      1193       5            1              1      2  1
1        1       661       3            1              1      1  0
2        1       914       3            1              1      2  1
3        1      3408       4            1              1      0  1
4        1      2355       5            1              1      2  1


In [19]:
df_ratings['r'] = df_ratings.groupby(['user_id', 'group'])['group'].transform(lambda x: np.random.binomial(1, theta_max/2))
df_ratings.loc[df_ratings['r'] == 1, response_col_names] = 1 - df_ratings.loc[df_ratings['r'] == 1, response_col_names]

In [23]:
df_ratings[cols_export].to_csv('../data/ml-1m-response/ratings_obf5_fixed_no_mask.csv', index=False)

## With masking
Corresponds to **Framework 7**

In [11]:
df_ratings['r'] = df_ratings.groupby(['user_id', 'group'])['group'].transform(lambda x: np.random.binomial(1, theta_max/2))

In [12]:
# choose a beta max based on its user-wise distribution
beta_max = grouped.size().max()/n_items
# and now generate a beta
beta = np.random.uniform(0, beta_max)

In [35]:
# initialise user-wise data frame for each unrated movie: conserve the beta, sigma, treatment
df_users_unrated = grouped[['group', 'r']].agg('min')
df_users_unrated['n_rated'] = grouped.size()
df_users_unrated['n_rated'].fillna(0)
df_users_unrated.reset_index(inplace=True)
# for each user, radomly choose a random number of unrated movie ids
all_items = set(range(n_items))
df_users_unrated['unselected_set'] = grouped['movie_id'].agg(list).apply(lambda s: list(all_items.difference(set(s)))).reset_index()['movie_id']
df_users_unrated['unselected_subset'] = df_users_unrated.apply(lambda x: np.random.choice(x['unselected_set'], int(beta*(n_items-x['n_rated'])), replace=False), axis=1)
df_users_unrated.drop(columns=['unselected_set'], inplace=True)
df_users_unrated.rename(columns={'unselected_subset':'movie_id'}, inplace=True)
# explode the table for each user_id and unrated movie_id
df_users_unrated = df_users_unrated.explode('movie_id', ignore_index=True)

In [37]:
# generate random binary values
df_users_unrated[col_names] = np.random.binomial(1, 0.5, (len(df_users_unrated), n_features))
df_users_unrated[response_col_names] = df_ratings[col_names].copy()

In [38]:
# # apply obfuscation to the original ratings
# df_ratings.loc[df_ratings['r'] >= df_ratings['theta'], response_col_names] = 1 - df_ratings.loc[df_ratings['r'] >= df_ratings['theta'], response_col_names]

df_ratings.loc[df_ratings['r'] == 1, response_col_names] = 1 - df_ratings.loc[df_ratings['r'] == 1, response_col_names]


In [39]:
# # apply obfuscation to the fictive ratings
# df_users_unrated.loc[df_users_unrated['r'] >= df_users_unrated['theta'], response_col_names] = 1 - df_users_unrated.loc[df_users_unrated['r'] >= df_users_unrated['theta'], response_col_names]

df_users_unrated.loc[df_users_unrated['r'] == 1, response_col_names] = 1 - df_users_unrated.loc[df_users_unrated['r'] == 1, response_col_names]

In [58]:
df_compare = pd.DataFrame()
df_compare['n_synthetic'] = df_users_unrated.groupby('user_id').size()

In [59]:
df_compare['n_rated'] = df_users_unrated.groupby('user_id').agg(min)['n_rated']

In [60]:
df_compare

Unnamed: 0_level_0,n_synthetic,n_rated
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,10,53
2,10,129
3,10,51
4,10,21
5,9,198
...,...,...
6036,8,888
6037,9,202
6038,10,20
6039,10,123


In [19]:
# add the original/synthetic flag
df_ratings['is_original'] = True
df_users_unrated['is_original'] = False
cols_export.append('is_original')
if 'rating' in cols_export:
    cols_export.remove('rating')

In [20]:
# concatenate the original and synthetic data with noise
df_ratings_agg = pd.concat([df_ratings[cols_export], df_users_unrated[cols_export]])

In [21]:
df_ratings_agg.to_csv('../data/ml-1m-response/ratings_obf7_fixed_with_mask.csv', index=False)

# Variable response

In [None]:
def response_variable_no_masking(df: pd.DataFrame, response_col_names:list):
    df = df.copy()
    df['r'] = np.random.uniform(size=len(df))
    df['theta'] = df.groupby(['user_id', 'group'])['group'].transform(lambda x: np.random.uniform())
    df.loc[df['r'] >= df['theta'], response_col_names] = 1 - df.loc[df['r'] >= df['theta'], response_col_names]
    return df

In [15]:
# df_ratings['r'] = np.random.uniform(size=len(df_ratings))
df_ratings['r'] = df_ratings.groupby(['user_id', 'group'])['group'].transform(lambda x: np.random.uniform())

In [20]:
# generate theta for each group of each user - this is needed both for the masking and non-masking
df_ratings['theta'] = df_ratings.groupby(['user_id', 'group'])['group'].transform(lambda x: np.random.uniform(0, theta_max))

## No masking
*This and the next section are mutually exclusive!*  
Corresponds to **Framework 6**.

In [14]:
df_ratings.loc[df_ratings['r'] >= df_ratings['theta'], response_col_names] = 1 - df_ratings.loc[df_ratings['r'] >= df_ratings['theta'], response_col_names]

In [16]:
df_ratings.to_csv('../data/ml-1m-response/ratings_obf6_variable_no_mask.csv', index=False)

## With masking
*This and the previous section are mutually exclusive!*  
Corresponds to **Framework 8**.

In [17]:
# choose a beta max based on its user-wise distribution
beta_max = grouped.size().max()/n_items
# generate a beta for each user
df_ratings['beta'] = grouped['user_id'].transform(lambda x: np.random.uniform(0, beta_max))

In [18]:
# initialise user-wise data frame for each unrated movie: conserve the beta, sigma, treatment
df_users_unrated = grouped[['group', 'beta', 'theta', 'r']].agg('min').reset_index()
# for each user, radomly choose a random number of unrated movie ids
all_items = set(range(n_items))
df_users_unrated['unselected_set'] = grouped['movie_id'].agg(list).apply(lambda s: list(all_items.difference(set(s)))).reset_index()['movie_id']
df_users_unrated['unselected_subset'] = df_users_unrated.apply(lambda x: np.random.choice(x['unselected_set'], int(x['beta']*(n_items-len(x['unselected_set']))), replace=False), axis=1)
df_users_unrated.drop(columns=['unselected_set'], inplace=True)
df_users_unrated.rename(columns={'unselected_subset':'movie_id'}, inplace=True)
# explode the table for each user_id and unrated movie_id
df_users_unrated = df_users_unrated.explode('movie_id', ignore_index=True)

In [19]:
# generate random binary values
df_users_unrated[col_names] = np.random.binomial(1, 0.5, (len(df_users_unrated), n_features))
df_users_unrated[response_col_names] = df_ratings[col_names].copy()

In [20]:
# apply obfuscation to the original ratings
df_ratings.loc[df_ratings['r'] >= df_ratings['theta'], response_col_names] = 1 - df_ratings.loc[df_ratings['r'] >= df_ratings['theta'], response_col_names]

In [21]:
# apply obfuscation to the fictive ratings
df_users_unrated.loc[df_users_unrated['r'] >= df_users_unrated['theta'], response_col_names] = 1 - df_users_unrated.loc[df_users_unrated['r'] >= df_users_unrated['theta'], response_col_names]

In [29]:
# add the original/synthetic flag
df_ratings['is_original'] = True
df_users_unrated['is_original'] = False
cols_export.append('is_original')
cols_export.remove('rating')

In [30]:
# concatenate the original and synthetic data with noise
df_ratings_agg = pd.concat([df_ratings[cols_export], df_users_unrated[cols_export]])

In [31]:
df_ratings_agg.to_csv('../data/ml-1m-response/ratings_obf8_variable_with_mask.csv', index=False)