In [1]:
import numpy as np
import pandas as pd 
import sys

sys.path.append('../code/')

from eval_metrics import prediction_shift, filterRecsByTargetItem, getHitRatioPerItem, getAvgHitRatio
from collections import defaultdict
from surprise import Dataset
from surprise import Reader
from surprise.prediction_algorithms.knns import KNNBaseline


In [2]:
train_cols = ['user_id', 'item_id', 'rating', 'timestamp']
item_cols = ['item_id', 'movie', 'release_date', 'v_release_date', 'imdb_url', 'unknown', 'action', 
             'adventure', 'animation', 'childrens', 'comedy', 'crime', 'documentary', 'drama', 'fantasy',
             'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi','Thriller', 'War', 'Western']

trainDf = pd.read_csv('../data/MovieLens.training', sep='\t', lineterminator='\n')
testDf = pd.read_csv('../data/MovieLens.test', sep='\t', lineterminator='\n')
itemDf = pd.read_csv('../data/MovieLens.item', sep='|', lineterminator='\n')

trainDf.columns = train_cols
testDf.columns = train_cols
itemDf.columns = item_cols

In [5]:
def get_top_n(df, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        test_df(list of Prediction objects): dataframe with predictions.
        n(int): The number of recommendation to output for each user. Default is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [raw item id, ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for index, row in df.iterrows():
        top_n[row["user_id"]].append((row['item_id'], row['prediction']))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_items = [item_rating_t[0] for item_rating_t in user_ratings[:n]]
        top_n[uid] = top_items

    return top_n

In [6]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))
train_data = Dataset.load_from_df(trainDf[['user_id', 'item_id', 'rating']], reader)
test_data = Dataset.load_from_df(testDf[['user_id', 'item_id', 'rating']], reader)

trainset = train_data.build_full_trainset()

In [7]:
NUM_SEL_ITEMS = 3
NUM_FILLER_ITEMS = 90
selected_items = [ 50, 181, 258]
target_items = [868, 1162, 927, 1521, 1301, 1191]

def getTargetUsers(targetItems):
    users_rated_target = set(trainDf[trainDf.item_id.isin(targetItems)].user_id.values)
    # - Users who have not rated target item
    data_tmp = trainDf[~trainDf.user_id.isin(users_rated_target)].copy()

    # - Users who have not rated target item and have rated selected_items
    target_users = data_tmp[data_tmp.item_id.isin(selected_items)].groupby('user_id').size()
    
    print("Number of target users: ", 
           target_users[(target_users == NUM_SEL_ITEMS)].shape[0])
    target_users = sorted(target_users[(target_users == NUM_SEL_ITEMS)].index)
    return target_users

target_users = getTargetUsers(target_items)
print("target_users: ", len(target_users))

Number of target users:  190
target_users:  190


### Attack data

### User based KNN recommender system model

In [27]:
attackType = ['bandwagon', 'random', 'sampling']

# - Before attack model data
userBasedKNN = KNNBaseline(sim_options={'name': 'pearson_baseline', 'user_based': True})
userBasedKNN.fit(trainset)
b4AttackTestDf = testDf.copy()

# - get predictions on test set from trained model with attack data
prediction = []
for index, row in b4AttackTestDf.iterrows():
    pred = userBasedKNN.predict(row["user_id"], row["item_id"], row["rating"], verbose=False)
    prediction.append(pred[3])
b4AttackTestDf['prediction'] = prediction
b4Attacktop10 = get_top_n(b4TestDf)

# - https://surprise.readthedocs.io/en/stable/getting_started.html?highlight=KNNBaseline#use-a-custom-dataset
for a_type in attackType:
    print("\n", '-' * 30)
    print('Simulating attack: ', a_type)
    print('-' * 30, "\n")
    
    # - Attack data
    attackDataDf = pd.read_csv('../code/%s.csv' %a_type)
    attackTrainData = pd.concat([trainDf, attackDataDf]).sort_values(by=['user_id', 'item_id'])
    
    # - Attack dataset
    attacktrain_data = Dataset.load_from_df(attackTrainData[['user_id', 'item_id', 'rating']], reader)
    attackTrainset = attacktrain_data.build_full_trainset()
    
    attackUserBasedKNN = KNNBaseline(sim_options={'name': 'pearson_baseline', 'user_based': True})
    attackUserBasedKNN.fit(attackTrainset)
    attackTestDf = testDf.copy()

    prediction = []
    for index, row in attackTestDf.iterrows():
        pred = attackUserBasedKNN.predict(row["user_id"], row["item_id"], row["rating"], verbose=False)
        prediction.append(pred[3])

    attackTestDf['prediction'] = prediction
    attackTop10 = get_top_n(attackTestDf)

    allUsersPredShift, targetUserPredShift = prediction_shift(b4TestDf, attackTestDf, target_users, testDf)
    print(f'[{a_type}] Prediction shift - Target users: {targetUserPredShift}')
    print(f'[{a_type}] Prediction shift - All users: {allUsersPredShift}')
    
    topNRecAllUsersWithTargetsB4 = filterRecsByTargetItem(top10B4, target_items)
    topNRecAllUsersWithTargets = filterRecsByTargetItem(attackTop10, target_items)

    print(f'[{a_type}] Number of users with targets: {len(topNRecAllUsersWithTargets)}')
    print(f'[{a_type}] Number of users with targets before attack: {len(topNRecAllUsersWithTargetsB4)}')

    hitRatioPerItem = getHitRatioPerItem(attackTop10, target_items)
    print(f'[{a_type}] hitRatioPerItem: {hitRatioPerItem}')
    avgHitRatio = getAvgHitRatio(hitRatioPerItem)
    print(f'[{a_type}] avgHitRatio after attack: {avgHitRatio}')

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.

 ------------------------------
Simulating attack:  bandwagon
------------------------------ 

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Number of target users in test: 2228, uniq: 40
diff sum:  32.23810260723493
count:  19999  uniq:  459
[bandwagon] Prediction shift - Target users: -0.0004385865868390656
[bandwagon] Prediction shift - All users: 0.0016119857296482288
[bandwagon] Number of users with targets: 6
[bandwagon] Number of users with targets before attack: 2
[bandwagon] hitRatioPerItem: {868: 0.004357298474945534, 1162: 0.002178649237472767, 927: 0.002178649237472767, 1521: 0.002178649237472767, 1301: 0.002178649237472767, 1191: 0.0}
[bandwagon] avgHitRatio after attack: 0.002178649237472767

 ------------------------------
Simulating attack:  random
------------------------------ 

Est

### Item based KNN recommender system model

In [28]:
itemBasedKNN = KNNBaseline(sim_options={'name': 'pearson_baseline', 'user_based': False})
itemBasedKNN.fit(trainset)

b4ItemTestDf = testDf.copy()
prediction = []

for index, row in b4ItemTestDf.iterrows():
    pred = itemBasedKNN.predict(row["user_id"], row["item_id"], row["rating"], verbose=False)
    prediction.append(pred[3])
    
b4ItemTestDf['prediction'] = prediction
b4AttackItemtop10 = get_top_n(b4ItemTestDf)

# - https://surprise.readthedocs.io/en/stable/getting_started.html?highlight=KNNBaseline#use-a-custom-dataset
for a_type in attackType:
    print("\n", '-' * 30)
    print('(Item based) Simulating attack: ', a_type)
    print('-' * 30, "\n")
    
    # - Attack data
    attackDataDf = pd.read_csv('../code/%s.csv' %a_type)
    attackTrainData = pd.concat([trainDf, attackDataDf]).sort_values(by=['user_id', 'item_id'])
    
    # - Attack dataset
    attacktrain_data = Dataset.load_from_df(attackTrainData[['user_id', 'item_id', 'rating']], reader)
    attackTrainset = attacktrain_data.build_full_trainset()
    
    attackItemBasedKNN = KNNBaseline(sim_options={'name': 'pearson_baseline', 'user_based': False})
    attackItemBasedKNN.fit(attackTrainset)
    attackItemTestDf = testDf.copy()

    prediction = []
    for index, row in attackItemTestDf.iterrows():
        pred = attackItemBasedKNN.predict(row["user_id"], row["item_id"], row["rating"], verbose=False)
        prediction.append(pred[3])

    attackItemTestDf['prediction'] = prediction
    #print('prediction after: ', prediction)
    
    allUsersPredShift, targetUserPredShift = prediction_shift(b4ItemTestDf, attackItemTestDf, target_users, testDf)
    print(f'[{a_type}] Prediction shift - Target users: {targetUserPredShift}')
    print(f'[{a_type}] Prediction shift - All users: {allUsersPredShift}')
    
    attackItemTop10 = get_top_n(attackItemTestDf)
    topNRecAllUsersWithTargetsB4 = filterRecsByTargetItem(b4AttackItemtop10, target_items)
    topNRecAllUsersWithTargets = filterRecsByTargetItem(attackItemTop10, target_items)

    print(f'[{a_type}] Number of users with targets: {len(topNRecAllUsersWithTargets)}')
    print(f'[{a_type}] Number of users with targets before attack: {len(topNRecAllUsersWithTargetsB4)}')

    hitRatioPerItem = getHitRatioPerItem(attackItemTop10, target_items)
    print(f'[{a_type}] hitRatioPerItem: {hitRatioPerItem}')
    avgHitRatio = getAvgHitRatio(hitRatioPerItem)
    print(f'[{a_type}] avgHitRatio after attack: {avgHitRatio}')

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.

 ------------------------------
(Item based) Simulating attack:  bandwagon
------------------------------ 

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Number of target users in test: 2228, uniq: 40
diff sum:  25.819735219792616
count:  19999  uniq:  459
[bandwagon] Prediction shift - Target users: -0.000975204874138421
[bandwagon] Prediction shift - All users: 0.0012910513135553085
[bandwagon] Number of users with targets: 5
[bandwagon] Number of users with targets before attack: 0
[bandwagon] hitRatioPerItem: {868: 0.002178649237472767, 1162: 0.004357298474945534, 927: 0.002178649237472767, 1521: 0.002178649237472767, 1301: 0.0, 1191: 0.0}
[bandwagon] avgHitRatio after attack: 0.0018155410312273058

 ------------------------------
(Item based) Simulating attack:  random
--------------------------

### Set 2: User based KNN recommender system model (sel items: 50 100 174)


In [29]:
# - https://surprise.readthedocs.io/en/stable/getting_started.html?highlight=KNNBaseline#use-a-custom-dataset
for a_type in attackType:
    print("\n", '-' * 30)
    print('Simulating attack: ', a_type)
    print('-' * 30, "\n")
    
    # - Attack data
    attackDataDf = pd.read_csv('../code1/%s.csv' %a_type)
    attackTrainData = pd.concat([trainDf, attackDataDf]).sort_values(by=['user_id', 'item_id'])
    
    # - Attack dataset
    attacktrain_data = Dataset.load_from_df(attackTrainData[['user_id', 'item_id', 'rating']], reader)
    attackTrainset = attacktrain_data.build_full_trainset()
    
    attackUserBasedKNN = KNNBaseline(sim_options={'name': 'pearson_baseline', 'user_based': True})
    attackUserBasedKNN.fit(attackTrainset)
    attackTestDf = testDf.copy()

    prediction = []
    for index, row in attackTestDf.iterrows():
        pred = attackUserBasedKNN.predict(row["user_id"], row["item_id"], row["rating"], verbose=False)
        prediction.append(pred[3])

    attackTestDf['prediction'] = prediction
    attackTop10 = get_top_n(attackTestDf)

    allUsersPredShift, targetUserPredShift = prediction_shift(b4TestDf, attackTestDf, target_users, testDf)
    print(f'[{a_type}] Prediction shift - Target users: {targetUserPredShift}')
    print(f'[{a_type}] Prediction shift - All users: {allUsersPredShift}')
    
    topNRecAllUsersWithTargetsB4 = filterRecsByTargetItem(top10B4, target_items)
    topNRecAllUsersWithTargets = filterRecsByTargetItem(attackTop10, target_items)

    print(f'[{a_type}] Number of users with targets: {len(topNRecAllUsersWithTargets)}')
    print(f'[{a_type}] Number of users with targets before attack: {len(topNRecAllUsersWithTargetsB4)}')

    hitRatioPerItem = getHitRatioPerItem(attackTop10, target_items)
    print(f'[{a_type}] hitRatioPerItem: {hitRatioPerItem}')
    avgHitRatio = getAvgHitRatio(hitRatioPerItem)
    print(f'[{a_type}] avgHitRatio after attack: {avgHitRatio}')


 ------------------------------
Simulating attack:  bandwagon
------------------------------ 

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Number of target users in test: 2228, uniq: 40
diff sum:  43.61698127651284
count:  19999  uniq:  459
[bandwagon] Prediction shift - Target users: 0.0006858931091536728
[bandwagon] Prediction shift - All users: 0.0021809581117312286
[bandwagon] Number of users with targets: 7
[bandwagon] Number of users with targets before attack: 2
[bandwagon] hitRatioPerItem: {868: 0.004357298474945534, 1162: 0.002178649237472767, 927: 0.004357298474945534, 1521: 0.002178649237472767, 1301: 0.002178649237472767, 1191: 0.0}
[bandwagon] avgHitRatio after attack: 0.002541757443718228

 ------------------------------
Simulating attack:  random
------------------------------ 

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Numb

### Set 2: Item based KNN recommender system model (sel items: 50 100 174)


In [30]:
# - https://surprise.readthedocs.io/en/stable/getting_started.html?highlight=KNNBaseline#use-a-custom-dataset
for a_type in attackType:
    print("\n", '-' * 30)
    print('(Item based) Simulating attack: ', a_type)
    print('-' * 30, "\n")
    
    # - Attack data
    attackDataDf = pd.read_csv('../code1/%s.csv' %a_type)
    attackTrainData = pd.concat([trainDf, attackDataDf]).sort_values(by=['user_id', 'item_id'])
    
    # - Attack dataset
    attacktrain_data = Dataset.load_from_df(attackTrainData[['user_id', 'item_id', 'rating']], reader)
    attackTrainset = attacktrain_data.build_full_trainset()
    
    attackItemBasedKNN = KNNBaseline(sim_options={'name': 'pearson_baseline', 'user_based': False})
    attackItemBasedKNN.fit(attackTrainset)
    attackItemTestDf = testDf.copy()

    prediction = []
    for index, row in attackItemTestDf.iterrows():
        pred = attackItemBasedKNN.predict(row["user_id"], row["item_id"], row["rating"], verbose=False)
        prediction.append(pred[3])

    attackItemTestDf['prediction'] = prediction
    #print('prediction after: ', prediction)
    
    allUsersPredShift, targetUserPredShift = prediction_shift(b4ItemTestDf, attackItemTestDf, target_users, testDf)
    print(f'[{a_type}] Prediction shift - Target users: {targetUserPredShift}')
    print(f'[{a_type}] Prediction shift - All users: {allUsersPredShift}')
    
    attackItemTop10 = get_top_n(attackItemTestDf)
    topNRecAllUsersWithTargetsB4 = filterRecsByTargetItem(b4AttackItemtop10, target_items)
    topNRecAllUsersWithTargets = filterRecsByTargetItem(attackItemTop10, target_items)

    print(f'[{a_type}] Number of users with targets: {len(topNRecAllUsersWithTargets)}')
    print(f'[{a_type}] Number of users with targets before attack: {len(topNRecAllUsersWithTargetsB4)}')

    hitRatioPerItem = getHitRatioPerItem(attackItemTop10, target_items)
    print(f'[{a_type}] hitRatioPerItem: {hitRatioPerItem}')
    avgHitRatio = getAvgHitRatio(hitRatioPerItem)
    print(f'[{a_type}] avgHitRatio after attack: {avgHitRatio}')


 ------------------------------
(Item based) Simulating attack:  bandwagon
------------------------------ 

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Number of target users in test: 2228, uniq: 40
diff sum:  21.081483245764055
count:  19999  uniq:  459
[bandwagon] Prediction shift - Target users: -0.00021525229904695856
[bandwagon] Prediction shift - All users: 0.0010541268686316344
[bandwagon] Number of users with targets: 7
[bandwagon] Number of users with targets before attack: 0
[bandwagon] hitRatioPerItem: {868: 0.002178649237472767, 1162: 0.004357298474945534, 927: 0.004357298474945534, 1521: 0.002178649237472767, 1301: 0.002178649237472767, 1191: 0.0}
[bandwagon] avgHitRatio after attack: 0.002541757443718228

 ------------------------------
(Item based) Simulating attack:  random
------------------------------ 

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done comp