In [1]:
import numpy as np
import pandas as pd 

from collections import defaultdict
from surprise import Dataset
from surprise import Reader
from surprise.prediction_algorithms.knns import KNNBaseline


In [2]:
train_cols = ['user_id', 'item_id', 'rating', 'timestamp']
item_cols = ['item_id', 'movie', 'release_date', 'v_release_date', 'imdb_url', 'unknown', 'action', 
             'adventure', 'animation', 'childrens', 'comedy', 'crime', 'documentary', 'drama', 'fantasy',
             'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi','Thriller', 'War', 'Western']

trainDf = pd.read_csv('../data/MovieLens.training', sep='\t', lineterminator='\n')
testDf = pd.read_csv('../data/MovieLens.test', sep='\t', lineterminator='\n')
itemDf = pd.read_csv('../data/MovieLens.item', sep='|', lineterminator='\n')

trainDf.columns = train_cols
testDf.columns = train_cols
itemDf.columns = item_cols

In [3]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))
train_data = Dataset.load_from_df(trainDf[['user_id', 'item_id', 'rating']], reader)
test_data = Dataset.load_from_df(testDf[['user_id', 'item_id', 'rating']], reader)

trainset = train_data.build_full_trainset()

In [4]:
def get_top_n(df, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        test_df(list of Prediction objects): dataframe with predictions.
        n(int): The number of recommendation to output for each user. Default is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [raw item id, ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for index, row in df.iterrows():
        top_n[row["user_id"]].append((row['item_id'], row['prediction']))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_items = [item_rating_t[0] for item_rating_t in user_ratings[:n]]
        top_n[uid] = top_items

    return top_n

In [5]:
# - get filler items
from random import randrange
rating_mean, rating_std = trainDf.rating.mean(), trainDf.rating.std()

class FakeProfile(object): 
    MAX_RATING = 5
    
    def __init__(self, target_items, 
                 filler_item_count = 70):
        self.target_items  = target_items
        self.filler_item_count = filler_item_count
        self.selected_items = {}
        self.filler_items = {}
        
    def setSelectedItems(self, selectedItems):
        #selectedItems = freqRatedItems[np.random.choice(len(freqRatedItems), size=self.selected_items_count, replace=False)]
        for item in selectedItems:
            self.selected_items[item] = self.MAX_RATING
    
    def fillerItems(self, selectedItems):
        targetSelItems = list(self.target_items)
        targetSelItems.extend(selectedItems)
        #print(targetSelItems)
        fillers_candidates = list(set(trainDf.item_id.unique()) - set(targetSelItems) )
        fillers = np.random.choice(fillers_candidates, size=self.filler_item_count, replace=False)
        ratings = np.round(np.random.normal(loc=rating_mean, scale=rating_std, size=self.filler_item_count), 1)
        for item, rating in zip(fillers, ratings):
            self.filler_items[item] = rating
                
    def create(self, selectedItems):
        self.setSelectedItems(selectedItems)
        self.fillerItems(selectedItems)
        
    def print(self):
        print(f'target_item : {self.target_items}')
        print(f'selected_items : {self.selected_items}')
        print(f'filler_items : {self.filler_items}')
        print("\n")
        
    def getAllItemRatings(self):
        itemRatings = [(item, self.MAX_RATING) for item in self.target_items]
        for item in self.selected_items:
            itemRatings.append((item, self.selected_items[item]))
        for item in self.filler_items:
            itemRatings.append((item, self.filler_items[item]))
        return itemRatings
    
def createProfiles(targetItems, selected_items, n):
    fake_profiles = []
    for _ in range(n):
        fp = FakeProfile(targetItems)
        fp.create(selected_items)
        fake_profiles.append(fp)
    
    '''
    # - Uncomment to print
    for fp in fake_profiles:
        fp.print()
    '''
    return fake_profiles
 
def createProfilesAsDf(targetItems, selected_items, n):
    fake_profiles = createProfiles(targetItems, selected_items, n)
    userId = 1100
    timestamp = 874965758
    fakeRatingsdata = {'userId': [], 'item_id': [], 'ratings': [], 'timestamp': []}
    for fp in fake_profiles:
        userId += 1
        itemRatings = fp.getAllItemRatings()
        for itemRatingPair in itemRatings:
            fakeRatingsdata['userId'].append(userId)
            fakeRatingsdata['item_id'].append(itemRatingPair[0])
            fakeRatingsdata['ratings'].append(itemRatingPair[1])
            fakeRatingsdata['timestamp'].append(timestamp)

    columnsZipped = zip(fakeRatingsdata['userId'], fakeRatingsdata['item_id'],
                       fakeRatingsdata['ratings'], fakeRatingsdata['timestamp'])
    fakeProfileDf = pd.DataFrame(list(columnsZipped),
                   columns =['user_id', 'item_id', 'rating', 'timestamp'])
    return fakeProfileDf

In [6]:
def prediction_shift(predBefore, predAtk, target_users, testDf):
    
    targetUsersTest = testDf[testDf.user_id.isin(target_users)]
    numTargetUsersInTest = len(targetUsersTest.user_id)
    print(f'Number of target users in test: {numTargetUsersInTest}, uniq: {len(targetUsersTest.user_id.unique())}')
    
    # - Prediction shift across targetted users
    predAttackTargetUser = predAtk[predAtk.user_id.isin(target_users)].sort_values(['user_id', 'item_id']).prediction
    predTargetUser = predBefore[predBefore.user_id.isin(target_users)].sort_values(['user_id', 'item_id']).prediction
    targetUserPredShift = np.sum(predAttackTargetUser - predTargetUser)/numTargetUsersInTest
    
    predAfterAttack = predAtk.sort_values(['user_id', 'item_id']).prediction
    predBeforeAttack = predBefore.sort_values(['user_id', 'item_id']).prediction
    print('diff sum: ', np.sum(predAfterAttack - predBeforeAttack))
    print('count: ', testDf.user_id.count(), ' uniq: ', len(testDf.user_id.unique()))
    allUsersPredShift = np.sum(predAfterAttack - predBeforeAttack)/len(testDf.user_id)
    
    return (allUsersPredShift, targetUserPredShift)

def filterRecsByTargetItem(recommendations, targetItems):
    recWithTargetItems = {}
    for user_id in recommendations.keys():
        topNRec = recommendations[user_id]
        is_target_item_present = any(item in topNRec for item in targetItems)
        if is_target_item_present:
            recWithTargetItems[user_id] = topNRec
            #print(user_id, topNRec)
    
    return recWithTargetItems

def getHitRatioPerItem(topNRecAllUsers, targetItems):
    hitRatioAllItems = {}
    
    for item in targetItems:
        usersWithItem = 0
        for user in topNRecAllUsers.keys():
            if item in topNRecAllUsers[user]:
                usersWithItem += 1
        hitRatio_i = usersWithItem/(len(topNRecAllUsers.keys()) * 1.0)
        hitRatioAllItems[item] = hitRatio_i
                                    
    return hitRatioAllItems 

def getAvgHitRatio(hitRatioPerItem):
    sumHitRatio = 0
    for hitRatio_i in hitRatioPerItem.values():
        sumHitRatio += hitRatio_i 
    return sumHitRatio/(len(hitRatioPerItem.keys()) * 1.0)

In [7]:
NUM_SEL_ITEMS = 3
NUM_FILLER_ITEMS = 90
selected_items = [ 50, 181, 258]
target_items = [868, 1162, 927, 1521, 1301, 1191]

def getTargetUsers(targetItems):
    users_rated_target = set(trainDf[trainDf.item_id.isin(targetItems)].user_id.values)
    # - Users who have not rated target item
    data_tmp = trainDf[~trainDf.user_id.isin(users_rated_target)].copy()

    # - Users who have not rated target item and have rated selected_items
    target_users = data_tmp[data_tmp.item_id.isin(selected_items)].groupby('user_id').size()
    
    print("Number of target users: ", 
           target_users[(target_users == NUM_SEL_ITEMS)].shape[0])
    target_users = sorted(target_users[(target_users == NUM_SEL_ITEMS)].index)
    return target_users

target_users = getTargetUsers(target_items)
print("target_users: ", len(target_users))

Number of target users:  190
target_users:  190


### Attack data

In [8]:
NUM_FAKE_USERS = 50
attackDataDf = createProfilesAsDf(target_items, selected_items, NUM_FAKE_USERS)
attackTrainData = pd.concat([trainDf, attackDataDf]).sort_values(by=['user_id', 'item_id'])
attacktrain_data = Dataset.load_from_df(attackTrainData[['user_id', 'item_id', 'rating']], reader)
attackTrainset = attacktrain_data.build_full_trainset()

### User based KNN recommender system model

In [16]:
# - https://surprise.readthedocs.io/en/stable/getting_started.html?highlight=KNNBaseline#use-a-custom-dataset
userBasedKNN = KNNBaseline(sim_options={'name': 'pearson_baseline', 'user_based': True})
userBasedKNN.fit(trainset)
b4TestDf = testDf.copy()

prediction = []
for index, row in b4TestDf.iterrows():
    pred = userBasedKNN.predict(row["user_id"], row["item_id"], row["rating"], verbose=False)
    prediction.append(pred[3])
#print('b4 prediction: ', prediction)
    
b4TestDf['prediction'] = prediction

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [17]:
top10B4 = get_top_n(b4TestDf)
for user in top10B4.keys():
    if user < 16:
        print("User ", user, " : ", top10B4[user])

User  1.0  :  [114.0, 64.0, 272.0, 174.0, 98.0, 134.0, 100.0, 12.0, 190.0, 56.0]
User  2.0  :  [50.0, 313.0, 251.0, 19.0, 315.0, 316.0, 297.0, 303.0, 257.0, 298.0]
User  3.0  :  [318.0, 272.0, 345.0, 307.0, 300.0, 328.0, 327.0, 332.0, 331.0, 343.0]
User  4.0  :  [50.0, 357.0, 303.0, 354.0, 288.0, 361.0, 294.0, 356.0, 264.0, 260.0]
User  5.0  :  [173.0, 89.0, 445.0, 176.0, 100.0, 1.0, 98.0, 42.0, 79.0, 185.0]
User  6.0  :  [480.0, 318.0, 483.0, 515.0, 513.0, 488.0, 528.0, 479.0, 187.0, 134.0]
User  7.0  :  [174.0, 223.0, 127.0, 511.0, 483.0, 661.0, 182.0, 543.0, 185.0, 657.0]
User  8.0  :  [50.0, 172.0, 127.0, 183.0, 79.0, 651.0, 511.0, 210.0, 190.0, 89.0]
User  9.0  :  [487.0, 479.0, 527.0, 521.0, 691.0, 298.0, 507.0, 340.0, 6.0, 286.0]
User  10.0  :  [483.0, 474.0, 127.0, 98.0, 603.0, 488.0, 493.0, 56.0, 100.0, 199.0]
User  11.0  :  [190.0, 100.0, 735.0, 12.0, 524.0, 191.0, 194.0, 22.0, 736.0, 740.0]
User  12.0  :  [318.0, 50.0, 196.0, 172.0, 96.0, 28.0, 191.0, 735.0, 204.0, 82.0]
Use

In [18]:
attackUserBasedKNN = KNNBaseline(sim_options={'name': 'pearson_baseline', 'user_based': True})
attackUserBasedKNN.fit(attackTrainset)
attackTestDf = testDf.copy()

prediction = []
for index, row in attackTestDf.iterrows():
    pred = attackUserBasedKNN.predict(row["user_id"], row["item_id"], row["rating"], verbose=False)
    prediction.append(pred[3])
    
attackTestDf['prediction'] = prediction
#print('prediction after: ', prediction)
attackTop10 = get_top_n(attackTestDf)


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [19]:
for user in attackTop10.keys():
    if user < 16:
        print("User ", user, " : ", attackTop10[user])

User  1.0  :  [114.0, 64.0, 272.0, 174.0, 100.0, 98.0, 12.0, 134.0, 190.0, 56.0]
User  2.0  :  [50.0, 313.0, 251.0, 19.0, 315.0, 316.0, 297.0, 303.0, 257.0, 298.0]
User  3.0  :  [318.0, 272.0, 345.0, 307.0, 300.0, 328.0, 331.0, 332.0, 327.0, 343.0]
User  4.0  :  [50.0, 357.0, 303.0, 354.0, 288.0, 361.0, 294.0, 356.0, 264.0, 260.0]
User  5.0  :  [173.0, 89.0, 176.0, 100.0, 1.0, 98.0, 42.0, 79.0, 185.0, 429.0]
User  6.0  :  [480.0, 318.0, 483.0, 515.0, 488.0, 479.0, 528.0, 513.0, 134.0, 187.0]
User  7.0  :  [174.0, 223.0, 127.0, 511.0, 483.0, 185.0, 661.0, 657.0, 166.0, 182.0]
User  8.0  :  [50.0, 172.0, 127.0, 183.0, 651.0, 79.0, 511.0, 210.0, 190.0, 89.0]
User  9.0  :  [479.0, 487.0, 527.0, 521.0, 691.0, 298.0, 507.0, 340.0, 6.0, 286.0]
User  10.0  :  [483.0, 474.0, 127.0, 603.0, 98.0, 488.0, 199.0, 100.0, 56.0, 493.0]
User  11.0  :  [190.0, 100.0, 735.0, 191.0, 12.0, 194.0, 524.0, 736.0, 22.0, 740.0]
User  12.0  :  [318.0, 50.0, 172.0, 196.0, 96.0, 28.0, 191.0, 735.0, 204.0, 82.0]
Use

In [20]:
allUsersPredShift, targetUserPredShift = prediction_shift(b4TestDf, attackTestDf, target_users, testDf)
print("Prediction shift - Target users: ", targetUserPredShift)
print("Prediction shift - All users: ", allUsersPredShift)

Number of target users in test: 2228, uniq: 40
diff sum:  58.2863687449364
count:  19999  uniq:  459
Prediction shift - Target users:  0.0006773320921356566
Prediction shift - All users:  0.0029144641604548424


In [21]:
topNRecAllUsersWithTargetsB4 = filterRecsByTargetItem(top10B4, target_items)
topNRecAllUsersWithTargets = filterRecsByTargetItem(attackTop10, target_items)

print(f'Number of users with targets: {len(topNRecAllUsersWithTargets)}')
print(f'Number of users with targets before attack: {len(topNRecAllUsersWithTargetsB4)}')

hitRatioPerItem = getHitRatioPerItem(attackTop10, target_items)
print("hitRatioPerItem: ", hitRatioPerItem)
avgHitRatio = getAvgHitRatio(hitRatioPerItem)
print("\navgHitRatio after attack: ", avgHitRatio)

Number of users with targets: 6
Number of users with targets before attack: 2
hitRatioPerItem:  {868: 0.004357298474945534, 1162: 0.002178649237472767, 927: 0.004357298474945534, 1521: 0.002178649237472767, 1301: 0.0, 1191: 0.0}

avgHitRatio after attack:  0.002178649237472767


### Item based KNN recommender system model

In [26]:
itemBasedKNN = KNNBaseline(sim_options={'name': 'pearson_baseline', 'user_based': False})
itemBasedKNN.fit(trainset)

itemTestDf = testDf.copy()
prediction = []

for index, row in itemTestDf.iterrows():
    pred = itemBasedKNN.predict(row["user_id"], row["item_id"], row["rating"], verbose=False)
    prediction.append(pred[3])
    
itemTestDf['prediction'] = prediction

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [27]:
topItem10 = get_top_n(itemTestDf)
for user in topItem10.keys():
    if user < 16:
        print("User ", user, " : ", topItem10[user])

User  1.0  :  [170.0, 174.0, 272.0, 64.0, 134.0, 100.0, 12.0, 60.0, 190.0, 98.0]
User  2.0  :  [50.0, 315.0, 251.0, 19.0, 313.0, 297.0, 316.0, 303.0, 292.0, 298.0]
User  3.0  :  [318.0, 345.0, 272.0, 307.0, 331.0, 354.0, 348.0, 335.0, 327.0, 334.0]
User  4.0  :  [303.0, 357.0, 361.0, 50.0, 354.0, 288.0, 356.0, 260.0, 264.0, 294.0]
User  5.0  :  [89.0, 176.0, 173.0, 100.0, 144.0, 1.0, 79.0, 69.0, 109.0, 429.0]
User  6.0  :  [483.0, 515.0, 318.0, 513.0, 488.0, 357.0, 134.0, 480.0, 478.0, 199.0]
User  7.0  :  [643.0, 483.0, 174.0, 172.0, 223.0, 657.0, 528.0, 8.0, 127.0, 191.0]
User  8.0  :  [50.0, 511.0, 183.0, 172.0, 79.0, 176.0, 127.0, 651.0, 89.0, 190.0]
User  9.0  :  [479.0, 527.0, 487.0, 521.0, 340.0, 298.0, 507.0, 6.0, 691.0, 286.0]
User  10.0  :  [483.0, 603.0, 98.0, 474.0, 64.0, 127.0, 191.0, 488.0, 199.0, 651.0]
User  11.0  :  [190.0, 735.0, 191.0, 216.0, 12.0, 22.0, 100.0, 194.0, 429.0, 736.0]
User  12.0  :  [50.0, 196.0, 318.0, 735.0, 28.0, 282.0, 172.0, 143.0, 96.0, 15.0]
User

In [30]:
attackItemBasedKNN = KNNBaseline(sim_options={'name': 'pearson_baseline', 'user_based': False})
attackItemBasedKNN.fit(attackTrainset)
attackTestDf = testDf.copy()

prediction = []
for index, row in attackTestDf.iterrows():
    pred = attackItemBasedKNN.predict(row["user_id"], row["item_id"], row["rating"], verbose=False)
    prediction.append(pred[3])
    
attackTestDf['prediction'] = prediction
#print('prediction after: ', prediction)
attackItemTop10 = get_top_n(attackTestDf)


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [31]:
allUsersPredShift, targetUserPredShift = prediction_shift(b4TestDf, attackTestDf, target_users, testDf)
print("Prediction shift - Target users: ", targetUserPredShift)
print("Prediction shift - All users: ", allUsersPredShift)

Number of target users in test: 2228, uniq: 40
diff sum:  642.2561006161402
count:  19999  uniq:  459
Prediction shift - Target users:  0.024533410919141473
Prediction shift - All users:  0.03211441075134458


In [32]:
topNRecAllUsersWithTargetsB4 = filterRecsByTargetItem(topItem10, target_items)
topNRecAllUsersWithTargets = filterRecsByTargetItem(attackItemTop10, target_items)

print(f'Number of users with targets: {len(topNRecAllUsersWithTargets)}')
print(f'Number of users with targets before attack: {len(topNRecAllUsersWithTargetsB4)}')

hitRatioPerItem = getHitRatioPerItem(attackItemTop10, target_items)
print("hitRatioPerItem: ", hitRatioPerItem)
avgHitRatio = getAvgHitRatio(hitRatioPerItem)
print("\navgHitRatio after attack: ", avgHitRatio)

Number of users with targets: 5
Number of users with targets before attack: 0
hitRatioPerItem:  {868: 0.004357298474945534, 1162: 0.004357298474945534, 927: 0.0, 1521: 0.002178649237472767, 1301: 0.0, 1191: 0.0}

avgHitRatio after attack:  0.0018155410312273058
