## Contents:
* [Load Data](#load-data)
* [Target Item and Users](#target-item-user)
* [Create Fake Profile](#create-fake-data)
* [Evaluation](#eval-attack)
 * [Prediction Shift](#pred-shift)
 * [Hit Ratio](#hit-ratio)
 




In [512]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [3]:
# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

## Load Data <a class="anchor" id="load-data"></a>

In [513]:
schema_ratings = StructType([
    StructField("user_id", IntegerType(), False),
    StructField("item_id", IntegerType(), False),
    StructField("rating", IntegerType(), False),
    StructField("timestamp", IntegerType(), False)])

schema_items = StructType([
    StructField("item_id", IntegerType(), False),
    StructField("movie", StringType(), False)])

training = spark.read.option("sep", "\t").csv("../data/MovieLens.training", header=False, schema=schema_ratings)
test = spark.read.option("sep", "\t").csv("../data/MovieLens.test", header=False, schema=schema_ratings)
items = spark.read.option("sep", "|").csv("../data/MovieLens.item", header=False, schema=schema_items)

In [514]:
trainDf = training.toPandas()
testDf = test.toPandas()
num_users, num_items = len(trainDf.user_id.unique()), len(trainDf.item_id.unique())
max_uid, max_iid = trainDf.user_id.unique().max(), trainDf.item_id.unique().max()
print(f'num_users: {num_users} num_items : {num_items}')
print(f'max_user_id: ', max_uid, " max_item_id: ", max_iid)

num_users: 943 num_items : 1650
max_user_id:  943  max_item_id:  1682


## Target Item and Users <a class="anchor" id="target-item-user"></a>

In [515]:
NUM_SEL_ITEMS = 3
NUM_FILLER_ITEMS = 90

In [516]:
# - Create popular selected item list
item_ratings = dict(trainDf.groupby('item_id').size())
item_pop = [0] * (max_iid + 1)

for item_id in item_ratings.keys():
    item_pop[item_id] = item_ratings[item_id]
    
items_sorted = np.array(item_pop).argsort()[::-1]
selected_items = items_sorted[:NUM_SEL_ITEMS]
print("selected_items: ", selected_items)

selected_items:  [ 50 181 258]


In [517]:
trainDf.groupby('item_id', as_index=False).agg(
    rating_sum= ('rating', 'count')).sort_values('rating_sum', ascending=False).head(NUM_SEL_ITEMS)

Unnamed: 0,item_id,rating_sum
49,50,484
180,181,422
257,258,402


In [633]:
# - Select target item
target_items = [j for i in range(8, 10) for j in
                    items_sorted[i * len(items_sorted) // 10:(i * len(items_sorted) // 10) + 2]][::-1]
target_items = list(
    np.random.choice([i for i in range(len(item_pop)) if item_pop[i] == 3], 3, replace=False)) + target_items


print('target_items before filtering with test :', target_items)
target_items = list(testDf[testDf.item_id.isin(target_items)].item_id.unique())
target_items += [1191]

# -- HARD CODING VALUES HERE 
target_items = [868, 1162, 927, 1521, 1301, 1191]
print('target_items:', target_items)
print('target_items rating count: ', [(i, item_pop[i]) for i in target_items])


target_items before filtering with test : [1162, 1521, 868, 1564, 1647, 1301, 927]
target_items: [868, 1162, 927, 1521, 1301, 1191]
target_items rating count:  [(868, 3), (1162, 3), (927, 4), (1521, 3), (1301, 4), (1191, 3)]


In [634]:
# - create target users
threshold = testDf.rating.mean()
threshold = threshold if threshold < 3 else 3.0
print(f'threshold: {threshold}')
    
target_item = 243
users_rated_target = set(trainDf[trainDf.item_id.isin(target_items)].user_id.values)
# - Users who have not rated target item
data_tmp = trainDf[~trainDf.user_id.isin(users_rated_target)].copy()
#data_tmp = data_tmp[data_tmp.rating >= threshold]

# - Users who have not rated target item and have rated selected_items
target_users = data_tmp[data_tmp.item_id.isin(selected_items)].groupby('user_id').size()


print("target_users[(target_users == selected_num)].shape[0]: ", 
       target_users[(target_users == NUM_SEL_ITEMS)].shape[0])
target_users = sorted(target_users[(target_users == NUM_SEL_ITEMS)].index)
target_users
print("target_users: ", len(target_users))

threshold: 3.0
target_users[(target_users == selected_num)].shape[0]:  190
target_users:  190


In [635]:
# - Get ratings mean and rating std
stdDf = trainDf.groupby('item_id', as_index=False).agg(rating_mean= ('rating', 'mean'), rating_std= ('rating', 'std'), 
                                  rating_count = ('rating', 'count'))
item_mean = dict(zip(stdDf.item_id,stdDf.rating_mean))
item_std = dict(zip(stdDf.item_id,stdDf.rating_std))
rating_mean, rating_std = trainDf.rating.mean(), trainDf.rating.std()
print(f'rating_mean: {rating_mean} rating_std: {rating_std}')

rating_mean: 3.52835 rating_std: 1.118564668374818


## Fake profile creation <a class="anchor" id="create-fake-data"></a>

In [636]:
# - get filler items
from random import randrange

class FakeProfile(object): 
    MAX_RATING = 5
    
    def __init__(self, target_items, 
                 filler_item_count = 70):
        self.target_items  = target_items
        self.filler_item_count = filler_item_count
        self.selected_items = {}
        self.filler_items = {}
        
    def setSelectedItems(self, selectedItems):
        #selectedItems = freqRatedItems[np.random.choice(len(freqRatedItems), size=self.selected_items_count, replace=False)]
        for item in selectedItems:
            self.selected_items[item] = self.MAX_RATING
    
    def fillerItems(self, selectedItems):
        targetSelItems = list(self.target_items)
        targetSelItems.extend(selectedItems)
        #print(targetSelItems)
        fillers_candidates = list(set(trainDf.item_id.unique()) - set(targetSelItems) )
        fillers = np.random.choice(fillers_candidates, size=self.filler_item_count, replace=False)
        ratings = np.round(np.random.normal(loc=rating_mean, scale=rating_std, size=self.filler_item_count), 1)
        for item, rating in zip(fillers, ratings):
            self.filler_items[item] = rating
                
    def create(self, selectedItems):
        self.setSelectedItems(selectedItems)
        self.fillerItems(selectedItems)
        
    def print(self):
        print(f'target_item : {self.target_items}')
        print(f'selected_items : {self.selected_items}')
        print(f'filler_items : {self.filler_items}')
        print("\n")
        
    def getAllItemRatings(self):
        itemRatings = [(item, self.MAX_RATING) for item in self.target_items]
        for item in self.selected_items:
            itemRatings.append((item, self.selected_items[item]))
        for item in self.filler_items:
            itemRatings.append((item, self.filler_items[item]))
        return itemRatings

In [637]:
NUM_FAKE_USERS = 50
fake_profiles = []

for u in range(NUM_FAKE_USERS):
    fp = FakeProfile(target_items)
    fp.create(selected_items)
    fake_profiles.append(fp)
    
for fp in fake_profiles:
    fp.print()

target_item : [868, 1162, 927, 1521, 1301, 1191]
selected_items : {50: 5, 181: 5, 258: 5}
filler_items : {1474: 2.5, 48: 1.6, 1051: 0.1, 1057: 2.8, 859: 2.2, 1139: 2.8, 476: 3.4, 1491: 3.3, 892: 1.3, 722: 4.4, 835: 2.2, 1410: 1.6, 1465: 4.8, 1212: 2.9, 838: 5.3, 1519: 4.8, 353: 4.0, 1067: 3.7, 1382: 3.7, 1518: 3.7, 520: 3.3, 686: 3.9, 325: 2.7, 322: 1.3, 1371: 1.4, 204: 2.1, 286: 3.4, 765: 3.8, 1218: 5.2, 104: 3.1, 863: 4.6, 702: 4.3, 182: 4.8, 736: 2.5, 291: 1.8, 619: 2.1, 1612: 1.9, 878: 3.5, 652: 1.5, 366: 3.6, 368: 3.8, 891: 3.9, 112: 2.1, 559: 3.5, 193: 3.6, 285: 5.3, 695: 3.9, 687: 3.0, 1148: 1.3, 969: 4.1, 921: 4.2, 1628: 3.3, 1678: 4.6, 1023: 1.5, 958: 2.3, 110: 2.4, 1043: 2.2, 160: 2.7, 713: 3.2, 1341: 3.3, 593: 5.4, 294: 2.4, 1441: 2.2, 432: 2.0, 685: 3.9, 80: 3.1, 1478: 4.5, 974: 3.6, 661: 2.8, 423: 3.3}


target_item : [868, 1162, 927, 1521, 1301, 1191]
selected_items : {50: 5, 181: 5, 258: 5}
filler_items : {55: 4.4, 740: 2.1, 77: 2.9, 1064: 4.7, 1347: 3.5, 280: 3.7, 714: 

In [638]:
# - Create attack data frame
userId = 1100
timestamp = 874965758
fakeRatingsdata = {'userId': [], 'item_id': [], 'ratings': [], 'timestamp': []}
for fp in fake_profiles:
    userId += 1
    itemRatings = fp.getAllItemRatings()
    for itemRatingPair in itemRatings:
        fakeRatingsdata['userId'].append(userId)
        fakeRatingsdata['item_id'].append(itemRatingPair[0])
        fakeRatingsdata['ratings'].append(itemRatingPair[1])
        fakeRatingsdata['timestamp'].append(timestamp)
        
columnsZipped = zip(fakeRatingsdata['userId'], fakeRatingsdata['item_id'],
                   fakeRatingsdata['ratings'], fakeRatingsdata['timestamp'])
attackDataDf = pd.DataFrame(list(columnsZipped),
               columns =['user_id', 'item_id', 'rating', 'timestamp'])
attackDataDf

Unnamed: 0,user_id,item_id,rating,timestamp
0,1101,868,5.0,874965758
1,1101,1162,5.0,874965758
2,1101,927,5.0,874965758
3,1101,1521,5.0,874965758
4,1101,1301,5.0,874965758
...,...,...,...,...
3945,1150,1189,3.7,874965758
3946,1150,1633,3.9,874965758
3947,1150,439,3.8,874965758
3948,1150,1496,4.3,874965758


In [639]:
attackTrainData = pd.concat([trainDf, attackDataDf]).sort_values(by=['user_id', 'item_id'])
attackTrainData

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1,5.0,874965758
1,1,2,3.0,876893171
2,1,3,4.0,878542960
3,1,4,3.0,876893119
4,1,5,3.0,889751712
...,...,...,...,...
3917,1150,1566,5.4,874965758
3946,1150,1633,3.9,874965758
3921,1150,1637,2.9,874965758
3916,1150,1679,5.1,874965758


## Evaluation <a class="anchor" id="eval-attack"></a>

### RMSE <a class="anchor" id="rmse"></a>

#### Model before attack data

In [525]:
# 0.1
als = ALS(maxIter=10, rank=100, regParam=0.1, userCol="user_id", itemCol="item_id", ratingCol="rating", coldStartStrategy="drop")
model = als.fit(training)
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
rmse

0.9283872090543989

In [526]:
userRecs = model.recommendForAllUsers(10)
userRecs = userRecs.toPandas()

In [527]:
count = 0
for index, row in userRecs.iterrows():
    recommendations = [r['item_id'] for r in row['recommendations']]
    if target_item in recommendations:
        print(row['user_id'], recommendations)
        count += 1
print(f'Total users with {target_item}: {count}')

Total users with 243: 0


#### Model with train data + attack data

In [640]:
# 0.1
attackDF = spark.createDataFrame(attackTrainData)
als_atk = ALS(maxIter=10, rank=100, regParam=0.1, userCol="user_id", itemCol="item_id", ratingCol="rating", coldStartStrategy="drop")
model_atk = als_atk.fit(attackDF)
predictions_atk = model_atk.transform(test)
evaluator_atk = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse_atk = evaluator_atk.evaluate(predictions_atk)
rmse_atk

0.9258617814201447

### Calculate prediction shift <a class="anchor" id="pred-shift"></a>

In [641]:
predAtk = predictions_atk.toPandas()
pred = predictions.toPandas()
targetUsersTest = testDf[testDf.user_id.isin(target_users)]
numTargetUsersInTest = len(targetUsersTest.user_id.unique())
print(f'Number of target users in test: {numTargetUsersInTest}')

Number of target users in test: 40


In [642]:
pred[pred.user_id.isin(target_users)]

Unnamed: 0,user_id,item_id,rating,timestamp,prediction
4,222,148,2,881061164,2.995661
10,328,148,3,885048638,3.160234
14,447,148,4,878854729,3.034730
38,313,148,2,891031979,2.723193
51,354,463,4,891217575,3.740919
...,...,...,...,...,...
19903,301,89,2,882076046,3.966234
19916,151,89,5,879524491,4.600976
19928,222,401,2,878184422,1.868581
19934,301,401,4,882078040,2.969263


<h4>Prediction shift across targetted users</h4>

In [643]:
predAttackTargetUser = predAtk[predAtk.user_id.isin(target_users)].sort_values(['user_id', 'item_id']).prediction
predTargetUser = pred[pred.user_id.isin(target_users)].sort_values(['user_id', 'item_id']).prediction

np.sum(predAttackTargetUser - predTargetUser)/numTargetUsersInTest

0.07029163837432861

<h4>Prediction across all users</h4>

In [644]:
predAfterAttack = predAtk.sort_values(['user_id', 'item_id']).prediction
predBeforeAttack = pred.sort_values(['user_id', 'item_id']).prediction

np.sum(predAfterAttack - predBeforeAttack)/len(testDf.user_id.unique())

0.3404002137693185

### Calculate Hit Ratio <a class="anchor" id="hit-ratio"></a>

In [645]:
testDf[testDf.item_id.isin(target_items)].item_id.unique().size

5

In [646]:
def getTopNRecommendations(m, testUserIds, n=10):
    recommendations = {}
    userRecs = m.recommendForAllUsers(10)
    userRecs = userRecs.toPandas()
    
    for index, row in userRecs.iterrows():
        if row['user_id'] in testUserIds:
            userRec = [r['item_id'] for r in row['recommendations']]
            recommendations[row['user_id']] = userRec 
    return recommendations

def filterRecsByTargetItem(recommendations, targetItems):
    recWithTargetItems = {}
    for user_id in recommendations.keys():
        topNRec = recommendations[user_id]
        is_target_item_present = any(item in topNRec for item in targetItems)
        if is_target_item_present:
            recWithTargetItems[user_id] = topNRec
            print(user_id, topNRec)
    
    return recWithTargetItems

In [647]:
testUserIds = testDf.user_id.unique()
topNRecAllUsersAtk = getTopNRecommendations(model_atk, testUserIds)
topNRecAllUsersWithTargets = filterRecsByTargetItem(topNRecAllUsersAtk, target_items)
print(f'Number of users with targets: {len(topNRecAllUsersWithTargets)}')

137 [50, 174, 181, 96, 1269, 868, 1301, 1515, 1191, 79]
300 [1515, 868, 1313, 1191, 1301, 927, 1124, 1521, 881, 922]
332 [169, 868, 313, 50, 318, 1301, 1191, 174, 64, 927]
388 [96, 1269, 1301, 927, 1191, 868, 50, 79, 174, 98]
372 [868, 1482, 1124, 1301, 1449, 1191, 927, 1220, 98, 1063]
319 [318, 316, 272, 1662, 313, 64, 246, 133, 1301, 1191]
355 [1515, 1662, 1301, 1191, 927, 1449, 50, 868, 313, 318]
348 [313, 927, 1301, 1191, 868, 126, 22, 1449, 318, 50]
164 [272, 318, 1443, 1449, 313, 927, 22, 1191, 868, 126]
169 [258, 64, 190, 313, 1137, 408, 50, 927, 1301, 1191]
283 [50, 169, 483, 12, 114, 1191, 64, 168, 172, 1301]
57 [64, 50, 313, 318, 181, 169, 174, 1191, 963, 927]
279 [1491, 1206, 502, 169, 868, 1191, 189, 862, 927, 1301]
257 [318, 315, 1449, 316, 251, 19, 285, 14, 1191, 515]
309 [1515, 258, 1301, 868, 1191, 331, 927, 1269, 1293, 1521]
262 [1449, 190, 131, 483, 699, 1191, 251, 318, 486, 813]
127 [230, 271, 228, 222, 901, 1269, 343, 258, 1521, 927]
88 [1515, 1449, 1191, 921, 1558,

In [648]:
topNRecAllUsersB4 = getTopNRecommendations(model, testUserIds)
topNRecAllUsersWithTargetsB4 = filterRecsByTargetItem(topNRecAllUsersB4, target_items)
print(f'Number of users with targets before attack: {len(topNRecAllUsersWithTargetsB4)}')

440 [190, 1449, 302, 1191, 408, 12, 1194, 513, 64, 242]
88 [1191, 1449, 286, 745, 311, 1466, 251, 537, 19, 921]
Number of users with targets before attack: 2


In [649]:
def getHitRatioPerItem(topNRecAllUsers, targetItems):
    hitRatioAllItems = {}
    
    for item in targetItems:
        usersWithItem = 0
        for user in topNRecAllUsers.keys():
            if item in topNRecAllUsers[user]:
                usersWithItem += 1
        hitRatio_i = usersWithItem/(len(topNRecAllUsers.keys()) * 1.0)
        hitRatioAllItems[item] = hitRatio_i
                                    
    return hitRatioAllItems  

In [650]:
hitRatioPerItem = getHitRatioPerItem(topNRecAllUsersAtk, target_items)
print("hitRatioPerItem: ", hitRatioPerItem)
sumHitRatio = 0
for hitRatio_i in hitRatioPerItem.values():
    sumHitRatio += hitRatio_i 
avgHitRatio = sumHitRatio/(len(hitRatioPerItem.keys()) * 1.0)
print("\navgHitRatio after attack: ", avgHitRatio)

hitRatioPerItem:  {868: 0.054466230936819175, 1162: 0.0, 927: 0.0718954248366013, 1521: 0.017429193899782137, 1301: 0.08278867102396514, 1191: 0.10239651416122005}

avgHitRatio after attack:  0.05482933914306464
