In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [3]:
# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [4]:
schema_ratings = StructType([
    StructField("user_id", IntegerType(), False),
    StructField("item_id", IntegerType(), False),
    StructField("rating", IntegerType(), False),
    StructField("timestamp", IntegerType(), False)])

schema_items = StructType([
    StructField("item_id", IntegerType(), False),
    StructField("movie", StringType(), False)])

training = spark.read.option("sep", "\t").csv("../data/MovieLens.training", header=False, schema=schema_ratings)
test = spark.read.option("sep", "\t").csv("../data/MovieLens.test", header=False, schema=schema_ratings)
items = spark.read.option("sep", "|").csv("../data/MovieLens.item", header=False, schema=schema_items)

In [22]:
trainDf = training.toPandas()
testDf = test.toPandas()
num_users, num_items = len(trainDf.user_id.unique()), len(trainDf.item_id.unique())
max_uid, max_iid = trainDf.user_id.unique().max(), trainDf.item_id.unique().max()
print(f'num_users: {num_users} num_items : {num_items}')
print(f'max_user_id: ', max_uid, " max_item_id: ", max_iid)

num_users: 943 num_items : 1650
max_user_id:  943  max_item_id:  1682


In [339]:
NUM_SEL_ITEMS = 3
NUM_FILLER_ITEMS = 90

In [340]:
# - Create popular selected item list
item_ratings = dict(trainDf.groupby('item_id').size())
item_pop = [0] * (max_iid + 1)

for item_id in item_ratings.keys():
    item_pop[item_id] = item_ratings[item_id]
    
items_sorted = np.array(item_pop).argsort()[::-1]
selected_items = items_sorted[:NUM_SEL_ITEMS]
print("selected_items: ", selected_items)

selected_items:  [ 50 181 258]


In [341]:
trainDf.groupby('item_id', as_index=False).agg(
    rating_sum= ('rating', 'count')).sort_values('rating_sum', ascending=False).head(5)

Unnamed: 0,item_id,rating_sum
49,50,484
180,181,422
257,258,402
99,100,395
293,294,394


In [482]:
# - Select target item
target_items = [j for i in range(8, 10) for j in
                    items_sorted[i * len(items_sorted) // 10:(i * len(items_sorted) // 10) + 2]][::-1]
target_items = list(
    np.random.choice([i for i in range(len(item_pop)) if item_pop[i] == 3], 2, replace=False)) + target_items
target_items += [243]
print('target_items:', target_items)
print('target_items rating count: ', [(i, item_pop[i]) for i in target_items])

target_items: [992, 1155, 1564, 1647, 1301, 927, 243]
target_items rating count:  [(992, 3), (1155, 3), (1564, 1), (1647, 1), (1301, 4), (927, 4), (243, 108)]


In [483]:
# - create target users
threshold = testDf.rating.mean()
threshold = threshold if threshold < 3 else 3.0
print(f'threshold: {threshold}')
    
target_item = 243
users_rated_target = set(trainDf[trainDf.item_id.isin(target_items)].user_id.values)
# - Users who have not rated target item
data_tmp = trainDf[~trainDf.user_id.isin(users_rated_target)].copy()
#data_tmp = data_tmp[data_tmp.rating >= threshold]

# - Users who have not rated target item and have rated selected_items
target_users = data_tmp[data_tmp.item_id.isin(selected_items)].groupby('user_id').size()
print("target_users[(target_users == selected_num)].shape[0]: ", 
       target_users[(target_users == NUM_SEL_ITEMS)].shape[0])
target_users = sorted(target_users[(target_users == NUM_SEL_ITEMS)].index)
target_users
print("target_users: ", len(target_users))

threshold: 3.0
target_users[(target_users == selected_num)].shape[0]:  160
target_users:  160


In [484]:
# - Get ratings mean and rating std
stdDf = trainDf.groupby('item_id', as_index=False).agg(rating_mean= ('rating', 'mean'), rating_std= ('rating', 'std'), 
                                  rating_count = ('rating', 'count'))
item_mean = dict(zip(stdDf.item_id,stdDf.rating_mean))
item_std = dict(zip(stdDf.item_id,stdDf.rating_std))
rating_mean, rating_std = trainDf.rating.mean(), trainDf.rating.std()
print(f'rating_mean: {rating_mean} rating_std: {rating_std}')

rating_mean: 3.52835 rating_std: 1.118564668374818


In [485]:
# - get filler items
from random import randrange

class FakeProfile(object): 
    MAX_RATING = 5
    
    def __init__(self, target_items, 
                 filler_item_count = 70):
        self.target_items  = target_items
        self.filler_item_count = filler_item_count
        self.selected_items = {}
        self.filler_items = {}
        
    def setSelectedItems(self, selectedItems):
        #selectedItems = freqRatedItems[np.random.choice(len(freqRatedItems), size=self.selected_items_count, replace=False)]
        for item in selectedItems:
            self.selected_items[item] = self.MAX_RATING
    
    def fillerItems(self, selectedItems):
        targetSelItems = list(self.target_items)
        targetSelItems.extend(selectedItems)
        print(targetSelItems)
        fillers_candidates = list(set(trainDf.item_id.unique()) - set(targetSelItems) )
        fillers = np.random.choice(fillers_candidates, size=self.filler_item_count, replace=False)
        ratings = np.round(np.random.normal(loc=rating_mean, scale=rating_std, size=self.filler_item_count), 1)
        for item, rating in zip(fillers, ratings):
            self.filler_items[item] = rating
                
    def create(self, selectedItems):
        self.setSelectedItems(selectedItems)
        self.fillerItems(selectedItems)
        
    def print(self):
        print(f'target_item : {self.target_items}')
        print(f'selected_items : {self.selected_items}')
        print(f'filler_items : {self.filler_items}')
        print("\n")
        
    def getAllItemRatings(self):
        itemRatings = [(item, self.MAX_RATING) for item in self.target_items]
        for item in self.selected_items:
            itemRatings.append((item, self.selected_items[item]))
        for item in self.filler_items:
            itemRatings.append((item, self.filler_items[item]))
        return itemRatings

In [486]:
NUM_FAKE_USERS = 50
fake_profiles = []

for u in range(NUM_FAKE_USERS):
    fp = FakeProfile(target_items)
    fp.create(selected_items)
    fake_profiles.append(fp)
    
for fp in fake_profiles:
    fp.print()

[992, 1155, 1564, 1647, 1301, 927, 243, 50, 181, 258]
[992, 1155, 1564, 1647, 1301, 927, 243, 50, 181, 258]
[992, 1155, 1564, 1647, 1301, 927, 243, 50, 181, 258]
[992, 1155, 1564, 1647, 1301, 927, 243, 50, 181, 258]
[992, 1155, 1564, 1647, 1301, 927, 243, 50, 181, 258]
[992, 1155, 1564, 1647, 1301, 927, 243, 50, 181, 258]
[992, 1155, 1564, 1647, 1301, 927, 243, 50, 181, 258]
[992, 1155, 1564, 1647, 1301, 927, 243, 50, 181, 258]
[992, 1155, 1564, 1647, 1301, 927, 243, 50, 181, 258]
[992, 1155, 1564, 1647, 1301, 927, 243, 50, 181, 258]
[992, 1155, 1564, 1647, 1301, 927, 243, 50, 181, 258]
[992, 1155, 1564, 1647, 1301, 927, 243, 50, 181, 258]
[992, 1155, 1564, 1647, 1301, 927, 243, 50, 181, 258]
[992, 1155, 1564, 1647, 1301, 927, 243, 50, 181, 258]
[992, 1155, 1564, 1647, 1301, 927, 243, 50, 181, 258]
[992, 1155, 1564, 1647, 1301, 927, 243, 50, 181, 258]
[992, 1155, 1564, 1647, 1301, 927, 243, 50, 181, 258]
[992, 1155, 1564, 1647, 1301, 927, 243, 50, 181, 258]
[992, 1155, 1564, 1647, 1301

In [487]:
# - Create attack data frame
userId = 1100
timestamp = 874965758
fakeRatingsdata = {'userId': [], 'item_id': [], 'ratings': [], 'timestamp': []}
for fp in fake_profiles:
    userId += 1
    itemRatings = fp.getAllItemRatings()
    for itemRatingPair in itemRatings:
        fakeRatingsdata['userId'].append(userId)
        fakeRatingsdata['item_id'].append(itemRatingPair[0])
        fakeRatingsdata['ratings'].append(itemRatingPair[1])
        fakeRatingsdata['timestamp'].append(timestamp)
        
columnsZipped = zip(fakeRatingsdata['userId'], fakeRatingsdata['item_id'],
                   fakeRatingsdata['ratings'], fakeRatingsdata['timestamp'])
attackDataDf = pd.DataFrame(list(columnsZipped),
               columns =['user_id', 'item_id', 'rating', 'timestamp'])
attackDataDf

Unnamed: 0,user_id,item_id,rating,timestamp
0,1101,992,5.0,874965758
1,1101,1155,5.0,874965758
2,1101,1564,5.0,874965758
3,1101,1647,5.0,874965758
4,1101,1301,5.0,874965758
...,...,...,...,...
3995,1150,346,3.4,874965758
3996,1150,302,2.7,874965758
3997,1150,520,3.0,874965758
3998,1150,988,3.0,874965758


In [488]:
attackTrainData = pd.concat([trainDf, attackDataDf]).sort_values(by=['user_id', 'item_id'])
attackTrainData

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1,5.0,874965758
1,1,2,3.0,876893171
2,1,3,4.0,878542960
3,1,4,3.0,876893119
4,1,5,3.0,889751712
...,...,...,...,...
3943,1150,1605,4.2,874965758
3938,1150,1611,4.7,874965758
3978,1150,1612,3.5,874965758
3923,1150,1647,5.0,874965758


<h3>Evaluation</h3>

<h4>Base model</h4>

In [368]:
# 0.1
als = ALS(maxIter=10, rank=100, regParam=0.1, userCol="user_id", itemCol="item_id", ratingCol="rating", coldStartStrategy="drop")
model = als.fit(training)
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
rmse

0.9283872090543989

In [369]:
userRecs = model.recommendForAllUsers(10)
userRecs = userRecs.toPandas()

In [370]:
count = 0
for index, row in userRecs.iterrows():
    recommendations = [r['item_id'] for r in row['recommendations']]
    if target_item in recommendations:
        print(row['user_id'], recommendations)
        count += 1
print(f'Total users with {target_item}: {count}')

Total users with 243: 0


<h4>Model with train data + attack data</h4>

In [489]:
# 0.1
attackDF = spark.createDataFrame(attackTrainData)
als_atk = ALS(maxIter=10, rank=100, regParam=0.1, userCol="user_id", itemCol="item_id", ratingCol="rating", coldStartStrategy="drop")
model_atk = als_atk.fit(attackDF)
predictions_atk = model_atk.transform(test)
evaluator_atk = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse_atk = evaluator_atk.evaluate(predictions_atk)
rmse_atk

0.9253753449349769

<h3>Calculate prediction shift</h3>

In [499]:
predAtk = predictions_atk.toPandas()
pred = predictions.toPandas()
targetUsersTest = testDf[testDf.user_id.isin(target_users)]
numTargetUsersInTest = len(targetUsersTest.user_id.unique())
print(f'Number of target users in test: {numTargetUsersInTest}')

Number of target users in test: 40


<h4>Prediction shift across targetted users</h4>

In [500]:
predAttackTargetUser = predAtk[predAtk.user_id.isin(target_users)].sort_values(['user_id', 'item_id']).prediction
predTargetUser = pred[pred.user_id.isin(target_users)].sort_values(['user_id', 'item_id']).prediction

np.sum(predAttackTargetUser - predTargetUser)/numTargetUsersInTest

0.09506261944770814

<h4>Prediction across all users</h4>

In [503]:
predAfterAttack = predAtk.sort_values(['user_id', 'item_id']).prediction
predBeforeAttack = pred.sort_values(['user_id', 'item_id']).prediction

np.sum(predAfterAttack - predBeforeAttack)/len(testDf.user_id.unique())

0.3056491029028799

In [504]:
userRecs = model_atk.recommendForAllUsers(10)
userRecs = userRecs.toPandas()

In [507]:
count = 0
user_ids = {}
for index, row in userRecs.iterrows():
    recommendations = [r['item_id'] for r in row['recommendations']]
    for target_item in target_items:
        if row['user_id'] < 1000 and row['user_id'] not in user_ids and target_item in recommendations:
            print(row['user_id'], recommendations)
            user_ids[row['user_id']] = recommendations
print(f'Total users with {target_items}: {len(user_ids)}')

897 [634, 50, 64, 318, 1155, 22, 913, 313, 98, 174]
580 [50, 181, 913, 927, 1647, 1301, 1155, 1564, 992, 608]
451 [873, 333, 331, 313, 328, 1155, 1301, 329, 1647, 22]
137 [50, 174, 181, 96, 144, 1269, 79, 1155, 1301, 1647]
53 [272, 64, 318, 313, 15, 628, 316, 1155, 282, 22]
472 [913, 1155, 1647, 50, 1301, 927, 1564, 169, 992, 181]
513 [313, 22, 174, 50, 520, 181, 1155, 1269, 210, 966]
642 [50, 181, 1155, 1647, 1564, 927, 966, 83, 992, 1301]
300 [881, 922, 1094, 1645, 1155, 1301, 1647, 56, 1262, 1564]
784 [313, 169, 50, 318, 408, 302, 258, 483, 1155, 22]
688 [169, 50, 1155, 313, 408, 927, 1647, 913, 1301, 1564]
577 [50, 22, 64, 313, 1155, 318, 966, 927, 181, 1647]
811 [313, 169, 189, 608, 1122, 318, 1155, 1449, 190, 300]
388 [96, 50, 1155, 1647, 1301, 927, 181, 12, 992, 64]
372 [913, 64, 98, 1449, 302, 1301, 1155, 313, 1647, 927]
319 [318, 316, 64, 272, 313, 913, 268, 246, 306, 1301]
599 [64, 313, 1278, 1269, 888, 255, 12, 1155, 917, 1647]
671 [50, 181, 96, 174, 927, 172, 1647, 313, 115

In [356]:
len(testDf.user_id.unique())

459

In [357]:
35/459

0.07625272331154684