In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [3]:
# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [4]:
schema_ratings = StructType([
    StructField("user_id", IntegerType(), False),
    StructField("item_id", IntegerType(), False),
    StructField("rating", IntegerType(), False),
    StructField("timestamp", IntegerType(), False)])

schema_items = StructType([
    StructField("item_id", IntegerType(), False),
    StructField("movie", StringType(), False)])

training = spark.read.option("sep", "\t").csv("../data/MovieLens.training", header=False, schema=schema_ratings)
test = spark.read.option("sep", "\t").csv("../data/MovieLens.test", header=False, schema=schema_ratings)
items = spark.read.option("sep", "|").csv("../data/MovieLens.item", header=False, schema=schema_items)

In [22]:
trainDf = training.toPandas()
testDf = test.toPandas()
num_users, num_items = len(trainDf.user_id.unique()), len(trainDf.item_id.unique())
max_uid, max_iid = trainDf.user_id.unique().max(), trainDf.item_id.unique().max()
print(f'num_users: {num_users} num_items : {num_items}')
print(f'max_user_id: ', max_uid, " max_item_id: ", max_iid)

num_users: 943 num_items : 1650
max_user_id:  943  max_item_id:  1682


In [339]:
NUM_SEL_ITEMS = 3
NUM_FILLER_ITEMS = 90

In [340]:
# - Create popular selected item list
item_ratings = dict(trainDf.groupby('item_id').size())
item_pop = [0] * (max_iid + 1)

for item_id in item_ratings.keys():
    item_pop[item_id] = item_ratings[item_id]
    
items_sorted = np.array(item_pop).argsort()[::-1]
selected_items = items_sorted[:NUM_SEL_ITEMS]
print("selected_items: ", selected_items)

selected_items:  [ 50 181 258]


In [341]:
trainDf.groupby('item_id', as_index=False).agg(
    rating_sum= ('rating', 'count')).sort_values('rating_sum', ascending=False).head(5)

Unnamed: 0,item_id,rating_sum
49,50,484
180,181,422
257,258,402
99,100,395
293,294,394


In [342]:
# - Select target item
target_items = [j for i in range(2, 10) for j in
                    items_sorted[i * len(items_sorted) // 10:(i * len(items_sorted) // 10) + 2]][::-1]
target_items = list(
    np.random.choice([i for i in range(len(item_pop)) if item_pop[i] == 3], 4, replace=False)) + target_items
target_items += [243]
print('target_items:', target_items)
print('target_items rating count: ', [(i, item_pop[i]) for i in target_items])

target_items: [1102, 1517, 1247, 104, 1564, 1647, 1301, 927, 917, 1276, 1057, 1029, 1137, 1045, 1142, 224, 277, 145, 421, 699, 243]
target_items rating count:  [(1102, 3), (1517, 3), (1247, 3), (104, 3), (1564, 1), (1647, 1), (1301, 4), (927, 4), (917, 7), (1276, 7), (1057, 12), (1029, 12), (1137, 21), (1045, 21), (1142, 34), (224, 34), (277, 52), (145, 52), (421, 79), (699, 79), (243, 108)]


In [343]:
# - create target users
threshold = testDf.rating.mean()
threshold = threshold if threshold < 3 else 3.0
print(f'threshold: {threshold}')
    
target_item = 243
target_rated = set(trainDf[trainDf.item_id == target_item].user_id.values)
data_tmp = trainDf[~trainDf.user_id.isin(target_rated)].copy()
data_tmp = data_tmp[data_tmp.rating >= threshold]

target_users = data_tmp[data_tmp.item_id.isin(selected_items)].groupby('user_id').size()
print("target_users[(target_users == selected_num)].shape[0]: ", 
      target_users[(target_users == NUM_SEL_ITEMS)].shape[0])
target_users = sorted(target_users[(target_users == NUM_SEL_ITEMS)].index)
target_users

threshold: 3.0
target_users[(target_users == selected_num)].shape[0]:  140


[57,
 85,
 104,
 116,
 151,
 177,
 185,
 222,
 249,
 250,
 263,
 294,
 301,
 313,
 323,
 328,
 334,
 345,
 347,
 350,
 354,
 363,
 395,
 402,
 411,
 413,
 416,
 417,
 429,
 430,
 435,
 447,
 450,
 453,
 455,
 456,
 459,
 464,
 465,
 466,
 468,
 470,
 472,
 474,
 479,
 483,
 484,
 486,
 487,
 493,
 497,
 500,
 504,
 506,
 507,
 509,
 513,
 517,
 521,
 523,
 528,
 535,
 538,
 540,
 541,
 545,
 546,
 548,
 549,
 555,
 560,
 564,
 580,
 582,
 584,
 588,
 595,
 596,
 601,
 606,
 619,
 623,
 624,
 625,
 632,
 642,
 653,
 654,
 659,
 661,
 671,
 689,
 693,
 698,
 703,
 706,
 708,
 711,
 714,
 735,
 738,
 742,
 748,
 749,
 756,
 757,
 758,
 759,
 760,
 770,
 773,
 790,
 805,
 807,
 815,
 826,
 834,
 838,
 839,
 843,
 844,
 846,
 862,
 864,
 867,
 870,
 871,
 875,
 878,
 889,
 890,
 896,
 899,
 902,
 907,
 913,
 922,
 924,
 931,
 940]

In [344]:
# - Get ratings mean and rating std
stdDf = trainDf.groupby('item_id', as_index=False).agg(rating_mean= ('rating', 'mean'), rating_std= ('rating', 'std'), 
                                  rating_count = ('rating', 'count'))
item_mean = dict(zip(stdDf.item_id,stdDf.rating_mean))
item_std = dict(zip(stdDf.item_id,stdDf.rating_std))
rating_mean, rating_std = trainDf.rating.mean(), trainDf.rating.std()
print(f'rating_mean: {rating_mean} rating_std: {rating_std}')

rating_mean: 3.52835 rating_std: 1.118564668374818


In [345]:
# - get filler items
from random import randrange

class FakeProfile(object): 
    MAX_RATING = 5
    
    def __init__(self, target_item, 
                 filler_item_count = 70):
        self.target_item  = target_item
        self.filler_item_count = filler_item_count
        self.selected_items = {}
        self.filler_items = {}
        
    def setSelectedItems(self, selectedItems):
        #selectedItems = freqRatedItems[np.random.choice(len(freqRatedItems), size=self.selected_items_count, replace=False)]
        for item in selectedItems:
            self.selected_items[item] = self.MAX_RATING
    
    def fillerItems(self, selectedItems):
        fillers_candidates = list(set(trainDf.item_id.unique()) - set([self.target_item] + selectedItems))
        fillers = np.random.choice(fillers_candidates, size=self.filler_item_count, replace=False)
        ratings = np.random.normal(loc=rating_mean, scale=rating_std, size=self.filler_item_count)
        for item, rating in zip(fillers, ratings):
            self.filler_items[item] = rating
                
    def create(self, selectedItems):
        self.setSelectedItems(selectedItems)
        self.fillerItems(selectedItems)
        
    def print(self):
        print(f'target_item : {self.target_item}')
        print(f'selected_items : {self.selected_items}')
        print(f'filler_items : {self.filler_items}')
        print("\n")
        
    def getAllItemRatings(self):
        itemRatings = [(self.target_item, self.MAX_RATING)]
        for item in self.selected_items:
            itemRatings.append((item, self.selected_items[item]))
        for item in self.filler_items:
            itemRatings.append((item, self.filler_items[item]))
        return itemRatings

In [365]:
NUM_FAKE_USERS = 70
fake_profiles = []

for u in range(NUM_FAKE_USERS):
    fp = FakeProfile(target_item)
    fp.create(selected_items)
    fake_profiles.append(fp)
    
for fp in fake_profiles:
    fp.print()

target_item : 243
selected_items : {50: 5, 181: 5, 258: 5}
filler_items : {1135: 4.092222562377607, 1636: 3.2257575650730606, 246: 4.290588925005136, 211: 2.9914553100499814, 194: 3.247931822784448, 1620: 5.163043805422188, 33: 4.445985494558568, 633: 2.4788446602161915, 1434: 3.0983690559526997, 1127: 3.9969069872374163, 979: 1.9373988479185917, 361: 4.088259352900344, 230: 3.567213687144892, 9: 2.167021797009584, 1259: 2.982647442857739, 1587: 1.9553402472440529, 273: 4.5489247337725525, 1420: 3.1225495312449003, 1119: 4.305854983110574, 774: 3.6895041281522327, 1676: 5.0141102352849405, 855: 4.178705855678049, 550: 2.788987307082859, 927: 2.454291371734013, 1196: 3.970515155386866, 1639: 3.3057149847914977, 1500: 4.276682679836125, 1606: 4.376420953698456, 1467: 4.3073438902967816, 1439: 5.69109094895015, 1484: 3.398012824763863, 1412: 3.554849595908315, 1028: 3.3210438387310433, 998: 4.044652838806465, 1332: 2.866804669097862, 932: 3.2676474628343297, 380: 3.7807097594696586, 398: 

In [366]:
# - Create attack data frame
userId = 1100
timestamp = 874965758
fakeRatingsdata = {'userId': [], 'item_id': [], 'ratings': [], 'timestamp': []}
for fp in fake_profiles:
    userId += 1
    itemRatings = fp.getAllItemRatings()
    for itemRatingPair in itemRatings:
        fakeRatingsdata['userId'].append(userId)
        fakeRatingsdata['item_id'].append(itemRatingPair[0])
        fakeRatingsdata['ratings'].append(itemRatingPair[1])
        fakeRatingsdata['timestamp'].append(timestamp)
        
columnsZipped = zip(fakeRatingsdata['userId'], fakeRatingsdata['item_id'],
                   fakeRatingsdata['ratings'], fakeRatingsdata['timestamp'])
attackDataDf = pd.DataFrame(list(columnsZipped),
               columns =['user_id', 'item_id', 'rating', 'timestamp'])
attackDataDf

Unnamed: 0,user_id,item_id,rating,timestamp
0,1101,243,5.000000,874965758
1,1101,50,5.000000,874965758
2,1101,181,5.000000,874965758
3,1101,258,5.000000,874965758
4,1101,1135,4.092223,874965758
...,...,...,...,...
5175,1170,549,4.686273,874965758
5176,1170,32,2.230388,874965758
5177,1170,349,1.981155,874965758
5178,1170,182,2.162999,874965758


In [367]:
attackTrainData = pd.concat([trainDf, attackDataDf]).sort_values(by=['user_id', 'item_id'])
attackTrainData

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1,5.000000,874965758
1,1,2,3.000000,876893171
2,1,3,4.000000,878542960
3,1,4,3.000000,876893119
4,1,5,3.000000,889751712
...,...,...,...,...
5113,1170,1567,3.659204,874965758
5131,1170,1607,4.994548,874965758
5160,1170,1622,4.170744,874965758
5154,1170,1627,4.496605,874965758


<h3>Evaluation</h3>

<h4>Base model</h4>

In [368]:
# 0.1
als = ALS(maxIter=10, rank=100, regParam=0.1, userCol="user_id", itemCol="item_id", ratingCol="rating", coldStartStrategy="drop")
model = als.fit(training)
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
rmse

0.9283872090543989

In [369]:
userRecs = model.recommendForAllUsers(10)
userRecs = userRecs.toPandas()

In [370]:
count = 0
for index, row in userRecs.iterrows():
    recommendations = [r['item_id'] for r in row['recommendations']]
    if target_item in recommendations:
        print(row['user_id'], recommendations)
        count += 1
print(f'Total users with {target_item}: {count}')

Total users with 243: 0


<h4>Model with train data + attack data</h4>

In [371]:
# 0.1
attackDF = spark.createDataFrame(attackTrainData)
als_atk = ALS(maxIter=10, rank=100, regParam=0.1, userCol="user_id", itemCol="item_id", ratingCol="rating", coldStartStrategy="drop")
model_atk = als_atk.fit(attackDF)
predictions_atk = model_atk.transform(test)
evaluator_atk = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse_atk = evaluator_atk.evaluate(predictions_atk)
rmse_atk

0.9268555596684868

In [372]:
np.mean(predictions_atk.toPandas().iloc[target_users].values - predictions.toPandas().iloc[target_users].values)

-0.0005520679269518171

In [373]:
userRecs = model_atk.recommendForAllUsers(10)
userRecs = userRecs.toPandas()

In [374]:
count = 0
for index, row in userRecs.iterrows():
    recommendations = [r['item_id'] for r in row['recommendations']]
    if target_item in recommendations:
        print(row['user_id'], recommendations)
        count += 1
print(f'Total users with {target_item}: {count}')

471 [932, 1625, 1594, 102, 477, 8, 258, 422, 243, 82]
1143 [1621, 50, 1131, 1570, 1467, 1004, 181, 1631, 258, 243]
1163 [587, 1570, 1196, 363, 1396, 1187, 333, 258, 1675, 243]
1148 [1570, 1675, 362, 243, 50, 1312, 258, 181, 454, 1449]
1114 [793, 896, 1570, 682, 50, 243, 1025, 1336, 181, 258]
1145 [1548, 1298, 1442, 491, 243, 1142, 1570, 50, 258, 1402]
1137 [788, 346, 1131, 1673, 50, 243, 1570, 1662, 1192, 1641]
1144 [607, 1631, 1594, 1004, 1570, 50, 538, 1675, 243, 1449]
1167 [1211, 1570, 1285, 1329, 1288, 243, 1662, 1594, 1110, 50]
1166 [1131, 1534, 1300, 388, 258, 1625, 50, 1511, 1662, 243]
1151 [1247, 1676, 50, 181, 1570, 1124, 243, 1442, 649, 258]
1119 [1472, 1299, 1570, 279, 1449, 50, 258, 1131, 836, 243]
1124 [1043, 962, 1523, 1144, 60, 1570, 1004, 243, 50, 138]
1123 [1662, 1015, 838, 1654, 618, 1675, 243, 987, 181, 50]
1169 [1681, 1496, 837, 50, 1131, 181, 243, 1462, 1530, 1467]
1104 [1631, 1594, 1589, 1358, 1678, 243, 50, 1131, 181, 1442]
261 [1662, 1570, 272, 243, 866, 64, 159

In [356]:
len(testDf.user_id.unique())

459

In [357]:
35/459

0.07625272331154684