In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [2]:
# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [4]:
schema_ratings = StructType([
    StructField("user_id", IntegerType(), False),
    StructField("item_id", IntegerType(), False),
    StructField("rating", IntegerType(), False),
    StructField("timestamp", IntegerType(), False)])

schema_items = StructType([
    StructField("item_id", IntegerType(), False),
    StructField("movie", StringType(), False)])

training = spark.read.option("sep", "\t").csv("../data/MovieLens.training", header=False, schema=schema_ratings)
test = spark.read.option("sep", "\t").csv("../data/MovieLens.test", header=False, schema=schema_ratings)
items = spark.read.option("sep", "|").csv("../data/MovieLens.item", header=False, schema=schema_items)

<h3>Base Model</h3>

In [5]:
# 0.1
als = ALS(maxIter=10, rank=100, regParam=0.1, userCol="user_id", itemCol="item_id", ratingCol="rating", coldStartStrategy="drop")
model = als.fit(training)
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
rmse

0.9289473527471047

<h3>Data statistics</h3>

In [7]:
movieTrain = training.toPandas()
movieTrain

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712
...,...,...,...,...
79995,943,1067,2,875501756
79996,943,1074,4,888640250
79997,943,1188,3,888640250
79998,943,1228,3,888640275


<h3>Select frequently rated items</h3>

In [45]:
# select frequently rated items
#.agg(min_height=('height', 'min'), 
#                               max_weight=('weight', 'max'))
freqRatedItemDf = movieTrain.groupby('item_id', as_index=False).agg(rating_mean= ('rating', 'mean'), rating_count= ('rating', 'count'), 
                                  rating_med = ('rating', 'median')).sort_values(by='rating_count', 
                                                            ascending = False).iloc[:60]
freqRatedItemDf

Unnamed: 0,item_id,rating_mean,rating_count,rating_med
49,50,4.359504,484,5.0
180,181,4.011848,422,4.0
257,258,3.823383,402,4.0
99,100,4.108861,395,4.0
293,294,3.164975,394,3.0
287,288,3.445013,391,4.0
285,286,3.641753,388,4.0
0,1,3.89295,383,4.0
120,121,3.390935,353,4.0
299,300,3.65625,352,4.0


In [54]:
# Get distinct item_ids : Range = 1 - 1681
uniqItemIds = sorted(movieTrain.item_id.unique())
print(f'uniqItemIds: {len(uniqItemIds)}')
print(f'first 5 uniqItemIds: {uniqItemIds[:5]}')
print(f'last 5 uniqItemIds: {uniqItemIds[-5:-1]}')

uniqItemIds: 1650
first 5 uniqItemIds: [1, 2, 3, 4, 5]
last 5 uniqItemIds: [1678, 1679, 1680, 1681]


In [59]:
freqRatedItems = freqRatedItemDf['item_id'].to_numpy()
freqRatedItems[np.random.choice(len(freqRatedItems), size=20, replace=False)]
target_item = 243
target_array = np.array([target_item])
if target_item in freqRatedItems:
    print("Target item present in freqRatedItems, hence remvoing")
    freqRatedItems = np.setdiff1d(freqRatedItems, target_array)
freqRatedItems

array([ 50, 181, 258, 100, 294, 288, 286,   1, 121, 300, 174, 127,  56,
       237,   7, 117,  98, 172, 222, 405, 204, 313, 210,  79, 173, 168,
       151,  69, 748, 269, 257,   9, 302,  96, 118, 195, 423, 328, 318,
        15, 276, 111,  25,  22, 176, 202,  89, 183,  64,  28, 216, 234,
       191, 275,  12, 289, 357, 546,  97,  82])

In [29]:
movieTrain.groupby('user_id').agg(rating_count= ('rating', 'count')).agg('median')
# - rating_count    50.0

rating_count    50.0
dtype: float64

<h3>Creating fake user profiles </h3><br/>
Number of fake user profiles 5% of 1k users = 50 <br/>
Target item = 243 <br/>
Profile size = 50 <br/>
selected_items = 40% = 20 <br/>
filler_items = 40% = 20 <br/>
unrated_items = 20% = 10 <br/>


In [82]:
from random import randrange

class FakeProfile(object): 
    MAX_RATING = 5
    
    def __init__(self, target_item, 
                 selected_items_count = 20, 
                 filler_item_count = 20):
        self.target_item  = target_item
        self.selected_items_count = selected_items_count
        self.filler_item_count = filler_item_count
        self.unrated_item_count = 10
        self.selected_items = {}
        self.filler_items = {}
        
    def setSelectedItems(self, freqRatedItems):
        selectedItems = freqRatedItems[np.random.choice(len(freqRatedItems), size=20, replace=False)]
        for item in selectedItems:
            self.selected_items[item] = self.MAX_RATING
    
    def fillerItems(self, item_start, item_end):
        while len(self.filler_items) < self.filler_item_count:
            randomItem = randrange(item_start, item_end)
            # - Use normal distribution to generate rating? np.random.normal?
            randomRating = randrange(1, 5)
            
            if randomItem not in self.selected_items:
                self.filler_items[randomItem] = randomRating
                
    def create(self, freqRatedItems, itemId_start, itemId_end):
        self.setSelectedItems(freqRatedItems)
        self.fillerItems(itemId_start, itemId_end)
        
    def print(self):
        print(f'target_item : {self.target_item}')
        print(f'selected_items : {self.selected_items}')
        print(f'filler_items : {self.filler_items}')
        print("\n")
        
    def getAllItemRatings(self):
        itemRatings = [(self.target_item, self.MAX_RATING)]
        for item in self.selected_items:
            itemRatings.append((item, self.selected_items[item]))
        for item in self.filler_items:
            itemRatings.append((item, self.filler_items[item]))
        return itemRatings
        
            

In [83]:
NUM_FAKE_USERS = 50
fake_profiles = []
target_item = 243
itemIdStart = 1 
itemIdEnd = 1681

for u in range(NUM_FAKE_USERS):
    fp = FakeProfile(target_item)
    fp.create(freqRatedItems, itemIdStart, itemIdEnd)
    fake_profiles.append(fp)
    
for fp in fake_profiles:
    fp.print()

target_item : 243
selected_items : {25: 5, 1: 5, 257: 5, 202: 5, 286: 5, 15: 5, 172: 5, 210: 5, 258: 5, 237: 5, 748: 5, 111: 5, 121: 5, 12: 5, 289: 5, 204: 5, 294: 5, 151: 5, 318: 5, 328: 5}
filler_items : {1115: 2, 10: 3, 352: 3, 1484: 3, 1525: 1, 828: 3, 1056: 4, 565: 1, 1083: 3, 234: 2, 1508: 4, 1368: 2, 1615: 2, 1334: 1, 1086: 2, 222: 1, 1618: 4, 591: 4, 17: 2, 1039: 2}


target_item : 243
selected_items : {183: 5, 357: 5, 173: 5, 191: 5, 237: 5, 288: 5, 12: 5, 1: 5, 423: 5, 9: 5, 300: 5, 202: 5, 28: 5, 69: 5, 302: 5, 222: 5, 172: 5, 151: 5, 748: 5, 269: 5}
filler_items : {7: 3, 886: 4, 924: 1, 243: 4, 1277: 3, 1212: 3, 261: 2, 720: 3, 763: 1, 466: 2, 24: 1, 1550: 2, 662: 3, 1356: 4, 597: 1, 1273: 4, 1366: 2, 779: 4, 482: 1, 1590: 2}


target_item : 243
selected_items : {204: 5, 111: 5, 7: 5, 289: 5, 257: 5, 269: 5, 183: 5, 222: 5, 12: 5, 191: 5, 313: 5, 117: 5, 546: 5, 328: 5, 748: 5, 286: 5, 121: 5, 181: 5, 237: 5, 97: 5}
filler_items : {753: 2, 1089: 3, 1480: 2, 903: 2, 322: 3, 

In [85]:
# - Create attack data frame
userId = 1100
timestamp = 874965758
fakeRatingsdata = {'userId': [], 'item_id': [], 'ratings': [], 'timestamp': []}
for fp in fake_profiles:
    userId += 1
    itemRatings = fp.getAllItemRatings()
    for itemRatingPair in itemRatings:
        fakeRatingsdata['userId'].append(userId)
        fakeRatingsdata['item_id'].append(itemRatingPair[0])
        fakeRatingsdata['ratings'].append(itemRatingPair[1])
        fakeRatingsdata['timestamp'].append(timestamp)
        
columnsZipped = zip(fakeRatingsdata['userId'], fakeRatingsdata['item_id'],
                   fakeRatingsdata['ratings'], fakeRatingsdata['timestamp'])
attackDf = pd.DataFrame(list(columnsZipped),
               columns =['user_id', 'item_id', 'rating', 'timestamp'])
attackDf

Unnamed: 0,user_id,item_id,rating,timestamp
0,1101,243,5,874965758
1,1101,25,5,874965758
2,1101,1,5,874965758
3,1101,257,5,874965758
4,1101,202,5,874965758
...,...,...,...,...
2045,1150,1151,3,874965758
2046,1150,531,1,874965758
2047,1150,1381,3,874965758
2048,1150,114,2,874965758


<h3>Evaluating attack</h3>

In [89]:
attack = pd.concat([movieTrain, attackDf]).sort_values(by=['user_id', 'item_id'])
attack

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712
...,...,...,...,...
2049,1150,1354,1,874965758
2044,1150,1355,3,874965758
2047,1150,1381,3,874965758
2037,1150,1508,3,874965758


In [90]:
attackDF = spark.createDataFrame(attack)
attackDF

DataFrame[user_id: bigint, item_id: bigint, rating: bigint, timestamp: bigint]

In [92]:
# 0.1
als = ALS(maxIter=10, rank=100, regParam=0.1, userCol="user_id", itemCol="item_id", ratingCol="rating", coldStartStrategy="drop")
model = als.fit(attackDF)
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
rmse

0.9319700196247223