In [2]:
import sys
import numpy as np
import random
import pandas as pd
import pickle
import time
from lenskit.algorithms.als import BiasedMF
from lenskit.batch import predict
from lenskit.metrics.predict import rmse

In [74]:
verbose = False


class MatrixFactorization:
  def __init__(self, train, test, num_items, num_factors, lrate, reg):
    self.num_train_users = len(train.groupby("user").size())
    self.num_test_users = len(test.groupby("user").size())
    self.num_items = num_items
    self.num_factors = num_factors
    self.lrate = lrate
    self.reg = reg

    self.train_u = train.sort_values(by="user", axis=0).reset_index(drop=True).to_numpy()
    self.train_v = train.sort_values(by="item", axis=0).reset_index(drop=True).to_numpy()
    self.test_u = test.sort_values(by="user", axis=0).reset_index(drop=True).to_numpy()
    self.test_v = test.sort_values(by="item", axis=0).reset_index(drop=True).to_numpy()

    U_freqs = train.groupby("user").size().values
    U_freqs_test = test.groupby("user").size().values

    V_group = train.groupby("item").size()
    self.V_index = V_group.index.values
    V_freqs = np.zeros(num_items, dtype="int")
    for i in self.V_index:
      V_freqs[i] = V_group[i]

    V_group_test = test.groupby("item").size()
    V_index_test = V_group_test.index.values
    V_freqs_test = np.zeros(num_items, dtype="int")
    for i in V_index_test:
      V_freqs_test[i] = V_group_test[i]
      
    self.U_start = np.insert(np.cumsum(U_freqs), 0, 0)
    self.U_start_test = np.insert(np.cumsum(U_freqs_test), 0, 0)
    self.V_start = np.insert(np.cumsum(V_freqs), 0, 0)
    self.V_start_test = np.insert(np.cumsum(V_freqs_test), 0, 0)



  def evaluate(self, test=True):
    if test:
      data = self.test_u
      start = self.U_start_test
      num_users = self.num_test_users
    else:
      data = self.train_u
      start = self.U_start
      num_users = self.num_train_users

    square_error = 0
    for user in range(num_users):
      user_data = data[start[user] : start[user + 1]]
      square_error += np.sum((user_data[:, 2] - np.matmul(self.V[user_data[:, 1]], self.U[user])) ** 2)
    return np.sqrt(square_error / len(data))


  def evaluate_item(self, item):
    data = self.test_v[self.V_start_test[item] : self.V_start_test[item + 1]]
    return np.sqrt(np.sum((data[:, 2] - np.matmul(self.U[data[:, 0]], self.V[item])) ** 2) / len(data))


  def get_u_step(self, user):
    pass


  def get_v_step(self, item):
    pass


  def alt_min(self):
    self.U = np.random.uniform(-1, 1, (self.num_train_users, self.num_factors))
    self.V = np.random.uniform(-1, 1, (self.num_items, self.num_factors))

    rmse = self.evaluate()
    prev_rmse = 1000
    rounds = 0
    num_iters = 20
    threshold = -0.001

    while rmse - prev_rmse < threshold:
      if verbose:
        t0 = time.time()
      prev_rmse = rmse
      
      # Optmize U
      for i in range(num_iters):
        for user in range(self.num_train_users):
          step = self.get_u_step(user)
          self.U[user] += self.lrate * step
          
      # Optimize V
      for i in range(num_iters):
        for item in self.V_index:
          step = self.get_v_step(item)
          self.V[item] += self.lrate * step
          
      rmse = self.evaluate()
      rounds += 1

      if verbose:
        t1 = time.time()
        train_rmse = self.evaluate(False)
        print("\n==================== ROUND {} ====================\nRMSE: {}\nPrev RMSE: {}\nDiff: {}\nTrain RMSE: {}\nExecution time: {}\n" \
          .format(rounds, round(rmse, 4), round(prev_rmse, 4), round(rmse - prev_rmse, 4), round(train_rmse, 4), round(t1 - t0, 2)))
      
    if verbose:
      print("max U: {}\nmin U: {}\navg U: {}\n".format(np.amax(self.U), np.amin(self.U), np.mean(self.U)))
      print("max V: {}\nmin V: {}\navg V: {}\n".format(np.amax(self.V), np.amin(self.V), np.mean(self.V)))
      print("Regularization: {}".format(self.reg))
      print("Final RMSE: {}\n".format(rmse))

In [75]:
class LeastSquares(MatrixFactorization):
  def __init__(self, train, test, num_items, num_factors=30, lrate=0.1, reg=0.1):
    MatrixFactorization.__init__(self, train, test, num_items, num_factors, lrate, reg)


  def get_u_step(self, user):
    data = self.train_u[self.U_start[user] : self.U_start[user + 1]]
    vmat = self.V[data[:, 1]]
    preds = np.matmul(vmat, self.U[user])
    return np.mean(np.multiply((data[:, 2] - preds).reshape((-1, 1)), vmat) - (self.reg * self.U[user]), axis=0)

    
  def get_v_step(self, item):
    data = self.train_v[self.V_start[item] : self.V_start[item + 1]]
    umat = self.U[data[:, 0]]
    preds = np.matmul(umat, self.V[item])
    return np.mean(np.multiply((data[:, 2] - preds).reshape((-1, 1)), umat) - (self.reg * self.V[item]), axis=0)

In [76]:
train = pd.read_pickle("../data/ml-1m-split/train.pkl").drop(["item_id", "timestamp"], axis=1)
test = pd.read_pickle("../data/ml-1m-split/test.pkl").drop(["item_id", "timestamp"], axis=1)
full = pd.read_pickle("../data/ml-1m-split/full.pkl").drop(["item_id", "timestamp"], axis=1)

num_users = len(full.groupby("user").size())
num_items = len(full.groupby("item").size())

ModelClass = LeastSquares
model_clean = ModelClass(train, test, num_items)


item_freqs = full.groupby("item").size().values
item_freqs_train = train.groupby("item").size()
item_rating_avgs = full.groupby("item").mean()["rating"].values
item_rating_stds = full.groupby("item").std()["rating"].values

target_items_list = []
for i in range(num_items):
  if i in item_freqs_train and item_freqs[i] <= (0.05 * num_users) and item_freqs[i] >= (0.02 * num_users) and item_rating_avgs[i] < 3:
    target_items_list.append(i)

filler_prop = 0.05
filler_size = int(filler_prop * num_items)
filler_items_list = list(range(num_items))
filler_items_list = [x for x in filler_items_list if x in item_freqs_train]


target_items = [117, 1792, 2837, 3157, 2206, 3038, 1597, 3466, 1988, 3014]

model_clean.alt_min()
overall_rmse = model_clean.evaluate()
results = []
algo = BiasedMF(30, bias=False, iterations=8)
algo.fit(train)
preds = predict(algo, test)


target_item = 1792
print("Target item {} freq: {}".format(target_item, len(train.loc[train["item"] == target_item])))
filler_items = random.sample([x for x in filler_items_list if x != target_item], k=filler_size)
target_rmse = model_clean.evaluate_item(target_item)

target_preds = preds.loc[preds["item"] == target_item]
algo_target_rmse = rmse(target_preds["prediction"], target_preds["rating"])
algo_overall_rmse = rmse(preds["prediction"], preds["rating"])

original_entry = [target_item, 0.0, round(target_rmse, 4), round(algo_target_rmse, 4), round(overall_rmse, 4), round(algo_overall_rmse, 4)]
results.append(original_entry)
print(original_entry)

profile_prop = 0.03
profile_size = int(profile_prop * num_users)
attack_data = []
for i in range(profile_size):
  user = i + num_users
  # Boost target item
  attack_data.append([user, target_item, 5]) 

  for filler_item in filler_items:
    if np.isnan(item_rating_stds[filler_item]):
      raw_rating = item_rating_avgs[filler_item]
    else:
      raw_rating = np.random.normal(item_rating_avgs[filler_item], item_rating_stds[filler_item])
    
    rating = int(round(np.clip(raw_rating, 1, 5)))
    attack_data.append([user, filler_item, rating])

attack_df = pd.DataFrame(attack_data).rename(columns={0: "user", 1: "item", 2: "rating"})
train_attacked = train.append(attack_df).reset_index().drop(["index"], axis=1)
model_attack = ModelClass(train_attacked, test, num_items)
model_attack.alt_min()
overall_rmse_attacked = model_attack.evaluate()
target_rmse_attacked = model_attack.evaluate_item(target_item)

algo.fit(train_attacked)
preds_attacked = predict(algo, test)
target_preds_attacked = preds_attacked.loc[preds_attacked["item"] == target_item]
algo_target_rmse_attacked = rmse(target_preds_attacked["prediction"], target_preds_attacked["rating"])
algo_overall_rmse_attacked = rmse(preds_attacked["prediction"], preds_attacked["rating"])

attack_prop = profile_size / (profile_size + len(train.loc[train["item"] == target_item]))
entry = [target_item, round(attack_prop, 4), \
        round(target_rmse_attacked, 4), round(algo_target_rmse_attacked, 4), round(overall_rmse_attacked, 4), round(algo_overall_rmse_attacked, 4)]
results.append(entry)
print(entry)
print("Target item factors:\n{}\nAlgo target item factors:\n{}\n".format(model_attack.V[target_item], algo.item_features_[algo.item_index_.get_indexer([target_item])[0]]))  

Target item 1792 freq: 149
[1792, 0.0, 0.8024, 0.7915, 0.8723, 0.8686]
[1792, 0.5485, 1.0165, 1.1434, 0.8718, 0.8687]
Target item factors:
[-0.1521613  -0.03606174  0.12979539  1.356082   -0.59235444 -0.00714804
  0.03775234  0.41347589 -0.03170108  0.32937708  0.51282665  0.13173392
  0.11163167  1.27621528  0.62742199  0.3664579  -0.43748857 -1.62253097
  0.04562578 -0.16921736 -0.31062888 -0.34619092 -0.4928845  -0.04036666
  0.24252487 -0.37149621 -0.62765002  0.12051612 -0.83325069 -0.15468249]
Algo target item factors:
[ 0.47438498  1.41245031 -1.09438774 -0.30445571 -0.99825386 -0.43128842
 -0.40364215 -1.01318219 -0.042908   -0.13841504 -0.02481691  0.10074652
 -0.0111109  -0.06170742  0.12685186 -0.15165611 -0.42965833  0.10060357
  1.77670667 -0.17611242  0.3827099   0.39589877 -0.00928763 -0.45572155
  0.2116667  -0.0041291   0.26454575 -0.15091072  0.40220262 -0.01018236]



In [34]:
model_v = model_attack.V[target_item]

In [35]:
algo_v = algo.item_features_[algo.item_index_.get_indexer([target_item])[0]]

In [38]:
data = model_attack.test_v[model_attack.V_start_test[target_item] : model_attack.V_start_test[target_item + 1]]
users = data[:, 0]
model_u = model_attack.U[users]
algo_u = algo.user_features_[algo.user_index_.get_indexer(users)]

In [60]:
model_preds = np.matmul(model_u, model_v)
model_res = data[:, 2] - model_preds
algo_preds = np.matmul(algo_u, algo_v)
algo_res = data[:, 2] - algo_preds

In [61]:
np.sqrt(np.sum(model_res ** 2) / len(data))

1.1425585623090941

In [62]:
np.sqrt(np.sum(algo_res ** 2) / len(data))

1.4573903502303955

In [66]:
algo = BiasedMF(30, bias=False, iterations=10)
algo.fit(train)
preds = predict(algo, test)
algo_overall_rmse = rmse(preds["prediction"], preds["rating"])
algo_overall_rmse

0.8976247461335377

In [79]:
target_items = [371, 1100, 2531, 40, 2818, 1314, 1747, 3081, 2984, 871]
for item in target_items:
  print(item_rating_avgs[item])

3.2606635071090047
4.25278810408922
3.4355555555555557
3.958677685950413
3.5401785714285716
3.25531914893617
4.1940298507462686
4.387453874538745
3.661764705882353
3.9523809523809526


In [12]:
train = pd.read_pickle("../data/ml-1m-split/train.pkl").drop(["item_id", "timestamp"], axis=1)
U_freqs = train.groupby("user").size().values
V_freqs = train.groupby("item").size().values

In [18]:
np.mean(U_freqs)

132.4822847682119