In [1]:
import numpy as np, pandas as pd

ratings = pd.read_table('models/data/bookcrossing/ratings.csv', sep=',', header=None, names=['UserId','ItemId','Rating'], engine='python')
ratings = ratings.drop(0).reset_index(drop=True)
ratings = ratings.astype(int)

In [2]:
# rename Rating to Count and drop Timestamp if it exists
ratings = ratings.rename(columns={'Rating': 'Count'})

In [3]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(ratings, test_size=.2, random_state=12)
users_train = set(train.UserId)
items_train = set(train.ItemId)
test = test.loc[test.UserId.isin(users_train) & test.ItemId.isin(items_train)].reset_index(drop=True)
del users_train, items_train
test.shape

(163094, 3)

In [4]:
%%time
from hpfrec import HPF

recommender = HPF(k=50, full_llk=False, random_seed=123,
                  check_every=10, maxiter=150, reindex=True,
                  allow_inconsistent_math=False,
                  save_folder='models/parameters_dump_bc_50/')
recommender.fit(train)

**********************************
Hierarchical Poisson Factorization
**********************************


Saving user and item mappings...



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.input_df["UserId"], self.user_mapping_ = pd.factorize(self.input_df["UserId"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.input_df["ItemId"], self.item_mapping_ = pd.factorize(self.input_df["ItemId"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.input_df['Count'] = self.input_d

Number of users: 59842
Number of items: 129060
Latent factors to use: 50

Initializing parameters...
Allocating Phi matrix...
Initializing optimization procedure...
Iteration 10 | train llk: -12653290 | train rmse: 7.7435
Iteration 20 | train llk: -11417208 | train rmse: 7.6831
Iteration 30 | train llk: -11351700 | train rmse: 7.6799
Iteration 40 | train llk: -11334521 | train rmse: 7.6803
Iteration 50 | train llk: -11325954 | train rmse: 7.6805
Iteration 60 | train llk: -11321549 | train rmse: 7.6803
Iteration 70 | train llk: -11318207 | train rmse: 7.6802
Iteration 80 | train llk: -11316044 | train rmse: 7.6802
Iteration 90 | train llk: -11314566 | train rmse: 7.6801
Iteration 100 | train llk: -11313452 | train rmse: 7.6801
Iteration 110 | train llk: -11312549 | train rmse: 7.6800
Iteration 120 | train llk: -11312130 | train rmse: 7.6800
Iteration 130 | train llk: -11311544 | train rmse: 7.6800
Iteration 140 | train llk: -11311011 | train rmse: 7.6800
Iteration 150 | train llk: -1131

<hpfrec.HPF at 0x16c0b91d0>

In [5]:
rng = np.random.default_rng(12)
sampled_users = rng.choice(test.UserId.unique(), size=1200, replace=False)

In [23]:
#sampled_users = test.UserId.unique()

In [6]:
# get all unique item IDs
items = ratings.ItemId.unique()

In [7]:
# predict ratings for the sample of users
predictions = []
for user in sampled_users:
    user_predictions = []
    for item in items:
        user_predictions.append(recommender.predict(user=user, item=item))
    predictions.append(user_predictions)
predictions = np.array(predictions)
#np.save("models/data/predictions/poisson_predictions_50.npy", predictions)
#np.save("models/data/predictions/poisson_test_users_50.npy", sampled_users)
np.save("poisson_predictions_bc_50.npy", predictions)
np.save("poisson_test_users_bc_50.npy", sampled_users)