# BiVAE

https://github.com/microsoft/recommenders/blob/main/examples/02_model_collaborative_filtering/cornac_bivae_deep_dive.ipynb

In [48]:
import sys
import os
import torch
import cornac
import papermill as pm
import scrapbook as sb
import pandas as pd
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.models.cornac.cornac_utils import predict_ranking
from recommenders.utils.timer import Timer
from recommenders.utils.constants import SEED
from recommenders.datasets.pandas_df_utils import filter_by, negative_feedback_sampler

from tqdm import tqdm

print("System version: {}".format(sys.version))
print("PyTorch version: {}".format(torch.__version__))
print("Cornac version: {}".format(cornac.__version__))

System version: 3.8.5 (default, Sep  4 2020, 07:30:14) 
[GCC 7.3.0]
PyTorch version: 1.7.1
Cornac version: 1.14.2


## Hyperparmaters

In [2]:
# top k items to recommend
TOP_K = 10

# Model parameters
LATENT_DIM = 50
ENCODER_DIMS = [100]
ACT_FUNC = "tanh"
LIKELIHOOD = "pois"
NUM_EPOCHS = 10
BATCH_SIZE = 256
LEARNING_RATE = 0.001

# BiVAE

## Import Data

In [3]:
COL_USER = "userID"
COL_ITEM = "itemID"
COL_RATING = "rating"
COL_PREDICTION = "rating"
COL_TIMESTAMP = "timestamp"

root_dir = '/opt/ml/input/data/train/'
df = pd.read_csv(os.path.join(root_dir,'train_ratings.csv'), names=[COL_USER, COL_ITEM, COL_TIMESTAMP], header=0)
df.head()

Unnamed: 0,userID,itemID,timestamp
0,11,4643,1230782529
1,11,170,1230782534
2,11,531,1230782539
3,11,616,1230782542
4,11,2140,1230782563


In [4]:
df['rating'] = 1

In [5]:
# Obtain both usercount and itemcount after filtering
usercount = df[['userID']].groupby('userID', as_index = False).size()
itemcount = df[['itemID']].groupby('itemID', as_index = False).size()

# Compute sparsity after filtering
sparsity = 1. * df.shape[0] / (usercount.shape[0] * itemcount.shape[0])

print("After filtering, there are %d watching events from %d users and %d movies (sparsity: %.3f%%)" % 
      (df.shape[0], usercount.shape[0], itemcount.shape[0], sparsity * 100))

After filtering, there are 5154471 watching events from 31360 users and 6807 movies (sparsity: 2.415%)


In [7]:
train, test = python_random_split(df, 0.75)

ValueError: Split ratio has to be between 0 and 1

## Cornac Dataset

In [8]:
train_set = cornac.data.Dataset.from_uir(train.itertuples(index=False), seed=SEED)

print('Number of users: {}'.format(train_set.num_users))
print('Number of items: {}'.format(train_set.num_items))

Number of users: 31360
Number of items: 6807


## Train

In [9]:
bivae = cornac.models.BiVAECF(
    k=LATENT_DIM,
    encoder_structure=ENCODER_DIMS,
    act_fn=ACT_FUNC,
    likelihood=LIKELIHOOD,
    n_epochs=NUM_EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    seed=SEED,
    use_gpu=torch.cuda.is_available(),
    verbose=True
)

with Timer() as t:
    bivae.fit(train_set)
print("Took {} seconds for training.".format(t))

  0%|          | 0/10 [00:00<?, ?it/s]

Took 40.6273 seconds for training.


## Prediction and Evaluation

In [11]:
with Timer() as t:
    all_predictions = predict_ranking(bivae, train, usercol='userID', itemcol='itemID', remove_seen=True)
print("Took {} seconds for prediction.".format(t))

Took 245.9086 seconds for prediction.


In [None]:
eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
print("Recall@K:\t%f" % eval_recall)

# Submission

In [12]:
all_predictions

Unnamed: 0,userID,itemID,prediction
5154471,11,8961,0.432683
5154472,11,1396,0.064274
5154473,11,471,0.033174
5154474,11,1042,0.127116
5154475,11,1947,0.036844
...,...,...,...
213467515,138493,7753,0.006361
213467516,138493,93422,0.003947
213467517,138493,6519,0.008478
213467518,138493,8830,0.010108


In [20]:
unique_users = list(all_predictions['userID'].unique())

In [73]:
submission_df = pd.DataFrame(columns=["user","item"])

In [74]:
for user in tqdm(unique_users):
    top_k = all_predictions[all_predictions['userID'] == user].sort_values('prediction', ascending=False)[:10][["userID", "itemID"]]
    top_k.rename(columns = {"userID": "user", "itemID": "item"}, inplace=True)
    submission_df = pd.concat([submission_df, top_k], ignore_index=True)

100%|███████████████████████████████████████████████████████████████████████████████████████| 31360/31360 [2:52:55<00:00,  3.02it/s]


In [75]:
submission_df

Unnamed: 0,user,item
0,11,40815
1,11,4886
2,11,5989
3,11,8360
4,11,5418
...,...,...
313595,138493,2762
313596,138493,1704
313597,138493,8961
313598,138493,1197


In [76]:
# submission_df = all_predictions[['userID', 'itemID']]
# submission_df.rename(columns = {"userID": "user", "itemID": "item"}, inplace=True)
submission_df.to_csv('/opt/ml/input/submission.csv', index=False)

In [None]:
submission_df