In [1]:
import sys
import os
import torch
import cornac
import papermill as pm
import scrapbook as sb
import pandas as pd
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.models.cornac.cornac_utils import predict_ranking
from recommenders.utils.timer import Timer
from recommenders.utils.constants import SEED

print("System version: {}".format(sys.version))
print("PyTorch version: {}".format(torch.__version__))
print("Cornac version: {}".format(cornac.__version__))

System version: 3.8.13 (default, Mar 28 2022, 11:38:47) 
[GCC 7.5.0]
PyTorch version: 1.7.1
Cornac version: 1.14.2


In [2]:
# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '100k'

# top k items to recommend
TOP_K = 10

# Model parameters
LATENT_DIM = 50
ENCODER_DIMS = [100]
ACT_FUNC = "tanh"
LIKELIHOOD = "pois"
NUM_EPOCHS = 500
BATCH_SIZE = 128
LEARNING_RATE = 0.001

In [4]:
data = pd.read_csv("/opt/ml/input/data/train/train_ratings.csv")

In [6]:
data = data[['user', 'item']]

In [8]:
data.columns = ['userID', 'itemID']

In [9]:
data['rating'] = 1
data.head()

Unnamed: 0,userID,itemID,rating
0,11,4643,1
1,11,170,1
2,11,531,1
3,11,616,1
4,11,2140,1


In [10]:
train, test = python_random_split(data, 0.75)

In [11]:
train_set = cornac.data.Dataset.from_uir(train.itertuples(index=False), seed=SEED)

print('Number of users: {}'.format(train_set.num_users))
print('Number of items: {}'.format(train_set.num_items))

Number of users: 31360
Number of items: 6807


In [12]:
bivae = cornac.models.BiVAECF(
    k=LATENT_DIM,
    encoder_structure=ENCODER_DIMS,
    act_fn=ACT_FUNC,
    likelihood=LIKELIHOOD,
    n_epochs=NUM_EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    seed=SEED,
    use_gpu=torch.cuda.is_available(),
    verbose=True
)

with Timer() as t:
    bivae.fit(train_set)
print("Took {} seconds for training.".format(t))

  0%|          | 0/500 [00:00<?, ?it/s]

Took 3502.8108 seconds for training.


In [13]:
with Timer() as t:
    all_predictions = predict_ranking(bivae, test, usercol='userID', itemcol='itemID', remove_seen=True)
print("Took {} seconds for prediction.".format(t))

Took 242.9335 seconds for prediction.


In [14]:
eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.000000
NDCG:	0.000000
Precision@K:	0.000000
Recall@K:	0.000000


In [16]:
all_predictions[all_predictions['userID'] == 69029]

Unnamed: 0,userID,itemID,prediction
1288618,69029,5902,0.473038
1288619,69029,4896,0.158540
1288620,69029,253,0.157383
1288621,69029,924,0.595679
1288622,69029,33166,0.285728
...,...,...,...
1295362,69029,49276,0.000008
1295363,69029,2170,0.000018
1295364,69029,43,0.000241
1295365,69029,27744,0.000087


In [22]:
users = list(data['userID'].unique())

In [38]:
submission = pd.DataFrame()

for user in users:
    interaction = all_predictions[all_predictions['userID']==user]
    interaction = interaction.sort_values(by=['prediction'], ascending=False).iloc[:10]
    submission = pd.concat([submission, interaction])
    
    

print(submission)

KeyboardInterrupt: 

In [39]:
submission

Unnamed: 0,userID,itemID,prediction
17053044,11,6365,0.713736
17052838,11,4993,0.675530
17052811,11,6502,0.667166
17052971,11,780,0.656038
17053410,11,5952,0.655307
...,...,...,...
180548767,102813,3535,0.414706
180549106,102813,2329,0.413652
180549414,102813,8957,0.383607
180548923,102813,48394,0.375871


In [41]:
pred_userid = all_predictions['userID'].tolist()

In [43]:
pred_user_set = set(pred_userid)

In [44]:
len(pred_user_set)

31360

In [45]:
len(users)

31360

In [46]:
submission

Unnamed: 0,userID,itemID,prediction
17053044,11,6365,0.713736
17052838,11,4993,0.675530
17052811,11,6502,0.667166
17052971,11,780,0.656038
17053410,11,5952,0.655307
...,...,...,...
180548767,102813,3535,0.414706
180549106,102813,2329,0.413652
180549414,102813,8957,0.383607
180548923,102813,48394,0.375871


In [51]:
all_predictions.iloc[:300]

Unnamed: 0,userID,itemID,prediction
1288618,69029,5902,0.473038
1288619,69029,4896,0.158540
1288620,69029,253,0.157383
1288621,69029,924,0.595679
1288622,69029,33166,0.285728
...,...,...,...
1288913,69029,6539,0.452225
1288914,69029,6122,0.001120
1288915,69029,3147,0.398187
1288916,69029,2524,0.000221
