In [4]:
import sys
import os
import torch
import cornac
import papermill as pm
import scrapbook as sb
import pandas as pd
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.models.cornac.cornac_utils import predict_ranking
from recommenders.utils.timer import Timer
from recommenders.utils.constants import SEED

print("System version: {}".format(sys.version))
print("PyTorch version: {}".format(torch.__version__))
print("Cornac version: {}".format(cornac.__version__))

System version: 3.8.13 (default, Mar 28 2022, 11:38:47) 
[GCC 7.5.0]
PyTorch version: 1.7.1
Cornac version: 1.14.2


In [5]:
# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '100k'

# top k items to recommend
TOP_K = 10

# Model parameters
LATENT_DIM = 100
ENCODER_DIMS = [200]
ACT_FUNC = "relu"
LIKELIHOOD = "pois"
NUM_EPOCHS = 500
BATCH_SIZE = 64
LEARNING_RATE = 0.0001

In [6]:
train = pd.read_csv("/opt/ml/input/data/train/train_ratings.csv")

In [7]:
train = train[['user', 'item']]

In [8]:
train.columns = ['userID', 'itemID']

In [9]:
train['rating'] = 1
train.head()

Unnamed: 0,userID,itemID,rating
0,11,4643,1
1,11,170,1
2,11,531,1
3,11,616,1
4,11,2140,1


In [10]:
# train, test = python_random_split(data, 0.75)

In [11]:
train_set = cornac.data.Dataset.from_uir(train.itertuples(index=False), seed=SEED)

print('Number of users: {}'.format(train_set.num_users))
print('Number of items: {}'.format(train_set.num_items))

Number of users: 31360
Number of items: 6807


In [12]:
bivae = cornac.models.BiVAECF(
    k=LATENT_DIM,
    encoder_structure=ENCODER_DIMS,
    act_fn=ACT_FUNC,
    likelihood=LIKELIHOOD,
    n_epochs=NUM_EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    seed=SEED,
    use_gpu=torch.cuda.is_available(),
    verbose=True
)

with Timer() as t:
    bivae.fit(train_set)
print("Took {} seconds for training.".format(t))

  0%|          | 0/500 [00:00<?, ?it/s]

Took 6892.7718 seconds for training.


In [13]:
with Timer() as t:
    all_predictions = predict_ranking(bivae, train, usercol='userID', itemcol='itemID', remove_seen=True)
print("Took {} seconds for prediction.".format(t))

Took 236.7133 seconds for prediction.


In [14]:
# eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
# eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
# eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
# eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)

# print("MAP:\t%f" % eval_map,
#       "NDCG:\t%f" % eval_ndcg,
#       "Precision@K:\t%f" % eval_precision,
#       "Recall@K:\t%f" % eval_recall, sep='\n')

In [15]:
users = list(train['userID'].unique())

In [16]:
data = all_predictions.values.tolist()

In [17]:
data.sort()

In [18]:
data

[[11.0, 2.0, 0.6813569664955139],
 [11.0, 3.0, 0.01855277456343174],
 [11.0, 4.0, 0.0002733002766035497],
 [11.0, 5.0, 0.028076065704226494],
 [11.0, 6.0, 0.04262160509824753],
 [11.0, 7.0, 0.013901784084737301],
 [11.0, 8.0, 0.0009838115656748414],
 [11.0, 9.0, 0.002243969589471817],
 [11.0, 10.0, 0.47316157817840576],
 [11.0, 11.0, 0.01649295538663864],
 [11.0, 12.0, 0.04432025924324989],
 [11.0, 13.0, 0.11136652529239655],
 [11.0, 14.0, 0.002582028741016984],
 [11.0, 15.0, 0.01269945316016674],
 [11.0, 16.0, 0.04723123461008072],
 [11.0, 17.0, 0.020828817039728165],
 [11.0, 18.0, 0.03599970042705536],
 [11.0, 20.0, 0.04320024326443672],
 [11.0, 21.0, 0.019389180466532707],
 [11.0, 22.0, 0.05924242362380028],
 [11.0, 23.0, 0.04645363613963127],
 [11.0, 24.0, 0.28145065903663635],
 [11.0, 25.0, 0.06326969712972641],
 [11.0, 26.0, 0.007497945334762335],
 [11.0, 27.0, 0.0016050022095441818],
 [11.0, 28.0, 0.0001132588877226226],
 [11.0, 29.0, 0.2475733458995819],
 [11.0, 30.0, 8.2948517

In [19]:
users.sort()

In [20]:
users

[11,
 14,
 18,
 25,
 31,
 35,
 43,
 50,
 58,
 60,
 61,
 65,
 72,
 77,
 82,
 85,
 90,
 91,
 96,
 98,
 99,
 102,
 116,
 121,
 124,
 129,
 132,
 133,
 135,
 136,
 147,
 152,
 154,
 155,
 162,
 163,
 168,
 175,
 182,
 189,
 190,
 201,
 204,
 205,
 206,
 208,
 209,
 211,
 213,
 215,
 218,
 220,
 232,
 237,
 239,
 241,
 248,
 252,
 254,
 258,
 264,
 266,
 271,
 279,
 284,
 285,
 294,
 304,
 312,
 313,
 316,
 317,
 318,
 337,
 340,
 342,
 348,
 351,
 359,
 361,
 367,
 370,
 372,
 375,
 379,
 383,
 387,
 388,
 394,
 395,
 398,
 407,
 409,
 413,
 419,
 421,
 422,
 425,
 427,
 430,
 431,
 436,
 440,
 442,
 448,
 451,
 455,
 457,
 459,
 462,
 466,
 469,
 471,
 482,
 485,
 486,
 492,
 500,
 503,
 504,
 505,
 508,
 512,
 520,
 521,
 532,
 534,
 535,
 540,
 546,
 548,
 557,
 563,
 571,
 572,
 573,
 577,
 578,
 586,
 588,
 598,
 604,
 609,
 612,
 614,
 617,
 619,
 626,
 631,
 633,
 637,
 649,
 650,
 661,
 662,
 664,
 672,
 689,
 692,
 693,
 694,
 700,
 710,
 724,
 729,
 735,
 737,
 738,
 739,
 741,
 

In [21]:
from collections import deque

In [22]:
data_ = deque(data)

In [23]:
submission = []
tmp = []
for user in users:
    while data_:
        if data_[0][0] != user:
            break
        else:
            interaction = data_.popleft()
            tmp.append(interaction)
    tmp.sort(key=lambda x: -x[2])
    submission += tmp[:10]
    tmp = []
    
print(len(submission))

313600


In [24]:
len(users)

31360

In [25]:
submission_bivae = pd.DataFrame(submission)

submission_bivae


Unnamed: 0,0,1,2
0,11.0,4370.0,0.842008
1,11.0,37386.0,0.803314
2,11.0,3986.0,0.774804
3,11.0,4886.0,0.765114
4,11.0,8861.0,0.761554
...,...,...,...
313595,138493.0,5349.0,0.736810
313596,138493.0,2762.0,0.690189
313597,138493.0,551.0,0.679910
313598,138493.0,589.0,0.654789


In [26]:
submission_bivae.columns = ['user', 'item', 'rating']

In [27]:
submission_bivae

Unnamed: 0,user,item,rating
0,11.0,4370.0,0.842008
1,11.0,37386.0,0.803314
2,11.0,3986.0,0.774804
3,11.0,4886.0,0.765114
4,11.0,8861.0,0.761554
...,...,...,...
313595,138493.0,5349.0,0.736810
313596,138493.0,2762.0,0.690189
313597,138493.0,551.0,0.679910
313598,138493.0,589.0,0.654789


In [28]:
submission_bivae = submission_bivae[['user', 'item']]

In [29]:
submission_bivae.to_csv('/opt/ml/level2-movie-recommendation-level2-recsys-15/임경연/CODE/Recommenders/BiVAE/submission_bivae_7.csv', index=False)