In [1]:
import sys
import os
import torch
import cornac
import papermill as pm
import scrapbook as sb
import pandas as pd
from sklearn.model_selection import train_test_split
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.models.cornac.cornac_utils import predict_ranking
from recommenders.utils.timer import Timer
from recommenders.utils.constants import SEED

print("System version: {}".format(sys.version))
print("PyTorch version: {}".format(torch.__version__))
print("Cornac version: {}".format(cornac.__version__))

System version: 3.8.5 (default, Sep  4 2020, 07:30:14) 
[GCC 7.3.0]
PyTorch version: 1.7.1
Cornac version: 1.14.2


In [2]:
# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '100k'

# top k items to recommend
TOP_K = 10

# Model parameters
LATENT_DIM = 50
ENCODER_DIMS = [100]
ACT_FUNC = "tanh"
LIKELIHOOD = "pois"
NUM_EPOCHS = 500
BATCH_SIZE = 128
LEARNING_RATE = 0.001

In [70]:
DATA_PATH = '/opt/ml/input/data/train/'
ratings = pd.read_csv(os.path.join(DATA_PATH, 'train_ratings.csv'))

In [72]:
ratings

Unnamed: 0,user,item,time,rate
0,11,4643,1230782529,1.0
1,11,170,1230782534,1.0
2,11,531,1230782539,1.0
3,11,616,1230782542,1.0
4,11,2140,1230782563,1.0
...,...,...,...,...
5154466,138493,44022,1260209449,1.0
5154467,138493,4958,1260209482,1.0
5154468,138493,68319,1260209720,1.0
5154469,138493,40819,1260209726,1.0


In [73]:
ratings = ratings.drop('time', axis=1)

ratings = ratings[:3000]

In [74]:
ratings['rate'] = 1.0

In [75]:
table = ratings.pivot_table('rate', index='item', columns='user')
#ratings.pivot_table('rate', index='user', columns='item')

In [76]:
table

user,11,14,18,25,31,35,43,50,58,60,...,138459,138461,138470,138471,138472,138473,138475,138486,138492,138493
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,1.0,,,,,,,1.0,,...,,,,1.0,,1.0,,1.0,,1.0
2,,,,,,,,,,,...,,,,,,,,,,1.0
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118700,,,,,,,,,,,...,,,,,,,,,,
118900,,,,,,,,,,,...,,,,,,,,,,
118997,,,,,,,,,,,...,,,,,,,,,,
119141,,,,,,,,,,,...,,,,,,,,,,


In [77]:
table.fillna(0.0, inplace=True)

In [78]:
table.unstack()

user    item  
11      1         1.0
        2         0.0
        3         0.0
        4         0.0
        5         0.0
                 ... 
138493  118700    0.0
        118900    0.0
        118997    0.0
        119141    0.0
        119145    0.0
Length: 213467520, dtype: float64

In [79]:
data = pd.DataFrame(table.unstack())
data.reset_index(inplace=True)
data = data.rename(columns={0:'rating'})

In [80]:
data

Unnamed: 0,user,item,rating
0,11,1,1.0
1,11,2,0.0
2,11,3,0.0
3,11,4,0.0
4,11,5,0.0
...,...,...,...
213467515,138493,118700,0.0
213467516,138493,118900,0.0
213467517,138493,118997,0.0
213467518,138493,119141,0.0


In [45]:
train, test = train_test_split(data, test_size=0.2, stratify=data['rating'])

In [46]:
train_set = cornac.data.Dataset.from_uir(train.itertuples(index=False), seed=42)

print(f'number of users : {train_set.num_users}')
print(f'number of items : {train_set.num_items}')

number of users : 23
number of items : 1495


In [64]:
cornac.data.

cornac.data.dataset.Dataset

In [None]:
bivae = cornac.models.BiVAECF(
    k=LATENT_DIM,
    encoder_structure=ENCODER_DIMS,
    act_fn=ACT_FUNC,
    likelihood=LIKELIHOOD,
    n_epochs=NUM_EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    seed=SEED,
    use_gpu=torch.cuda.is_available(),
    verbose=True
)

with Timer() as t:
    bivae.fit(train_set)
print("Took {} seconds for training.".format(t))