In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [33]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import pickle
import json
import h5py

def load_file(path, name):
    import importlib.util
    spec = importlib.util.spec_from_file_location(name, path)
    mod = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(mod)
    return mod

plot = load_file("../../utils/plot.py", "plot")
radam = load_file("../../utils/optim.py", "radam")
models = load_file("../../utils/models.py", "models")

cuda = torch.device('cuda')
frame_size = 10
# https://drive.google.com/open?id=1kTyu05ZmtP2MA33J5hWdX8OyUYEDW4iI
movie_ref = pickle.load(open('../../data/infos_pca128.pytorch', 'rb'))
# on my system os error often pops up
# try to open the file explorer and copy paste the path again 
# https://drive.google.com/open?id=1pPf-7AmUVceVfgfmKEJ6ireEDKEJHw-7
f = h5py.File("../../data/test/test.hdf5", "r")

In [41]:
csv_ratings = pd.read_csv('../../data/ml-20m/ratings.csv')
users = csv_ratings.groupby(["userId"]).size()
users = users[users >= frame_size + 1][users < 5700]
users = users.sort_values(ascending=False).index
# del csv_ratings

In [35]:
def soft_update(net, target_net, soft_tau=1e-2):
    for target_param, param in zip(target_net.parameters(), net.parameters()):
            target_param.data.copy_(
                target_param.data * (1.0 - soft_tau) + param.data * soft_tau
            )

In [36]:
frame_size = 10
batch_size = 1000

In [37]:
user_bar = tqdm(total=len(users))
batch = []

g_idx = 0
batch = [[], [], []]

class UserDataset(Dataset):

    def __init__(self, users):
        self.users = users

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        idx = self.users[idx]
        group = f['u' + str(idx)]
        movies = torch.tensor(group['movies'][:])
        print(movies)
        rates = torch.tensor(group['ratings'][:])
        size = movies.size(0)
        return {'movies': movies, 'rates': rates, 'size': size}
        

HBox(children=(IntProgress(value=0, max=138491), HTML(value='')))

In [38]:
def padder(x):
    movies_t = []
    ratings_t = []
    sizes_t = []
    for i in range(len(x)):
        movies_t.append(x[i]['movies'])
        ratings_t.append(x[i]['rates'])
        sizes_t.append(x[i]['size'])
    movies_t = torch.nn.utils.rnn.pad_sequence(movies_t, batch_first=True)
    ratings_t = torch.nn.utils.rnn.pad_sequence(ratings_t, batch_first=True)
    sizes_t = torch.tensor(sizes_t)
    return {'movies': movies_t, 'rates': ratings_t, 'size': sizes_t}

In [39]:
user_dataset = UserDataset(users)
dataloader = DataLoader(user_dataset, batch_size=25,
                        shuffle=False, num_workers=1,collate_fn=padder)

In [40]:
for batch in tqdm(dataloader):
    print(batch['movies'])
    break

HBox(children=(IntProgress(value=0, max=5540), HTML(value='')))

tensor([     1,      2,      3,  ..., 118997, 119155, 129659],
       dtype=torch.int32)
tensor([     1,      2,      3,  ..., 119145, 120400, 122131],
       dtype=torch.int32)
tensor([     1,      2,      5,  ..., 128812, 129644, 129818],
       dtype=torch.int32)
tensor([     1,      2,      3,  ..., 129401, 129474, 130900],
       dtype=torch.int32)
tensor([    1,     2,     3,  ..., 89275, 89594, 89607], dtype=torch.int32)
tensor([     1,      2,      6,  ..., 128488, 129340, 130622],
       dtype=torch.int32)
tensor([   1,    2,    3,  ..., 6531, 6910, 6918], dtype=torch.int32)
tensor([     1,      2,      5,  ..., 129447, 129449, 131017],
       dtype=torch.int32)
tensor([    1,     6,    16,  ..., 92167, 92174, 92176], dtype=torch.int32)
tensor([    1,     2,     3,  ..., 80421, 80424, 80494], dtype=torch.int32)
tensor([     1,      4,      6,  ..., 128898, 129068, 130512],
       dtype=torch.int32)
tensor([     2,      6,      7,  ..., 109893, 110412, 110441],
       dtype=tor