In [1]:
import random

import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import DataLoader

from models import Recommender
from data_processing import get_context, pad_list, map_column, MASK, PAD


In [1]:
10 - 3 * 3

1

In [3]:
data_csv_path = "../../data/train/rating_1.csv"
movies_path = "../../data/train/titles.tsv"
model_path = "../recommender_models/recommender-v4.ckpt"

In [4]:
data = pd.read_csv(data_csv_path)
movies = pd.read_csv(movies_path, sep='\t',encoding='latin-1')

In [5]:
data.sort_values(by="timestamp", inplace=True)

In [6]:
data, mapping, inverse_mapping = map_column(data, col_name="movieId")
grp_by_train = data.groupby(by="userId")

In [7]:
movies.head()

Unnamed: 0,item,title
0,318,"Shawshank Redemption, The (1994)"
1,2571,"Matrix, The (1999)"
2,2959,Fight Club (1999)
3,296,Pulp Fiction (1994)
4,356,Forrest Gump (1994)


In [8]:
movies = movies.rename(columns={'item':'movieId', 'title':'title'})

In [9]:
len(movies)

6807

In [10]:
print(len(mapping))

6807


In [11]:
movie_to_idx = {b: mapping[b] for b in data['movieId'].unique().tolist() if b in mapping}
idx_to_movie = {v: k for k, v in movie_to_idx.items()}

In [12]:
random.sample(list(grp_by_train.groups), k=10)

[101021, 82896, 83598, 24295, 131787, 134250, 60250, 100890, 97195, 18932]

In [13]:
model = Recommender(
        vocab_size=len(mapping) + 2,
        lr=1e-4,
        dropout=0.3,
    )
model.eval()
model.load_state_dict(torch.load(model_path)["state_dict"])

<All keys matched successfully>

In [16]:
for i in range(10, 0, -1):
    print(i)

10
9
8
7
6
5
4
3
2
1


In [18]:
from collections import Counter
result_collector = list()


def predict(list_movies, model, len_saw):
    space = int(len_saw / 10)
    all_movies = [movie_to_idx[a] for a in list_movies] 

    result_collector.clear()

    
    for i in range(10):
        select = len_saw - i * space
        ids = [PAD] * (1000 - len_saw - 1) + all_movies[select:] + [MASK] + all_movies[:select]
        
        src = torch.tensor(ids, dtype=torch.long).unsqueeze(0)
        
        with torch.no_grad():
            prediction = model(src)
    
        masked_pred = prediction[0, -1].numpy()
        
        sorted_predicted_ids = np.argsort(masked_pred).tolist()[::-1]

        sorted_predicted_ids = [a for a in sorted_predicted_ids if a not in ids]

        # for i in range(1):
        #     result_collector.append(sorted_predicted_ids[i])
        index = 0
        if sorted_predicted_ids[i] in result_collector:
            index += 1
        result_collector.append(sorted_predicted_ids[index])
    
    # result_collector.append(sorted_predicted_ids[4])
    # count = Counter(result_collector).most_common(10)
    # print(count)

    return [idx_to_movie[a] for a in result_collector if a in idx_to_movie]


In [19]:
rating_df = pd.read_csv('../../data/train/train_ratings.csv')
users = rating_df["user"].unique()

result = []
all_len = list()

for i, user in enumerate(users):
    print(str(i) + "done")
        
    saw_movies = rating_df[rating_df['user'] == user]['item']
    len_saw = len(saw_movies)
    if(len(saw_movies) > 1000):
        saw_movies = saw_movies.sample(n=1000)
        len_saw = 1000
    pred = predict(saw_movies, model, len_saw)
    for item in pred:
        result.append((user, item))

    # top_movie = predict(movies, model, movie_to_idx, idx_to_movie)

0done
1done


KeyboardInterrupt: 

In [17]:
pd.DataFrame(result, columns=["user", "item"]).to_csv(
    "../../output/burt4Rec_10split_submission.csv", index=False
)