In [1]:
import random
import numpy as np
import pandas as pd
import scipy.sparse as sp
from lightfm import LightFM

from sklearn.metrics import ndcg_score
from sklearn.metrics.pairwise import cosine_similarity

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from tqdm.notebook import tqdm

### Воспользуемся датасетом MovieLens. Мы будем реализовывать методы из статьи Neural Collaborative Filtering, а этот датасет там используется.

In [2]:
ratings = pd.read_csv('./ml-1m/ratings.dat', delimiter='::', header=None, 
        names=['user_id', 'movie_id', 'rating', 'timestamp'], 
        usecols=['user_id', 'movie_id', 'rating'], engine='python')

In [3]:
movie_info = pd.read_csv('./ml-1m/movies.dat', delimiter='::', header=None, 
        names=['movie_id', 'name', 'category'], engine='python')

In [4]:
ratings.head(4)

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4


In [5]:
movie_info.head(1)

Unnamed: 0,movie_id,name,category
0,1,Toy Story (1995),Animation|Children's|Comedy


### Будем работать с implicit данными. Считаем, что позитивная оценка - это оценка >= 4.

In [6]:
ratings = ratings.loc[(ratings['rating'] >= 4)]
ratings = ratings.drop(columns=['rating'])

### Удалим пользователей с малым числом просмотров

In [7]:
ratings.user_id.value_counts()

4277    1435
4169    1210
1680    1108
3032     861
5831     826
        ... 
4192       5
4295       4
4636       2
4349       2
5850       1
Name: user_id, Length: 6038, dtype: int64

In [8]:
ratings = ratings[ratings.user_id != 5850]

### Построим соответствие между пользователями/фильмами и номерами их эмбеддингов

In [9]:
emb2user = np.unique(ratings.user_id)
emb2movie = np.unique(ratings.movie_id)
user2emb = {user: emb for emb, user in enumerate(emb2user)}
movie2emb = {movie: emb for emb, movie in enumerate(emb2movie)}

### Заменим id фильмов и пользователей на номера их эмбеддингов

In [10]:
ratings['user_emb'] = ratings['user_id'].map(user2emb.get)
ratings['movie_emb'] = ratings['movie_id'].map(movie2emb.get)
ratings = ratings.drop(columns=['user_id', 'movie_id'])

In [11]:
ratings

Unnamed: 0,user_emb,movie_emb
0,0,1039
3,0,3027
4,0,2053
6,0,1130
7,0,2476
...,...,...
1000202,6036,962
1000205,6036,967
1000206,6036,536
1000207,6036,969


### Поделим на train/test. Для каждого пользователя в тесте будет 1 последний оцененный фильм и 99 случайных фильмов, которые он не смотрел.

In [12]:
train_users = []
train_movies = []

test_pos = dict()
test_neg = dict()
test = dict()

unseen = dict()

all_movies = set(ratings.movie_emb)

for user_emb in np.unique(ratings.user_emb):
  user_views = ratings[ratings.user_emb == user_emb].movie_emb
  
  train_users.extend([user_emb] * len(user_views.iloc[:-1]))
  train_movies.extend(user_views.iloc[:-1])

  test_pos[user_emb] = user_views.iloc[-1]
  
  unseen_movies = np.array(list(all_movies - set(user_views)), dtype=int)
  test_neg[user_emb] = np.random.choice(unseen_movies, 99, replace=False)

  unseen[user_emb] = unseen_movies


for user_emb in test_pos:
  test[user_emb] = [test_pos[user_emb]]
  test[user_emb].extend(test_neg[user_emb])

### Удобнее работать с sparse матрицами. Преобразуем данные в CSR.

In [13]:
train_coo = sp.coo_matrix((np.ones(len(train_users)), (train_users, train_movies)))
train_csr = train_coo.tocsr()

### Натренируем WARP

In [14]:
warp = LightFM(loss='warp', no_components=64)
warp.fit(train_csr, epochs=60, num_threads=2, verbose=False)

<lightfm.lightfm.LightFM at 0x7f9552490588>

In [15]:
class WARP:
  def __init__(self,  model):
    self.model = model

    self.user_embs = warp.user_embeddings
    self.user_biases = warp.user_biases

    self.movie_embs = warp.item_embeddings
    self.movie_biases = warp.item_biases

  def recommend(self, user_emb, n=10):
    ratings = self.movie_embs[unseen[user_emb]] @ self.user_embs[user_emb] + self.user_biases[user_emb] + self.movie_biases[unseen[user_emb]]
    argsort = ratings.argsort()[-n:][::-1]
    return unseen[user_emb][argsort]
  
  def similars(self, movie_emb, n=10):
    distances = cosine_similarity(self.movie_embs[movie_emb, :].reshape(1, -1), self.movie_embs)
    most_similar = distances[0].argsort()[-n:][::-1]
    return most_similar

  def predict(self, user_embs, movie_embs):
    return self.model.predict(user_embs, movie_embs)

warp_model = WARP(warp)

### Посмотрим на WARP симилары. Будем смотреть на фильмы, которые похожи на Toy Story

In [16]:
movie_info.iloc[0]

movie_id                              1
name                   Toy Story (1995)
category    Animation|Children's|Comedy
Name: 0, dtype: object

In [17]:
get_similars = lambda movie_emb, model : [movie_info[movie_info["movie_id"] == emb2movie[x]]["name"].to_string() for x in model.similars(movie_emb)]

In [18]:
get_similars(movie2emb[1], warp_model)

['0    Toy Story (1995)',
 '3045    Toy Story 2 (1999)',
 '584    Aladdin (1992)',
 "2286    Bug's Life, A (1998)",
 '2225    Antz (1998)',
 '2252    Pleasantville (1998)',
 '360    Lion King, The (1994)',
 '33    Babe (1995)',
 '591    Beauty and the Beast (1991)',
 '1132    Wrong Trousers, The (1993)']

Симилары хорошие

### Посмотрим на WARP рекомендации для пользователя из ДЗ 1. Он любит фантастику и боевики. 

In [19]:
get_user_history = lambda user_emb, ratings : [movie_info[movie_info["movie_id"] == emb2movie[movie_emb]]["name"].to_string() 
                                                         for movie_emb in ratings[ratings.user_emb == user_emb].movie_emb]

In [20]:
get_user_history(user2emb[4], ratings)

['3399    Hustler, The (1961)',
 '2882    Fistful of Dollars, A (1964)',
 '1196    Alien (1979)',
 '1023    Die Hard (1988)',
 '257    Star Wars: Episode IV - A New Hope (1977)',
 '1959    Saving Private Ryan (1998)',
 '476    Jurassic Park (1993)',
 '1180    Raiders of the Lost Ark (1981)',
 '1885    Rocky (1976)',
 '1081    E.T. the Extra-Terrestrial (1982)',
 '3349    Thelma & Louise (1991)',
 '3633    Mad Max (1979)',
 '2297    King Kong (1933)',
 '1366    Jaws (1975)',
 '1183    Good, The Bad and The Ugly, The (1966)',
 '2623    Run Lola Run (Lola rennt) (1998)',
 '2878    Goldfinger (1964)',
 '1220    Terminator, The (1984)']

In [21]:
get_recommendations = lambda user_emb, model : [movie_info[movie_info["movie_id"] == emb2movie[movie_emb]]["name"].to_string() 
                                                for movie_emb in model.recommend(user_emb)]

In [22]:
get_recommendations(user2emb[4], warp_model)

['847    Godfather, The (1972)',
 '585    Terminator 2: Judgment Day (1991)',
 '1178    Star Wars: Episode V - The Empire Strikes Back...',
 '1284    Butch Cassidy and the Sundance Kid (1969)',
 '1203    Godfather: Part II, The (1974)',
 '907    Wizard of Oz, The (1939)',
 '1271    Indiana Jones and the Last Crusade (1989)',
 '108    Braveheart (1995)',
 '957    African Queen, The (1951)',
 '453    Fugitive, The (1993)']

Рекомендации хорошие

### Посчитаем метрики WARP на тесте

In [23]:
def compute_metrics(model, test):
  hit_ratio = []
  ndcg = []

  y_true = np.array([1] + [0] * 99).reshape(1, -1)

  for user_emb in test:
    y_pred = model.predict(np.array([user_emb] * 100), test[user_emb]).reshape(1, -1)

    ndcg.append(ndcg_score(y_true, y_pred, k=10)) 
    hit_ratio.append(0 in y_pred.argsort()[-10:])

  print(f'NDCG@10: {np.mean(ndcg)}')
  print(f'HR@10: {np.mean(hit_ratio)}')

In [24]:
compute_metrics(warp_model, test)

NDCG@10: 0.4249864025233029
HR@10: 1.0


### Построим NCF

In [25]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

### Для начала реализуем MLP

In [26]:
class MLP(nn.Module):
    def __init__(self, n_users, n_movies, n_components, hidden_size):
        super().__init__()

        self.user_embs = nn.Embedding(num_embeddings=n_users, embedding_dim=n_components)
        self.movie_embs = nn.Embedding(num_embeddings=n_movies, embedding_dim=n_components)

        self.mlp = nn.Sequential(
            nn.Linear(2 * n_components, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU()
        )

        self.clf = nn.Linear(hidden_size, 1)

    def forward(self, user_emb, movie_emb):
        return self.clf(self.mlp(torch.cat([self.user_embs(user_emb), self.movie_embs(movie_emb)], dim=-1)))

    def part_forward(self, user_emb, movie_emb):
        return self.mlp(torch.cat([self.user_embs(user_emb), self.movie_embs(movie_emb)], dim=-1))

    def compute_loss(self, batch):
        user_embs, movie_embs, y_true = (item.to(device) for item in batch)
        y_pred = self(user_embs, movie_embs)
        return F.binary_cross_entropy_with_logits(y_pred, y_true.view(-1, 1))

### Подготовим данные

In [27]:
class MoviesDataset(Dataset):
    def __init__(self, users, movies):
        self.users = users
        self.movies = movies

        self.unique_users = np.unique(users)

    def __getitem__(self, idx):
        if idx < len(self.users):
            return self.users[idx], self.movies[idx], 1.0
        user = self.unique_users[random.randint(0, len(self.unique_users) - 1)]
        return user, unseen[user][random.randint(0, len(unseen[user]) - 1)], 0.0

    def __len__(self):
        return 3 * len(self.users)

In [28]:
train_dataset = MoviesDataset(train_users, train_movies)
train_dataloader = DataLoader(train_dataset, batch_size=2048, shuffle=True)

### Обучим MLP

In [29]:
def train(train_dataloader, model, lr, n_epoch):
  optimizer = optim.Adam(model.parameters(), lr=lr)
  for epoch in tqdm(range(n_epoch), desc='Training'):
      train_epoch(train_dataloader, model, optimizer)

def train_epoch(train_dataloader, model, optimizer):
  pbar = tqdm(train_dataloader, total=len(train_dataloader), leave=False)
  for batch in pbar:
    loss = model.compute_loss(batch)
    pbar.set_description(f'loss:{loss.item():.4f}')

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [30]:
mlp = MLP(len(emb2user), len(emb2movie), 32, 32).to(device)
train(train_dataloader, mlp, lr=1e-2, n_epoch=50)
torch.save(mlp.state_dict(), 'mlp.pth')

HBox(children=(FloatProgress(value=0.0, description='Training', max=50.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))




### Посмотрим на MLP симилары

In [31]:
class UserMovieDataset(Dataset):
  def __init__(self, users, movies):
    self.users = users
    self.movies = movies

  def __getitem__(self, idx):
    return self.users[idx], self.movies[idx]

  def __len__(self):
    return len(self.users)

In [32]:
class ModelWrapper:
  def __init__(self, model):
    self.model = model
    self.movie_embs = model.movie_embs.weight.detach().cpu().numpy()

  def recommend(self, user_emb, n=10):
    ratings = self.predict([user_emb] * len(unseen[user_emb]), unseen[user_emb])
    argsort = ratings.argsort()[-n:][::-1]
    return unseen[user_emb][argsort]
  
  def similars(self, movie_emb, n=10):
    distances = cosine_similarity(self.movie_embs[movie_emb, :].reshape(1, -1), self.movie_embs)
    most_similar = distances[0].argsort()[-n:][::-1]
    return most_similar

  @torch.no_grad()
  def predict(self, user_embs, movie_embs):
    predictions = []
    dataloader = DataLoader(UserMovieDataset(user_embs, movie_embs), batch_size=2048)
    for user_embs, movie_embs in dataloader:
      user_embs, movie_embs = user_embs.to(device), movie_embs.to(device)
      prediction = self.model(user_embs, movie_embs).sigmoid()
      predictions.extend(prediction.squeeze().cpu().tolist())
    return np.array(predictions)

In [33]:
mlp_model = ModelWrapper(mlp)

In [34]:
get_similars(movie2emb[1], mlp_model)

['0    Toy Story (1995)',
 '1245    Groundhog Day (1993)',
 '3045    Toy Story 2 (1999)',
 '2252    Pleasantville (1998)',
 '33    Babe (1995)',
 '360    Lion King, The (1994)',
 '584    Aladdin (1992)',
 '2011    Lady and the Tramp (1955)',
 '1120    Monty Python and the Holy Grail (1974)',
 '1287    When Harry Met Sally... (1989)']

Симилары хорошие

### Посмотрим на MLP рекомендации

In [35]:
get_recommendations(user2emb[4], mlp_model)

['847    Godfather, The (1972)',
 '1178    Star Wars: Episode V - The Empire Strikes Back...',
 '1203    Godfather: Part II, The (1974)',
 '1182    Aliens (1986)',
 '1575    L.A. Confidential (1997)',
 '740    Dr. Strangelove or: How I Learned to Stop Worr...',
 '1190    Apocalypse Now (1979)',
 '589    Silence of the Lambs, The (1991)',
 '49    Usual Suspects, The (1995)',
 '108    Braveheart (1995)']

Рекомендации тоже неплохие

### Метрики

In [36]:
compute_metrics(mlp_model, test)

NDCG@10: 0.44074087428342673
HR@10: 1.0


### Реализуем GMF

In [37]:
class GMF(nn.Module):
  def __init__(self, n_users, n_movies, n_components):
    super().__init__()

    self.user_embs = nn.Embedding(num_embeddings=n_users, embedding_dim=n_components)
    self.movie_embs = nn.Embedding(num_embeddings=n_movies, embedding_dim=n_components)

    self.clf = nn.Linear(n_components, 1)

  def forward(self, user_emb, movie_emb):
    return self.clf(torch.mul(self.user_embs(user_emb), self.movie_embs(movie_emb)))

  def part_forward(self, user_emb, movie_emb):
    return torch.mul(self.user_embs(user_emb), self.movie_embs(movie_emb))

  def compute_loss(self, batch):
    user_embs, movie_embs, y_true = (item.to(device) for item in batch)
    y_pred = self(user_embs, movie_embs)
    return F.binary_cross_entropy_with_logits(y_pred, y_true.view(-1, 1))

### Обучим GMF

In [38]:
gmf = GMF(len(emb2user), len(emb2movie), 32).to(device)
train(train_dataloader, gmf, lr=1e-2, n_epoch=50)
torch.save(gmf.state_dict(), 'gmf.pth')

HBox(children=(FloatProgress(value=0.0, description='Training', max=50.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))




### Посмотрим на GMF симилары

In [39]:
gmf_model = ModelWrapper(gmf)

In [40]:
get_similars(movie2emb[1], gmf_model)

['0    Toy Story (1995)',
 '3045    Toy Story 2 (1999)',
 '33    Babe (1995)',
 '584    Aladdin (1992)',
 '1245    Groundhog Day (1993)',
 '3327    Muppet Movie, The (1979)',
 '360    Lion King, The (1994)',
 "2286    Bug's Life, A (1998)",
 '591    Beauty and the Beast (1991)',
 '907    Wizard of Oz, The (1939)']

Хорошие симилары

### Посмотрим на GMF рекомендации

In [41]:
get_recommendations(user2emb[4], gmf_model)

['1203    Godfather: Part II, The (1974)',
 '847    Godfather, The (1972)',
 '1950    Seven Samurai (The Magnificent Seven) (Shichin...',
 '585    Terminator 2: Judgment Day (1991)',
 '2875    Dirty Dozen, The (1967)',
 '2460    Planet of the Apes (1968)',
 '1267    Ben-Hur (1959)',
 '2502    Matrix, The (1999)',
 '2879    From Russia with Love (1963)',
 '1178    Star Wars: Episode V - The Empire Strikes Back...']

Хорошие рекомендации

### Метрики

In [42]:
compute_metrics(gmf_model, test)

NDCG@10: 0.4632383208355241
HR@10: 1.0


### Реализуем NCF

In [43]:
class NCF(nn.Module):
    def __init__(self, n_users, n_movies, n_components, mlp_hidden_size, mlp_filepath, gmf_filepath):
        super().__init__()

        self.mlp = MLP(n_users, n_movies, n_components, mlp_hidden_size)
        self.gmf = GMF(n_users, n_movies, n_components)

        self.mlp.load_state_dict(torch.load(mlp_filepath))
        self.gmf.load_state_dict(torch.load(gmf_filepath))

        self.clf = nn.Linear(n_components + mlp_hidden_size, 1)

    def forward(self, user_emb, movie_emb):
        return self.clf(torch.cat((self.mlp.part_forward(user_emb, movie_emb), self.gmf.part_forward(user_emb, movie_emb)), dim=-1))

    def compute_loss(self, batch):
        user_embs, movie_embs, y_true = (item.to(device) for item in batch)
        y_pred = self(user_embs, movie_embs)
        return F.binary_cross_entropy_with_logits(y_pred, y_true.view(-1, 1))

### Обучим NCF

In [44]:
ncf = NCF(len(emb2user), len(emb2movie), 32, 32, 'mlp.pth', 'gmf.pth').to(device)
train(train_dataloader, ncf, lr=0.005, n_epoch=100)
torch.save(ncf.state_dict(), 'ncf.pth')

HBox(children=(FloatProgress(value=0.0, description='Training', max=50.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=834.0), HTML(value='')))




### Посмотрим на NCF симилары

In [45]:
class NCFWrapper(ModelWrapper):
  def __init__(self, model):
    self.model = model
    self.movie_embs = np.hstack((model.gmf.movie_embs.weight.detach().cpu().numpy(), model.mlp.movie_embs.weight.detach().cpu().numpy()))

In [46]:
ncf_model = NCFWrapper(ncf)

In [47]:
get_similars(movie2emb[1], ncf_model)

['0    Toy Story (1995)',
 '3045    Toy Story 2 (1999)',
 '1245    Groundhog Day (1993)',
 '33    Babe (1995)',
 '584    Aladdin (1992)',
 '2252    Pleasantville (1998)',
 '1120    Monty Python and the Holy Grail (1974)',
 '360    Lion King, The (1994)',
 '2327    Shakespeare in Love (1998)',
 "2286    Bug's Life, A (1998)"]

### Посмотрим на NCF рекомендации

In [48]:
get_recommendations(user2emb[4], ncf_model)

['847    Godfather, The (1972)',
 '1950    Seven Samurai (The Magnificent Seven) (Shichin...',
 '1203    Godfather: Part II, The (1974)',
 '585    Terminator 2: Judgment Day (1991)',
 '1178    Star Wars: Episode V - The Empire Strikes Back...',
 '2875    Dirty Dozen, The (1967)',
 '1267    Ben-Hur (1959)',
 '2502    Matrix, The (1999)',
 '108    Braveheart (1995)',
 '1182    Aliens (1986)']

### Метрики

In [49]:
compute_metrics(ncf_model, test)

NDCG@10: 0.4545789004114006
HR@10: 1.0


## Вывод

У всех моделей симилары и рекомендации были адекватными. На Toy Story выдавались мультфильмы, а для любителя фантастики и блокбастеров рекомендовались фантастика и блокбастеры. Для некоторых фильмов рекомендовались сиквелы.

Значения метрик странные. Не рискну по ним делать выводы.