In [None]:
!pip install pytorch_lightning
!pip install faiss-gpu
!pip install tensorboardX

In [None]:
from collections import namedtuple

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as td

import pytorch_lightning as pl

import tqdm
import json
import sklearn.metrics as sm

import tensorflow as tf
import datetime, os

import matplotlib.pyplot as plt
import seaborn as sns
import faiss
import tensorboardX as tb

np.random.seed(31337)

## Create pairs (first track, subsequent track, time)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
DATA_DIR = "/content/drive/MyDrive/recsys_mobod_2023/Week5Seminar/data/"

In [None]:
data = pd.read_csv(DATA_DIR + "train_ranker.csv")

In [None]:
data.query("message == 'last'").shape

In [None]:
data.shape

In [None]:
Pair = namedtuple("Session", ["user", "start", "track", "time"])

def get_pairs(user_data):
    pairs = []
    first = None
    for _, row in user_data.sort_values("timestamp").iterrows():
        if first is None:
            first = row["track"]
        else:
            pairs.append(Pair(row["user"], first, row["track"], row["time"]))

        if row["message"] == "last":
            first = None
    return pairs

In [None]:
pairs = pd.DataFrame(
    data
    .groupby("user")
    .apply(get_pairs)
    .explode()
    .values
    .tolist(),
    columns=["user", "start", "track", "time"]
)

In [None]:
figure, ax = plt.subplots()
sns.histplot(pairs["time"], ax=ax)
pass

In [None]:
pairs.head(5)

## Train Model

In [None]:
rdm = np.random.random(len(pairs))
train_data = pairs[rdm < 0.8]
val_data = pairs[(rdm >= 0.8) & (rdm < 0.9)]
test_data = pairs[rdm >= 0.9]

len(train_data), len(val_data), len(test_data)

In [None]:
class ContextualRanker(pl.LightningModule):
    def __init__(self, embedding_dim=10):
        super().__init__()
        self.embedding_dim = embedding_dim

        # We won't have embeddings for everything, but that's ok
        self.context = nn.Embedding(num_embeddings=50000, embedding_dim=self.embedding_dim)
        self.track = nn.Embedding(num_embeddings=50000, embedding_dim=self.embedding_dim)

    def forward(self, x):
        context = self.context(x[:, 0]) # start track
        track = self.track(x[:, 1]) # next track
        return torch.sum(context * track, dim=1)

    def step(self, batch, batch_idx, metric, prog_bar=False):
        x, y = batch
        predictions = self.forward(x)
        loss = F.mse_loss(predictions, y.float(), reduction='mean')
        self.log(metric, loss, prog_bar=prog_bar)
        return loss

    def test_step(self, batch, batch_idx, prog_bar=False):
        x, y = batch
        predictions = self.forward(x)
        targets = y[:, 0].float()
        avgs = y[:, 1].float()
        rdms = y[:, 2].float()

        loss = F.mse_loss(predictions, targets, reduction='mean')
        avg_loss = F.mse_loss(avgs, targets, reduction='mean')
        rdm_loss = F.mse_loss(rdms, targets, reduction='mean')

        self.log("test_loss", loss, prog_bar=prog_bar)
        self.log("avg_loss", avg_loss, prog_bar=prog_bar)
        self.log("rdm_loss", rdm_loss, prog_bar=prog_bar)

    def training_step(self, batch, batch_idx):
        return self.step(batch, batch_idx, "train_loss")

    def validation_step(self, batch, batch_idx):
        return self.step(batch, batch_idx, "val_loss", True)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3, weight_decay=1e-5)
        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True)
        scheduler = {
            'scheduler': lr_scheduler,
            'monitor': 'val_loss'
        }
        return [optimizer], [scheduler]

In [None]:
class ContextualRankerData(pl.LightningDataModule):
  def __init__(self, train_data, val_data, test_data, features):
      super().__init__()
      self.train_data = train_data
      self.val_data = val_data
      self.test_data = test_data
      self.features = features

  def prepare_data(self):
      self.test_data = self.test_data.assign(rdm = np.random.random(len(self.test_data))).assign(avg = self.train_data["time"].mean())

  def setup(self, stage=None):
      if stage == "fit" or stage is None:
        self.train_dataset = td.TensorDataset(
            torch.from_numpy(self.train_data[self.features].values),
            torch.from_numpy(self.train_data["time"].values)
            )

        self.val_dataset = td.TensorDataset(
            torch.from_numpy(self.val_data[self.features].values),
            torch.from_numpy(self.val_data["time"].values)
            )

      if stage == "test" or stage is None:
        self.test_dataset = td.TensorDataset(
            torch.from_numpy(self.test_data[self.features].values),
            torch.from_numpy(self.test_data[["time", "avg", "rdm"]].values)
        )
  def train_dataloader(self):
      return td.DataLoader(self.train_dataset, batch_size=2048, shuffle=True, num_workers=0)

  def val_dataloader(self):
      return td.DataLoader(self.val_dataset, batch_size=2048, num_workers=0)

  def test_dataloader(self):
      return td.DataLoader(self.test_dataset, batch_size=512, shuffle=False, num_workers=0)

In [None]:
net = ContextualRanker(embedding_dim=100)
data_module = ContextualRankerData(train_data, val_data, test_data, features = ["start", "track"])

checkpoint_callback = pl.callbacks.ModelCheckpoint(monitor="val_loss")

trainer = pl.Trainer(
    max_epochs=200,
    accelerator='gpu',
    devices=1,
    callbacks=[
        pl.callbacks.early_stopping.EarlyStopping(monitor="val_loss", patience=5),
        pl.callbacks.LearningRateMonitor(logging_interval="step"),
        checkpoint_callback
    ])

In [None]:
trainer.fit(
    net,
    data_module
)

In [None]:
best = ContextualRanker.load_from_checkpoint(checkpoint_callback.best_model_path, embedding_dim=100)

In [None]:
trainer.test(best, data_module)

## Compute top recommendations

In [None]:
track_meta = pd.read_json(DATA_DIR + "tracks.json", lines=True)

In [None]:
track_meta.head(5)

In [None]:
context_embeddings = dict(best.named_parameters())["context.weight"].data.cpu().numpy()
track_embeddings = dict(best.named_parameters())["track.weight"].data.cpu().numpy()

In [None]:
track_embeddings.shape, context_embeddings.shape

In [None]:
gpu_res = faiss.StandardGpuResources()
index_flat = faiss.index_factory(track_embeddings.shape[1], "Flat", faiss.METRIC_INNER_PRODUCT)

index = faiss.index_cpu_to_gpu(gpu_res, 0, index_flat)
index.add(track_embeddings.astype('float32'))

In [None]:
k = 50
with open(DATA_DIR + "tracks_with_recs.json", "w") as rf:
    for _, track in tqdm.tqdm(track_meta.iterrows()):
        embedding = context_embeddings[track["track"]]
        dists, neighbours = index.search(embedding.astype('float32')[np.newaxis, :], k)
        # neighbours = np.argpartition(-np.dot(track_embeddings, embedding), k)[:k]
        recommendation = dict(track)
        recommendation["recommendations"] = neighbours.flatten().tolist()

        rf.write(json.dumps(recommendation) + "\n")

In [None]:
track = 188
embedding = context_embeddings[track]
track_meta.loc[track_meta["track"] == track, ["artist", "title"]]

In [None]:
k = 10
neighbours = np.argpartition(-np.dot(track_embeddings, embedding), k)[:k]
track_meta.loc[track_meta["track"].isin(neighbours), ["artist", "title"]]

In [None]:
writer = tb.SummaryWriter(comment='nn_embeddings', log_dir="/content/drive/MyDrive/recsys_mobod_2023/Week5Seminar/tb")
writer.add_embedding(track_embeddings, metadata=list(track_meta[["artist", "title"]].itertuples(index=False, name=None)), tag="nn", metadata_header=["artist", "title"])
writer.close()

In [None]:
!cp -r /content/lightning_logs /gdrive/MyDrive/recsys_mobod_2023/Week5Seminar/lightning_logs