In [None]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as td

import pytorch_lightning as pl

from tqdm.autonotebook import tqdm
import json

import typing as tp
import faiss
import shutil

np.random.seed(31337)

In [None]:
data =  pd.read_csv('/Users/nadys/recsys_data/1600kfeedbacks.csv')
track_metadata = pd.read_json('/Users/nadys/recsys_data/tracks.json', lines=True).drop_duplicates(subset=['track'])

In [None]:
POSITIVES_THRESHOLD = 0.8
NUM_TRACKS_THRESHOLD = 20

NUM_NEGATIVE_SAMPLES = 10

EMBEDDING_DIM = 128

In [None]:
positives = data[data['time'] > POSITIVES_THRESHOLD].copy()
track_counts = positives.groupby('track').size()

# id треков, встретившихся хотя бы num_tracks_threshold раз
tracks = set(track_counts[track_counts >= NUM_TRACKS_THRESHOLD].index.values) 
# данные о positives 
data_filt = positives[positives['track'].isin(tracks)]

len(data_filt), len(tracks)

In [None]:
interactions = pd.pivot_table(data_filt, values='time', index='user', columns='track').fillna(0)
# print('Interactions matrix: \nshape=' + str(interactions.shape))
# print('Sparsity=' + str((interactions != 0).values.sum() / interactions.size))

In [None]:
track_metadata = track_metadata.fillna(value={'genre': 'Unk'})
# track_metadata.head()

In [None]:
dummy_features = pd.get_dummies(track_metadata[['genre']])
item_features = pd.concat([track_metadata[['track', 'pop']], dummy_features], axis=1).set_index('track', drop=True)
item_features['pop'] = np.log(item_features['pop'])
item_features.shape

In [None]:
triplets = data_filt[['user', 'track']].rename(columns={'track': 'track_pos'})
triplets =  pd.concat([triplets] * NUM_NEGATIVE_SAMPLES).sort_index().reset_index(drop=True)
triplets['track_neg'] = np.random.choice(range(50000), len(triplets))

In [None]:
rdm = np.random.random(len(triplets))
train_data = triplets[rdm < 0.9]
val_data = triplets[rdm >= 0.9]

len(train_data), len(val_data)

In [None]:
item_features.shape, interactions.shape

In [None]:
from random import shuffle

def pad_with_specific_value(lst, size, val):
    lst = list(set(lst))
    shuffle(lst)
    lst = lst[:size]
    return np.pad(lst, (0, size - len(lst)), 'constant', constant_values=(val))

padded_users = triplets.groupby('user').apply(lambda x: (
    pad_with_specific_value(x['track_pos'].tolist(), 30, 50000).tolist()
))

padded_users = padded_users.reindex(range(10000), fill_value=[50000] * 30)
padded_users = np.stack(padded_users.values)

In [None]:
class DSSMData(pl.LightningDataModule):
  def __init__(self, train_triplets, val_triplets, test_triplets, item_features, padded_users):
      super().__init__()
      self.train_triplets = train_triplets
      self.val_triplets = val_triplets
      self.test_triplets = test_triplets
      self.item_features = item_features
      self.padded_users = padded_users

  def _collect_data(self, triplets):
      users = triplets['user'].values
      positives = triplets['track_pos'].values
      negatives = triplets['track_neg'].values

      # Wipe out positive interacted tracks from user listen history
      listened_tracks = self.padded_users[users]
      listened_tracks[listened_tracks == positives.reshape(-1, 1)] = 50000
      listened_tracks[listened_tracks == negatives.reshape(-1, 1)] = 50000

      return td.TensorDataset(
            torch.from_numpy(listened_tracks).long(),
            torch.from_numpy(item_features.loc[positives].values.astype(np.float32)).double(),
            torch.from_numpy(item_features.loc[negatives].values.astype(np.float32)).double()
      )

  def prepare_data(self, stage=None):
      if stage == 'fit' or stage is None:
        self.train_dataset = self._collect_data(self.train_triplets)
        self.val_dataset = self._collect_data(self.val_triplets)
      elif stage == 'test' or stage is None:
        self.test_dataset = self._collect_data(self.test_triplets)

  def train_dataloader(self):
      return td.DataLoader(self.train_dataset, batch_size=2048, shuffle=True, num_workers=0)

  def val_dataloader(self):
      return td.DataLoader(self.val_dataset, batch_size=2048, num_workers=0)

  def test_dataloader(self):
      return td.DataLoader(self.test_dataset, batch_size=2048, shuffle=False, num_workers=0)

In [None]:
class ItemNet(nn.Module):
    def __init__(self, n_factors: int, dim_input: int, activation: tp.Callable[[torch.Tensor], torch.Tensor] = F.relu) -> None:
        super().__init__()
        self.embedding_layer = nn.Linear(dim_input - 1, 32, bias=False)
        self.dense_layer = nn.Linear(32 + 1, n_factors, bias=False)
        self.output_layer = nn.Linear(n_factors + 32, n_factors, bias=False)
        self.activation = activation

    def forward(self, item_features: torch.Tensor) -> torch.Tensor:
        popularity = item_features[:, 0].view(-1, 1)
        genre_emb = self.embedding_layer(item_features[:, 1:])

        pop_genre = torch.concat([popularity, genre_emb], axis=1)
        features = self.activation(self.dense_layer(pop_genre))

        genre_features = torch.concat([genre_emb, features], axis=1)
        output = self.output_layer(genre_features)
        return output

class UserNet(nn.Module):
    def __init__(self, n_factors: int, num_embeddings: int, activation: tp.Callable[[torch.Tensor], torch.Tensor] = F.relu) -> None:
        super().__init__()
        self.track_embeddings = nn.EmbeddingBag(num_embeddings+1, n_factors, padding_idx=num_embeddings)
        self.dense_layer = nn.Linear(n_factors, n_factors, bias=False)
        self.output_layer = nn.Linear(n_factors + n_factors, n_factors, bias=False)
        self.activation = activation

    def forward(self, user_tracks: torch.Tensor) -> torch.Tensor:
        interactions_emb = self.track_embeddings(user_tracks)
        features = self.activation(self.dense_layer(interactions_emb))
        x = torch.concat([interactions_emb, features], axis=1)
        output = self.output_layer(x)
        return output

In [None]:
class DSSM(pl.LightningModule):
    def __init__(
        self,
        dim_item_features: int,
        item_number: int,
        embedding_dim: int,
        activation: tp.Callable[[torch.Tensor], torch.Tensor] = F.relu,
        lr: float = 1e-3,
        triplet_loss_margin: float = 0.4,
        weight_decay: float = 1e-6,
        log_to_prog_bar: bool = True,
    ) -> None:
        super().__init__()
        self.lr = lr
        self.triplet_loss_margin = triplet_loss_margin
        self.weight_decay = weight_decay
        self.log_to_prog_bar = log_to_prog_bar
        self.item_net = ItemNet(embedding_dim, dim_item_features, activation)
        self.user_net = UserNet(embedding_dim, item_number)

    def forward(
        self,
        user_ids: torch.Tensor,
        item_features_pos: torch.Tensor,
        item_features_neg: torch.Tensor,
    ) -> tp.Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        anchor = self.user_net(user_ids)
        pos = self.item_net(item_features_pos)
        neg = self.item_net(item_features_neg)

        return anchor, pos, neg

    def _step(self, batch, batch_idx, metric, prog_bar=False):
        user_ids, pos, neg = batch
        anchor, positive, negative = self(user_ids, pos, neg)
        loss = F.triplet_margin_loss(anchor, positive, negative, margin=self.triplet_loss_margin)
        self.log(metric, loss, prog_bar=prog_bar)
        return loss

    def training_step(self, batch: tp.Sequence[torch.Tensor], batch_idx: int) -> torch.Tensor:
        return self._step(batch, batch_idx, 'train_loss')

    def validation_step(self, batch: tp.Sequence[torch.Tensor], batch_idx: int) -> torch.Tensor:
        return self._step(batch, batch_idx, 'val_loss', self.log_to_prog_bar)

    def test_step(self, batch, batch_idx, prog_bar=False):
        return self._step(batch, batch_idx, 'test_loss', self.log_to_prog_bar)

    def inference(self, dataloader: td.DataLoader[tp.Any], mode: str = 'item') -> np.ndarray:
        batches = []
        user_ids = []
        if(mode == 'user'):
          model = self.user_net
        elif(mode == 'item'):
          model = self.item_net
        else:
          raise ValueError(f'Unsupported model {mode}!')

        self.eval()
        for batch in dataloader:
            ids, features = batch
            with torch.no_grad():
                v_batch = model(features.to(self.device))
            batches.append(v_batch)
            user_ids.append(ids)
        vectors = torch.cat(batches, dim=0).cpu().numpy()
        vectors_ids = torch.cat(user_ids, dim=0).cpu().numpy()
        return vectors_ids, vectors

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr, weight_decay=self.weight_decay)
        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True)
        scheduler = {
            'scheduler': lr_scheduler,
            'monitor': 'val_loss'
        }
        return [optimizer], [scheduler]

In [None]:
data_module = DSSMData(train_data, val_data, None, item_features, padded_users)
net = DSSM(dim_item_features=23, item_number=50000, embedding_dim=EMBEDDING_DIM).double()

checkpoint_callback = pl.callbacks.ModelCheckpoint(monitor='val_loss')

trainer = pl.Trainer(
    max_epochs=40,
    accelerator='cpu',
    devices=1,
    callbacks=[
        pl.callbacks.early_stopping.EarlyStopping(monitor='val_loss', patience=5),
        pl.callbacks.LearningRateMonitor(logging_interval='step'),
        checkpoint_callback,
    ])

In [None]:
trainer.fit(
    net,
    data_module
)

In [None]:
shutil.move(checkpoint_callback.best_model_path, 'dssm_improved.ckpt')

In [None]:
best = DSSM.load_from_checkpoint("dssm_improved.ckpt", dim_item_features=23, item_number=50000, embedding_dim=EMBEDDING_DIM)

In [None]:
item_features = item_features.astype(np.float32).sort_index()
items = torch.from_numpy(item_features.index.values)
inf_items = torch.from_numpy(item_features.values).double()
items_ds = td.TensorDataset(items, inf_items)

In [None]:
inf_dl_items = td.DataLoader(items_ds, batch_size=128, shuffle=False, num_workers=1)
track_ids, track_embeddings = best.double().inference(inf_dl_items)

In [None]:
interactions = interactions.sort_index()
user_ids_inf = torch.from_numpy(interactions.index.values)
user_inf_feat = torch.from_numpy(padded_users[interactions.index.values]).long()
user_ds = td.TensorDataset(user_ids_inf, user_inf_feat)

In [None]:
inf_dl_users = td.DataLoader(user_ds, batch_size=128, shuffle=False, num_workers=1)
user_ids, user_embeddings = best.double().inference(inf_dl_users, "user")

In [None]:
track_embeddings.shape, user_embeddings.shape

In [None]:
# gpu_res = faiss.StandardCpuResources()
index_flat = faiss.index_factory(track_embeddings.shape[1], "Flat", faiss.METRIC_L2)

#index = faiss.index_cpu_to_gpu(gpu_res, 0, index_flat)
index = index_flat
index.add(track_embeddings.astype('float32'))

In [None]:
k = 30
with open("/Users/nadys/recsys_data/dssm_improved.json", "w") as rf:
    for user, user_emb in tqdm(zip(user_ids, user_embeddings), total=len(user_ids)):
      dists, neighbours = index.search(user_emb.astype('float32')[np.newaxis, :], k)
      recommendation = {
        "user": int(user),
        "tracks": neighbours.flatten().tolist()
      }
      rf.write(json.dumps(recommendation) + "\n")

In [None]:
!cp /Users/nadys/recsys_data/dssm_improved.json /Users/nadys/PythonProjects/recsys-course-spring-2024/botify/data/recommendations_dssm_improved.json