In [6]:
import pandas as pd
import numpy as np
from collections import namedtuple

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as td

import pytorch_lightning as pl

from tqdm.autonotebook import tqdm
import json
import sklearn.metrics as sm
from sklearn.preprocessing import LabelEncoder

import tensorboardX as tb
import tensorflow as tf
import datetime, os

import matplotlib.pyplot as plt
import seaborn as sns

import typing as tp
import faiss
import glob
from sklearn.metrics.pairwise import euclidean_distances
from functools import partial
import shutil

np.random.seed(31337)

In [2]:
data = pd.concat([
        pd.read_json(data_path, lines=True) 
        for data_path 
        in glob.glob("./data/*/data.json")
    ] + [pd.read_csv("./data/contextual_data.csv")])

data = data[["message", "timestamp", "user", "track", "time"]]
data["timestamp"] = pd.to_datetime(data["timestamp"])
track_metadata = pd.read_json("data/tracks.json", lines=True).drop_duplicates(subset=["track"])

In [12]:
positives = data[data["time"] > 0.8].copy()
negatives = data[(data["time"] > 0.1) * (data["time"] < 0.3)].copy()

track_counts = positives.groupby("track").size()
tracks = set(track_counts[track_counts >= 20].index.values)

pos_data_filt = positives[positives["track"].isin(tracks)]
neg_data_filt = negatives[negatives["track"].isin(tracks)]

len(pos_data_filt), len(neg_data_filt), len(tracks)

(813355, 149852, 11149)

In [7]:
track_metadata = track_metadata.fillna(value={'genre': 'Unk'})
track_metadata["genre"] = LabelEncoder().fit_transform(track_metadata["genre"])
track_metadata["artist"] = LabelEncoder().fit_transform(track_metadata["artist"])

In [8]:
item_features = track_metadata[["track", "genre", "artist", "pop"]].set_index("track", drop=False)
item_features['pop'] = np.log(item_features['pop'])
item_features.shape

(50000, 4)

In [18]:
genre_number = len(np.unique(item_features["genre"]))
artist_number = len(np.unique(item_features["artist"]))
track_number = 50000
user_number = 10000

In [14]:
triplets = pos_data_filt[["user", "track"]].rename(columns={"track": "track_pos"})

In [15]:
NUM_NEGATIVE_SAMPLES = 15
triplets =  pd.concat([triplets] * NUM_NEGATIVE_SAMPLES).sort_index().reset_index(drop=True)
triplets["track_neg"] = np.random.choice(range(50000), len(triplets))

In [16]:
rdm = np.random.random(len(triplets))
train_data = triplets[rdm < 0.8]
val_data = triplets[(rdm >= 0.8) & (rdm < 0.9)]
test_data = triplets[rdm >= 0.9]

len(train_data), len(val_data), len(test_data)

(9760693, 1219922, 1219710)

In [19]:
from random import shuffle

def pad_with_specific_value(lst, size, val):
    lst = list(set(lst))
    shuffle(lst)
    lst = lst[:size]
    return np.pad(lst, (0, size - len(lst)), 'constant', constant_values=(val))

pos_padded_users = triplets.groupby("user").apply(lambda x: (
    pad_with_specific_value(x['track_pos'].tolist(), 30, 50000).tolist()
))
neg_padded_users = negatives.groupby("user").apply(lambda x: (
    pad_with_specific_value(x['track'].tolist(), 30, 50000).tolist()
))

In [20]:
pos_padded_users = pos_padded_users.reindex(np.arange(user_number), fill_value=[50000] * 30)
neg_padded_users = neg_padded_users.reindex(np.arange(user_number), fill_value=[50000] * 30)

In [21]:
class DSSMData(pl.LightningDataModule):
    def __init__(self, train_triplets, val_triplets, test_triplets, item_features, pos_padded_users, neg_padded_users):
        super().__init__()
        self.train_triplets = train_triplets
        self.val_triplets = val_triplets
        self.test_triplets = test_triplets
        self.item_features = item_features
        self.pos_padded_users = pos_padded_users
        self.neg_padded_users = neg_padded_users
        self.fit_prepared = False

    def _collect_data(self, triplets):
        users = triplets["user"].values
        positives = triplets["track_pos"].values
        negatives = triplets["track_neg"].values

        print("collecting liked")
        liked_tracks = self.pos_padded_users[users]
        liked_tracks = np.stack(liked_tracks.values)
        liked_tracks[liked_tracks == positives.reshape(-1, 1)] = 50000
        liked_tracks[liked_tracks == negatives.reshape(-1, 1)] = 50000
        
        print("collecting disliked")
        disliked_tracks = self.neg_padded_users[users]
        disliked_tracks = np.stack(disliked_tracks.values)
        disliked_tracks[disliked_tracks == positives.reshape(-1, 1)] = 50000
        disliked_tracks[disliked_tracks == negatives.reshape(-1, 1)] = 50000

        return td.TensorDataset(
            torch.from_numpy(liked_tracks).long(),
            torch.from_numpy(disliked_tracks).long(),
            torch.from_numpy(item_features.loc[positives].values).long(),
            torch.from_numpy(item_features.loc[negatives].values).long(),
        )

    def prepare_data(self, stage=None):
        if stage == "fit" or stage is None:
            if self.fit_prepared:
                return
            self.train_dataset = self._collect_data(self.train_triplets)
            self.val_dataset = self._collect_data(self.val_triplets)
            self.fit_prepared = True
        elif stage == "test" or stage is None:
            self.test_dataset = self._collect_data(self.test_triplets)

    def train_dataloader(self):
        return td.DataLoader(self.train_dataset, batch_size=1024, shuffle=True, num_workers=0)

    def val_dataloader(self):
        return td.DataLoader(self.val_dataset, batch_size=1024, num_workers=0)

    def test_dataloader(self):
        return td.DataLoader(self.test_dataset, batch_size=1024, shuffle=False, num_workers=0)

### DSSM-модель

In [25]:
class DSSM(pl.LightningModule):
    def __init__(
        self,
        genres: int,
        artists: int,
        item_number: int,
        embedding_dim: int = 100,
        activation: tp.Callable[[torch.Tensor], torch.Tensor] = F.relu,
        lr: float = 1e-3,
        triplet_loss_margin: float = 0.4,
        weight_decay: float = 1e-6,
        log_to_prog_bar: bool = True,
    ) -> None:
        super().__init__()
        self.lr = lr
        self.triplet_loss_margin = triplet_loss_margin
        self.weight_decay = weight_decay
        self.log_to_prog_bar = log_to_prog_bar
        self.item_net = ItemNet(embedding_dim, genres, artists, activation)
        self.user_net = UserNet(embedding_dim, item_number)

    def forward(
        self,
        liked_tracks: torch.Tensor,
        disliked_tracks: torch.Tensor,
        item_features_pos: torch.Tensor,
        item_features_neg: torch.Tensor,
    ) -> tp.Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        anchor = self.user_net(liked_tracks, disliked_tracks)
        pos = self.item_net(item_features_pos)
        neg = self.item_net(item_features_neg)
        return anchor, pos, neg

    def _step(self, batch, batch_idx, metric, prog_bar=False):
        liked_tracks, disliked_tracks, pos, neg = batch
        anchor, positive, negative = self(liked_tracks, disliked_tracks, pos, neg)
        loss = F.triplet_margin_loss(anchor, positive, negative, margin=self.triplet_loss_margin)
        self.log(metric, loss, prog_bar=prog_bar)
        return loss

    def training_step(self, batch: tp.Sequence[torch.Tensor], batch_idx: int) -> torch.Tensor:
        return self._step(batch, batch_idx, "train_loss")

    def validation_step(self, batch: tp.Sequence[torch.Tensor], batch_idx: int) -> torch.Tensor:
        return self._step(batch, batch_idx, "val_loss", self.log_to_prog_bar)

    def test_step(self, batch, batch_idx, prog_bar=False):
        return self._step(batch, batch_idx, "test_loss", self.log_to_prog_bar)

    def inference(self, dataloader: td.DataLoader[tp.Any], mode: str = "item") -> np.ndarray:
        batches = []
        user_ids = []
        if(mode == "user"):
            model = self.user_net
        elif(mode == "item"):
            model = self.item_net
        else:
            raise ValueError(f"Unsupported model {mode}!")

        self.eval()
        for batch in dataloader:
            ids, features = batch
            with torch.no_grad():
                v_batch = model(features.to(self.device))
            batches.append(v_batch)
            user_ids.append(ids)
        vectors = torch.cat(batches, dim=0).cpu().numpy()
        vectors_ids = torch.cat(user_ids, dim=0).cpu().numpy()
        return vectors_ids, vectors

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr, weight_decay=self.weight_decay)
        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
        scheduler = {
            'scheduler': lr_scheduler,
            'monitor': 'val_loss'
        }
        return [optimizer], [scheduler]

In [34]:
class ItemNet(nn.Module):
    def __init__(self, n_factors: int, genres, artists, activation: tp.Callable[[torch.Tensor], torch.Tensor] = F.relu) -> None:
        super().__init__()
        self.genre_embedding_layer = nn.Embedding(genres+1, n_factors)
        self.artist_embedding_layer = nn.Embedding(artists+1, n_factors)
        self.dense_layer = nn.Linear(n_factors + 1, n_factors, bias=False)
        self.output_layer = nn.Linear(n_factors * 2, n_factors, bias=False)
        self.activation = activation

    def forward(self, item_features: torch.Tensor) -> torch.Tensor:
        popularity = item_features[:, 3].view(-1, 1)
        emb = self.genre_embedding_layer(item_features[:, 1])
        emb += self.artist_embedding_layer(item_features[:,2])

        pop_emb = torch.concat([popularity, emb], axis=1)
        features = self.activation(self.dense_layer(pop_emb))

        emb_features = torch.concat([emb, features], axis=1)
        output = self.output_layer(emb_features)
        return output

class UserNet(nn.Module):
    def __init__(self, n_factors: int, num_embeddings: int, activation: tp.Callable[[torch.Tensor], torch.Tensor] = F.relu) -> None:
        super().__init__()
        self.track_embeddings = nn.EmbeddingBag(num_embeddings+1, n_factors, padding_idx=num_embeddings)
        self.dense_layer = nn.Linear(n_factors, n_factors, bias=False)
        self.output_layer = nn.Linear(n_factors + n_factors, n_factors, bias=False)
        self.activation = activation

    def forward(self, liked_tracks, disliked_tracks) -> torch.Tensor:
        interactions_emb = self.track_embeddings(liked_tracks)
        interactions_emb -= self.track_embeddings(disliked_tracks)
        features = self.activation(self.dense_layer(interactions_emb))
        x = torch.concat([interactions_emb, features], axis=1)
        output = self.output_layer(x)
        return output

In [31]:
data_module = DSSMData(train_data, val_data, test_data, item_features, pos_padded_users, neg_padded_users)

In [35]:
net = DSSM(genre_number, artist_number, item_number=50000, embedding_dim=64).double()

In [38]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(dirpath="./data/checkpoints/dssm2/", monitor="val_loss")

trainer = pl.Trainer(
    max_epochs=30,
    accelerator='gpu',
    devices=1,
    callbacks=[
        pl.callbacks.early_stopping.EarlyStopping(monitor="val_loss", patience=5),
        pl.callbacks.LearningRateMonitor(logging_interval="step"),
        checkpoint_callback,
    ])

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [39]:
trainer.fit(
    net,
    data_module
)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type    | Params
-------------------------------------
0 | item_net | ItemNet | 801 K 
1 | user_net | UserNet | 3.2 M 
-------------------------------------
4.0 M     Trainable params
0         Non-trainable params
4.0 M     Total params
16.056    Total estimated model params size (MB)


Sanity Checking: |                                                                               | 0/? [00:00<…

Training: |                                                                                      | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

In [42]:
best = DSSM.load_from_checkpoint(checkpoint_callback.best_model_path, genres=genre_number, artists=artist_number, item_number=50000, embedding_dim=64)

In [43]:
device = torch.device("cuda")
best = best.to(device)

In [45]:
with torch.no_grad():
    item_embeds = best.item_net(torch.from_numpy(item_features.values).long().to(device)).cpu()

In [47]:
gpu_res = faiss.StandardGpuResources()
index = faiss.index_factory(64, "Flat", faiss.METRIC_L2)
index = faiss.index_cpu_to_gpu(gpu_res, 0, index)

index.add(item_embeds)

In [48]:
ppu = torch.from_numpy(np.stack(pos_padded_users))
npu = torch.from_numpy(np.stack(neg_padded_users))
user_ds = td.TensorDataset(ppu, npu)
user_dl = td.DataLoader(user_ds, batch_size=512, shuffle=False, num_workers=1)

In [49]:
batches = []
for batch in user_dl:
    ppu, npu = batch
    with torch.no_grad():
        v_batch = best.user_net(ppu.to(best.device), npu.to(best.device))
    batches.append(v_batch.cpu())
user_embs = torch.cat(batches, dim=0).numpy()

In [50]:
_, tracks = index.search(user_embs, k=100)

In [52]:
with open(f"../../botify/data/recommendations_dssm_new.json", "w") as rf:
    for user in tqdm(range(user_number)):
        recommendation = {
            "user": int(user),
            "tracks": tracks[user,:].tolist()
        }
        rf.write(json.dumps(recommendation) + "\n")

  0%|          | 0/10000 [00:00<?, ?it/s]