In [2]:
!pip install tensorboardX
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [3]:
import pandas as pd
import numpy as np
from collections import namedtuple

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as td

from tqdm.autonotebook import tqdm
import json
import sklearn.metrics as sm

import tensorboardX as tb
import tensorflow as tf
import datetime, os

import matplotlib.pyplot as plt
import seaborn as sns

import typing as tp
import faiss
from sklearn.metrics.pairwise import euclidean_distances
from functools import partial
import shutil

np.random.seed(19)

  from tqdm.autonotebook import tqdm
2024-04-15 10:42:57.954419: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-15 10:42:57.954523: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-15 10:42:58.087853: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
DATA_DIR = '/kaggle/input/botify-data/'

In [5]:
data = pd.read_csv(DATA_DIR + 'final_collection.csv')

In [6]:
track_metadata = pd.read_json(DATA_DIR + 'tracks.json', lines=True).drop_duplicates(subset=["track"])

In [7]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cuda', index=0)

In [8]:
track_metadata.shape, track_metadata.isnull().sum(axis = 0)

((50000, 5),
 artist        0
 title         0
 genre     23253
 pop           0
 track         0
 dtype: int64)

In [9]:
track_metadata = track_metadata.fillna(value={'genre': 'Unk'})

In [10]:
dummy_features = pd.get_dummies(track_metadata[["genre"]])
dummy_features = dummy_features.astype(float)
item_features = pd.concat([track_metadata[["track", "pop"]], dummy_features], axis=1).set_index("track", drop=True)
item_features['pop'] = np.log(item_features['pop'])
item_features.shape

(50000, 23)

In [11]:
positives = data[data["time"] > 0.8].copy()

track_counts = positives.groupby("track").size()
tracks = set(track_counts[track_counts >= 20].index.values)

data_filt = positives[positives["track"].isin(tracks)]

len(data_filt), len(tracks)

(496174, 6506)

In [12]:
interactions = pd.pivot_table(data_filt, values="time", index="user", columns="track").fillna(0)

print("Interactions matrix: \nshape=" + str(interactions.shape))
print("Sparsity=" + str((interactions != 0).values.sum() / interactions.size))

Interactions matrix: 
shape=(9998, 6506)
Sparsity=0.007358203887934058


In [13]:
triplets = data_filt[["user", "track"]].rename(columns={"track": "track_pos"})

In [14]:
NUM_NEGATIVE_SAMPLES = 10
triplets =  pd.concat([triplets] * NUM_NEGATIVE_SAMPLES).sort_index().reset_index(drop=True)
triplets["track_neg"] = np.random.choice(range(50000), len(triplets))

In [19]:
rdm = np.random.random(len(triplets))
train_data = triplets[rdm < 0.85]
val_data = triplets[rdm >= 0.85]

len(train_data), len(val_data)

(4217741, 743999)

In [16]:
item_features.shape, interactions.shape

((50000, 23), (9998, 6506))

In [17]:
from random import shuffle

def pad_with_specific_value(lst, size, val):
    lst = list(set(lst))
    shuffle(lst)
    lst = lst[:size]
    return np.pad(lst, (0, size - len(lst)), 'constant', constant_values=(val))

padded_users = triplets.groupby("user").apply(lambda x: (
    pad_with_specific_value(x['track_pos'].tolist(), 30, 50000).tolist()
))

padded_users = padded_users.reindex(range(10000), fill_value=[50000] * 30)
padded_users = np.stack(padded_users.values)

  padded_users = triplets.groupby("user").apply(lambda x: (


In [18]:
padded_users

array([[  496,  2284,  2006, ..., 10222, 50000, 50000],
       [  846,  3955,  9127, ..., 16177,  1866,  3221],
       [19865,    55,  1117, ..., 30588,    84,   539],
       ...,
       [ 2892,  6599,   511, ...,  4537,  3189,  1100],
       [20157, 27209, 34847, ...,  1511,  1123, 14615],
       [ 4450, 39588,    84, ...,   511,     2,    20]])

In [32]:
def collect_data(triplets):
    users = triplets["user"].values
    positives = triplets["track_pos"].values
    negatives = triplets["track_neg"].values

    # Wipe out positive interacted tracks from user listen history
    listened_tracks = padded_users[users]
    listened_tracks[listened_tracks == positives.reshape(-1, 1)] = 50000
    listened_tracks[listened_tracks == negatives.reshape(-1, 1)] = 50000

    return td.TensorDataset(
        torch.tensor(listened_tracks, dtype=torch.long, device=DEVICE),
        torch.tensor(item_features.loc[positives].values, dtype=torch.double, device=DEVICE),
        torch.tensor(item_features.loc[negatives].values, dtype=torch.double, device=DEVICE)
    )
train_ds = collect_data(train_data)
val_ds = collect_data(val_data)

training_loader = td.DataLoader(train_ds, batch_size=1024, shuffle=True)
validation_loader = td.DataLoader(val_ds, batch_size=1024, shuffle=False)

In [33]:
class DSSM(nn.Module):
    def __init__(self, dim_item_features: int, item_number: int, embedding_dim: int = 100):
        super(DSSM, self).__init__()
        self.item_net = ItemNet(embedding_dim, dim_item_features)
        self.user_net = UserNet(embedding_dim, item_number)

    def forward(self, user_ids: torch.Tensor, item_features_pos: torch.Tensor, item_features_neg: torch.Tensor) -> tp.Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        anchor = self.user_net(user_ids)
        pos = self.item_net(item_features_pos)
        neg = self.item_net(item_features_neg)

        return anchor, pos, neg

    def inference(self, dataloader: td.DataLoader[tp.Any], mode: str = "item") -> np.ndarray:
        batches = []
        user_ids = []
        if(mode == "user"):
            model = self.user_net
        elif(mode == "item"):
            model = self.item_net
        else:
            raise ValueError(f"Unsupported model {mode}!")

        self.eval()
        for batch in dataloader:
            ids, features = batch
            with torch.inference_mode():
                v_batch = model(features.to(DEVICE))
            batches.append(v_batch)
            user_ids.append(ids)
        vectors = torch.cat(batches, dim=0).cpu().numpy()
        vectors_ids = torch.cat(user_ids, dim=0).cpu().numpy()
        return vectors_ids, vectors

In [34]:
class ItemNet(nn.Module):
    def __init__(self, n_factors: int, dim_input: int):
        super().__init__()
        self.embedding_layer = nn.Linear(dim_input - 1, 32, bias=False)
        self.dense_layer = nn.Linear(32 + 1, n_factors, bias=False)
        self.output_layer = nn.Linear(n_factors + 32, n_factors, bias=False)
        self.activation = nn.ReLU()

    def forward(self, item_features: torch.Tensor) -> torch.Tensor:
        popularity = item_features[:, 0].view(-1, 1)
        genre_emb = self.embedding_layer(item_features[:, 1:])

        pop_genre = torch.concat([popularity, genre_emb], axis=1)
        features = self.activation(self.dense_layer(pop_genre))

        genre_features = torch.concat([genre_emb, features], axis=1)
        output = self.output_layer(genre_features)
        return output

class UserNet(nn.Module):
    def __init__(self, n_factors: int, num_embeddings: int):
        super().__init__()
        self.track_embeddings = nn.EmbeddingBag(num_embeddings+1, n_factors, padding_idx=num_embeddings)
        self.dense_layer = nn.Linear(n_factors, n_factors, bias=False)
        self.output_layer = nn.Linear(n_factors + n_factors, n_factors, bias=False)
        self.activation = nn.ReLU()

    def forward(self, user_tracks: torch.Tensor) -> torch.Tensor:
        interactions_emb = self.track_embeddings(user_tracks)
        features = self.activation(self.dense_layer(interactions_emb))
        x = torch.concat([interactions_emb, features], axis=1)
        output = self.output_layer(x)
        return output

In [35]:
model = DSSM(dim_item_features=23, item_number=50000, embedding_dim=64).double()
model.to(DEVICE)

DSSM(
  (item_net): ItemNet(
    (embedding_layer): Linear(in_features=22, out_features=32, bias=False)
    (dense_layer): Linear(in_features=33, out_features=64, bias=False)
    (output_layer): Linear(in_features=96, out_features=64, bias=False)
    (activation): ReLU()
  )
  (user_net): UserNet(
    (track_embeddings): EmbeddingBag(50001, 64, mode='mean', padding_idx=50000)
    (dense_layer): Linear(in_features=64, out_features=64, bias=False)
    (output_layer): Linear(in_features=128, out_features=64, bias=False)
    (activation): ReLU()
  )
)

In [36]:
loss_fn = nn.TripletMarginLoss(margin=0.3)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-6)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)

In [37]:
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform(m.weight)
        
model.apply(init_weights)

  nn.init.xavier_uniform(m.weight)


DSSM(
  (item_net): ItemNet(
    (embedding_layer): Linear(in_features=22, out_features=32, bias=False)
    (dense_layer): Linear(in_features=33, out_features=64, bias=False)
    (output_layer): Linear(in_features=96, out_features=64, bias=False)
    (activation): ReLU()
  )
  (user_net): UserNet(
    (track_embeddings): EmbeddingBag(50001, 64, mode='mean', padding_idx=50000)
    (dense_layer): Linear(in_features=64, out_features=64, bias=False)
    (output_layer): Linear(in_features=128, out_features=64, bias=False)
    (activation): ReLU()
  )
)

In [38]:
import wandb
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
my_secret = user_secrets.get_secret("wandb_key") 

In [39]:
!wandb login {my_secret}

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [40]:
wandb.init(project="dssm_recommender")

[34m[1mwandb[0m: Currently logged in as: [33mviktor-zhuravlev1919[0m ([33mzhuravlevvik[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [41]:
def train_one_epoch():
    running_loss = 0
    last_loss = 0
    
    for i, data in enumerate(training_loader):
        # Every data instance is an input + label pair
        user_tracks, pos, neg = data

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        anchor, pos, neg = model(user_tracks, pos, neg)
    
        loss = loss_fn(anchor, pos, neg)
        loss.backward()

        optimizer.step()

        running_loss += loss.item()
        last_loss = running_loss / (i + 1)

    return last_loss

In [42]:
timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')

EPOCHS = 300
losses = []
val_loss = []

best_vloss = 1_000_000.

for epoch in range(EPOCHS):
    model.train(True)
    avg_loss = train_one_epoch()
    losses.append(avg_loss)


    running_vloss = 0.0
    model.eval()

    with torch.no_grad():
        for i, vdata in enumerate(validation_loader):
            vusers, vpos, vneg = vdata
            vanchor, vpos, vneg = model(vusers, vpos, vneg)
            vloss = loss_fn(vanchor, vpos, vneg)
            running_vloss += vloss

    avg_vloss = running_vloss / len(validation_loader)

    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_path = '/kaggle/working/model_{}_{}'.format(timestamp, epoch)
        torch.save(model.state_dict(), model_path)
        
        
    if epoch % 3 == 0:
        torch.save(model.state_dict(), '/kaggle/working/last_state.ct')

    wandb.log({
        'train_loss': avg_loss,
        'val_loss': avg_vloss,
        'epoch': epoch
    })

    scheduler.step(avg_loss)

KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(), '/kaggle/working/last_state.ct')

In [49]:
best = DSSM(dim_item_features=23, item_number=50000, embedding_dim=64)
best = best.to(DEVICE)
best.load_state_dict(torch.load("/kaggle/working/model_20240415_110604_63"))

<All keys matched successfully>

In [50]:
item_features = item_features.sort_index()
items = torch.from_numpy(item_features.index.values)
inf_items = torch.from_numpy(item_features.values).double()
items_ds = td.TensorDataset(items, inf_items)

In [51]:
inf_dl_items = td.DataLoader(items_ds, batch_size=128, shuffle=False, num_workers=1)
track_ids, track_embeddings = best.double().inference(inf_dl_items)

In [52]:
interactions = interactions.sort_index()
user_ids_inf = torch.from_numpy(interactions.index.values)
user_inf_feat = torch.from_numpy(padded_users[interactions.index.values]).long()
user_ds = td.TensorDataset(user_ids_inf, user_inf_feat)

In [53]:
inf_dl_users = td.DataLoader(user_ds, batch_size=128, shuffle=False, num_workers=1)
user_ids, user_embeddings = best.double().inference(inf_dl_users, "user")

In [54]:
track_embeddings.shape, user_embeddings.shape

((50000, 64), (9998, 64))

In [55]:
gpu_res = faiss.StandardGpuResources()
index_flat = faiss.index_factory(track_embeddings.shape[1], "Flat", faiss.METRIC_L2)

#index = faiss.index_cpu_to_gpu(gpu_res, 0, index_flat)
index = index_flat
index.add(track_embeddings.astype('float32'))

In [56]:
k = 125
with open("/kaggle/working/dssm_recs_my_final.json", "w") as rf:
    for user, user_emb in tqdm(zip(user_ids, user_embeddings), total=len(user_ids)):
      dists, neighbours = index.search(user_emb.astype('float32')[np.newaxis, :], k)
      recommendation = {
            "user": int(user),
            "tracks": neighbours.flatten().tolist()
        }
      rf.write(json.dumps(recommendation) + "\n")

  0%|          | 0/9998 [00:00<?, ?it/s]

In [57]:
writer = tb.SummaryWriter(comment='ncf_embeddings', log_dir="/kaggle/working/tb")
writer.add_embedding(track_embeddings, metadata=list(track_metadata[["artist", "title"]].itertuples(index=False, name=None)), tag="DSSM", metadata_header=["artist", "title"])
writer.close()

In [59]:
!zip -r tb.zip /kaggle/working/tb

  adding: kaggle/working/tb/ (stored 0%)
  adding: kaggle/working/tb/00000/ (stored 0%)
  adding: kaggle/working/tb/00000/DSSM/ (stored 0%)
  adding: kaggle/working/tb/00000/DSSM/metadata.tsv (deflated 54%)
  adding: kaggle/working/tb/00000/DSSM/tensors.tsv (deflated 56%)
  adding: kaggle/working/tb/events.out.tfevents.1713185273.09b0dca983bf (deflated 8%)
  adding: kaggle/working/tb/projector_config.pbtxt (deflated 37%)
