In [1]:
import pandas as pd
import numpy as np
import scipy.stats as ss

import lightfm
import lightfm.data as ld
import lightfm.evaluation as lv
from lightfm import LightFM

import glob
from tqdm import tqdm
import json
import optuna

import tensorboardX as tb

import matplotlib.pyplot as pl
import seaborn as sns

np.random.seed(31337)

In [4]:
DATA_DIR = "/home/zer0/Downloads/2024-03-04-seminar-4"

In [5]:
data = pd.concat([
    pd.read_json(data_path, lines=True) 
    for data_path 
    in glob.glob(DATA_DIR + "/data/*/data.json")
])
data["rnd"] = np.random.random(len(data))

data.head(5)

Unnamed: 0,message,timestamp,user,track,time,latency,recommendation,experiments,rnd
0,next,2024-03-04 09:06:00.867,4692,43584,0.0,0.001748,34265.0,{},0.721852
1,next,2024-03-04 09:06:01.052,5687,29284,1.0,0.001789,32024.0,{},0.966441
2,next,2024-03-04 09:06:01.073,5687,35973,0.03,0.001909,35972.0,{},0.289049
3,next,2024-03-04 09:06:01.088,7957,26410,0.51,0.001521,10027.0,{},0.171346
4,next,2024-03-04 09:06:01.106,7089,11314,0.41,0.000465,952.0,{},0.371618


In [6]:

data["test"] = np.random.random(len(data)) >= 0.99
data.drop_duplicates(["user", "track"], inplace=True)

In [7]:
user_counts = data.groupby("user").size()
users = set(user_counts.index.values)

In [8]:
track_counts = data.groupby("track").size()
tracks = set(track_counts.index.values)

In [9]:
len(users), len(tracks)

(10000, 49146)

## Train LightFM

In [23]:
train_df = data[data["time"] > 0.55].copy() 
train_data = train_df[train_df["user"].isin(users) & train_df["track"].isin(tracks)]
test_data = train_df[train_df["test"] & train_df["user"].isin(users) & train_df["track"].isin(tracks)]

len(train_data), len(test_data)

(231535, 2338)

In [24]:
dataset = ld.Dataset()
dataset.fit(users, tracks)

In [25]:
train_interactions, _ = dataset.build_interactions(train_data[["user", "track"]].itertuples(index=False, name=None))
test_interactions, _ = dataset.build_interactions(test_data[["user", "track"]].itertuples(index=False, name=None))

In [26]:
model = lightfm.LightFM(
    no_components=115,
    loss='warp',
    learning_rate=0.01,
    max_sampled=50,
    user_alpha=0.0001,
    item_alpha=0.0001
)

model.fit(train_interactions, epochs=310, verbose=True, num_threads=2)

Epoch:   0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 100%|██████████| 310/310 [08:54<00:00,  1.72s/it]


<lightfm.lightfm.LightFM at 0x74303ba679a0>

## Save track embeddings

In [27]:
BOTIFY_DATA_DIR = "/home/zer0/projects/vscode/recsys-course/botify/data/"

In [28]:
biases, embeddings = model.get_item_representations()

In [29]:
model.item_biases *= 0.0

In [30]:
track_meta = pd.read_json(BOTIFY_DATA_DIR + "tracks.json", lines=True)
track_meta["dataset_index"] = track_meta["track"].map(lambda t: dataset.mapping()[2].get(t))

In [31]:
dataset_tracks = track_meta[pd.notnull(track_meta["dataset_index"])].sort_values("dataset_index")

In [32]:
writer = tb.SummaryWriter(comment='top_ligtfm_embeddings', log_dir=DATA_DIR + "tb")
writer.add_embedding(embeddings, metadata=list(dataset_tracks[["artist", "title"]].itertuples(index=False, name=None)), tag="lightfm", metadata_header=["artist", "title"])
writer.close()

## Compute top recommendations

In [33]:
tracks = dataset_tracks["track"].values
users = [user for user, index in sorted(dataset.mapping()[0].items(), key=lambda kv: kv[1])]

In [38]:
with open(BOTIFY_DATA_DIR + "top_recommendations_lfm.json", "w") as rf:
    for user_index in tqdm(range(dataset.user_features_shape()[0])):
        predictions = model.predict(user_index, np.arange(dataset.item_features_shape()[0]), num_threads=2)
        top = tracks[np.argsort(predictions)[-30:]]
        recommendation = {
            "user": int(users[user_index]),
            "tracks": [int(x) for x in top]
        }
        rf.write(json.dumps(recommendation) + "\n")

100%|██████████| 10000/10000 [01:51<00:00, 89.37it/s]
