In [1]:
import pandas as pd
import numpy as np
import scipy.stats as ss

import lightfm
import lightfm.data as ld
import lightfm.evaluation as lv

import glob
import tqdm
import json
import optuna

import tensorboardX as tb

import matplotlib.pyplot as pl
import seaborn as sns

np.random.seed(31337)



In [2]:
DATA_DIR = "/Users/n.anokhin/Desktop/"

In [3]:
data = pd.concat([
    pd.read_json(data_path, lines=True) 
    for data_path 
    in glob.glob(DATA_DIR + "/data/*/data.json")
])
data["rnd"] = np.random.random(len(data))

data.head(5)

Unnamed: 0,message,timestamp,user,track,time,latency,recommendation,experiments,rnd
0,next,2024-03-04 09:06:00.999,2620,32065,0.0,0.001914,32069.0,{},0.721852
1,next,2024-03-04 09:06:01.057,5687,32024,0.8,0.001389,37337.0,{},0.966441
2,next,2024-03-04 09:06:01.078,7957,14080,0.8,0.001621,5540.0,{},0.289049
3,next,2024-03-04 09:06:01.092,7957,10027,0.41,0.000477,492.0,{},0.171346
4,last,2024-03-04 09:06:01.111,7957,14271,0.01,0.000207,,{},0.371618


In [13]:
positives = data[data["time"] > 0.8].copy()
positives["test"] = np.random.random(len(positives)) >= 0.7
positives.drop_duplicates(["user", "track"], inplace=True)

In [14]:
user_counts = positives[~positives["test"]].groupby("user").size()
users = set(user_counts[user_counts >= 5].index.values)

In [15]:
track_counts = positives[~positives["test"]].groupby("track").size()
tracks = set(track_counts[track_counts >= 5].index.values)

In [16]:
len(users), len(tracks)

(9750, 4571)

## Train LightFM

In [17]:
train_data = positives[~positives["test"] & positives["user"].isin(users) & positives["track"].isin(tracks)]
test_data = positives[positives["test"] & positives["user"].isin(users) & positives["track"].isin(tracks)]

len(train_data), len(test_data)

(71514, 28207)

In [18]:
dataset = ld.Dataset()
dataset.fit(users, tracks)

In [19]:
train_interactions, _ = dataset.build_interactions(train_data[["user", "track"]].itertuples(index=False, name=None))
test_interactions, _ = dataset.build_interactions(test_data[["user", "track"]].itertuples(index=False, name=None))

In [20]:
def fit_model(
    epochs=1, 
    at=10,
    loss="warp",
    no_components=30,
    learning_rate=0.01, 
    max_sampled=10,
    user_alpha=0.0, 
    item_alpha=0.0, 
    threads=30, 
    verbose=False,
    patience=3,
    epsilon=1e-6,
):
    model = lightfm.LightFM(
        no_components=no_components,
        loss=loss,
        learning_rate=learning_rate,
        max_sampled=max_sampled,
        user_alpha=user_alpha,
        item_alpha=item_alpha,
    )

    precisions_at = []
    
    for epoch in range(epochs):
        model = model.fit_partial(train_interactions, num_threads=threads)
        
        precision_at = lv.precision_at_k(model, test_interactions, train_interactions=train_interactions, k=at, num_threads=threads)
        
        if verbose:
            print(f"{epoch}:\t{np.mean(precision_at)} +/- {ss.sem(precision_at) * 1.96}")
            
        precisions_at.append(np.mean(precision_at))
            
        if epoch > patience and all([precisions_at[-j] - precisions_at[-patience-1] < epsilon for j in range(1, patience + 1)]):
            if verbose:
                print("Early stopiing!")
            break
        
    else:
        if verbose:
            print("No early stopiing happened: increase epochs maybe?")
        
    return model, precisions_at


def objective(trial):
    loss = trial.suggest_categorical("loss", ["warp", "bpr"])
    no_components = trial.suggest_categorical("no_components", [10, 30, 50])
    learning_rate = trial.suggest_categorical("learning_rate", [0.0001, 0.001, 0.01])
    max_sampled = trial.suggest_categorical("max_sampled", [10, 20, 50, 100])
    user_alpha = trial.suggest_categorical("user_alpha", [0.0, 0.0001])
    item_alpha = trial.suggest_categorical("item_alpha", [0.0, 0.0001])
    
    model, precisions_at = fit_model(
        epochs=5, 
        at=10,
        loss=loss,
        no_components=no_components, 
        learning_rate=learning_rate, 
        max_sampled=max_sampled, 
        user_alpha=user_alpha, 
        item_alpha=item_alpha,
    )
    
    return precisions_at[-1]

In [21]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)
best_params = study.best_params

[32m[I 2024-03-04 12:38:47,280][0m A new study created in memory with name: no-name-05946645-dba5-4e3e-bc36-f93a7894d3a8[0m
[32m[I 2024-03-04 12:38:57,600][0m Trial 0 finished with value: 0.018938032910227776 and parameters: {'loss': 'warp', 'no_components': 30, 'learning_rate': 0.001, 'max_sampled': 100, 'user_alpha': 0.0, 'item_alpha': 0.0001}. Best is trial 0 with value: 0.018938032910227776.[0m
[32m[I 2024-03-04 12:39:06,465][0m Trial 1 finished with value: 0.019117647781968117 and parameters: {'loss': 'warp', 'no_components': 10, 'learning_rate': 0.01, 'max_sampled': 50, 'user_alpha': 0.0001, 'item_alpha': 0.0}. Best is trial 1 with value: 0.019117647781968117.[0m
[32m[I 2024-03-04 12:39:17,553][0m Trial 2 finished with value: 0.019196227192878723 and parameters: {'loss': 'warp', 'no_components': 50, 'learning_rate': 0.01, 'max_sampled': 100, 'user_alpha': 0.0, 'item_alpha': 0.0001}. Best is trial 2 with value: 0.019196227192878723.[0m
[32m[I 2024-03-04 12:39:29,042]

In [23]:
best_params = {
    'loss': 'warp',
    'no_components': 50,
    'learning_rate': 0.01,
    'max_sampled': 100,
    'user_alpha': 0.0,
    'item_alpha': 0.0001
}

In [26]:
model, precisions_at = fit_model(
    epochs=300,
    at=10,
    loss=best_params["loss"],
    no_components=best_params["no_components"], 
    learning_rate=best_params["learning_rate"], 
    max_sampled=best_params["max_sampled"],
    user_alpha=best_params["user_alpha"],
    item_alpha=best_params["item_alpha"],
    verbose=True,
)

0:	0.01876964420080185 +/- 0.0010069465027825956
1:	0.018735969439148903 +/- 0.0010108157489557086
2:	0.019050292670726776 +/- 0.0010178373302895138
3:	0.019140098243951797 +/- 0.0010204371480820487
4:	0.019735068082809448 +/- 0.0010346662109506782
5:	0.02030758745968342 +/- 0.0010496671890110566
6:	0.020745398476719856 +/- 0.0010677292154084887
7:	0.02147507853806019 +/- 0.001098915502023381
8:	0.02218230813741684 +/- 0.0011292277559827935
9:	0.022563988342881203 +/- 0.0011426087797321654
10:	0.023080378770828247 +/- 0.0011542035988451082
11:	0.02381005883216858 +/- 0.001181262405882392
12:	0.02442748099565506 +/- 0.001197235820699664
13:	0.02488774061203003 +/- 0.0012102638429437294
14:	0.025213293731212616 +/- 0.0012195238854995482
15:	0.02559497207403183 +/- 0.0012287642781797703
16:	0.026021553203463554 +/- 0.0012409943916958644
17:	0.026459364220499992 +/- 0.0012519737983663977
18:	0.02662775106728077 +/- 0.0012552639147414115
19:	0.027099236845970154 +/- 0.0012704910210945824
20

In [31]:
# TODO 1.4: Plot learning curve precision(epoch)

## Save track embeddings

In [41]:
BOTIFY_DATA_DIR = "/Users/n.anokhin/Projects/recsys-course/botify/data/"

In [42]:
biases, embeddings = model.get_item_representations()

In [43]:
model.item_biases *= 0.0

In [44]:
track_meta = pd.read_json(BOTIFY_DATA_DIR + "tracks.json", lines=True)
track_meta["dataset_index"] = track_meta["track"].map(lambda t: dataset.mapping()[2].get(t))

In [45]:
dataset_tracks = track_meta[pd.notnull(track_meta["dataset_index"])].sort_values("dataset_index")

In [46]:
writer = tb.SummaryWriter(comment='msd_ligtfm_embeddings', log_dir=DATA_DIR + "tb")
writer.add_embedding(embeddings, metadata=list(dataset_tracks[["artist", "title"]].itertuples(index=False, name=None)), tag="lightfm", metadata_header=["artist", "title"])
writer.close()



## Compute top recommendations

In [47]:
tracks = dataset_tracks["track"].values
users = [user for user, index in sorted(dataset.mapping()[0].items(), key=lambda kv: kv[1])]

In [48]:
with open(BOTIFY_DATA_DIR + "recommendations_lfm.json", "w") as rf:
    for user_index in tqdm.tqdm(range(dataset.user_features_shape()[0])):
        predictions = model.predict(user_index, np.arange(dataset.item_features_shape()[0]), num_threads=30)
        top = tracks[np.argsort(predictions)[-100:]]
        recommendation = {
            "user": int(users[user_index]),
            "tracks": [int(x) for x in top]
        }
        rf.write(json.dumps(recommendation) + "\n")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9750/9750 [00:08<00:00, 1185.70it/s]
