In [1]:
from lightfm import LightFM
from lightfm.data import Dataset
import pandas as pd
import os
import numpy as np
from numpy.typing import NDArray, ArrayLike
from typing import Dict, List, Tuple, Any
# from ..candidate_generation.generate_candidates import extract_relevant, extract_n_tracks

In [2]:
def extract_n_tracks(model: LightFM, user_id: int, tracks_shape: int, n_tracks: int = 5):
    predictions = model.predict(user_id, np.arange(1, tracks_shape))
    predictions = [(score, index) for index, score in enumerate(predictions)]
    predictions.sort(reverse=False)
    predictions = predictions[:n_tracks]
    predictions = [index for _, index in predictions]
    return predictions

def extract_relevant(playlists: NDArray, tracks: List[int], songs_features: NDArray, user_playlist: ArrayLike) -> Tuple[Any, ArrayLike, NDArray, Dict[Any, int]]:
    def encode(track_encodings: Dict, track: int):
        if track == 0 or track not in track_encodings:
            return 0
        return track_encodings[track]
    tracks_set = set(tracks)
    relevant_playlists = [user_playlist, ]
    # extracting relevant playlists
    for playlist in playlists:
        if set(playlist) & set(tracks_set):
            relevant_playlists.append(playlist)
    # extracting all relevant tracks
    relevant_tracks = set(np.array(relevant_playlists).flatten())
    relevant_tracks.discard(0)
    tracks_encodings = {track: index + 1 for index, track in enumerate(relevant_tracks)}
    encode_vectorizer = np.vectorize(encode)
    relevant_playlists = encode_vectorizer(tracks_encodings, relevant_playlists)
    relevant_tracks_features = []
    for sf in songs_features:
        if sf[-1] in relevant_tracks:
            sf = sf[1:]
            sf[-1] = tracks_encodings[sf[-1]]
            relevant_tracks_features.append(sf)
    relevant_tracks_features = np.array(relevant_tracks_features)
    return relevant_playlists, np.array(list(relevant_tracks)), relevant_tracks_features, tracks_encodings

In [3]:
input_filepath = "../../../data/predictions/candidate_generation"
model_filepath = "../../../models/learn_to_rank"
output_filepath = "../../../data/predictions/learn_to_rank"

In [4]:
features = pd.read_csv(os.path.join(input_filepath, "features.csv"), index_col=False).to_numpy()[:, 1:12]
playlists = pd.read_csv(os.path.join(input_filepath, "playlists.csv"), index_col=False).to_numpy()
user_playlist = pd.read_csv(os.path.join(input_filepath, "user_playlist.csv"), index_col=False).to_numpy()[:, 1].flatten()

In [5]:
N_USERS = playlists.shape[0]
unique_tracks = set(playlists.flatten())
unique_tracks.discard(0)
N_ITEMS = len(unique_tracks)
N_FEATURES = features.shape[1]
interactions_tuples = np.array([(user_id, track_id, 1) for user_id, playlist in enumerate(playlists) for track_id in playlist if track_id != 0])
features_tuples = np.array([(item_id+1, {feature_id: feature_value} )for item_id, f in enumerate(features) for feature_id, feature_value in enumerate(f)])

In [6]:
print(N_USERS, N_ITEMS, N_FEATURES, interactions_tuples.shape, features_tuples.shape)

233560 1354160 11 (23993047, 3) (14895760, 2)


In [7]:
dataset = Dataset(item_identity_features=True)
dataset.fit(users=np.arange(0, N_USERS), items=np.arange(1, N_ITEMS+1), item_features=np.arange(0, N_FEATURES))

In [8]:
interactions, _ = dataset.build_interactions(interactions_tuples)
item_features = dataset.build_item_features(features_tuples)

In [9]:
model = LightFM(no_components=2, loss='warp')
model.fit(interactions, item_features=item_features, epochs=1, verbose=True)

Epoch: 100%|██████████| 1/1 [00:48<00:00, 48.97s/it]


<lightfm.lightfm.LightFM at 0x7fbcdd7360a0>

In [10]:
user_playlist_tuples = [(0, track_id, 1) for track_id in user_playlist if track_id != 0]
user_interactions, _ = dataset.build_interactions(user_playlist_tuples)
model.fit_partial(user_interactions, item_features=item_features, epochs=3, verbose=True)

Epoch: 100%|██████████| 3/3 [00:00<00:00, 167.11it/s]


<lightfm.lightfm.LightFM at 0x7fbcdd7360a0>

In [11]:
predictions = extract_n_tracks(model, 0, user_interactions.shape[0], 100)
relevant_playlists, tracks, features, encodings = extract_relevant(playlists, predictions, features, user_playlist)

In [12]:
user_playlist = np.array([encodings[track] for track in user_playlist if track in encodings])
user_playlist = np.pad(user_playlist, (0, 375-len(user_playlist)), constant_values=(0,0))


In [13]:
print(set(user_playlist.flatten()))
print(set())

{13472, 34304, 0, 24934, 25549, 23502, 35410, 15315, 4569, 30939, 12669}
set()
