In [5]:
import pickle
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from lightfm import LightFM
from scipy.sparse import csr_matrix, save_npz, load_npz
from lightfm.data import Dataset
import click
import os
import logging
from numpy.typing import NDArray
from typing import Any
import pickle
from tqdm import tqdm
from typing import List, Dict, Tuple
from numpy.typing import ArrayLike, NDArray

In [6]:
def load_pickle(input_path) -> Any:
    with open(input_path, 'rb') as file:
        return pickle.load(file)

def save_pickle(object: Any, output_path: str):
    with open(output_path, 'wb') as file:
        pickle.dump(object, file)

In [7]:
interactions_input_filepath = "../../../data/dataset"
playlist_input_filepath = "../../../data/processed/"
output_filepath = "../../../data/predictions/candidate_generation"
model_path = "../../../models/candidate_generation"

In [8]:
train_playlist = pd.read_csv(os.path.join(playlist_input_filepath, "train_playlists.csv"), index_col=False).to_numpy()
val_playlist = pd.read_csv(os.path.join(playlist_input_filepath, "val_playlists.csv"), index_col=False).to_numpy()
test_playlist = pd.read_csv(os.path.join(playlist_input_filepath, "test_playlists.csv"), index_col=False).to_numpy()
all_playlists = pd.read_csv(os.path.join(playlist_input_filepath, "playlists.csv"), index_col=False).to_numpy()
songs_encodings = pd.read_csv(os.path.join(playlist_input_filepath, "songs_encodings.csv"), index_col=False)
songs_features = pd.read_csv(os.path.join(playlist_input_filepath, "songs_features.csv")).to_numpy()

In [9]:
train_interactions = load_npz(os.path.join(interactions_input_filepath, "train_interactions.npz"))
val_interactions = load_npz(os.path.join(interactions_input_filepath, "val_interactions.npz"))
test_interactions = load_npz(os.path.join(interactions_input_filepath, "test_interactions.npz"))
dataset = load_pickle(os.path.join(interactions_input_filepath, "dataset_lightfm"))

In [10]:
model = LightFM(no_components=2, loss='warp')
model.fit(train_interactions, epochs=2, verbose=True)
model.fit_partial(val_interactions)
model.fit_partial(test_interactions)

Epoch: 100%|██████████| 2/2 [04:44<00:00, 142.16s/it]


<lightfm.lightfm.LightFM at 0x7f179b3abf40>

In [12]:
save_pickle(model, os.path.join(model_path, "candidate_generator"))

In [13]:
def extract_n_tracks(model: LightFM, user_id: int, tracks_shape: int, n_tracks: int = 5):
    predictions = model.predict(user_id, np.arange(1, tracks_shape))
    predictions = [(score, index) for index, score in enumerate(predictions)]
    predictions.sort(reverse=False)
    predictions = predictions[:n_tracks]
    predictions = [index for _, index in predictions]
    return predictions

In [19]:
preds = extract_n_tracks(model, 999998, train_interactions.shape[0], 250)

In [20]:
def extract_relevant(playlists: NDArray, tracks: List[int], songs_encodings: Dict, songs_features: NDArray, user_playlist: ArrayLike) -> Tuple[Any, ArrayLike, NDArray, Dict[Any, int]]:
    def encode(track_encodings: Dict, track: int):
        if track == 0 or track not in track_encodings:
            return 0
        return track_encodings[track]
    tracks_set = set(tracks)
    relevant_playlists = [user_playlist, ]
    # extracting relevant playlists
    for playlist in playlists:
        if set(playlist) & set(tracks_set):
            relevant_playlists.append(playlist)
    # extracting all relevant tracks
    relevant_tracks = set(np.array(relevant_playlists).flatten())
    relevant_tracks.discard(0)
    tracks_encodings = {track: index + 1 for index, track in enumerate(relevant_tracks)}
    encode_vectorizer = np.vectorize(encode)
    relevant_playlists = encode_vectorizer(tracks_encodings, relevant_playlists)
    relevant_tracks_features = []
    for sf in songs_features:
        if sf[-1] in relevant_tracks:
            sf = sf[1:]
            sf[-1] = tracks_encodings[sf[-1]]
            relevant_tracks_features.append(sf)
    relevant_tracks_features = np.array(relevant_tracks_features)
    return relevant_playlists, np.array(list(relevant_tracks)), relevant_tracks_features, tracks_encodings


In [23]:
relevant_playlists, tracks, features, encodings = extract_relevant(train_playlist, preds, songs_encodings, songs_features, all_playlists[999998])

In [24]:
print(relevant_playlists.shape)
print(tracks.shape)
print(features.shape)
print(len(encodings))
print(set(tracks) & set(all_playlists[999999]))
print(set(all_playlists[999999]))

(3173, 375)
(126858,)
(5310, 13)
126858
{493056, 1207233, 1910306, 1726979, 1796070, 353641, 813485, 1246350, 866480, 1979674}
{493056, 1207233, 1910306, 1726979, 0, 1796070, 353641, 813485, 1246350, 866480, 1753143, 1979674, 519453}


In [25]:
user_playlist = np.array([encodings[track] for track in all_playlists[999999] if track in encodings])
user_playlist = np.pad(user_playlist, (0, 375-len(user_playlist)), constant_values=(0,0))
print(user_playlist.shape)
print(user_playlist[:100])


(375,)
[ 40453  13672  80211  46218  73314  38031  78129  97813 109244 112756
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0]


In [27]:
pd.Series(user_playlist).to_csv(os.path.join(output_filepath, "user_playlist.csv"))
pd.DataFrame(relevant_playlists).to_csv(os.path.join(output_filepath, "playlists.csv"), index=False)
pd.DataFrame(features).to_csv(os.path.join(output_filepath, "features.csv"), index=False)
