In [13]:
import pickle
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from lightfm import LightFM
from scipy.sparse import csr_matrix, save_npz, load_npz
from lightfm.evaluation import auc_score
from lightfm.data import Dataset
import click
import os
import logging
from numpy.typing import NDArray
from typing import Any
import pickle
from tqdm import tqdm

In [2]:
! pwd

/home/adamgorski/Desktop/inzynierka/conseillify/conseillify-research/src/features


In [3]:
input_filepath = "/home/adamgorski/Desktop/inzynierka/conseillify/conseillify-research/data/processed"
output_filepath = "/home/adamgorski/Desktop/inzynierka/conseillify/conseillify-research/data/dataset"

In [8]:
def create_interactions_tuples(playlists: NDArray, starting_index: int = 0):
    return np.array([(user + starting_index, item, 1)  for user, row in tqdm(enumerate(playlists), total=playlists.shape[0]) for item in row if item != 0])

def load_pickle(input_path) -> Any:
    with open(input_path) as file:
        return pickle.load(file)

def save_pickle(object: Any, output_path: str):
    with open(output_path, 'wb') as file:
        pickle.dump(object, file)

In [9]:
dataset = Dataset(user_identity_features=False, item_identity_features=True)
logging.info("Loading playlists")
train_playlist = pd.read_csv(os.path.join(input_filepath, "train_playlists.csv"), index_col=False).to_numpy()
val_playlist = pd.read_csv(os.path.join(input_filepath, "val_playlists.csv"), index_col=False).to_numpy()
test_playlist = pd.read_csv(os.path.join(input_filepath, "test_playlists.csv"), index_col=False).to_numpy()
songs_encodings = pd.read_csv(os.path.join(input_filepath, "songs_encodings.csv"), index_col=False)

N_USERS = train_playlist.shape[0] + test_playlist.shape[0] + test_playlist.shape[0]
N_ITEMS = songs_encodings.shape[0]
logging.info(f"N_USERS: {N_USERS} N_ITEMS: {N_ITEMS}")

In [10]:
logging.info("Fitting datset")
dataset = Dataset(user_identity_features=False, item_identity_features=False)
dataset.fit(users=np.arange(0, N_USERS), items=np.arange(1, N_ITEMS+1))


In [14]:
train_interactions = create_interactions_tuples(train_playlist, 0)
train_interactions, _ = dataset.build_interactions(train_interactions)

100%|██████████| 990000/990000 [00:48<00:00, 20346.02it/s]


In [16]:
val_interactions = create_interactions_tuples(val_playlist, train_playlist.shape[0])
val_interactions, _ = dataset.build_interactions(val_interactions)   

test_interactions = create_interactions_tuples(test_playlist, train_playlist.shape[0] + test_playlist.shape[0])
test_interactions, _ = dataset.build_interactions(test_interactions)

100%|██████████| 5000/5000 [00:00<00:00, 18603.57it/s]
100%|██████████| 5000/5000 [00:00<00:00, 20124.13it/s]


 11%|█         | 104400/990000 [00:21<00:42, 20794.13it/s]

In [19]:
save_npz(os.path.join(output_filepath, 'train_interactions.npz'), train_interactions)
save_npz(os.path.join(output_filepath, 'val_interactions.npz'), val_interactions)
save_npz(os.path.join(output_filepath, 'test_interactions.npz'), test_interactions)

In [20]:
save_pickle(dataset, os.path.join(output_filepath, "dataset_lightfm"))

In [38]:
# pure collaborative model
dataset = Dataset(user_identity_features=False, item_identity_features=False)
dataset.fit(users=np.arange(0, N_USERS), items=np.arange(1, N_ITEMS+1), user_features=None, item_features=None)
(interactions, weights) = dataset.build_interactions(train_interactions)

In [39]:
model = LightFM(no_components=20)
model.fit(interactions, epochs=3, verbose=True, num_threads=8)

Epoch: 100%|██████████| 3/3 [00:52<00:00, 17.67s/it]


<lightfm.lightfm.LightFM at 0x7fecb2172610>

In [40]:
test_interactions = test_playlist.to_numpy()
test_interactions = np.array([(train_playlist.shape[0] + 1 + user, item, 1)  for user, row in enumerate(test_interactions) for item in row if item != 0])
test_interactions, _ = dataset.build_interactions(test_interactions)

In [41]:
print(test_interactions.shape)

(1000000, 2252192)


In [42]:
model.fit_partial(test_interactions)

<lightfm.lightfm.LightFM at 0x7fecb2172610>

In [44]:
model.predict(999, item_ids=np.arange(0, test_interactions.shape[1]))

array([1.1334163 , 0.71205145, 0.29885036, ..., 0.42696986, 0.29637343,
       0.40504098], dtype=float32)