In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import requests
from tqdm.auto import tqdm
from scipy.stats import mode 
from pprint import pprint
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender, BM25Recommender
import warnings
warnings.filterwarnings("ignore")

from rectools import Columns
from rectools.model_selection import TimeRangeSplitter
from rectools.dataset import Dataset, Interactions
from rectools.models.popular import PopularModel
from rectools.models.lightfm import LightFMWrapperModel
from rectools.metrics import Precision, Recall, MeanInvUserFreq, MAP, Serendipity, calc_metrics

In [3]:
interactions = pd.read_csv('interactions.csv')

interactions.rename(columns={
        'last_watch_dt': Columns.Datetime,
        'total_dur': Columns.Weight
    }, 
    inplace=True
) 

interactions['datetime'] = pd.to_datetime(interactions['datetime'])

In [4]:
from lightfm import LightFM

In [5]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=None,
    item_features_df=None
)

In [14]:
lfm = LightFMWrapperModel(LightFM())
lfm.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7f82e72b8e20>

In [21]:
recos = lfm.recommend(
    users=interactions[Columns.User].unique(),
    dataset=dataset,
    k=60,
    filter_viewed=True,
)

In [22]:
recos

Unnamed: 0,user_id,item_id,score,rank
0,176549,4003,2.715359e+27,1
1,176549,1654,1.555407e+27,2
2,176549,5859,7.066884e+26,3
3,176549,15791,5.287224e+26,4
4,176549,2412,3.352055e+25,5
...,...,...,...,...
57730735,697262,7614,1.814607e+24,56
57730736,697262,12180,1.598932e+24,57
57730737,697262,13944,1.287519e+24,58
57730738,697262,12363,1.252760e+24,59


In [23]:
recos_offline = recos[recos['rank'] <= 10]

In [35]:
agreggated = recos_offline.groupby('user_id').agg({'item_id': list})

In [39]:
count_recs_by_users = recos_offline.user_id.value_counts()

In [42]:
agreggated

Unnamed: 0_level_0,item_id
user_id,Unnamed: 1_level_1
0,"[10794, 11559, 8645, 13856, 1654, 5859, 4328, ..."
1,"[11559, 8645, 13856, 2412, 11303, 4399, 9033, ..."
2,"[10794, 11559, 1654, 8645, 13856, 2412, 5859, ..."
3,"[11559, 10794, 8645, 13856, 4328, 9694, 13827,..."
4,"[10794, 11559, 8645, 13856, 1654, 4328, 5859, ..."
...,...
1097553,"[4003, 10794, 15791, 11303, 1654, 1611, 15555,..."
1097554,"[4003, 15791, 4328, 1611, 4399, 9033, 14565, 1..."
1097555,"[10794, 11559, 8645, 13856, 4328, 1654, 11303,..."
1097556,"[10794, 11559, 4003, 15791, 1654, 11303, 15555..."


In [43]:
results = agreggated['item_id'].to_dict()

In [44]:
reserved = {}

for key, value in tqdm(results.items()):
    if isinstance(value[0], np.ndarray):
        print(key)
        reserved[int(key)] = value[0].tolist()
    else:
        reserved[int(key)] = np.array(value).tolist()

  0%|          | 0/962179 [00:00<?, ?it/s]

In [45]:
for key, value in reserved.items():
    if len(value) != 10:
        print(key, value)
        break

In [46]:
import json

with open('lightfm-precalculated.json', 'w') as f:
    json.dump(reserved, f)

In [6]:
import joblib

with open('lightfm-dataset.joblib', 'wb') as f:
    joblib.dump(dataset, f)