In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import requests
from tqdm.auto import tqdm
from scipy.stats import mode 
from pprint import pprint
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender, BM25Recommender
import warnings
warnings.filterwarnings("ignore")

from rectools import Columns
from rectools.model_selection import TimeRangeSplitter
from rectools.dataset import Dataset, Interactions
from rectools.models.popular import PopularModel
from rectools.models.implicit_knn import ImplicitItemKNNWrapperModel
from rectools.metrics import Precision, Recall, MeanInvUserFreq, MAP, Serendipity, calc_metrics

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

In [3]:
interactions = pd.read_csv('kion_train/interactions.csv')

interactions.rename(columns={
        'last_watch_dt': Columns.Datetime,
        'total_dur': Columns.Weight
    }, 
    inplace=True
) 

interactions['datetime'] = pd.to_datetime(interactions['datetime'])

In [4]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=None,
    item_features_df=None
)

In [5]:
bmp25_k60_model = ImplicitItemKNNWrapperModel(BM25Recommender(K=60))
bmp25_k60_model.fit(dataset)

<rectools.models.implicit_knn.ImplicitItemKNNWrapperModel at 0x7f56ffd445b0>

In [6]:

K_RECOS = 30
    
recos_offline_bmp25 = bmp25_k60_model.recommend(
    users=interactions[Columns.User].unique(),
    dataset=dataset,
    k=K_RECOS,
    filter_viewed=True,
)

In [7]:
recos_offline_bmp25

Unnamed: 0,user_id,item_id,score,rank
0,176549,13865,8.899597e+10,1
1,176549,10440,8.153085e+10,2
2,176549,15297,7.204604e+10,3
3,176549,3734,6.953473e+10,4
4,176549,4151,4.674591e+10,5
...,...,...,...,...
28862258,697262,5434,1.615419e+10,26
28862259,697262,1132,1.605160e+10,27
28862260,697262,7476,1.566697e+10,28
28862261,697262,11237,1.546907e+10,29


In [8]:
recos_offline_bmp25.to_csv('offline-bm25.csv')

In [10]:
recos_offline_bmp25.head(50)

Unnamed: 0,user_id,item_id,score,rank
0,176549,13865,88995970000.0,1
1,176549,10440,81530850000.0,2
2,176549,15297,72046040000.0,3
3,176549,3734,69534730000.0,4
4,176549,4151,46745910000.0,5
5,176549,7571,41345350000.0,6
6,176549,4880,38581290000.0,7
7,176549,142,34837610000.0,8
8,176549,2657,29255500000.0,9
9,176549,8636,27311580000.0,10


In [15]:
recos_offline_10 = recos_offline_bmp25[recos_offline_bmp25['rank'] <= 10]

In [16]:
recos_offline_10

Unnamed: 0,user_id,item_id,score,rank
0,176549,13865,8.899597e+10,1
1,176549,10440,8.153085e+10,2
2,176549,15297,7.204604e+10,3
3,176549,3734,6.953473e+10,4
4,176549,4151,4.674591e+10,5
...,...,...,...,...
28862238,697262,142,7.403072e+10,6
28862239,697262,2657,7.134247e+10,7
28862240,697262,12192,6.349703e+10,8
28862241,697262,4880,5.963177e+10,9


In [103]:
agreggated = recos_offline_10.groupby('user_id').agg({'item_id': list})

In [18]:
agreggated

Unnamed: 0_level_0,item_id
user_id,Unnamed: 1_level_1
0,"[10440, 4151, 13865, 3734, 142, 2657, 4880, 9996, 16228, 8636]"
1,"[15297, 13865, 9728, 4151, 2657, 3734, 142, 9996, 4880, 12192]"
2,"[9728, 13865, 3734, 10440, 15297, 3182, 4151, 7626, 12173, 4880]"
3,"[15297, 13865, 9996, 8636, 6809, 12995, 4495, 7417, 4740, 1844]"
4,"[9728, 13865, 10440, 15297, 8636, 3734, 4151, 4457, 142, 2657]"
...,...
1097553,"[15297, 13865, 101, 10440, 4151, 9728, 14470, 1916, 142, 15531]"
1097554,"[9728, 10440, 13865, 15297, 4151, 142, 11237, 4880, 1844, 3734]"
1097555,"[10440, 13865, 15297, 4151, 3734, 2657, 142, 9996, 12192, 11237]"
1097556,"[15297, 3734, 9728, 10440, 13865, 4151, 4880, 12192, 142, 3935]"


In [97]:
agreggated = recos_offline_10.groupby('user_id').agg({'item_id': list})

In [26]:
from rectools.models.popular import PopularModel, Popularity
from rectools.models.popular_in_category import PopularInCategoryModel, RatioStrategy, MixingStrategy
from rectools.model_selection import TimeRangeSplitter
from rectools.metrics import MAP, Recall, calc_metrics

In [27]:
popular = PopularModel()
popular.fit(dataset)

<rectools.models.popular.PopularModel at 0x7f56a376c100>

In [28]:
recos_popular = popular.recommend(
    users=interactions[Columns.User].unique(),
    dataset=dataset,
    k=10,
    filter_viewed=True,
)

In [29]:
recos_popular

Unnamed: 0,user_id,item_id,score,rank
0,176549,10440,202457.0,1
1,176549,15297,193123.0,2
2,176549,13865,122119.0,3
3,176549,4151,91167.0,4
4,176549,3734,74803.0,5
...,...,...,...,...
9621785,697262,2657,68581.0,6
9621786,697262,4880,55043.0,7
9621787,697262,142,45367.0,8
9621788,697262,6809,40372.0,9


In [30]:
recos_popular = recos_popular[recos_popular['rank'] <= 10]

In [31]:
recos_popular

Unnamed: 0,user_id,item_id,score,rank
0,176549,10440,202457.0,1
1,176549,15297,193123.0,2
2,176549,13865,122119.0,3
3,176549,4151,91167.0,4
4,176549,3734,74803.0,5
...,...,...,...,...
9621785,697262,2657,68581.0,6
9621786,697262,4880,55043.0,7
9621787,697262,142,45367.0,8
9621788,697262,6809,40372.0,9


In [33]:
recos_popular.loc[0, 'user_id']

176549

In [36]:
agreggated.loc[0]['item_id']

[10440, 4151, 13865, 3734, 142, 2657, 4880, 9996, 16228, 8636]

In [40]:
count_recs_by_users = recos_offline_10.user_id.value_counts()

In [54]:
users_without_recos = list(count_recs_by_users[count_recs_by_users < 10].index)

In [53]:
recos_popular[recos_popular['user_id'] == 447799].agg({'item_id': list})

item_id    [10440, 15297, 9728, 13865, 4151, 3734, 2657, 4880, 142, 6809]
dtype: object

In [57]:
agreggated.loc[10440, 'item_id']

[10440, 4151, 9728, 3734, 142, 2657, 4880, 12192, 9996, 8636]

In [59]:
recos_offline_10[recos_offline_10['user_id'] == 10440]

Unnamed: 0,user_id,item_id,score,rank
13367698,10440,10440,190042300000.0,1
13367699,10440,4151,130245000000.0,2
13367700,10440,9728,99446980000.0,3
13367701,10440,3734,88431180000.0,4
13367702,10440,142,63568280000.0,5
13367703,10440,2657,62248210000.0,6
13367704,10440,4880,57414390000.0,7
13367705,10440,12192,52488530000.0,8
13367706,10440,9996,42241840000.0,9
13367707,10440,8636,39198640000.0,10


In [62]:
count_recs_by_users = recos_offline_10.user_id.value_counts()
users_without_recos = list(count_recs_by_users[count_recs_by_users < 10].index)

In [63]:
users_without_recos

[4139,
 447799,
 997730,
 477369,
 864456,
 848806,
 858838,
 582189,
 762653,
 132302,
 269023,
 864594,
 734934,
 623554,
 285722,
 722341,
 129750,
 445377,
 766982,
 286631,
 573891]

In [66]:
recos_offline_10.loc[1053]['item_id'].append(recos_popular.loc[1053].agg({'item_id': list}))

In [67]:
recos_offline_10[recos_offline_10['user_id'] == 10440]

Unnamed: 0,user_id,item_id,score,rank
13367698,10440,10440,190042300000.0,1
13367699,10440,4151,130245000000.0,2
13367700,10440,9728,99446980000.0,3
13367701,10440,3734,88431180000.0,4
13367702,10440,142,63568280000.0,5
13367703,10440,2657,62248210000.0,6
13367704,10440,4880,57414390000.0,7
13367705,10440,12192,52488530000.0,8
13367706,10440,9996,42241840000.0,9
13367707,10440,8636,39198640000.0,10


In [70]:
recos_popular[recos_popular['user_id'] == 10440].head(10 - 3)

Unnamed: 0,user_id,item_id,score,rank
4456120,10440,10440,202457.0,1
4456121,10440,9728,132865.0,2
4456122,10440,4151,91167.0,3
4456123,10440,3734,74803.0,4
4456124,10440,2657,68581.0,5
4456125,10440,4880,55043.0,6
4456126,10440,142,45367.0,7


In [104]:
for user_id in users_without_recos:
    length = len(agreggated.loc[user_id]['item_id'])
    agreggated.loc[user_id]['item_id'].extend(
        recos_popular[recos_popular['user_id'] == user_id]
        .head(10 - length).agg({'item_id': list})['item_id']
    )

In [105]:
results = agreggated['item_id'].to_dict()

In [106]:
reserved = {}

for key, value in tqdm(results.items()):
    if isinstance(value[0], np.ndarray):
        print(key)
        reserved[int(key)] = value[0].tolist()
    else:
        reserved[int(key)] = np.array(value).tolist()

  0%|          | 0/962105 [00:00<?, ?it/s]

In [101]:
len(reserved)

962105

In [107]:
for key, value in reserved.items():
    if len(value) != 10:
        print(key, value)
        break


In [108]:
import json

with open('userknnbm25-pop-precalculated.json', 'w') as f:
    json.dump(reserved, f)

In [85]:
type(reserved[0])

list