In [2]:
import os
import pandas as pd
import numpy as np
import requests
import warnings
import pickle
import time

import nmslib

from implicit.als import AlternatingLeastSquares

from rectools.metrics import Precision, Recall, MAP, calc_metrics
from rectools.models import PopularModel, RandomModel, ImplicitALSWrapperModel
from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel

import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib.pyplot as plt
from pathlib import Path
import typing as tp
from tqdm import tqdm

from lightfm import LightFM

from implicit.bpr import BayesianPersonalizedRanking

from implicit.lmf import LogisticMatrixFactorization

In [3]:
url = "https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip"

req = requests.get(url, stream=True)

with open('kion_train.zip', "wb") as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

kion dataset download: 100%|█████████▉| 78.6M/78.8M [00:05<00:00, 17.1MiB/s]

In [4]:
!unzip kion_train.zip

Archive:  kion_train.zip
   creating: kion_train/
  inflating: kion_train/interactions.csv  
  inflating: __MACOSX/kion_train/._interactions.csv  
  inflating: kion_train/users.csv    
  inflating: __MACOSX/kion_train/._users.csv  
  inflating: kion_train/items.csv    
  inflating: __MACOSX/kion_train/._items.csv  


In [5]:
warnings.filterwarnings('ignore')

In [6]:
os.environ["OPENBLAS_NUM_THREADS"] = "1" 

# Load data

In [7]:
interactions = pd.read_csv('kion_train/interactions.csv')
users = pd.read_csv('kion_train/users.csv')
items = pd.read_csv('kion_train/items.csv')

# Users

In [8]:
users.head()

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
2,1047345,age_45_54,income_40_60,Ж,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0


In [9]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840197 entries, 0 to 840196
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   user_id   840197 non-null  int64 
 1   age       826102 non-null  object
 2   income    825421 non-null  object
 3   sex       826366 non-null  object
 4   kids_flg  840197 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 32.1+ MB


# Add 3 users

In [10]:
max_user_id = users['user_id'].max()

In [11]:
def random_dates(start, end, n = 3, unit = 'D'):
    ndays = (end - start).days + 1
    dates = pd.to_timedelta(np.random.rand(n) * ndays, unit = unit) + start
    return pd.to_datetime(dates, format = '%m/%d/%Y').strftime('%Y-%m-%d')

In [12]:
Columns.Datetime = 'last_watch_dt'

max_date = pd.to_datetime(interactions[Columns.Datetime].max())
min_date = pd.to_datetime(interactions[Columns.Datetime].min())

print("Max date of interactions: ", max_date)
print("Min date of interactions: ", min_date)   

Max date of interactions:  2021-08-22 00:00:00
Min date of interactions:  2021-03-13 00:00:00


### 1.A user who likes comedies and melodramas

In [13]:
user_id = max_user_id + 1
print("User id: ", user_id)

User id:  1097559


In [14]:
dict_user = {'user_id': [user_id],
        'age': ['age_25_34'],
        'income': ['income_40_60'],
        'sex': ['Ж'],
        'kids_flg': ['0']
       }

user_1 = pd.DataFrame(dict_user)

#### Adding interactions with relevant items

In [15]:
dict_interaction = {'user_id': [user_id, user_id, user_id],
        'item_id': [6699, 6526, 11594],
        'last_watch_dt': random_dates(min_date, max_date, 3),
        'total_dur': [110, 80, 98],
        'watched_pct': [99., 66., 96.]
       }
       
interaction_user_1 = pd.DataFrame(dict_interaction)

In [16]:
items.query('item_id in [6699, 6526, 11594]')

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
144,6699,film,Как отделаться от парня за 10 дней,How to Lose a Guy in 10 Days,2003.0,"мелодрамы, комедии",США,,12.0,,"Дональд Питри, Дональд Петри","Кейт Хадсон, Мэттью Мак-Конахи, Кэтрин Хан, Эн...",Журналистка Энди Андерсон из журнала получает ...,"Нью-Йорк, США, ставка, журналист, терапевт, сп..."
15641,6526,film,Мистер и миссис Смит,Mr. & Mrs. Smith,1941.0,"мелодрамы, комедии",США,,16.0,,Альфред Хичкок,"Кэрол Ломбард, Роберт Монтгомери, Джин Рэймонд...",Дэвид и Энн Смит начали свою совместную жизнь ...,"такси, курение, офис, дождь, джин, дневник, ша..."
15809,11594,film,Мой парень из зоопарка,"ZOOKEEPER, THE",2011.0,"мелодрамы, комедии",США,,12.0,,Фрэнк Корачи,"Кевин Джеймс, Розарио Доусон, Лесли Бибб, Кен ...",Звери пытаются научить незадачливого смотрител...,"зоопарк, антропоморфизм, ученый, страус, обезь..."


In [17]:
users = pd.concat([users, user_1]).reset_index(drop=True)
interactions = pd.concat([interactions, interaction_user_1]).reset_index(drop=True)

### 2.A user who likes thriller series

In [18]:
user_id = max_user_id + 2
print("User id: ", user_id)

User id:  1097560


In [19]:
dict_user = {'user_id': [user_id],
        'age': ['age_45_54'],
        'income': ['income_60_90'],
        'sex': ['М'],
        'kids_flg': ['1']
       }

user_2 = pd.DataFrame(dict_user)

In [20]:
dict_interaction = {'user_id': [user_id, user_id, user_id],
        'item_id': [9747, 11213, 12148],
        'last_watch_dt': random_dates(min_date, max_date, 3),
        'total_dur': [350, 800, 480],
        'watched_pct': [50., 65., 100.]
       }
       
interaction_user_2 = pd.DataFrame(dict_interaction)

In [21]:
items.query('item_id in [9747, 11213, 12148]')

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
2325,12148,series,Квест,,2015.0,триллеры,Россия,0.0,16.0,,"Андрей Загидуллин, Александр Данилов","Павел Прилучный, Агата Муцениеце, Марина Петре...",Триллер с Павлом Прилучным и Агатой Муцениеце....,"Квест, 2015, Россия"
11520,11213,series,Фортитьюд,Fortitude,2015.0,триллеры,США,,18.0,,"Кирон Хоукс, Сэм Миллер, Хетти Макдональд","Бьерн Хлинур, Ричард Дормер, Софи Гробёль, Стэ...",Британский психологический триллер в скандинав...,"Фортитьюд, 2015, США"
13695,9747,series,Шторм,,2019.0,триллеры,Россия,0.0,18.0,,Борис Хлебников,"Александр Робак, Максим Лагашкин, Анна Михалко...",Сериал «Шторм» отмечен множеством престижных н...,"Шторм, 2019, Россия"


In [22]:
users = pd.concat([users, user_2]).reset_index(drop=True)
interactions = pd.concat([interactions, interaction_user_2]).reset_index(drop=True)

### 3.A user who likes documentaries.

In [23]:
user_id = max_user_id + 3
print("User id: ", user_id)

User id:  1097561


In [24]:
dict_user = {'user_id': [user_id],
        'age': ['age_25_34'],
        'income': ['income_60_90'],
        'sex': ['Ж'],
        'kids_flg': ['0']
       }

user_3 = pd.DataFrame(dict_user)

In [25]:
dict_interaction = {'user_id': [user_id, user_id, user_id],
        'item_id': [10111, 3454, 14654],
        'last_watch_dt': random_dates(min_date, max_date, 3),
        'total_dur': [67, 40, 95],
        'watched_pct': [69., 42., 100.]
       }
       
interaction_user_3 = pd.DataFrame(dict_interaction)

In [26]:
items.query('item_id in [10111, 3454, 14654]')

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
51,10111,film,Андрей Тарковский. Кино как молитва,Andrey Tarkovsky. A Cinema Prayer,2019.0,документальное,"Россия, Италия, Швеция",,12.0,,Андрей А. Тарковский,,Рассказ режиссера о самом себе. В основе фильм...,"Советский союз, тоскана, италия, портрет худож..."
204,14654,film,После правды: Дезинформация и цена фейк ньюс,After Truth: Disinformation and the Cost of Fa...,2020.0,документальное,США,,18.0,HBO,Эндрю Росси,"Александр Кит, Грег Эбботт, Джеймс Алефантис","Документальный проект, исследующий феномен «фа...","После, правды, Дезинформация, цена, фейк, ньюс..."
586,3454,film,Луис навсегда,"Siempre, Luis",2020.0,документальное,США,,18.0,HBO,,"Лин-Мануэль Миранда, Луис Миранда",Трогательный документальный фильм о Луисе Мира...,"Луис, навсегда, 2020, США"


In [27]:
users = pd.concat([users, user_3]).reset_index(drop=True)
interactions = pd.concat([interactions, interaction_user_3]).reset_index(drop=True)

# Items

In [28]:
items.head()

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео..."
2,10716,film,Тактическая сила,Tactical Force,2011.0,"криминал, зарубежные, триллеры, боевики, комедии",Канада,,16.0,,Адам П. Калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман,...",Профессиональный рестлер Стив Остин («Все или ...,"Тактическая, сила, 2011, Канада, бандиты, ганг..."
3,7868,film,45 лет,45 Years,2015.0,"драмы, зарубежные, мелодрамы",Великобритания,,16.0,,Эндрю Хэй,"Александра Риддлстон-Барретт, Джеральдин Джейм...","Шарлотта Рэмплинг, Том Кортни, Джеральдин Джей...","45, лет, 2015, Великобритания, брак, жизнь, лю..."
4,16268,film,Все решает мгновение,,1978.0,"драмы, спорт, советские, мелодрамы",СССР,,12.0,Ленфильм,Виктор Садовский,"Александр Абдулов, Александр Демьяненко, Алекс...",Расчетливая чаровница из советского кинохита «...,"Все, решает, мгновение, 1978, СССР, сильные, ж..."


In [29]:
items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15963 entries, 0 to 15962
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   item_id       15963 non-null  int64  
 1   content_type  15963 non-null  object 
 2   title         15963 non-null  object 
 3   title_orig    11218 non-null  object 
 4   release_year  15865 non-null  float64
 5   genres        15963 non-null  object 
 6   countries     15926 non-null  object 
 7   for_kids      566 non-null    float64
 8   age_rating    15961 non-null  float64
 9   studios       1065 non-null   object 
 10  directors     14454 non-null  object 
 11  actors        13344 non-null  object 
 12  description   15961 non-null  object 
 13  keywords      15540 non-null  object 
dtypes: float64(3), int64(1), object(10)
memory usage: 1.7+ MB


# Interactions

In [30]:
interactions.head()

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0


In [31]:
interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5476260 entries, 0 to 5476259
Data columns (total 5 columns):
 #   Column         Dtype  
---  ------         -----  
 0   user_id        int64  
 1   item_id        int64  
 2   last_watch_dt  object 
 3   total_dur      int64  
 4   watched_pct    float64
dtypes: float64(1), int64(3), object(1)
memory usage: 208.9+ MB


# Preprocess

In [32]:
interactions.drop(interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True)
interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format='%Y-%m-%d')   #bringing the date to a single format

In [33]:
max_date = interactions[Columns.Datetime].max()
print("Max date of interactions: ", max_date)

Max date of interactions:  2021-08-22 00:00:00


In [34]:
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)   #if the user viewed more than 10%, then the weight of interactions = 3, otherwise 1

In [35]:
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()   #data separation, test - data about the last week 

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (4985278, 6)
test: (490982, 6)


In [36]:
cold_users = set(test[Columns.User]) - set(train[Columns.User])   #filter out cold users from the test
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

kion dataset download: 100%|██████████| 78.8M/78.8M [00:20<00:00, 17.1MiB/s]

## Add cold users

To avoid the problem with cold users, we will take one interaction each and add it to the train.

In [37]:
list_cold_users = list(cold_users)
df_cold = interactions[interactions['user_id'].isin(list_cold_users)].copy()
df_cold = df_cold.drop_duplicates(subset = "user_id")
df_cold['last_watch_dt'].loc[(df_cold['last_watch_dt'] > min_date)] = min_date

train = pd.concat([train, df_cold]).reset_index(drop=True)

# Prepare features

## User features

In [38]:
users.isnull().sum()

user_id         0
age         14095
income      14776
sex         13831
kids_flg        0
dtype: int64

In [39]:
users.fillna('Unknown', inplace=True)

In [40]:
users.nunique()

user_id     840200
age              7
income           7
sex              3
kids_flg         4
dtype: int64

In [41]:
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()

In [42]:
user_features_frames = []
for feature in ["sex", "age"]:    #as features, we take the sex  and age of the user
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex


In [43]:
user_features.query(f"id == 973171")

Unnamed: 0,id,value,feature
0,973171,М,sex
0,973171,age_25_34,age


## Item features

In [44]:
items.isnull().sum()

item_id             0
content_type        0
title               0
title_orig       4745
release_year       98
genres              0
countries          37
for_kids        15397
age_rating          2
studios         14898
directors        1509
actors           2619
description         2
keywords          423
dtype: int64

In [45]:
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()

In [46]:
items.nunique()

item_id         15619
content_type        2
title           14980
title_orig      10413
release_year      105
genres           2720
countries         677
for_kids            2
age_rating          6
studios            38
directors        7834
actors          12717
description     15279
keywords        15158
dtype: int64

## Genre

In [47]:
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")       #explode genres to flatten table
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature.head()

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


## Content

In [48]:
content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"
item_features = pd.concat((genre_feature, content_feature))

In [49]:
item_features

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre
...,...,...,...
15958,6443,series,content_type
15959,2367,series,content_type
15960,10632,series,content_type
15961,4538,series,content_type


# Metrics

In [50]:
metrics_name = {
    'Precision': Precision,
    'Recall': Recall,
    'MAP': MAP,
}

metrics = {}
for metric_name, metric in metrics_name.items():
    for k in range(1, 11):
        metrics[f'{metric_name}@{k}'] = metric(k=k)

# Models

In [51]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 8
N_FACTORS = (20, 30)
N_EPOCHS = (6, ) 
USER_ALPHA = [0, 0.1] 
ITEM_ALPHA = [0, 0.1] 
LEARNING_RATE = 0.05 

In [52]:
models = {
    'popular': PopularModel(),
}

In [53]:
implicit_models = {
    'ALS': AlternatingLeastSquares,
}

for implicit_name, implicit_model in implicit_models.items():
    for is_fitting_features in (True, False):
        for n_factors in N_FACTORS:
            models[f"{implicit_name}_n_factors:{n_factors}_is_fitting_features:{is_fitting_features}"] = (
                ImplicitALSWrapperModel(
                    model=implicit_model(
                        factors=n_factors, 
                        random_state=RANDOM_STATE, 
                        num_threads=NUM_THREADS,
                    ),
                    fit_features_together=is_fitting_features,
                )
            )



In [54]:
lightfm_losses = ('bpr', 'warp') 

for n_epoch in N_EPOCHS:
  for user_alpha in USER_ALPHA:
    for item_alpha in ITEM_ALPHA:
      for loss in lightfm_losses:
          for n_factors in N_FACTORS:
              models[f"LightFM_{loss}_n_factors:{n_factors}_user_alpha:{user_alpha}_item_alpha:{item_alpha}_n_epoch:{n_epoch}"] = LightFMWrapperModel(
                  LightFM(
                      no_components=n_factors, 
                      loss=loss, 
                      random_state=RANDOM_STATE,
                      learning_rate=LEARNING_RATE,
                      user_alpha=user_alpha,
                      item_alpha=item_alpha,
                  ),
                  epochs=n_epoch,
                  num_threads=NUM_THREADS,
              )

In [55]:
%%time
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

CPU times: user 1.85 s, sys: 20.9 ms, total: 1.87 s
Wall time: 1.89 s


In [56]:
test_users = test[Columns.User].unique()

In [57]:
%%time
results = []
for model_name, model in models.items():
    print(f"Fitting model {model_name}...")
    model_quality = {'model': model_name}

    model.fit(dataset)
    recos = model.recommend(
        users=test_users,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    
    metric_values = calc_metrics(metrics, recos, test, train)
    model_quality.update(metric_values)
    results.append(model_quality)

Fitting model popular...
Fitting model ALS_n_factors:20_is_fitting_features:True...
Fitting model ALS_n_factors:30_is_fitting_features:True...
Fitting model ALS_n_factors:20_is_fitting_features:False...
Fitting model ALS_n_factors:30_is_fitting_features:False...
Fitting model LightFM_bpr_n_factors:20_user_alpha:0_item_alpha:0_n_epoch:6...
Fitting model LightFM_bpr_n_factors:30_user_alpha:0_item_alpha:0_n_epoch:6...
Fitting model LightFM_warp_n_factors:20_user_alpha:0_item_alpha:0_n_epoch:6...
Fitting model LightFM_warp_n_factors:30_user_alpha:0_item_alpha:0_n_epoch:6...
Fitting model LightFM_bpr_n_factors:20_user_alpha:0_item_alpha:0.1_n_epoch:6...
Fitting model LightFM_bpr_n_factors:30_user_alpha:0_item_alpha:0.1_n_epoch:6...
Fitting model LightFM_warp_n_factors:20_user_alpha:0_item_alpha:0.1_n_epoch:6...
Fitting model LightFM_warp_n_factors:30_user_alpha:0_item_alpha:0.1_n_epoch:6...
Fitting model LightFM_bpr_n_factors:20_user_alpha:0.1_item_alpha:0_n_epoch:6...
Fitting model LightFM

In [58]:
df_quality = pd.DataFrame(results).T

df_quality.columns = df_quality.iloc[0]

df_quality.drop('model', inplace=True)

In [59]:
df_quality.style.highlight_max(color='lightgreen', axis=1)

model,popular,ALS_n_factors:20_is_fitting_features:True,ALS_n_factors:30_is_fitting_features:True,ALS_n_factors:20_is_fitting_features:False,ALS_n_factors:30_is_fitting_features:False,LightFM_bpr_n_factors:20_user_alpha:0_item_alpha:0_n_epoch:6,LightFM_bpr_n_factors:30_user_alpha:0_item_alpha:0_n_epoch:6,LightFM_warp_n_factors:20_user_alpha:0_item_alpha:0_n_epoch:6,LightFM_warp_n_factors:30_user_alpha:0_item_alpha:0_n_epoch:6,LightFM_bpr_n_factors:20_user_alpha:0_item_alpha:0.1_n_epoch:6,LightFM_bpr_n_factors:30_user_alpha:0_item_alpha:0.1_n_epoch:6,LightFM_warp_n_factors:20_user_alpha:0_item_alpha:0.1_n_epoch:6,LightFM_warp_n_factors:30_user_alpha:0_item_alpha:0.1_n_epoch:6,LightFM_bpr_n_factors:20_user_alpha:0.1_item_alpha:0_n_epoch:6,LightFM_bpr_n_factors:30_user_alpha:0.1_item_alpha:0_n_epoch:6,LightFM_warp_n_factors:20_user_alpha:0.1_item_alpha:0_n_epoch:6,LightFM_warp_n_factors:30_user_alpha:0.1_item_alpha:0_n_epoch:6,LightFM_bpr_n_factors:20_user_alpha:0.1_item_alpha:0.1_n_epoch:6,LightFM_bpr_n_factors:30_user_alpha:0.1_item_alpha:0.1_n_epoch:6,LightFM_warp_n_factors:20_user_alpha:0.1_item_alpha:0.1_n_epoch:6,LightFM_warp_n_factors:30_user_alpha:0.1_item_alpha:0.1_n_epoch:6
Precision@1,0.073308,0.085513,0.085513,0.06389,0.06389,0.023639,0.024146,0.078311,0.080079,0.0,0.0,1.7e-05,8e-06,0.003933,0.002539,2.5e-05,8e-06,0.0,8e-06,0.078411,0.078577
Recall@1,0.038149,0.045159,0.045159,0.032618,0.032618,0.012638,0.012558,0.039713,0.040779,0.0,0.0,3e-06,1e-06,0.001846,0.001248,1.3e-05,3e-06,0.0,1e-06,0.041624,0.041699
Precision@2,0.069263,0.073383,0.073383,0.056078,0.056078,0.018142,0.018844,0.067023,0.066807,0.0,8e-06,3.3e-05,4e-06,0.00412,0.002419,2.1e-05,4e-06,0.0,4e-06,0.06925,0.069786
Recall@2,0.071011,0.075109,0.075109,0.055862,0.055862,0.018933,0.019309,0.066532,0.065855,0.0,3e-06,1.6e-05,1e-06,0.00386,0.002365,1.6e-05,3e-06,0.0,1e-06,0.071002,0.071801
Precision@3,0.066225,0.062657,0.062657,0.051914,0.051914,0.015184,0.015815,0.059523,0.059103,3e-06,1.1e-05,3.6e-05,8e-06,0.004257,0.002417,1.4e-05,3e-06,0.0,3e-06,0.066175,0.065967
Recall@3,0.1004,0.094391,0.094391,0.076889,0.076889,0.023232,0.023607,0.087754,0.086371,4e-06,5e-06,2.1e-05,1.2e-05,0.005951,0.003567,1.6e-05,3e-06,0.0,1e-06,0.100329,0.100503
Precision@4,0.059383,0.05507,0.05507,0.047853,0.047853,0.013224,0.013855,0.053606,0.053214,6e-06,1.2e-05,3.7e-05,6e-06,0.003499,0.003033,1e-05,4e-06,0.0,2e-06,0.059294,0.057373
Recall@4,0.118878,0.109322,0.109322,0.093312,0.093312,0.026443,0.027134,0.103837,0.102759,9e-06,7e-06,3.4e-05,1.2e-05,0.006465,0.006522,1.6e-05,1.1e-05,0.0,1e-06,0.118715,0.114912
Precision@5,0.052735,0.048726,0.048726,0.044063,0.044063,0.011879,0.012576,0.048472,0.048319,1.2e-05,2.3e-05,3.3e-05,7e-06,0.003045,0.002879,1e-05,8e-06,0.0,2e-06,0.052511,0.052307
Recall@5,0.130473,0.119502,0.119502,0.105857,0.105857,0.029258,0.030222,0.11614,0.11559,2e-05,2.1e-05,3.5e-05,1.2e-05,0.007046,0.007852,1.7e-05,2e-05,0.0,1e-06,0.12993,0.129515


## Approximate Nearest Neighbors

In [60]:
user_embeddings, item_embeddings = model.get_vectors(dataset)

user_embeddings.shape, item_embeddings.shape

((962182, 32), (15619, 32))

In [61]:
def augment_inner_product(factors):
    normed_factors = np.linalg.norm(factors, axis=1)
    max_norm = normed_factors.max()
    
    extra_dim = np.sqrt(max_norm ** 2 - normed_factors ** 2).reshape(-1, 1)
    augmented_factors = np.append(factors, extra_dim, axis=1)
    return max_norm, augmented_factors

In [62]:
print('Pre shape items: ', item_embeddings.shape)

max_norm, augmented_item_embeddings = augment_inner_product(item_embeddings)

print('Shape items after augmented: ', augmented_item_embeddings.shape)

Pre shape items:  (15619, 32)
Shape items after augmented:  (15619, 33)


In [63]:
extra_zero = np.zeros((user_embeddings.shape[0], 1))
augmented_user_embeddings = np.append(user_embeddings, extra_zero, axis=1)

print('Shape users after augmented: ', augmented_user_embeddings.shape)

Shape users after augmented:  (962182, 33)


### Examples of user embeddings and item embeddings

In [64]:
user_id = 30

print('User embeddings for ', user_id)
user_embeddings[user_id]

User embeddings for  30


array([-8.98748476e-06,  1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00])

In [65]:
print('User augmented embeddings for ', user_id)
augmented_user_embeddings[user_id]

User augmented embeddings for  30


array([-8.98748476e-06,  1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00])

In [66]:
item_id = 0

print('Item embeddings for ', item_id)
item_embeddings[item_id]

Item embeddings for  0


array([ 1.00000000e+00, -1.03900925e-06,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00])

In [67]:
print('Item augmented embeddings for ', item_id)
augmented_item_embeddings[item_id]

Item augmented embeddings for  0


array([ 1.00000000e+00, -1.03900925e-06,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        1.78602710e-06])

In [68]:
#set index parameters
M = 48
efC = 100

num_threads = 4
index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0}

print('Index-time parameters', index_time_params)

Index-time parameters {'M': 48, 'indexThreadQty': 4, 'efConstruction': 100, 'post': 0}


In [69]:
K = 10   #number of neighbors 

space_name = 'negdotprod'   #used for brute-force search

In [70]:
#intitialize the library, specify the space, the type of the vector and add data points 
index = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR) 
index.addDataPointBatch(augmented_item_embeddings) 

15619

In [71]:
#create an index
start = time.time()
index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC}
index.createIndex(index_time_params) 
end = time.time() 

print('Index-time parameters', index_time_params)

print('Indexing time = %f' % (end-start))

Index-time parameters {'M': 48, 'indexThreadQty': 4, 'efConstruction': 100}
Indexing time = 23.439706


In [72]:
#setting query-time parameters
efS = 100
query_time_params = {'efSearch': efS}

print('Setting query-time parameters', query_time_params) 

index.setQueryTimeParams(query_time_params)

Setting query-time parameters {'efSearch': 100}


In [73]:
query_matrix = augmented_user_embeddings[:1000, :]

In [74]:
# Querying
query_qty = query_matrix.shape[0]
start = time.time() 
nbrs = index.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)
end = time.time() 

print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' % 
      (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty)) 

kNN time total=0.065958 (sec), per query=0.000066 (sec), per query adjusted for thread number=0.000264 (sec)


In [75]:
nbrs[0]

(array([ 136, 1740, 1810, 1718,  806, 1367,   96,   14, 1383, 1068],
       dtype=int32),
 array([6.4693395e-06, 6.6390999e-06, 6.6727857e-06, 6.6913003e-06,
        6.6985122e-06, 6.7161727e-06, 6.7268497e-06, 6.7268552e-06,
        6.7377132e-06, 6.7395817e-06], dtype=float32))

# The recommendations received for the three users we added earlier

## User 1 (who likes comedies and melodramas)

#### Model popular

In [76]:
recos = models['popular'].recommend(
        users=[1097559, 1097560, 1097561],
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
  )

recos.merge(items[['item_id', 'title', 'genre']], on='item_id', how='left')

Unnamed: 0,user_id,item_id,score,rank,title,genre
0,1097559,10440,194098.0,1,Хрустальный,"[триллеры, детективы]"
1,1097559,15297,183474.0,2,Клиника счастья,"[драмы, мелодрамы]"
2,1097559,9728,122263.0,3,Гнев человеческий,"[боевики, триллеры]"
3,1097559,13865,115906.0,4,Девятаев,"[драмы, военные, приключения]"
4,1097559,4151,86395.0,5,Секреты семейной жизни,[комедии]
5,1097559,3734,69971.0,6,Прабабушка легкого поведения,[комедии]
6,1097559,2657,66962.0,7,Подслушано,"[драмы, триллеры]"
7,1097559,4880,53324.0,8,Афера,[комедии]
8,1097559,142,42864.0,9,Маша,"[драмы, триллеры]"
9,1097559,6809,39551.0,10,Дуров,[документальное]


In [81]:
recos = models['LightFM_warp_n_factors:20_user_alpha:0.1_item_alpha:0.1_n_epoch:6'].recommend(
        users=[1097559],
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
  )

recos.merge(items[['item_id', 'title', 'genre']], on='item_id', how='left')

Unnamed: 0,user_id,item_id,score,rank,title,genre
0,1097559,15297,-348.624112,1,Клиника счастья,"[драмы, мелодрамы]"
1,1097559,4151,-348.791454,2,Секреты семейной жизни,[комедии]
2,1097559,5424,-349.024323,3,Марафон желаний,"[мелодрамы, комедии]"
3,1097559,10440,-349.113923,4,Хрустальный,"[триллеры, детективы]"
4,1097559,3734,-349.221212,5,Прабабушка легкого поведения,[комедии]
5,1097559,1189,-349.25142,6,Привычка расставаться,"[мелодрамы, комедии]"
6,1097559,14982,-349.259405,7,Пока свадьба не разлучит нас,"[мелодрамы, комедии]"
7,1097559,7144,-349.259965,8,Срочно выйду замуж,"[мелодрамы, комедии]"
8,1097559,14809,-349.302331,9,Красотка,"[мелодрамы, комедии]"
9,1097559,933,-349.318312,10,Статус: Свободен,"[мелодрамы, комедии]"


We see that 9 out of 10 recommendations contain genres of melodrama or comedy. Also, the recommendations do not repeat the popular ones and are individual for the user.

## User 2 (who likes thriller series)

In [82]:
recos = models['LightFM_warp_n_factors:20_user_alpha:0.1_item_alpha:0.1_n_epoch:6'].recommend(
        users=[1097560],
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
  )
recos.merge(items[['item_id', 'title', 'genre', 'content_type']], 
       on='item_id',
       how='left')

Unnamed: 0,user_id,item_id,score,rank,title,genre,content_type
0,1097560,10440,-314.549195,1,Хрустальный,"[триллеры, детективы]",series
1,1097560,15297,-314.776322,2,Клиника счастья,"[драмы, мелодрамы]",series
2,1097560,8346,-315.133061,3,Одиночка,[детективы],series
3,1097560,947,-315.274269,4,Плацента (Правило лабиринта),[триллеры],series
4,1097560,4151,-315.311079,5,Секреты семейной жизни,[комедии],series
5,1097560,11640,-315.311307,6,Преступление,[детективы],series
6,1097560,9996,-315.36291,7,Немцы,[драмы],series
7,1097560,16406,-315.380046,8,Мистер Мерседес,"[фантастика, триллеры, детективы]",series
8,1097560,12615,-315.410011,9,Вызов,"[боевики, драмы, фантастика]",series
9,1097560,2916,-315.411787,10,Инквизитор,"[триллеры, детективы]",series


We see that 4 out of 10 recommendations contain the thriller genre and mostly recommend series, not movies.

## User 3 (wholikes documentaries)

In [84]:
recos = models['LightFM_warp_n_factors:20_user_alpha:0.1_item_alpha:0.1_n_epoch:6'].recommend(
        users=[1097561],
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
  )
recos.merge(items[['item_id', 'title', 'genre']], on='item_id', how='left')

Unnamed: 0,user_id,item_id,score,rank,title,genre
0,1097561,4740,-339.426893,1,Сахаров. Две жизни,[документальное]
1,1097561,6809,-339.593363,2,Дуров,[документальное]
2,1097561,16201,-339.833234,3,[4К] Сахаров. Две жизни,[документальное]
3,1097561,4345,-339.943316,4,Съесть слона,"[комедии, документальное]"
4,1097561,14526,-340.037365,5,Спорт XX века,[документальное]
5,1097561,2223,-340.11823,6,Как попасть в «Содержанки»,[документальное]
6,1097561,3886,-340.119118,7,[4К] Вид сверху. Корсика. Остров красоты,[документальное]
7,1097561,10586,-340.134167,8,[4К] Вид сверху. Канадская Арктика. Царство льда,[документальное]
8,1097561,2682,-340.148126,9,Бессмертие человека,[документальное]
9,1097561,12574,-340.206615,10,Into_нация большой Одессы,[документальное]


We see that 10 out of 10 recommendations contain the genre of documentary.