# Download and import libraries.

In [1]:
%%capture
!pip install rectools
!pip install optuna

In [71]:
import requests
import datetime
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from pprint import pprint
from pathlib import Path
import dill

from rectools.metrics import MAP, Recall, calc_metrics
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel
from rectools import Columns
from rectools.dataset import Dataset

from lightfm import LightFM
from implicit.als import AlternatingLeastSquares
import nmslib

import optuna

import os
os.environ["OPENBLAS_NUM_THREADS"] = "1"  # For implicit ALS

import warnings
warnings.filterwarnings('ignore')

# Download KION dataset 

In [3]:
# download dataset by chunks
url = "https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip"

req = requests.get(url, stream=True)

with open('kion_train.zip', "wb") as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

kion dataset download:   0%|          | 0.00/78.8M [00:00<?, ?iB/s]

In [4]:
!unzip kion_train.zip

Archive:  kion_train.zip
   creating: kion_train/
  inflating: kion_train/interactions.csv  
  inflating: __MACOSX/kion_train/._interactions.csv  
  inflating: kion_train/users.csv    
  inflating: __MACOSX/kion_train/._users.csv  
  inflating: kion_train/items.csv    
  inflating: __MACOSX/kion_train/._items.csv  


In [5]:
!rm -rf kion_train.zip

In [6]:
interactions = pd.read_csv('kion_train/interactions.csv')
users = pd.read_csv('kion_train/users.csv')
items = pd.read_csv('kion_train/items.csv')

In [7]:
# rename columns, convert timestamp
interactions.rename(columns={'last_watch_dt': Columns.Datetime,
                             'total_dur': Columns.Weight}, inplace=True) 

interactions['datetime'] = pd.to_datetime(interactions['datetime'])
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

# Preparing data

In [8]:
max_date = interactions[Columns.Datetime].max().normalize()

train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (4985269, 5)
test: (490982, 5)


In [9]:
# filter out cold users from the test
cold_users = set(test[Columns.User]) - set(train[Columns.User])

test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

# Preparing features

## Users features

In [10]:
# How many % of empty fields in Users
users.isnull().sum()/len(users)

user_id     0.000000
age         0.016776
income      0.017586
sex         0.016462
kids_flg    0.000000
dtype: float64

In [11]:
# % not large => replace them with empty values
users.fillna('Unknown', inplace=True)

In [12]:
# leave only those who are in the train
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()
users.head()

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0
5,1037719,age_45_54,income_60_90,М,0


In [13]:
user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex


## Items Features

In [14]:
# How many % of empty fields in Item
items.isnull().sum()/len(items) * 100

item_id          0.000000
content_type     0.000000
title            0.000000
title_orig      29.724989
release_year     0.613920
genres           0.000000
countries        0.231786
for_kids        96.454301
age_rating       0.012529
studios         93.328322
directors        9.453110
actors          16.406690
description      0.012529
keywords         2.649878
dtype: float64

In [15]:
# leave only those who are in the train
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()

### Genre

In [16]:
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature.head()

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


### Content

In [17]:
content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"
content_feature.head()

Unnamed: 0,id,value,feature
0,10711,film,content_type
1,2508,film,content_type
2,10716,film,content_type
3,7868,film,content_type
4,16268,film,content_type


### Country

In [18]:
items["country"] = items["countries"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
country_feature = items[["item_id", "country"]].explode("country")
country_feature.columns = ["id", "value"]
country_feature["feature"] = "country"
country_feature.head()

Unnamed: 0,id,value,feature
0,10711,испания,country
1,2508,сша,country
2,10716,канада,country
3,7868,великобритания,country
4,16268,ссср,country


### Release year

In [19]:
items['binned_r_year'] = pd.qcut(items['release_year'], q=10, labels=list(range(10)))

In [20]:
year_feature = items.reindex(columns=[Columns.Item, "binned_r_year"])
year_feature.columns = ["id", "value"]
year_feature["feature"] = "binned_r_year"
year_feature.head()

Unnamed: 0,id,value,feature
0,10711,1,binned_r_year
1,2508,4,binned_r_year
2,10716,3,binned_r_year
3,7868,5,binned_r_year
4,16268,0,binned_r_year


### Combining features

In [21]:
item_features = pd.concat((genre_feature, content_feature, country_feature, year_feature))
item_features

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre
...,...,...,...
15958,6443,7,binned_r_year
15959,2367,8,binned_r_year
15960,10632,6,binned_r_year
15961,4538,7,binned_r_year


# Init dataset

In [22]:
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type", "country", "binned_r_year"],
)

# Hyperparametrs via Optuna

## General parameters

In [23]:
TEST_USERS = test[Columns.User].unique()
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 1
N_EPOCHS = 2

# Metric
METRIC_MAP = MAP(k=K_RECOS)

## LightFM

In [24]:
def objective_lightFM(trial):
    '''Objective LifgtFM for optuna'''

    # Parameters for LightFM
    N_FACTORS= trial.suggest_int("n_factors", low=32, high=64, step=32)
    LOSS = trial.suggest_categorical("loss", choices=['bpr', 'warp'])
    LEARNING_RATE = trial.suggest_float("learning_rate", low=0.05, high=0.25, step=0.05)
    ITEM_ALPHA = trial.suggest_float("item_alpha", low=0.0, high=0.1, step=0.05)
    USER_ALPHA= trial.suggest_float("user_alpha", low=0.0, high=0.1, step=0.05)

    # Initialization of Lightfm
    LIGHT_FM = LightFMWrapperModel(
        model=LightFM(
            no_components=N_FACTORS,
            loss=LOSS,
            learning_rate=LEARNING_RATE,
            random_state=RANDOM_STATE,
            user_alpha=USER_ALPHA,
            item_alpha=ITEM_ALPHA,
        ),
        epochs=N_EPOCHS,
        num_threads=NUM_THREADS,
    )

    # Model training
    LIGHT_FM.fit(dataset)

    # Make recommendations for users in test
    recos = LIGHT_FM.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )

    # MAP@K
    metric_values = METRIC_MAP.calc(recos, test)

    return metric_values

In [25]:
def save_best_trial_lightFM(trial):
    # Use same code objective to reproduce the best model
    N_FACTORS= trial.suggest_int("n_factors", low=32, high=64, step=32)
    LOSS = trial.suggest_categorical("loss", choices=['bpr', 'warp'])
    LEARNING_RATE = trial.suggest_float("learning_rate", low=0.05, high=0.25, step=0.05)
    ITEM_ALPHA = trial.suggest_float("item_alpha", low=0.0, high=0.1, step=0.05)
    USER_ALPHA= trial.suggest_float("user_alpha", low=0.0, high=0.1, step=0.05)

    LIGHT_FM = LightFMWrapperModel(
      model=LightFM(
        no_components=N_FACTORS, 
        loss=LOSS, 
        random_state=RANDOM_STATE,
        learning_rate=LEARNING_RATE,
        user_alpha=USER_ALPHA,
        item_alpha=ITEM_ALPHA,
      ),
      epochs=N_EPOCHS + 1,
      num_threads=NUM_THREADS,
    )

    LIGHT_FM.fit(dataset)

    with open('lightfm.dill', 'wb') as f:
        dill.dump(LIGHT_FM.model, f)

In [26]:
study = optuna.create_study(direction="maximize")
study.optimize(objective_lightFM, n_trials=10)

pprint(f"Number of finished trials: {len(study.trials)}")
trial = study.best_trial
pprint(f"Best trial: {trial}")

[32m[I 2022-12-08 09:16:12,873][0m A new study created in memory with name: no-name-f63c1a94-5c1e-4304-b42d-076bfe3f5a61[0m
[32m[I 2022-12-08 09:19:55,971][0m Trial 0 finished with value: 7.744283750556621e-07 and parameters: {'n_factors': 64, 'loss': 'bpr', 'learning_rate': 0.1, 'item_alpha': 0.05, 'user_alpha': 0.0}. Best is trial 0 with value: 7.744283750556621e-07.[0m
[32m[I 2022-12-08 09:22:37,463][0m Trial 1 finished with value: 0.0590403412591852 and parameters: {'n_factors': 64, 'loss': 'warp', 'learning_rate': 0.1, 'item_alpha': 0.0, 'user_alpha': 0.0}. Best is trial 1 with value: 0.0590403412591852.[0m
[32m[I 2022-12-08 09:25:40,040][0m Trial 2 finished with value: 0.0 and parameters: {'n_factors': 32, 'loss': 'bpr', 'learning_rate': 0.2, 'item_alpha': 0.05, 'user_alpha': 0.1}. Best is trial 1 with value: 0.0590403412591852.[0m
[32m[I 2022-12-08 09:27:58,848][0m Trial 3 finished with value: 0.07680311170477117 and parameters: {'n_factors': 32, 'loss': 'warp', 'l

'Number of finished trials: 10'
('Best trial: FrozenTrial(number=3, values=[0.07680311170477117], '
 'datetime_start=datetime.datetime(2022, 12, 8, 9, 25, 40, 41880), '
 'datetime_complete=datetime.datetime(2022, 12, 8, 9, 27, 58, 848483), '
 "params={'n_factors': 32, 'loss': 'warp', 'learning_rate': 0.2, 'item_alpha': "
 "0.1, 'user_alpha': 0.1}, distributions={'n_factors': "
 "IntDistribution(high=64, log=False, low=32, step=32), 'loss': "
 "CategoricalDistribution(choices=('bpr', 'warp')), 'learning_rate': "
 "FloatDistribution(high=0.25, log=False, low=0.05, step=0.05), 'item_alpha': "
 "FloatDistribution(high=0.1, log=False, low=0.0, step=0.05), 'user_alpha': "
 'FloatDistribution(high=0.1, log=False, low=0.0, step=0.05)}, user_attrs={}, '
 'system_attrs={}, intermediate_values={}, trial_id=3, '
 'state=TrialState.COMPLETE, value=None)')


In [27]:
save_best_trial_lightFM(study.best_trials[0])

In [28]:
with open('lightfm.dill', 'rb') as f:
    assert type(dill.load(f)) == LightFM

## ALS

In [29]:
def objective_ALS(trial):
    '''Objective ALS for optuna'''

    # Parameters for ALS
    NUM_FACTORS = trial.suggest_int('factors', low=32, high=64, step=32)
    REG_COEFS= trial.suggest_float('regularization', low=0.01, high=0.51, step=0.1)
    
    # Initialization of ALS
    ALS = ImplicitALSWrapperModel(
        model=AlternatingLeastSquares(
            factors=NUM_FACTORS,
            regularization=REG_COEFS,
            random_state=RANDOM_STATE,
        ),
        fit_features_together=True,
    )
    
    # Model training
    ALS.fit(dataset)
    
    # Make recommendations for users in test
    recos = ALS.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    
    # MAP@K
    metric_values = METRIC_MAP.calc(recos, test)
    
    return metric_values

In [30]:
def save_best_trial_ALS(trial):
    # Use same code objective to reproduce the best model
    NUM_FACTORS = trial.suggest_int('factors', low=32, high=64, step=32)
    REG_COEFS= trial.suggest_float('regularization', low=0.01, high=0.51, step=0.1)
    
    ALS = ImplicitALSWrapperModel(
      model=AlternatingLeastSquares(
        factors=NUM_FACTORS, 
        regularization=REG_COEFS,
        random_state=RANDOM_STATE, 
      ),
      fit_features_together=True,
    )

    ALS.fit(dataset)

    with open('implicit_als.dill', 'wb') as f:
        dill.dump(ALS.model, f)
     

In [31]:
# запуск подбора гиперпараметров
study = optuna.create_study(direction="maximize")
study.optimize(objective_ALS, n_trials=10)

pprint(f"Number of finished trials: {len(study.trials)}")
trial = study.best_trial
pprint(f"Best trial: {trial}")

[32m[I 2022-12-08 09:49:02,408][0m A new study created in memory with name: no-name-11e59734-3674-4b60-b675-491d462f9084[0m
[32m[I 2022-12-08 09:51:59,571][0m Trial 0 finished with value: 0.07937983219034522 and parameters: {'factors': 32, 'regularization': 0.01}. Best is trial 0 with value: 0.07937983219034522.[0m
[32m[I 2022-12-08 09:55:22,993][0m Trial 1 finished with value: 0.07859404208176704 and parameters: {'factors': 64, 'regularization': 0.11}. Best is trial 0 with value: 0.07937983219034522.[0m
[32m[I 2022-12-08 09:58:14,382][0m Trial 2 finished with value: 0.07775498909538012 and parameters: {'factors': 32, 'regularization': 0.31000000000000005}. Best is trial 0 with value: 0.07937983219034522.[0m
[32m[I 2022-12-08 10:01:26,299][0m Trial 3 finished with value: 0.07859404208176704 and parameters: {'factors': 64, 'regularization': 0.11}. Best is trial 0 with value: 0.07937983219034522.[0m
[32m[I 2022-12-08 10:04:38,812][0m Trial 4 finished with value: 0.079380

'Number of finished trials: 10'
('Best trial: FrozenTrial(number=4, values=[0.0793800173204073], '
 'datetime_start=datetime.datetime(2022, 12, 8, 10, 1, 26, 300281), '
 'datetime_complete=datetime.datetime(2022, 12, 8, 10, 4, 38, 811789), '
 "params={'factors': 64, 'regularization': 0.21000000000000002}, "
 "distributions={'factors': IntDistribution(high=64, log=False, low=32, "
 "step=32), 'regularization': FloatDistribution(high=0.51, log=False, "
 'low=0.01, step=0.1)}, user_attrs={}, system_attrs={}, '
 'intermediate_values={}, trial_id=4, state=TrialState.COMPLETE, value=None)')


In [32]:
save_best_trial_ALS(study.best_trials[0])

In [33]:
with open('implicit_als.dill', 'rb') as f:
    assert type(dill.load(f)) == AlternatingLeastSquares

# Approximate Nearest Neighbors

In [34]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type", "country", "binned_r_year"],
)

In [38]:
lightFM = LightFMWrapperModel(
  model=LightFM(
    no_components=32, 
    loss='warp', 
    random_state=RANDOM_STATE,
    learning_rate=0.2,
    user_alpha=0.1,
    item_alpha=0.1,
  ),
  epochs=N_EPOCHS,
  num_threads=NUM_THREADS,
)

lightFM.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7fb5e81c8710>

## Create index

In [39]:
user_embeddings, item_embeddings = lightFM.get_vectors(dataset)
user_embeddings.shape, item_embeddings.shape

((962179, 34), (15706, 34))

In [40]:
def augment_inner_product(factors):
    normed_factors = np.linalg.norm(factors, axis=1)
    max_norm = normed_factors.max()
    
    extra_dim = np.sqrt(max_norm ** 2 - normed_factors ** 2).reshape(-1, 1)
    augmented_factors = np.append(factors, extra_dim, axis=1)
    return max_norm, augmented_factors

In [41]:
print('pre shape: ', item_embeddings.shape)
max_norm, augmented_item_embeddings = augment_inner_product(item_embeddings)
augmented_item_embeddings.shape

pre shape:  (15706, 34)


(15706, 35)

In [42]:
extra_zero = np.zeros((user_embeddings.shape[0], 1))
augmented_user_embeddings = np.append(user_embeddings, extra_zero, axis=1)
augmented_user_embeddings.shape

(962179, 35)

In [43]:
M = 48
K = 10
efC = 128
num_threads = 4
space_name='negdotprod'

In [44]:
%%time

index = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR) 
index.addDataPointBatch(augmented_item_embeddings) 

index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC}
index.createIndex(index_time_params) 
print('Index-time parameters', index_time_params)

Index-time parameters {'M': 48, 'indexThreadQty': 4, 'efConstruction': 128}
CPU times: user 46.2 s, sys: 391 ms, total: 46.6 s
Wall time: 25.2 s


In [45]:
# Setting query-time parameters
efS = 128
query_time_params = {'efSearch': efS}
print('Setting query-time parameters', query_time_params)
index.setQueryTimeParams(query_time_params)

Setting query-time parameters {'efSearch': 128}


## Get embeddings for TEST_USERS

In [47]:
def get_mapping(train_df, col):
    inv_mapping = dict(enumerate(train_df[col].unique()))
    mapping = {v: k for k, v in inv_mapping.items()}
    return inv_mapping, mapping

In [48]:
users_inv_mapping, users_mapping = get_mapping(train, 'user_id')

In [49]:
test_emb_ids = [users_mapping[user] for user in TEST_USERS]
test_emb_ids[:10]

[829461, 722089, 83024, 241680, 677649, 667175, 467124, 88243, 682987, 69878]

In [50]:
query_matrix = augmented_user_embeddings[test_emb_ids, :]

In [52]:
assert query_matrix.shape[0] == len(test_emb_ids)

## Querying

In [53]:
%%time
query_qty = query_matrix.shape[0]
nbrs = index.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)

CPU times: user 16.4 s, sys: 141 ms, total: 16.5 s
Wall time: 9.36 s


In [54]:
item_inv_mapping, item_mapping = get_mapping(train, 'item_id')
     
recos = [[item_inv_mapping[item] for item in nbrs[i][0]] for i in range(len(nbrs))]
recos[:5]

[[12846, 2616, 11970, 7347, 14347, 7737, 372, 9617, 12173, 334],
 [12846, 2616, 11970, 7347, 14347, 7737, 372, 9617, 12173, 334],
 [12846, 2616, 11970, 7347, 14347, 7737, 372, 9617, 12173, 334],
 [12846, 2616, 11970, 7347, 14347, 7737, 372, 9617, 12173, 334],
 [12846, 2616, 11970, 7347, 14347, 7737, 372, 9617, 12173, 334]]

In [55]:
recos = pd.DataFrame(list(zip(TEST_USERS, recos)), columns=['user_id', 'item_id'])
recos.head()

Unnamed: 0,user_id,item_id
0,203219,"[12846, 2616, 11970, 7347, 14347, 7737, 372, 9..."
1,200197,"[12846, 2616, 11970, 7347, 14347, 7737, 372, 9..."
2,73446,"[12846, 2616, 11970, 7347, 14347, 7737, 372, 9..."
3,10010,"[12846, 2616, 11970, 7347, 14347, 7737, 372, 9..."
4,890735,"[12846, 2616, 11970, 7347, 14347, 7737, 372, 9..."


In [56]:
res = recos.explode('item_id')
res['rank'] = res.groupby('user_id').cumcount() + 1
res.head()

Unnamed: 0,user_id,item_id,rank
0,203219,12846,1
0,203219,2616,2
0,203219,11970,3
0,203219,7347,4
0,203219,14347,5


In [60]:
metrics = {
    "map@10": MAP(k=10),
    "recall@10": Recall(k=10),
}
  
metric_values = calc_metrics(metrics, res, test, train)
metric_values

{'recall@10': 0.00451225871428664, 'map@10': 0.0012408981395582517}

# Let's create three avatars

[example](https://github.com/sharthZ23/your-second-recsys/blob/master/lecture_2/Practice.ipynb)

In [61]:
items[items['genres'].str.contains("детективы")]['title']

0                   Поговори с ней
8               Лабиринты прошлого
21                  Смерть девушки
38                     Тайное окно
46                       Дивергент
                   ...            
15898               Триплексоголик
15904                    Налетчики
15905          [4k] Кловерфилд, 10
15913    Мотылек (жестовым языком)
15916                   До и после
Name: title, Length: 1185, dtype: object

In [62]:
titles = [
    "Тайное окно", 
    "Налетчики", 
    "До и после",
    "Поговори с ней",
    "Смерть девушки"
]
avatar_interactions_detective = pd.DataFrame({"user_id": "avatar_detective", "title": titles})
avatar_interactions_detective = avatar_interactions_detective.merge(items[["item_id", "title", "genres"]], on="title")
avatar_interactions_detective

Unnamed: 0,user_id,title,item_id,genres
0,avatar_detective,Тайное окно,1622,"триллеры, детективы"
1,avatar_detective,Налетчики,13014,"криминал, детективы, зарубежные, триллеры, бое..."
2,avatar_detective,До и после,5274,"драмы, детективы"
3,avatar_detective,Поговори с ней,10711,"драмы, зарубежные, детективы, мелодрамы"
4,avatar_detective,Поговори с ней,4717,мелодрамы
5,avatar_detective,Смерть девушки,3445,"драмы, зарубежные, триллеры, детективы"


In [63]:
items[items['genres'].str.contains("спорт")]['title']

4                             Все решает мгновение
35       Лев Яшин – Эдуард Стрельцов. Перекрестки…
221          Жиросжигающий интенсив HIIT. Выпуск 7
272                                        Дебошир
340                                       Вышибала
                           ...                    
15856                          Влюбленный скорпион
15868                                  Лови волну!
15920                                          Лёд
15938                                   Держи удар
15961                                 Среди камней
Name: title, Length: 298, dtype: object

In [64]:
titles = [
    "Дебошир", 
    "Вышибала", 
    "Лови волну!",
    "Лёд",
    "Среди камней"
]
avatar_interactions_sport = pd.DataFrame({"user_id": "avatar_sport", "title": titles})
avatar_interactions_sport = avatar_interactions_sport.merge(items[["item_id", "title", "genres"]], on="title")
avatar_interactions_sport

Unnamed: 0,user_id,title,item_id,genres
0,avatar_sport,Дебошир,849,"историческое, биография, криминал, драмы, спор..."
1,avatar_sport,Вышибала,4639,"драмы, спорт, комедии"
2,avatar_sport,Лови волну!,6467,"спорт, мультфильм, комедии"
3,avatar_sport,Лёд,13485,"драмы, спорт, мелодрамы"
4,avatar_sport,Лёд,9035,"драмы, спорт, мелодрамы"
5,avatar_sport,Среди камней,4538,"драмы, спорт, криминал"


In [65]:
items[items['genres'].str.contains("триллеры")]['title']

2          Тактическая сила
8        Лабиринты прошлого
17               Дитя крови
21           Смерть девушки
38              Тайное окно
                ...        
15923                  Анна
15939     100 дней на жизнь
15945            Чёрный лес
15958         Полярный круг
15960                Сговор
Name: title, Length: 2379, dtype: object

In [66]:
titles = [
    "Лабиринты прошлого", 
    "Полярный круг", 
    "Сговор",
    "Дитя крови",
    "Тайное окно"
]
avatar_interactions_thriller = pd.DataFrame({"user_id": "avatar_thriller", "title": titles})
avatar_interactions_thriller = avatar_interactions_thriller.merge(items[["item_id", "title", "genres"]], on="title")
avatar_interactions_thriller

Unnamed: 0,user_id,title,item_id,genres
0,avatar_thriller,Лабиринты прошлого,9853,"криминал, детективы, драмы, зарубежные, триллеры"
1,avatar_thriller,Полярный круг,6443,"драмы, триллеры, криминал"
2,avatar_thriller,Сговор,10632,"драмы, триллеры, криминал"
3,avatar_thriller,Дитя крови,6881,"зарубежные, триллеры, фэнтези, ужасы"
4,avatar_thriller,Тайное окно,1622,"триллеры, детективы"


In [67]:

train_with_avatars = pd.concat([
    train, 
    avatar_interactions_detective.drop(['title', 'genres'], axis=1),
    avatar_interactions_sport.drop(['title', 'genres'], axis=1), 
    avatar_interactions_thriller.drop(['title', 'genres'], axis=1),
    ], 
    sort=False)

train_with_avatars.tail()

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,avatar_thriller,9853,NaT,,
1,avatar_thriller,6443,NaT,,
2,avatar_thriller,10632,NaT,,
3,avatar_thriller,6881,NaT,,
4,avatar_thriller,1622,NaT,,


In [72]:
train_with_avatars['datetime'] = train_with_avatars['datetime'].fillna(datetime.datetime(2021, 1, 1))
train_with_avatars['weight'] = train_with_avatars['weight'].fillna(3)
train_with_avatars['watched_pct'] = train_with_avatars['watched_pct'].fillna(80)

In [73]:
from lightfm.data import Dataset as LFMDataset

In [74]:
avatar_dataset = LFMDataset()
avatar_dataset.fit(
    users=train_with_avatars["user_id"].values,
    items=train_with_avatars["item_id"].values,
)

train_matrix, _ = avatar_dataset.build_interactions(zip(*train_with_avatars[["user_id", "item_id"]].values.T))

In [75]:
model = LightFM(
    learning_rate=0.2, 
    loss='warp', 
    no_components=32,
    random_state=RANDOM_STATE,
    user_alpha=0.1,
    item_alpha=0.1,
)

model.fit(
    interactions=train_matrix, 
    epochs=N_EPOCHS,
    num_threads=20,
);

# Recommend

In [76]:
id_item_mapping = {v: k for k, v in avatar_dataset._item_id_mapping.items()}

In [77]:
def get_n_recommendations_for_user(user_id, model, train_matrix,
                                   user_to_id, id_to_item, n_recommendations,):
  
    user_inner_id = user_to_id[user_id]
    scores = model.predict(
        user_ids=user_inner_id,
        item_ids=np.arange(train_matrix.shape[1]),
        num_threads=20
    )
    user_watched_items = train_matrix.col[train_matrix.row == user_inner_id]
    scores[user_watched_items] = -np.inf

    recommended_item_inner_ids = np.argpartition(scores, -np.arange(n_recommendations))[-n_recommendations:][::-1]
    recommended_item_ids = [id_to_item[x] for x in recommended_item_inner_ids]
    return recommended_item_ids

In [82]:
user_id = "avatar_detective"

recommended_items = get_n_recommendations_for_user(
    user_id=user_id,
    model=model,
    train_matrix=train_matrix,
    user_to_id=avatar_dataset._user_id_mapping,
    id_to_item=id_item_mapping,
    n_recommendations=K_RECOS
)
pd.DataFrame({"user_id": user_id, "item_id": recommended_items}).merge(items[["item_id", "title", "genres"]])

Unnamed: 0,user_id,item_id,title,genres
0,avatar_detective,1876,Пустой человек,"ужасы, детективы"
1,avatar_detective,606,Азиатская падчерица,для взрослых
2,avatar_detective,6193,Планета обезьян: Революция,"боевики, драмы, фантастика"
3,avatar_detective,15199,Девять ярдов (субтитры),"криминал, комедии"
4,avatar_detective,10761,Моана,"мультфильм, фэнтези, мюзиклы"
5,avatar_detective,14461,Пара из будущего,"мелодрамы, комедии"
6,avatar_detective,14317,Веном,"популярное, фантастика, триллеры, боевики, ужасы"
7,avatar_detective,3784,Маленький воин,"семейное, комедии"
8,avatar_detective,11863,Девятаев - сериал,"драмы, военные, приключения"
9,avatar_detective,10440,Хрустальный,"триллеры, детективы"


In [84]:
user_id = "avatar_sport"

recommended_items = get_n_recommendations_for_user(
    user_id=user_id,
    model=model,
    train_matrix=train_matrix,
    user_to_id=avatar_dataset._user_id_mapping,
    id_to_item=id_item_mapping,
    n_recommendations=K_RECOS
)
pd.DataFrame({"user_id": user_id, "item_id": recommended_items}).merge(items[["item_id", "title", "genres"]])

Unnamed: 0,user_id,item_id,title,genres
0,avatar_sport,1876,Пустой человек,"ужасы, детективы"
1,avatar_sport,606,Азиатская падчерица,для взрослых
2,avatar_sport,6193,Планета обезьян: Революция,"боевики, драмы, фантастика"
3,avatar_sport,15199,Девять ярдов (субтитры),"криминал, комедии"
4,avatar_sport,10761,Моана,"мультфильм, фэнтези, мюзиклы"
5,avatar_sport,14461,Пара из будущего,"мелодрамы, комедии"
6,avatar_sport,14317,Веном,"популярное, фантастика, триллеры, боевики, ужасы"
7,avatar_sport,3784,Маленький воин,"семейное, комедии"
8,avatar_sport,11863,Девятаев - сериал,"драмы, военные, приключения"
9,avatar_sport,7102,Дочь волка,"боевики, триллеры"


In [85]:
user_id = "avatar_thriller"

recommended_items = get_n_recommendations_for_user(
    user_id=user_id,
    model=model,
    train_matrix=train_matrix,
    user_to_id=avatar_dataset._user_id_mapping,
    id_to_item=id_item_mapping,
    n_recommendations=K_RECOS
)
pd.DataFrame({"user_id": user_id, "item_id": recommended_items}).merge(items[["item_id", "title", "genres"]])

Unnamed: 0,user_id,item_id,title,genres
0,avatar_thriller,1876,Пустой человек,"ужасы, детективы"
1,avatar_thriller,606,Азиатская падчерица,для взрослых
2,avatar_thriller,6193,Планета обезьян: Революция,"боевики, драмы, фантастика"
3,avatar_thriller,15199,Девять ярдов (субтитры),"криминал, комедии"
4,avatar_thriller,10761,Моана,"мультфильм, фэнтези, мюзиклы"
5,avatar_thriller,14461,Пара из будущего,"мелодрамы, комедии"
6,avatar_thriller,14317,Веном,"популярное, фантастика, триллеры, боевики, ужасы"
7,avatar_thriller,3784,Маленький воин,"семейное, комедии"
8,avatar_thriller,11863,Девятаев - сериал,"драмы, военные, приключения"
9,avatar_thriller,7102,Дочь волка,"боевики, триллеры"


# Prepare best model for inference
The best model - LightFM. We will use fitted LightFM model object from rectools to inference in the online API.

Cold users will be processed with Popular model result - constant vector of items [9728, 10440, 15297, 13865, 14488, 12192, 12360, 341, 4151, 3734]

Dump mappers

In [87]:
users_inv_mapping, users_mapping = get_mapping(train, 'user_id')
items_inv_mapping, items_mapping = get_mapping(train, 'item_id')

In [88]:
with open('users_mapping.dill', 'wb') as f:
    dill.dump(users_mapping, f)

In [89]:
with open('items_inv_mapping.dill', 'wb') as f:
    dill.dump(items_inv_mapping, f)

## Load model

In [90]:
with open('lightfm.dill', 'rb') as f:
    model = dill.load(f)

## Get recommendations

In [91]:
train.head()

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,3,72.0
1,699317,1659,2021-05-29,3,100.0
2,656683,7107,2021-05-09,1,0.0
3,864613,7638,2021-07-05,3,100.0
4,964868,9506,2021-04-30,3,100.0


In [92]:
user_id = 699317
user_inner_idx = users_mapping[user_id]
user_biases, user_embedding = model.get_user_representations()[0][user_inner_idx], model.get_user_representations()[1][user_inner_idx]
assert user_embedding.shape[0] == 32

items_biases, items_embedding = model.get_item_representations()
user_embedding = np.hstack((user_biases, np.ones(user_biases.size), user_embedding))
items_embedding = np.hstack((np.ones((items_biases.size, 1)), items_biases[:, np.newaxis], items_embedding))

In [93]:
scores = items_embedding @ user_embedding

In [94]:
top_score_ids = scores.argsort()[-10:][::-1]
items_to_recommend = [items_inv_mapping[item] for item in top_score_ids]
items_to_recommend

[10440, 15297, 13865, 4151, 9728, 3734, 142, 12585, 99, 6809]