In [1]:
import multiprocessing
import os
import pickle
import warnings
from collections import Counter

import numpy as np
import optuna
import pandas as pd
import requests
from implicit.als import AlternatingLeastSquares
from lightfm import LightFM
from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import calc_metrics, MAP, MeanInvUserFreq, Serendipity
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel
from rectools.models.popular import PopularModel
from tqdm.auto import tqdm

In [2]:
warnings.filterwarnings('ignore')

# Get KION dataset

In [3]:
# download dataset by chunks
url = (
    'https://storage.yandexcloud.net/'
    'itmo-recsys-public-data/kion_train.zip'
)

req = requests.get(url, stream=True)

with open('kion_train.zip', 'wb') as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(
        desc='kion dataset download',
        total=total_size_in_bytes,
        unit='iB',
        unit_scale=True,
    )
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

kion dataset download:   0%|          | 0.00/78.8M [00:00<?, ?iB/s]

In [4]:
!unzip -o kion_train.zip

Archive:  kion_train.zip
  inflating: kion_train/interactions.csv  
  inflating: __MACOSX/kion_train/._interactions.csv  
  inflating: kion_train/users.csv    
  inflating: __MACOSX/kion_train/._users.csv  
  inflating: kion_train/items.csv    
  inflating: __MACOSX/kion_train/._items.csv  


# Data utils

In [5]:
def read_data():
    interactions = pd.read_csv('kion_train/interactions.csv')
    users = pd.read_csv('kion_train/users.csv')
    items = pd.read_csv('kion_train/items.csv')

    interactions.rename(
        columns={'last_watch_dt': Columns.Datetime},
        inplace=True,
    )

    interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime])
    interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)
    
    users.fillna('Unknown', inplace=True)

    items['genre_list'] = items['genres'].apply(lambda genres: genres.split(', '))
    genre_counter = Counter(items['genre_list'].explode())
    genre_names = list(map(lambda x: x[0], genre_counter.most_common(10)))
    for genre_name in genre_names:
        is_current_genre = []
        for genre_list in items['genres']:
            is_current_genre.append(int(genre_name in genre_list))
        items[genre_name] = is_current_genre

    return interactions, users, items

In [6]:
def train_test_split(df, filter_cold_users, time_delta):
    max_date = df[Columns.Datetime].max()
    train = df[df[Columns.Datetime] < max_date - time_delta].copy()
    test = df[df[Columns.Datetime] >= max_date - time_delta].copy()
    
    if filter_cold_users:
        cold_users = set(test[Columns.User]) - set(train[Columns.User])
        test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)
        
    return train, test

In [7]:
def get_user_features(users, features):
    user_features_frames = []
    for feature in features:
        feature_frame = users.reindex(columns=[Columns.User, feature])
        feature_frame.columns = ['id', 'value']
        feature_frame['feature'] = feature
        user_features_frames.append(feature_frame)
    return pd.concat(user_features_frames)

def get_item_features(items, features):
    item_features_frames = []
    for feature in features:
        feature_frame = items.reindex(columns=[Columns.Item, feature])
        feature_frame.columns = ['id', 'value']
        feature_frame['feature'] = feature
        item_features_frames.append(feature_frame)
    return pd.concat(item_features_frames)

# Objective class

In [8]:
class Objective(object):
    def __init__(
        self,
        dataset,
        train,
        test,
        metrics,
        objective_metric,
        seed=42,
    ):
        self.dataset = dataset
        self.train = train
        self.test = test
        self.metrics = metrics
        self.objective_metric = objective_metric
        self.seed = seed
        self.trials = []
        self.catalog = self.train[Columns.Item].unique()

    def __call__(self, trial):
        model = self.get_model(trial)
        
        for past_trial in trial.study.trials:
            if past_trial.state != optuna.trial.TrialState.COMPLETE:
                continue

            if past_trial.params == trial.params:
                return past_trial.value
        
        print('Params: {0}'.format(trial.params))
        
        model.fit(self.dataset)

        test_users = self.test[Columns.User].unique()

        recos = model.recommend(
            users=test_users,
            dataset=self.dataset,
            k=10,
            filter_viewed=True,
        )

        metric_values = calc_metrics(
            self.metrics, recos, self.test, self.train, self.catalog,
        )

        report = trial.params.copy()
        report.update(metric_values)

        self.trials.append(report)

        return metric_values[self.objective_metric]

    def get_model(self, trial):
        model_type = trial.suggest_categorical('model', ['ALS', 'FM', 'Popular'])
        if model_type == 'ALS':
            return self._get_als_model(trial)
        elif model_type == 'FM':
            return self._get_fm_model(trial)
        else:
            return PopularModel()

    def _get_als_model(self, trial):
        n_components = trial.suggest_int('n_components', 1, 64)
        fit_features_together = trial.suggest_categorical(
            'fit_features_together', [True, False],
        )
        
        regularization = trial.suggest_float('regularization', 0, 0.1)

        return ImplicitALSWrapperModel(
            model=AlternatingLeastSquares(
                factors=n_components,
                regularization=regularization,
                random_state=self.seed,
                num_threads=multiprocessing.cpu_count(),
            ),
            fit_features_together=fit_features_together,
        )

    def _get_fm_model(self, trial):
        n_components = trial.suggest_int('n_components', 1, 64)
        loss = trial.suggest_categorical(
            'loss', ['bpr', 'warp'],
        )
        
        user_alpha = trial.suggest_float('user_alpha', 0, 0.1)
        item_alpha = trial.suggest_float('item_alpha', 0, 0.1)

        return LightFMWrapperModel(
            LightFM(
                no_components=n_components,
                loss=loss,
                random_state=self.seed,
                user_alpha=user_alpha,
                item_alpha=item_alpha,
            ),
            epochs=1,
            num_threads=multiprocessing.cpu_count(),
        )

# Model saving

In [9]:
def save_model(model, dirname, model_type, dataset, interactions):
    if model_type == 'ALS':
        user_embeddings, item_embeddings = best_model.get_vectors()
    else:
        user_embeddings, item_embeddings = best_model.get_vectors(dataset)
        
    if not os.path.isdir(dirname):
        os.mkdir(dirname)
        
    attr_values = [user_embeddings, item_embeddings, interactions]
    attr_names = ['user_embeddings', 'item_embeddings', 'interactions']
    
    for attr_value, attr_name in zip(attr_values, attr_names):
        attr_filename = os.path.join(
            dirname, '{0}.pickle'.format(attr_name),
        )
        with open(attr_filename, 'wb') as attr_file:
            pickle.dump(attr_value, attr_file)

In [10]:
interactions, users, items = read_data()

In [11]:
# Создадим автаров
avatar_items = [
    # Отечественные комедийные сериалы
    ['Кухня', 'Сваты', 'Ивановы'],
    # Детский контент
    ['Ми-ми-мишки', 'Мама для мамонтёнка', 'Дядюшка Ау'],
    # Боевики, выпущенные до 2000 года
    ['Терминатор', 'Рэмбо: Первая кровь', 'Робокоп']
]
avatar_ids = []


avatar_interactions = []
for idx in range(len(avatar_items)):
    avatar_id = interactions[Columns.User].max() + idx + 1
    avatar_ids.append(avatar_id)
    for title in avatar_items[idx]:
        datetime = interactions[Columns.Datetime].min()
        item_id = items[items['title'] == title][Columns.Item].iloc[0]
        avatar_interactions.append([avatar_id, item_id, datetime, 0, 0, 3])

# Добавим аватров в датафрейм
interactions = pd.concat(
    [
        interactions,
        pd.DataFrame(avatar_interactions, columns=interactions.columns),
    ],
    ignore_index=True,
)

In [12]:
# Разобьем выборку
train, test = train_test_split(interactions, True, pd.Timedelta(days=7))

# Получим признаки для итемов и пользователей
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()

user_features = get_user_features(users, ['sex', 'age', 'income'])
item_features = get_item_features(items, ['content_type'] + items.columns[-10:].tolist())

# Создадим датасет только из тренировочной части данных
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=['sex', 'age', 'income'],
    item_features_df=item_features,
    cat_item_features=['content_type'] + items.columns[-10:].tolist(),
)

In [13]:
# Подбор параметров
metrics = {
    'MAP@10': MAP(k=10),
    'MeanInvUserFreq': MeanInvUserFreq(k=10),
    'Serendipity': Serendipity(k=10),
}

objective = Objective(
    dataset, train, test, metrics, 'MAP@10',
)
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

[32m[I 2022-12-13 21:59:48,714][0m A new study created in memory with name: no-name-7d8a9d76-ca67-406e-abdd-83f807708f8c[0m


Params: {'model': 'Popular'}


[32m[I 2022-12-13 21:59:55,173][0m Trial 0 finished with value: 0.07895709839298103 and parameters: {'model': 'Popular'}. Best is trial 0 with value: 0.07895709839298103.[0m
[32m[I 2022-12-13 21:59:55,174][0m Trial 1 finished with value: 0.07895709839298103 and parameters: {'model': 'Popular'}. Best is trial 0 with value: 0.07895709839298103.[0m
[32m[I 2022-12-13 21:59:55,175][0m Trial 2 finished with value: 0.07895709839298103 and parameters: {'model': 'Popular'}. Best is trial 0 with value: 0.07895709839298103.[0m


Params: {'model': 'FM', 'n_components': 63, 'loss': 'warp', 'user_alpha': 0.004719979864040203, 'item_alpha': 0.046488221793319295}


[32m[I 2022-12-13 22:03:32,795][0m Trial 3 finished with value: 0.00030275175685626004 and parameters: {'model': 'FM', 'n_components': 63, 'loss': 'warp', 'user_alpha': 0.004719979864040203, 'item_alpha': 0.046488221793319295}. Best is trial 0 with value: 0.07895709839298103.[0m
[32m[I 2022-12-13 22:03:32,796][0m Trial 4 finished with value: 0.07895709839298103 and parameters: {'model': 'Popular'}. Best is trial 0 with value: 0.07895709839298103.[0m


Params: {'model': 'ALS', 'n_components': 46, 'fit_features_together': False, 'regularization': 0.07304523882398964}


[32m[I 2022-12-13 22:05:25,259][0m Trial 5 finished with value: 0.06544143743608777 and parameters: {'model': 'ALS', 'n_components': 46, 'fit_features_together': False, 'regularization': 0.07304523882398964}. Best is trial 0 with value: 0.07895709839298103.[0m


Params: {'model': 'FM', 'n_components': 29, 'loss': 'bpr', 'user_alpha': 0.008891914860545825, 'item_alpha': 0.03835102412240964}


[32m[I 2022-12-13 22:07:09,693][0m Trial 6 finished with value: 0.0 and parameters: {'model': 'FM', 'n_components': 29, 'loss': 'bpr', 'user_alpha': 0.008891914860545825, 'item_alpha': 0.03835102412240964}. Best is trial 0 with value: 0.07895709839298103.[0m


Params: {'model': 'FM', 'n_components': 20, 'loss': 'warp', 'user_alpha': 0.06443418986097918, 'item_alpha': 0.011348532028018499}


[32m[I 2022-12-13 22:08:36,657][0m Trial 7 finished with value: 0.07670489706245115 and parameters: {'model': 'FM', 'n_components': 20, 'loss': 'warp', 'user_alpha': 0.06443418986097918, 'item_alpha': 0.011348532028018499}. Best is trial 0 with value: 0.07895709839298103.[0m
[32m[I 2022-12-13 22:08:36,659][0m Trial 8 finished with value: 0.07895709839298103 and parameters: {'model': 'Popular'}. Best is trial 0 with value: 0.07895709839298103.[0m


Params: {'model': 'FM', 'n_components': 11, 'loss': 'warp', 'user_alpha': 0.010056565400803453, 'item_alpha': 0.055898006020603325}


[32m[I 2022-12-13 22:09:45,309][0m Trial 9 finished with value: 0.0333803283274062 and parameters: {'model': 'FM', 'n_components': 11, 'loss': 'warp', 'user_alpha': 0.010056565400803453, 'item_alpha': 0.055898006020603325}. Best is trial 0 with value: 0.07895709839298103.[0m


Params: {'model': 'ALS', 'n_components': 5, 'fit_features_together': True, 'regularization': 0.004382992604813327}


[32m[I 2022-12-13 22:10:45,985][0m Trial 10 finished with value: 0.08045767529939321 and parameters: {'model': 'ALS', 'n_components': 5, 'fit_features_together': True, 'regularization': 0.004382992604813327}. Best is trial 10 with value: 0.08045767529939321.[0m


Params: {'model': 'ALS', 'n_components': 4, 'fit_features_together': True, 'regularization': 0.003027686351279978}


[32m[I 2022-12-13 22:11:46,939][0m Trial 11 finished with value: 0.08057808825090312 and parameters: {'model': 'ALS', 'n_components': 4, 'fit_features_together': True, 'regularization': 0.003027686351279978}. Best is trial 11 with value: 0.08057808825090312.[0m


Params: {'model': 'ALS', 'n_components': 1, 'fit_features_together': True, 'regularization': 0.0002480831373961271}


[32m[I 2022-12-13 22:12:45,253][0m Trial 12 finished with value: 0.080279767511658 and parameters: {'model': 'ALS', 'n_components': 1, 'fit_features_together': True, 'regularization': 0.0002480831373961271}. Best is trial 11 with value: 0.08057808825090312.[0m


Params: {'model': 'ALS', 'n_components': 1, 'fit_features_together': True, 'regularization': 0.0023987648656378056}


[32m[I 2022-12-13 22:13:41,661][0m Trial 13 finished with value: 0.08105730133559175 and parameters: {'model': 'ALS', 'n_components': 1, 'fit_features_together': True, 'regularization': 0.0023987648656378056}. Best is trial 13 with value: 0.08105730133559175.[0m


Params: {'model': 'ALS', 'n_components': 17, 'fit_features_together': True, 'regularization': 0.029058039098230903}


[32m[I 2022-12-13 22:14:54,199][0m Trial 14 finished with value: 0.07970530741585087 and parameters: {'model': 'ALS', 'n_components': 17, 'fit_features_together': True, 'regularization': 0.029058039098230903}. Best is trial 13 with value: 0.08105730133559175.[0m


Params: {'model': 'ALS', 'n_components': 1, 'fit_features_together': True, 'regularization': 0.03486005015079774}


[32m[I 2022-12-13 22:15:52,229][0m Trial 15 finished with value: 0.08121831189440205 and parameters: {'model': 'ALS', 'n_components': 1, 'fit_features_together': True, 'regularization': 0.03486005015079774}. Best is trial 15 with value: 0.08121831189440205.[0m


Params: {'model': 'ALS', 'n_components': 31, 'fit_features_together': True, 'regularization': 0.04845603352116515}


[32m[I 2022-12-13 22:17:24,841][0m Trial 16 finished with value: 0.07900839611532838 and parameters: {'model': 'ALS', 'n_components': 31, 'fit_features_together': True, 'regularization': 0.04845603352116515}. Best is trial 15 with value: 0.08121831189440205.[0m


Params: {'model': 'ALS', 'n_components': 44, 'fit_features_together': False, 'regularization': 0.03233576255784106}


[32m[I 2022-12-13 22:19:13,857][0m Trial 17 finished with value: 0.0652430513611751 and parameters: {'model': 'ALS', 'n_components': 44, 'fit_features_together': False, 'regularization': 0.03233576255784106}. Best is trial 15 with value: 0.08121831189440205.[0m


Params: {'model': 'ALS', 'n_components': 16, 'fit_features_together': True, 'regularization': 0.09828574070541736}


[32m[I 2022-12-13 22:20:25,869][0m Trial 18 finished with value: 0.08009869987538755 and parameters: {'model': 'ALS', 'n_components': 16, 'fit_features_together': True, 'regularization': 0.09828574070541736}. Best is trial 15 with value: 0.08121831189440205.[0m


Params: {'model': 'ALS', 'n_components': 23, 'fit_features_together': True, 'regularization': 0.02493167335696917}


[32m[I 2022-12-13 22:21:47,309][0m Trial 19 finished with value: 0.07909353103382936 and parameters: {'model': 'ALS', 'n_components': 23, 'fit_features_together': True, 'regularization': 0.02493167335696917}. Best is trial 15 with value: 0.08121831189440205.[0m


In [14]:
report = pd.DataFrame(objective.trials).fillna('-')
report.sort_values(objective.objective_metric, inplace=True, ascending=False)

metric_columns = ['Serendipity', 'MeanInvUserFreq', 'MAP@10']

columns = []
for column in report.columns:
    if column not in metric_columns:
        columns.append(column)
columns.extend(metric_columns)

report = report[columns]

def highlight_metric(row):
    return 'background-color: lightcyan'

report.head(20).style.applymap(
    highlight_metric, 
    subset=pd.IndexSlice[:, [objective.objective_metric]],
)

Unnamed: 0,model,n_components,loss,user_alpha,item_alpha,fit_features_together,regularization,Serendipity,MeanInvUserFreq,MAP@10
11,ALS,1.000000,-,-,-,True,0.034860,8e-06,4.183457,0.081218
9,ALS,1.000000,-,-,-,True,0.002399,9e-06,4.178566,0.081057
7,ALS,4.000000,-,-,-,True,0.003028,5e-06,3.947866,0.080578
6,ALS,5.000000,-,-,-,True,0.004383,5e-06,3.934947,0.080458
8,ALS,1.000000,-,-,-,True,0.000248,7e-06,4.153913,0.08028
14,ALS,16.000000,-,-,-,True,0.098286,5e-06,3.920305,0.080099
10,ALS,17.000000,-,-,-,True,0.029058,5e-06,3.94804,0.079705
15,ALS,23.000000,-,-,-,True,0.024932,5e-06,3.994778,0.079094
12,ALS,31.000000,-,-,-,True,0.048456,5e-06,3.992444,0.079008
0,Popular,-,-,-,-,-,-,2e-06,3.712316,0.078957


In [15]:
# Сохраним лучшую модель обученную только на train
best_model = objective.get_model(study.best_trial)
best_model.fit(dataset)

save_model(
    best_model,
    'partial_best_model',
    study.best_trial.params['model'],
    dataset,
    train,
)

In [16]:
# Посмотрим на рекомендации для аватаров от модели с лучшим показателем MAP@10
colors = ['lavender', 'wheat', 'mistyrose']
color_map = {avatar_id: color for avatar_id, color in zip(avatar_ids, colors)}

def highlight_avatars(row):
    color = color_map[row[0]]
    return ['background-color: {0}'.format(color)] * len(row.values)

avatar_recs = best_model.recommend(
    users=avatar_ids,
    dataset=dataset,
    k=10,
    filter_viewed=True,
).merge(
    items[[Columns.Item, 'title']],
    on=Columns.Item,
).sort_values(
    [Columns.User, Columns.Rank],
)

avatar_recs.style.apply(highlight_avatars, axis=1)

Unnamed: 0,user_id,item_id,score,rank,title
0,1097558,15297,0.378224,1,Клиника счастья
3,1097558,10440,0.335288,2,Хрустальный
6,1097558,4151,0.257319,3,Секреты семейной жизни
8,1097558,3734,0.255944,4,Прабабушка легкого поведения
10,1097558,9728,0.223026,5,Гнев человеческий
13,1097558,2657,0.150892,6,Подслушано
15,1097558,13865,0.13814,7,Девятаев
17,1097558,7571,0.087007,8,100% волк
18,1097558,14431,0.086529,9,Приворот. Чёрное венчание
19,1097558,4880,0.072548,10,Афера


In [17]:
# Посмотрим на рекомендации для аватаров от модели с лучшим показателем MeanInvUserFreq
trial_idx = report.sort_values('MeanInvUserFreq', ascending=False).index[0]
model = objective.get_model(study.trials[trial_idx])
model.fit(dataset)

avatar_recs = model.recommend(
    users=avatar_ids,
    dataset=dataset,
    k=10,
    filter_viewed=True,
).merge(
    items[[Columns.Item, 'title']],
    on=Columns.Item,
).sort_values(
    [Columns.User, Columns.Rank],
)

avatar_recs.style.apply(highlight_avatars, axis=1)

Unnamed: 0,user_id,item_id,score,rank,title
0,1097558,15902,432461.303414,1,Морские паразиты (с тифлокомментарием)
2,1097558,13329,224356.116613,2,Тихая Одесса
5,1097558,1138,224137.245815,3,Два мгновения любви
8,1097558,10432,201723.211539,4,Притворись моим мужем
11,1097558,2669,158271.433597,5,Войцек
14,1097558,1595,30753.210829,6,Счастье в конверте
16,1097558,1999,7312.630618,7,Метод Гринберри
17,1097558,8486,3131.67857,8,Мой создатель
19,1097558,6645,2593.988666,9,"Беги, Лола, беги"
20,1097558,14776,2126.725363,10,Заблудившийся


In [18]:
# Посмотрим на рекомендации для аватаров от модели с лучшим показателем Serendipity
trial_idx = report.sort_values('Serendipity', ascending=False).index[0]
model = objective.get_model(study.trials[trial_idx])
model.fit(dataset)

avatar_recs = model.recommend(
    users=avatar_ids,
    dataset=dataset,
    k=10,
    filter_viewed=True,
).merge(
    items[[Columns.Item, 'title']],
    on=Columns.Item,
).sort_values(
    [Columns.User, Columns.Rank],
)

avatar_recs.style.apply(highlight_avatars, axis=1)

Unnamed: 0,user_id,item_id,score,rank,title
0,1097558,10440,187877.0,1,Хрустальный
3,1097558,15297,178630.0,2,Клиника счастья
6,1097558,9728,117779.0,3,Гнев человеческий
9,1097558,13865,113875.0,4,Девятаев
12,1097558,4151,85117.0,5,Секреты семейной жизни
15,1097558,3734,68835.0,6,Прабабушка легкого поведения
18,1097558,2657,66017.0,7,Подслушано
21,1097558,4880,52909.0,8,Афера
24,1097558,142,42466.0,9,Маша
27,1097558,6809,39320.0,10,Дуров


In [19]:
# Создадим полный датасет
interactions, users, items = read_data()

users = users.loc[users[Columns.User].isin(interactions[Columns.User])].copy()
items = items.loc[items[Columns.Item].isin(interactions[Columns.Item])].copy()

user_features = get_user_features(users, ['sex', 'age', 'income'])
item_features = get_item_features(items, ['content_type'] + items.columns[-10:])

dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=user_features,
    cat_user_features=['sex', 'age', 'income'],
    item_features_df=item_features,
    cat_item_features=['content_type'] + items.columns[-10:],
)

In [20]:
# Обучим модель на полном датасете и сохраним ее
best_model.fit(dataset)

save_model(
    best_model,
    'best_model',
    study.best_trial.params['model'],
    dataset,
    interactions,
)