In [1]:
import os
import pickle
import warnings
from collections import Counter
from typing import Dict, List, Optional, Tuple, Type, TypeVar

import numpy as np
import optuna
import pandas as pd
import requests
import scipy as sp
from implicit.nearest_neighbours import (
    CosineRecommender,
    ItemItemRecommender,
    TFIDFRecommender,
)
from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import MAP, calc_metrics
from rectools.model_selection import TimeRangeSplit
from rectools.models.popular import PopularModel
from tqdm.auto import tqdm

In [2]:
warnings.filterwarnings('ignore')

In [3]:
np.random.seed(42)

# Get KION dataset

In [4]:
# download dataset by chunks
url = (
    'https://storage.yandexcloud.net/'
    'itmo-recsys-public-data/kion_train.zip'
)

req = requests.get(url, stream=True)

with open('kion_train.zip', 'wb') as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(
        desc='kion dataset download',
        total=total_size_in_bytes,
        unit='iB',
        unit_scale=True,
    )
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

kion dataset download:   0%|          | 0.00/78.8M [00:00<?, ?iB/s]

In [5]:
!unzip -o kion_train.zip

Archive:  kion_train.zip
  inflating: kion_train/interactions.csv  
  inflating: __MACOSX/kion_train/._interactions.csv  
  inflating: kion_train/users.csv    
  inflating: __MACOSX/kion_train/._users.csv  
  inflating: kion_train/items.csv    
  inflating: __MACOSX/kion_train/._items.csv  


# Get Kinopoisk dataset

https://www.kaggle.com/datasets/alexandertesemnikov/kinopoisktop250russiandataset

In [6]:
!unzip -o archive.zip

Archive:  archive.zip
  inflating: kinopoisk-top250.csv    


# Model

In [7]:
T = TypeVar('T', bound='UserKnn')
Mapping = Dict[int, int]


class UserKnn(object):
    """User KNN model."""
    attrs_to_save = [
        'user_knn',
        'k_users',
        'cold_user_threshold',
        'cold_item_threshold',
        'popular',
        'user_column',
        'item_column',
        'weight_column',
        'ranking',
        'interactions',
    ]

    def __init__(
        self,
        user_knn: ItemItemRecommender,
        k_users: int,
        cold_user_threshold: int,
        cold_item_threshold: int,
        popular: pd.DataFrame,
        user_column: str = 'user_id',
        item_column: str = 'item_id',
        weight_column: Optional[str] = None,
        ranking: str = 'max',
    ):
        """Create user KNN model.

        Args:
            user_knn: Base user knn to use.
            k_users: KNN parameter.
            cold_user_threshold: Number of interactions threshold for users.
            cold_item_threshold: Number of interactions threshold for items.
            popular: List with popular items.
            user_column: User column in dataset.
            item_column: Item column in dataset.
            weight_column: Weight column in dataset.
            ranking: Agg function to user for item scores.
        """
        self.user_knn = user_knn
        self.k_users = k_users
        self.cold_user_threshold = cold_user_threshold
        self.cold_item_threshold = cold_item_threshold
        self.popular = popular
        self.user_column = user_column
        self.item_column = item_column
        self.weight_column = weight_column
        self.ranking = ranking
        self.interactions = pd.DataFrame()
        self._watched = pd.Series(dtype=int)
        self._item_idf = pd.DataFrame()
        self._users_mapping: Mapping = {}
        self._users_inv_mapping: Mapping = {}
        self._items_mapping: Mapping = {}
        self._items_inv_mapping: Mapping = {}
        self.is_fitted = False

    def fit(self, interactions: pd.DataFrame) -> None:
        """Train model.

        Args:
            interactions: Dataset for training on.
        """
        # Удаляем холодные записи.
        interactions = self._filter(interactions)
        self._preprocess(interactions)
        interaction_matrix = self._get_interaction_matrix(
            self.interactions,
        )

        self.user_knn.fit(interaction_matrix)
        self.is_fitted = True

    def predict(
        self, user_id: int, k_recs: int = 10,
    ) -> List[int]:
        """Predict recommendations for user with given ID.

        Args:
            user_id: User ID.
            k_recs: Number of recommendations.

        Returns:
            List of item IDs.
        """
        if not self.is_fitted:
            raise ValueError(
                'Model not fitted, call fit before predicting.',
            )
        user_watched = []
        if user_id in self._watched:
            user_watched = self._watched.loc[user_id]
        if len(user_watched) >= self.cold_user_threshold:
            similar_user_ids, scores = self._get_similar_users(
                user_id, self.k_users,
            )
            recs = pd.DataFrame(
                {
                    self.user_column: similar_user_ids,
                    self.item_column: self._watched.loc[
                        similar_user_ids].values,
                    'score': scores,
                },
            )

            recs = recs.explode(self.item_column)
            # Применяем аггрегирующую функцию к скорам
            recs['agg_score'] = recs.groupby(
                self.item_column,
            )['score'].transform(self.ranking)
            recs = recs.sort_values(
                'agg_score', ascending=False,
            ).drop_duplicates(
                self.item_column,
            ).merge(
                self._item_idf,
                left_on=self.item_column,
                right_on='index',
                how='left',
            )
            # Домножаем на значение IDF
            recs['agg_score'] *= recs['idf']
            recs.sort_values(
                'agg_score', ascending=False, inplace=True,
            )
            # Дополняем рекомендации популярным
            recs = pd.concat([recs, self.popular]).drop_duplicates(
                self.item_column,
            )
            # Убираем из списка то, что пользователь уже смотрел
            mask = recs[self.item_column].isin(user_watched)
            recs = recs[~mask]
        else:
            # Для холодных пользователей рекомендуем популярное
            # Убираем из списка то, что пользователь уже смотрел
            mask = self.popular[self.item_column].isin(user_watched)
            recs = self.popular[~mask]
        recs = recs.copy()
        recs['rank'] = np.arange(1, len(recs) + 1)
        return recs[recs['rank'] <= k_recs][self.item_column].tolist()

    @classmethod
    def load(cls: Type[T], dirpath: str) -> T:
        """Load model state.

        Args:
            dirpath: Path to the dir with the saved model.

        Returns:
            Loaded model.
        """
        attrs_dict = {}
        for attr_name in cls.attrs_to_save:
            attr_filepath = os.path.join(
                dirpath, '{0}.pickle'.format(attr_name),
            )
            with open(attr_filepath, 'rb') as attr_file:
                attr_value = pickle.load(attr_file)
            attrs_dict[attr_name] = attr_value
        interactions = attrs_dict.pop('interactions')
        model = cls(**attrs_dict)
        model._preprocess(interactions)
        model.is_fitted = True
        return model

    def save(self, dirpath: str) -> None:
        """Save model state.

        Args:
            dirpath: Path to the dir to save the model.
        """
        if not os.path.isdir(dirpath):
            os.makedirs(dirpath, exist_ok=True)
        for attr_name in self.attrs_to_save:
            attr_filepath = os.path.join(
                dirpath, '{0}.pickle'.format(attr_name),
            )
            with open(attr_filepath, 'wb') as attr_file:
                attr_value = getattr(self, attr_name)
                pickle.dump(attr_value, attr_file)

    def _preprocess(self, interactions: pd.DataFrame) -> None:
        """Preprocess interactions dataset and set it as an attribute.

        Args:
            interactions: Dataset with interactions.
        """
        self.interactions = interactions
        self._build_watched(self.interactions)
        self._build_mappings(self.interactions)
        self._calculate_item_idf(self.interactions)

    def _filter(self, interactions: pd.DataFrame) -> pd.DataFrame:
        """Filter cold users and cold items.

        Args:
            interactions: Dataset with interactions.

        Returns:
            Filtered dataset.
        """
        interactions = interactions.groupby(self.item_column).filter(
            lambda x: len(x) >= self.cold_item_threshold,
        )
        interactions = interactions.groupby(self.user_column).filter(
            lambda x: len(x) >= self.cold_user_threshold,
        )
        return interactions

    def _get_interaction_matrix(
        self, interactions: pd.DataFrame,
    ) -> sp.sparse.coo_matrix:
        """Get interaction matrix from data frame.

        Args:
            interactions: Dataset with interactions.

        Returns:
            Sparse matrix with data from interactions.
        """
        if self.weight_column:
            weights = interactions[self.weight_column].astype(np.float32)
        else:
            weights = np.ones(len(interactions), dtype=np.float32)

        n_items = len(interactions[self.item_column].unique())
        n_users = len(interactions[self.user_column].unique())

        return sp.sparse.coo_matrix(
            (
                weights,
                (
                    interactions[self.item_column].map(
                        self._items_mapping.get,
                    ),
                    interactions[self.user_column].map(
                        self._users_mapping.get,
                    ),
                ),
            ),
            shape=(n_items, n_users),
        )

    def _calculate_item_idf(self, interactions: pd.DataFrame) -> None:
        """Calculate items IDF.

        Args:
            interactions: Dataset with interactions.

        Returns:
            Calculated IDF for each item.
        """
        item_idf = pd.DataFrame.from_dict(
            Counter(interactions[self.item_column].values),
            orient='index',
            columns=['doc_freq'],
        ).reset_index()
        item_idf['idf'] = item_idf['doc_freq'].apply(
            lambda x: self._idf(len(interactions), x),
        )
        self._item_idf = item_idf

    def _idf(self, n: int, x: float) -> float:
        """Calculate IDF.

        Args:
            n: Number of items.
            x: Number of item occurrences.

        Returns:
            IDF value.
        """
        return np.log((1 + n) / (1 + x) + 1)

    def _build_mappings(self, interactions: pd.DataFrame) -> None:
        """Build mappings for IDs.

        Args:
            interactions: Dataset with interactions.
        """
        self._users_inv_mapping = dict(
            enumerate(interactions[self.user_column].unique()),
        )
        self._users_mapping = {
            v: k for k, v in self._users_inv_mapping.items()
        }

        self._items_inv_mapping = dict(
            enumerate(interactions[self.item_column].unique()),
        )
        self._items_mapping = {
            v: k for k, v in self._items_inv_mapping.items()
        }

    def _build_watched(self, interactions: pd.DataFrame) -> None:
        """Build watch lists for each user.

        Args:
            interactions: Dataset with interactions.
        """
        user_groups = interactions.groupby(self.user_column)
        self._watched = user_groups[self.item_column].apply(list)

    def _get_similar_users(
        self, user_id: int, k_users: int,
    ) -> Tuple[List[int], List[float]]:
        """Get similar users with KNN model.

        Args:
            user_id: Query user ID.
            k_users: Number of similar users to get.

        Returns:
            List of similar user IDs and similarity scores.
        """
        user_knn_user_id = self._users_mapping[user_id]
        similar_user_ids, scores = self.user_knn.similar_items(
            user_knn_user_id, N=k_users,
        )
        return (
            list(map(self._users_inv_mapping.get, similar_user_ids)),
            scores,
        )

# Read data

In [8]:
def read_data():
    interactions = pd.read_csv('kion_train/interactions.csv')
    users = pd.read_csv('kion_train/users.csv')
    items = pd.read_csv('kion_train/items.csv')

    interactions.rename(
        columns={
            'last_watch_dt': Columns.Datetime,
            'total_dur': Columns.Weight,
            'watched_pct': 'weight_norm',
        },
        inplace=True,
    )

    interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime])
    
    return interactions, users, items

In [9]:
interactions, users, items = read_data()

# Get popular

In [10]:
kinopoisk = pd.read_csv(
    'kinopoisk-top250.csv',
).rename(columns={'movie': 'title'})
kinopoisk['title'] = kinopoisk['title'].apply(lambda title: title.strip())

In [11]:
# Оставим в популярном КП только фильмы, которые есть библиотеке KION
popular_kinopoisk = items.merge(
    kinopoisk, on='title',
).sort_values(
    by='rating',
)[Columns.Item].to_frame()

In [12]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=None,
    item_features_df=None,
)

pop = PopularModel()
pop.fit(dataset)

# Получим популярное KION
popular_kion = pop.recommend(
    dataset.user_id_map.external_ids[:1], 
    dataset=dataset, 
    k=len(popular_kinopoisk), 
    filter_viewed=False,
)[Columns.Item].to_frame()

In [13]:
# Объединим списки популярного через 1
popular_common = []
for pair in zip(popular_kinopoisk[Columns.Item], popular_kion[Columns.Item]):
    popular_common.extend(pair)
popular = pd.DataFrame({Columns.Item: popular_common}).drop_duplicates()

# User sample

In [14]:
# Для кросс-валидации возьмем только часть пользователей
sample_fraction = 0.33
users = interactions['user_id'].unique()
size = int(sample_fraction * len(users))
users_sample = np.random.choice(users, size=size, replace=False)

interactions = interactions[interactions[Columns.User].isin(users_sample)]

# CV

In [15]:
def get_cv_splitter(
    last_date,
    n_folds,
    unit,
    n_units,
    filter_cold_users,
    filter_cold_items,
    filter_already_seen,
):
    start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  
    periods = n_folds + 1
    freq = f'{n_units}{unit}'
    date_range = pd.date_range(
        start=start_date, periods=periods, freq=freq, tz=last_date.tz,
    )
    return TimeRangeSplit(
        date_range=date_range,
        filter_cold_users=filter_cold_users,
        filter_cold_items=filter_cold_items,
        filter_already_seen=filter_already_seen,
    )

# Hyperparameter optimization

In [16]:
def select_params(trial):
    # Подберем тип модели user knn
    user_knn_name = trial.suggest_categorical(
        'user_knn', ['tf_idf', 'cosine'],
    )
    if user_knn_name == 'tf_idf':
        user_knn = TFIDFRecommender()
    else:
        user_knn = CosineRecommender()
    # Подберем количество соседей
    k_users = trial.suggest_int(
        'k_users', 10, 150,
    )
    # Подберем порог для определния холодных пользователей
    cold_user_threshold = trial.suggest_int(
        'cold_user_threshold', 1, 5,
    )
    # Подберем порог для определния холодных айтемов
    cold_item_threshold = trial.suggest_int(
        'cold_item_threshold', 1, 5,
    )
    # Подберем множество популярного
    popular_name = trial.suggest_categorical(
        'popular', ['merged', 'kion'],
    )
    if popular_name == 'merged':
        popular_set = popular
    else:
        popular_set = popular_kion
    # Подберем стобец, который будет использоваться в качестве веса
    weight_column = trial.suggest_categorical(
        'weight_column', ['weight', 'weight_norm'],
    )
    # Подберем способ ранжирования
    ranking = trial.suggest_categorical(
        'ranking', ['max', 'mean'],
    )
    
    return {
        'user_knn': user_knn,
        'k_users': k_users,
        'cold_user_threshold': cold_user_threshold,
        'cold_item_threshold': cold_item_threshold,
        'popular': popular_set,
        'weight_column': weight_column,
        'ranking': ranking,
    }

In [17]:
def objective(trial):
    last_date = interactions[Columns.Datetime].max().normalize()
    
    cv = get_cv_splitter(
        last_date=last_date,
        n_folds=3,
        unit='D',
        n_units=5,
        filter_cold_users=False,
        filter_cold_items=True,
        filter_already_seen=True,
    )
    
    metrics = {'map@10': MAP(k=10)}
    
    fold_iterator = cv.split(interactions, collect_fold_stats=True)
    scores = []
    for fold_idx, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
    
        df_train = interactions.iloc[train_ids].copy()
        df_test = interactions.iloc[test_ids][Columns.UserItem].copy()

        catalog = df_train[Columns.Item].unique()
        
        params = select_params(trial)
        
        model = UserKnn(**params)
        
        model.fit(df_train)
        
        users, items, ranks = [], [], []
        
        for user_id in df_test[Columns.User].unique():
            user_recs = model.predict(user_id)
            k_recs = len(user_recs)
            users.extend([user_id] * k_recs)
            items.extend(user_recs)
            ranks.extend(np.arange(1, k_recs + 1))
            
        recs = pd.DataFrame(
            {
                Columns.User: users,
                Columns.Item: items,
                Columns.Rank: ranks,
            },
        )
        
        metric_values = calc_metrics(
            metrics,
            reco=recs,
            interactions=df_test,
            prev_interactions=df_train,
            catalog=catalog,
        )
        score = metric_values['map@10']
        
        trial.report(score, fold_idx)
        if trial.should_prune():
            raise optuna.TrialPruned()
            
        scores.append(score)
        
    return np.mean(scores)

In [18]:
# Подберем гиперпараметры модели
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

[32m[I 2022-12-09 20:50:54,071][0m A new study created in memory with name: no-name-244cf884-0cf6-40fc-af15-0a14e20235fb[0m


  0%|          | 0/101134 [00:00<?, ?it/s]

  0%|          | 0/106747 [00:00<?, ?it/s]

  0%|          | 0/112710 [00:00<?, ?it/s]

[32m[I 2022-12-09 21:00:50,514][0m Trial 0 finished with value: 0.04301375936666239 and parameters: {'user_knn': 'cosine', 'k_users': 107, 'cold_user_threshold': 4, 'cold_item_threshold': 2, 'popular': 'merged', 'weight_column': 'weight', 'ranking': 'mean'}. Best is trial 0 with value: 0.04301375936666239.[0m


  0%|          | 0/170718 [00:00<?, ?it/s]

  0%|          | 0/179153 [00:00<?, ?it/s]

  0%|          | 0/188038 [00:00<?, ?it/s]

[32m[I 2022-12-09 21:15:31,330][0m Trial 1 finished with value: 0.08104656465130837 and parameters: {'user_knn': 'cosine', 'k_users': 141, 'cold_user_threshold': 2, 'cold_item_threshold': 1, 'popular': 'kion', 'weight_column': 'weight_norm', 'ranking': 'max'}. Best is trial 1 with value: 0.08104656465130837.[0m


  0%|          | 0/272547 [00:00<?, ?it/s]

  0%|          | 0/285268 [00:00<?, ?it/s]

  0%|          | 0/298903 [00:00<?, ?it/s]

[32m[I 2022-12-09 21:33:26,179][0m Trial 2 finished with value: 0.09123544203638616 and parameters: {'user_knn': 'tf_idf', 'k_users': 90, 'cold_user_threshold': 1, 'cold_item_threshold': 5, 'popular': 'kion', 'weight_column': 'weight_norm', 'ranking': 'mean'}. Best is trial 2 with value: 0.09123544203638616.[0m


  0%|          | 0/101199 [00:00<?, ?it/s]

  0%|          | 0/106817 [00:00<?, ?it/s]

  0%|          | 0/112775 [00:00<?, ?it/s]

[32m[I 2022-12-09 21:43:42,605][0m Trial 3 finished with value: 0.09037402679959715 and parameters: {'user_knn': 'tf_idf', 'k_users': 50, 'cold_user_threshold': 4, 'cold_item_threshold': 1, 'popular': 'kion', 'weight_column': 'weight', 'ranking': 'max'}. Best is trial 2 with value: 0.09123544203638616.[0m


  0%|          | 0/272783 [00:00<?, ?it/s]

  0%|          | 0/285495 [00:00<?, ?it/s]

  0%|          | 0/299129 [00:00<?, ?it/s]

[32m[I 2022-12-09 22:02:15,779][0m Trial 4 finished with value: 0.04594019287513732 and parameters: {'user_knn': 'tf_idf', 'k_users': 110, 'cold_user_threshold': 1, 'cold_item_threshold': 2, 'popular': 'merged', 'weight_column': 'weight_norm', 'ranking': 'mean'}. Best is trial 2 with value: 0.09123544203638616.[0m


  0%|          | 0/170584 [00:00<?, ?it/s]

[32m[I 2022-12-09 22:06:39,595][0m Trial 5 pruned. [0m


  0%|          | 0/272547 [00:00<?, ?it/s]

[32m[I 2022-12-09 22:12:14,958][0m Trial 6 pruned. [0m


  0%|          | 0/82724 [00:00<?, ?it/s]

[32m[I 2022-12-09 22:14:55,091][0m Trial 7 pruned. [0m


  0%|          | 0/170443 [00:00<?, ?it/s]

[32m[I 2022-12-09 22:19:12,407][0m Trial 8 pruned. [0m


  0%|          | 0/272547 [00:00<?, ?it/s]

[32m[I 2022-12-09 22:24:48,241][0m Trial 9 pruned. [0m


  0%|          | 0/127592 [00:00<?, ?it/s]

  0%|          | 0/134366 [00:00<?, ?it/s]

  0%|          | 0/141521 [00:00<?, ?it/s]

[32m[I 2022-12-09 22:36:08,836][0m Trial 10 finished with value: 0.08703230343620173 and parameters: {'user_knn': 'tf_idf', 'k_users': 13, 'cold_user_threshold': 3, 'cold_item_threshold': 4, 'popular': 'kion', 'weight_column': 'weight_norm', 'ranking': 'mean'}. Best is trial 2 with value: 0.09123544203638616.[0m


  0%|          | 0/101064 [00:00<?, ?it/s]

  0%|          | 0/106675 [00:00<?, ?it/s]

  0%|          | 0/112638 [00:00<?, ?it/s]

[32m[I 2022-12-09 22:45:55,769][0m Trial 11 finished with value: 0.09069593816713019 and parameters: {'user_knn': 'tf_idf', 'k_users': 95, 'cold_user_threshold': 4, 'cold_item_threshold': 3, 'popular': 'kion', 'weight_column': 'weight_norm', 'ranking': 'max'}. Best is trial 2 with value: 0.09123544203638616.[0m


  0%|          | 0/101064 [00:00<?, ?it/s]

  0%|          | 0/106675 [00:00<?, ?it/s]

  0%|          | 0/112638 [00:00<?, ?it/s]

[32m[I 2022-12-09 22:55:42,146][0m Trial 12 finished with value: 0.09069593816713019 and parameters: {'user_knn': 'tf_idf', 'k_users': 97, 'cold_user_threshold': 4, 'cold_item_threshold': 3, 'popular': 'kion', 'weight_column': 'weight_norm', 'ranking': 'max'}. Best is trial 2 with value: 0.09123544203638616.[0m


  0%|          | 0/82521 [00:00<?, ?it/s]

  0%|          | 0/87326 [00:00<?, ?it/s]

  0%|          | 0/92433 [00:00<?, ?it/s]

[32m[I 2022-12-09 23:03:51,414][0m Trial 13 finished with value: 0.09373076819710141 and parameters: {'user_knn': 'tf_idf', 'k_users': 93, 'cold_user_threshold': 5, 'cold_item_threshold': 4, 'popular': 'kion', 'weight_column': 'weight_norm', 'ranking': 'mean'}. Best is trial 13 with value: 0.09373076819710141.[0m


  0%|          | 0/82521 [00:00<?, ?it/s]

  0%|          | 0/87326 [00:00<?, ?it/s]

  0%|          | 0/92433 [00:00<?, ?it/s]

[32m[I 2022-12-09 23:12:00,419][0m Trial 14 finished with value: 0.09373076819710141 and parameters: {'user_knn': 'tf_idf', 'k_users': 122, 'cold_user_threshold': 5, 'cold_item_threshold': 4, 'popular': 'kion', 'weight_column': 'weight_norm', 'ranking': 'mean'}. Best is trial 13 with value: 0.09373076819710141.[0m


  0%|          | 0/82521 [00:00<?, ?it/s]

  0%|          | 0/87326 [00:00<?, ?it/s]

  0%|          | 0/92433 [00:00<?, ?it/s]

[32m[I 2022-12-09 23:20:07,957][0m Trial 15 finished with value: 0.09373076819710141 and parameters: {'user_knn': 'tf_idf', 'k_users': 123, 'cold_user_threshold': 5, 'cold_item_threshold': 4, 'popular': 'kion', 'weight_column': 'weight_norm', 'ranking': 'mean'}. Best is trial 13 with value: 0.09373076819710141.[0m


  0%|          | 0/82521 [00:00<?, ?it/s]

  0%|          | 0/87326 [00:00<?, ?it/s]

  0%|          | 0/92433 [00:00<?, ?it/s]

[32m[I 2022-12-09 23:28:15,382][0m Trial 16 finished with value: 0.09373076819710141 and parameters: {'user_knn': 'tf_idf', 'k_users': 149, 'cold_user_threshold': 5, 'cold_item_threshold': 4, 'popular': 'kion', 'weight_column': 'weight_norm', 'ranking': 'mean'}. Best is trial 13 with value: 0.09373076819710141.[0m


  0%|          | 0/82521 [00:00<?, ?it/s]

  0%|          | 0/87326 [00:00<?, ?it/s]

  0%|          | 0/92433 [00:00<?, ?it/s]

[32m[I 2022-12-09 23:36:22,106][0m Trial 17 finished with value: 0.09373076819710141 and parameters: {'user_knn': 'tf_idf', 'k_users': 150, 'cold_user_threshold': 5, 'cold_item_threshold': 4, 'popular': 'kion', 'weight_column': 'weight_norm', 'ranking': 'mean'}. Best is trial 13 with value: 0.09373076819710141.[0m


  0%|          | 0/127665 [00:00<?, ?it/s]

[32m[I 2022-12-09 23:39:54,985][0m Trial 18 pruned. [0m


  0%|          | 0/127592 [00:00<?, ?it/s]

[32m[I 2022-12-09 23:43:25,497][0m Trial 19 pruned. [0m


# Train

In [19]:
def train(trial):
    params = select_params(trial)
    model = UserKnn(**params) 
    model.fit(interactions)
    model.save('best_model')

In [20]:
# Заново считаем данные
interactions, users, items = read_data()

In [21]:
# Обучим с лучшим набором параметров модель на всех данных
train(study.best_trial)

  0%|          | 0/302310 [00:00<?, ?it/s]