In [2]:
from SLIM import SLIM, SLIMatrix

In [3]:
import pandas as pd
import numpy as np

In [4]:
import scipy as sp
import joblib
import requests
from tqdm.auto import tqdm

from rectools import Columns
from rectools.models.popular import PopularModel
from rectools.dataset import Interactions, Dataset
from rectools.model_selection import TimeRangeSplitter
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics, MAP
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender, BM25Recommender, ItemItemRecommender

from typing import Dict
from collections import Counter

from sklearn.preprocessing import MinMaxScaler

In [5]:
import requests
from copy import deepcopy
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
from tqdm.auto import tqdm

from rectools import Columns
from rectools.dataset import Dataset, Interactions
from rectools.models.popular import PopularModel, Popularity
from rectools.models.popular_in_category import PopularInCategoryModel, RatioStrategy, MixingStrategy
from rectools.model_selection import TimeRangeSplitter
from rectools.metrics import MAP, Recall, calc_metrics

from scipy.stats import mode
import scipy.sparse as sps
from scipy.sparse import csr_matrix

from sklearn.linear_model import ElasticNet

from itertools import product

In [6]:
import helpers

In [7]:
datasets = helpers.load_datasets('kion_train')

In [8]:
users, items, interactions = datasets.users, datasets.items, datasets.interactions
users = users[:50000].reset_index(drop=True)
interactions = interactions[interactions['user_id'].isin(users['user_id'])].reset_index(drop=True)
items = items[items['item_id'].isin(interactions['item_id'])].reset_index(drop=True)

In [9]:
def create_interaction_matrix(df, user_column='user_id', item_column='item_id'):
    users = CategoricalDtype(sorted(df[user_column].unique()), ordered=True)
    items = CategoricalDtype(sorted(df[item_column].unique()), ordered=True)
    df['score'] = 5
    row = df[user_column].astype(users).cat.codes
    col = df[item_column].astype(items).cat.codes
    sparse_matrix = csr_matrix((df['score'], (row, col)), \
                               shape=(users.categories.size, items.categories.size))
    return sparse_matrix

In [10]:
metrics = {
    "map@10": MAP(k=10),
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}

In [11]:
n_folds = 4
unit = "W"
n_units = 1

last_date = interactions['datetime'].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)
periods = n_folds + 1
freq = f"{n_units}{unit}"

date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)

In [12]:
cv = TimeRangeSplitter(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

## А ЗДЕСЬ БЫЛА ПОПЫТКА ПОТЮНИТЬ SLIM, НО У МЕНЯ ПОСТОЯННО УМИРАЛО ЯДРО НОУТБУКОВ

In [13]:
from tqdm.auto import tqdm


interactions_df = Interactions(interactions)

def objective(trial):
    fold_iterator = cv.split(interactions_df, collect_fold_stats=True)
    
    buff = []
    
    for i_fold, (train_indices, test_indices, fold_info) in enumerate(fold_iterator):
        
        print(f"\n==================== Fold {i_fold}")
        print(fold_info)
        
        df_train = interactions_df.df.iloc[train_indices].copy()
        df_test = interactions_df.df.iloc[test_indices].copy()
        
        catalog = df_train['item_id'].unique()
        
        nnbrs = trial.suggest_int('nnbrs', 0, 60)
        simtype = trial.suggest_categorical('simtype', ['cos', 'jac', 'dotp'])
        algo = trial.suggest_categorical('algo', ['admm', 'cd'])
        niters = trial.suggest_int('niters', 50, 70)
        l1r = trial.suggest_float('l1r', 0.1, 1)
        l2r = trial.suggest_float('l2r', 0.1, 1)
        
        params = {
            'algo': algo,
            'simtype': simtype,
            'niters': niters,
            'nthreads': 2,
            'l1r': round(l1r, 3),
            'l2r': round(l2r, 3),
        }
        
        print(params)
        
        interaction_matrix_train = create_interaction_matrix(df_train)
        trainmat = SLIMatrix(interaction_matrix_train)
        
        model = SLIM()
        model.train(params, trainmat)
        
        test = df_test[['user_id', 'item_id']]
        test['score'] = 5
        
        nrcmds = trial.suggest_int('nrcmds', 10, 50)
        
        testmat = SLIMatrix(test, oldmat=model)
        pred = model.predict(testmat, nrcmds=nrcmds)
        
        dataset = Dataset.construct(
            interactions_df=df_train,
            user_features_df=None,
            item_features_df=None
        )
        popular = PopularModel()
        popular.fit(dataset)
        
        recos_popular = popular.recommend(
            users=df_train[Columns.User].unique(),
            dataset=dataset,
            k=10,
            filter_viewed=True,
        )
        recos_popular = recos_popular[recos_popular['rank'] <= 10]

        user_ids = list(pred.keys())
        item_ids = list(v.tolist() for v in pred.values())
        pred_df = pd.DataFrame({'user_id': user_ids, 'item_id': item_ids})
        pred_df = pred_df.explode('item_id')
        pred_df.drop_duplicates(inplace=True)

        count_recs_by_users = pred_df.user_id.value_counts()
        users_without_recos = list(count_recs_by_users[count_recs_by_users < 10].index)

        completion = recos_popular[recos_popular['user_id'].isin(users_without_recos)]
        pred_df = pd.concat([pred_df, completion[['user_id', 'item_id']]], ignore_index=True)

        failed_ids = set(recos_popular['user_id']) - set(pred_df['user_id'])
        pred_df = pd.concat(
            [
                pred_df,
                recos_popular[recos_popular['user_id'].isin(failed_ids)][['user_id', 'item_id']]
            ]
        )

        pred_df['rank'] = pred_df.groupby('user_id').cumcount() + 1
        pred_df = pred_df[pred_df['rank'] <= 10]

        f_metrics = calc_metrics(
            metrics,
            reco=pred_df,
            interactions=df_test,
            prev_interactions=df_train,
            catalog=catalog
        )
        
        print(f_metrics)

        buff.append(f_metrics['map@10'])
        
    return np.array(buff).mean()

In [14]:
import optuna

In [15]:
study = optuna.create_study(direction="maximize", storage='sqlite:///slim.db')

[32m[I 2022-12-09 15:18:40,546][0m A new study created in RDB with name: no-name-9896dece-d963-4ace-a906-674bf82f038e[0m


In [None]:
study.optimize(objective, n_trials=50, timeout=1000)


{'Start date': Timestamp('2021-07-18 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-07-25 00:00:00', freq='W-SUN'), 'Train': 169161, 'Train users': 31982, 'Train items': 6968, 'Test': 11257, 'Test users': 4190, 'Test items': 2342}
{'algo': 'cd', 'simtype': 'dotp', 'niters': 67, 'nthreads': 2, 'l1r': 0.249, 'l2r': 0.919}
Learning takes 10.616 secs.
6642 of the events fall out of the range of oldmat. Partial entries collected.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['score'] = 5


{'prec@10': 0.012792362768496419, 'recall@10': 0.09994658483918627, 'map@10': 0.034867743809271254, 'novelty': 4.539872203707487, 'serendipity': 2.3334020909151255e-08}

{'Start date': Timestamp('2021-07-25 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'), 'Train': 185626, 'Train users': 34257, 'Train items': 7124, 'Test': 12258, 'Test users': 4592, 'Test items': 2368}
{'algo': 'cd', 'simtype': 'dotp', 'niters': 67, 'nthreads': 2, 'l1r': 0.249, 'l2r': 0.919}
Learning takes 12.728 secs.
7277 of the events fall out of the range of oldmat. Partial entries collected.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['score'] = 5


{'prec@10': 0.01367595818815331, 'recall@10': 0.10624818524970964, 'map@10': 0.03593756481251037, 'novelty': 4.553769038509297, 'serendipity': 4.466355208459775e-08}

{'Start date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'Train': 203620, 'Train users': 36692, 'Train items': 7248, 'Test': 12410, 'Test users': 4719, 'Test items': 2308}
{'algo': 'cd', 'simtype': 'dotp', 'niters': 67, 'nthreads': 2, 'l1r': 0.249, 'l2r': 0.919}
Learning takes 13.977 secs.
7293 of the events fall out of the range of oldmat. Partial entries collected.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['score'] = 5


{'prec@10': 0.01184573002754821, 'recall@10': 0.08940201212928486, 'map@10': 0.031219723037904854, 'novelty': 4.514619238980268, 'serendipity': 3.534583811930851e-08}

{'Start date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-15 00:00:00', freq='W-SUN'), 'Train': 221881, 'Train users': 39079, 'Train items': 7469, 'Test': 13046, 'Test users': 4855, 'Test items': 2431}
{'algo': 'cd', 'simtype': 'dotp', 'niters': 67, 'nthreads': 2, 'l1r': 0.249, 'l2r': 0.919}
Learning takes 15.754 secs.
7225 of the events fall out of the range of oldmat. Partial entries collected.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['score'] = 5
[32m[I 2022-12-09 15:19:43,397][0m Trial 0 finished with value: 0.033751660979454444 and parameters: {'nnbrs': 58, 'simtype': 'dotp', 'algo': 'cd', 'niters': 67, 'l1r': 0.24898769321789194, 'l2r': 0.9193556382555251, 'nrcmds': 41}. Best is trial 0 with value: 0.033751660979454444.[0m
