### Load libs

In [417]:
import pandas as pd
import numpy as np
from tqdm.autonotebook import tqdm
from pprint import pprint
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, MAP, calc_metrics
from rectools import Columns
from rectools.models.popular import PopularModel
from rectools.model_selection import TimeRangeSplitter
from rectools.dataset import Dataset, Interactions
from userknn import UserKnn
import warnings
import pickle

warnings.filterwarnings("ignore")
pd.set_option('display.float_format', lambda x: f'{x:,.6f}')

In [415]:
!cd .. && dir

 Volume in drive C has no label.
 Volume Serial Number is 26D9-E12B

 Directory of c:\repos\RecSys

27.11.2023  12:23    <DIR>          .
27.11.2023  12:23    <DIR>          ..
15.11.2023  19:34                87 .dockerignore
15.11.2023  19:34               377 .editorconfig
15.11.2023  19:34    <DIR>          .github
26.11.2023  21:17               977 .gitignore
15.11.2023  19:34            15 982 .pylintrc
15.11.2023  23:03    <DIR>          .pytest_cache
22.11.2023  14:05    <DIR>          .venv
15.11.2023  21:51    <DIR>          .vscode
15.11.2023  19:34               730 Dockerfile
15.11.2023  19:34             3 790 gunicorn.config.py
15.11.2023  19:34               327 main.py
15.11.2023  19:34             1 058 Makefile
27.11.2023  03:39    <DIR>          notebooks
26.11.2023  15:11           167 708 poetry.lock
15.11.2023  19:34                49 poetry.toml
26.11.2023  15:11               963 pyproject.toml
15.11.2023  19:34             8 973 README.md
26.11.2023  15:20   

In [414]:
!dir

 Volume in drive C has no label.
 Volume Serial Number is 26D9-E12B

 Directory of c:\repos\RecSys\notebooks

27.11.2023  03:39    <DIR>          .
27.11.2023  03:39    <DIR>          ..
27.11.2023  03:39                 0 cv_res.pkl
26.11.2023  15:23    <DIR>          data_original
27.11.2023  12:22            43 669 knn_experiments.ipynb
26.11.2023  15:11            96 159 metric_eval.ipynb
26.11.2023  17:12    <DIR>          __pycache__
               3 File(s)        139 828 bytes
               4 Dir(s)  384 205 750 272 bytes free


### Load data

In [393]:
interactions = pd.read_csv('data_original/interactions.csv')
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')

In [395]:
interactions = interactions.rename(columns={'total_dur': Columns.Weight, 
                                            'last_watch_dt': Columns.Datetime}) # change columns for rectools

interactions['datetime'] = pd.to_datetime(interactions['datetime'])

In [396]:
interactions.head()

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0


In [7]:
users.head()

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
2,1047345,age_45_54,income_40_60,Ж,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0


In [8]:
items.head()

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео..."
2,10716,film,Тактическая сила,Tactical Force,2011.0,"криминал, зарубежные, триллеры, боевики, комедии",Канада,,16.0,,Адам П. Калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман,...",Профессиональный рестлер Стив Остин («Все или ...,"Тактическая, сила, 2011, Канада, бандиты, ганг..."
3,7868,film,45 лет,45 Years,2015.0,"драмы, зарубежные, мелодрамы",Великобритания,,16.0,,Эндрю Хэй,"Александра Риддлстон-Барретт, Джеральдин Джейм...","Шарлотта Рэмплинг, Том Кортни, Джеральдин Джей...","45, лет, 2015, Великобритания, брак, жизнь, лю..."
4,16268,film,Все решает мгновение,,1978.0,"драмы, спорт, советские, мелодрамы",СССР,,12.0,Ленфильм,Виктор Садовский,"Александр Абдулов, Александр Демьяненко, Алекс...",Расчетливая чаровница из советского кинохита «...,"Все, решает, мгновение, 1978, СССР, сильные, ж..."


### Split data

In [34]:
max_date = interactions['datetime'].max()

train = interactions[(interactions['datetime'] < max_date - pd.Timedelta(days=7))] # We use 7 days as a trend
test = interactions[(interactions['datetime'] >= max_date - pd.Timedelta(days=7))]


# оставляем только теплых пользователей в тесте
test = test[test['user_id'].isin(train['user_id'].unique())]



print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (4985269, 5)
test: (349088, 5)


### Set up Popular model

In [10]:
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=None,
    item_features_df=None
)

In [27]:
pop = PopularModel()
pop.fit(dataset);

In [52]:
pop_recs = pop.recommend(
    dataset.user_id_map.external_ids,
    dataset=dataset,
    k=10,
    filter_viewed=False  # True - удаляет просмотренные айтемы из рекомендаций 
)

pop_recs.head()

Unnamed: 0,user_id,item_id,score,rank
0,176549,10440,187877.0,1
1,176549,15297,178630.0,2
2,176549,9728,117779.0,3
3,176549,13865,113875.0,4
4,176549,4151,85117.0,5


In [63]:
# Our baseline response for cold user
pop_recs['item_id'].value_counts().index

Int64Index([10440, 15297, 9728, 13865, 4151, 3734, 2657, 4880, 142, 6809], dtype='int64')

In [67]:
with open("most_popular.pkl", "wb") as file:
    pickle.dump(pop_recs, file)

In [109]:
def get_popular_rec(user_id, pop_recs):
    if user_id in pop_recs['user_id'].unique(): # If we want to filter viewed
        recs = pop_recs[pop_recs['user_id'] == user_id]['item_id'].to_list() # get 10 items for specific user
        return recs
    
    else:
        return pop_recs['item_id'].value_counts().index.to_list() # return baseline

In [316]:
# User to test models
random_user = np.random.randint(100000)

print(get_popular_rec(random_user, pop_recs))

[10440, 15297, 9728, 13865, 4151, 3734, 2657, 4880, 142, 6809]


In [69]:
# with open("most_popular.pkl", "rb") as file:
#     pop_recs = pickle.load(file)

### Set up custom (from seminar) UserKNN model

In [418]:
knn = UserKnn(model=TFIDFRecommender(K=30))
knn.fit(train)

100%|██████████| 896791/896791 [23:12<00:00, 643.89it/s] 


In [419]:
knn_preds = knn.predict(test)

#### Let's save dict user_id: List[item_id1, item_id2] to use in offline model

In [420]:
knn_preds = (knn_preds.groupby(['user_id'])
        .agg({'item_id': lambda x: x.tolist()})
        .reset_index()
        .set_index('user_id')
        .to_dict()['item_id']
)

In [421]:
with open("../service/recmodels/knn_preds.pkl", "wb") as file:
    pickle.dump(knn_preds, file)

#### And save model for online

In [422]:
with open("../service/recmodels/tfidf_knn.pkl", "wb") as file:
    pickle.dump(knn, file)

In [187]:
def get_knn_reco(user_id, knn):
    if user_id in list(knn.users_mapping):
        recs = knn.predict(pd.DataFrame([user_id], columns=['user_id'])) # user_id -> [item_id1, item_id2...]
        
        return recs['item_id'].to_list()
    else:
        return []

#### To process cold users and recos not fulfilled we will use popular_based approach 

In [349]:
def get_reco(user_id, knn, pop_recs):
        reco = get_knn_reco(user_id, knn)
        if not reco:
            reco = get_popular_rec(user_id, pop_recs)
        if len(reco) < 10:
            reco = list(pd.unique(reco + get_popular_rec(user_id, pop_recs)))[:10]
            
        return reco

In [None]:
# Only one item predicted
get_knn_reco(1, knn)

[10440]

In [364]:
print(get_reco(1, knn, pop_recs))

[10440, 15297, 9728, 13865, 4151, 3734, 2657, 4880, 142, 6809]


In [365]:
print(get_reco(random_user, knn, pop_recs))

[3518, 2451, 12828, 9696, 12837, 10732, 1819, 288, 496, 849]


### Check parameters and models, count metrics

In [408]:
def metrics_count(interactions, models, metrics, cv, K_RECOS, n_splits=3):
    # For each fold generate train and test part of dataset
    # Then fit every model, generate recommendations and calculate metrics

    results = []

    fold_iterator = cv.split(interactions, collect_fold_stats=True)

    for train_ids, test_ids, fold_info in tqdm((fold_iterator), total=n_splits):
        print(f"\n==================== Fold {fold_info['i_split']} ====================")
        pprint(fold_info)

        df_train = interactions.df.iloc[train_ids]
        dataset = Dataset.construct(df_train)

        df_test = interactions.df.iloc[test_ids][Columns.UserItem]
        test_users = np.unique(df_test[Columns.User])

        # Catalog is set of items that we recommend.
        # Sometimes we recommend not all items from train.
        catalog = df_train[Columns.Item].unique()

        for model_name, model in models.items():
            model.fit(df_train)
            recos = model.predict(df_train)
            metric_values = calc_metrics(
                metrics,
                reco=recos,
                interactions=df_test,
                prev_interactions=df_train,
                catalog=catalog,
            )
            res = {"fold": fold_info["i_split"], "model": model_name}
            res.update(metric_values)
            results.append(res)
        
    pivot_results = pd.DataFrame(results).drop(columns="fold").groupby(["model"], sort=False).agg(["mean"])
    mean_metric_subset = [(metric, agg) for metric, agg in pivot_results.columns if agg == 'mean']
    
    pivot_results = pivot_results.style \
        .highlight_min(subset=mean_metric_subset, color='lightcoral', axis=0) \
        .highlight_max(subset=mean_metric_subset, color='lightgreen', axis=0)
    
    
    display(pivot_results)
            
    return pivot_results

In [409]:
# calculate several classic (precision@k and recall@k) and "beyond accuracy" metrics
metrics = {
    'map@10': MAP(k=10),
    'novelty': MeanInvUserFreq(k=10),
    "prec@10": Precision(k=10),
    "recall": Recall(k=10),
    "serendipity": Serendipity(k=10),
}

# few simple models to compare
models = {
    '10k_cosine_userknn': UserKnn(CosineRecommender(K=10)), 
    '30k_cosine_userknn': UserKnn(CosineRecommender(K=30)),
    '10k_tfidf_userknn': UserKnn(TFIDFRecommender(K=10)),
    '30k_tfidf_userknn': UserKnn(TFIDFRecommender(K=30))
}

K_RECOS = 10

n_splits = 3

cv = TimeRangeSplitter(
    test_size="7D",
    n_splits=n_splits,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

In [410]:
results = metrics_count(Interactions(interactions), models, metrics, cv, K_RECOS)

  0%|          | 0/3 [00:00<?, ?it/s]


{'end': Timestamp('2021-08-09 00:00:00', freq='7D'),
 'i_split': 0,
 'start': Timestamp('2021-08-02 00:00:00', freq='7D'),
 'test': 263681,
 'test_items': 6602,
 'test_users': 98184,
 'train': 4266013,
 'train_items': 15237,
 'train_users': 797423}


100%|██████████| 797423/797423 [17:38<00:00, 753.03it/s] 
100%|██████████| 797423/797423 [18:00<00:00, 738.00it/s] 
100%|██████████| 797423/797423 [17:46<00:00, 747.74it/s] 
100%|██████████| 797423/797423 [17:42<00:00, 750.80it/s] 
 33%|███▎      | 1/3 [1:20:56<2:41:52, 4856.01s/it]


{'end': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'i_split': 1,
 'start': Timestamp('2021-08-09 00:00:00', freq='7D'),
 'test': 279422,
 'test_items': 6698,
 'test_users': 103511,
 'train': 4649162,
 'train_items': 15415,
 'train_users': 850489}


100%|██████████| 850489/850489 [20:39<00:00, 686.37it/s] 
100%|██████████| 850489/850489 [20:31<00:00, 690.58it/s] 
100%|██████████| 850489/850489 [21:26<00:00, 661.30it/s] 
100%|██████████| 850489/850489 [21:24<00:00, 662.06it/s] 
 67%|██████▋   | 2/3 [2:55:35<1:29:00, 5340.57s/it]


{'end': Timestamp('2021-08-23 00:00:00', freq='7D'),
 'i_split': 2,
 'start': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'test': 298878,
 'test_items': 6679,
 'test_users': 110076,
 'train': 5051815,
 'train_items': 15577,
 'train_users': 906071}


100%|██████████| 906071/906071 [23:44<00:00, 636.08it/s] 
100%|██████████| 906071/906071 [23:56<00:00, 630.59it/s] 
100%|██████████| 906071/906071 [23:52<00:00, 632.55it/s] 
100%|██████████| 906071/906071 [23:20<00:00, 646.95it/s] 
100%|██████████| 3/3 [4:41:56<00:00, 5638.74s/it]  


Unnamed: 0_level_0,prec@10,recall,map@10,novelty,serendipity
Unnamed: 0_level_1,mean,mean,mean,mean,mean
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
10k_cosine_userknn,0.004142,0.020041,0.003958,6.384824,6e-06
30k_cosine_userknn,0.003965,0.019435,0.00358,6.881218,8e-06
10k_tfidf_userknn,0.00589,0.029503,0.005659,6.494855,7e-06
30k_tfidf_userknn,0.006559,0.033585,0.006422,6.57516,9e-06


<style type="text/css">
#T_27fe2_row0_col3, #T_27fe2_row0_col4, #T_27fe2_row1_col0, #T_27fe2_row1_col1, #T_27fe2_row1_col2 {
  background-color: lightcoral;
}
#T_27fe2_row1_col3, #T_27fe2_row3_col0, #T_27fe2_row3_col1, #T_27fe2_row3_col2, #T_27fe2_row3_col4 {
  background-color: lightgreen;
}
</style>
<table id="T_27fe2">
  <thead>
    <tr>
      <th class="blank level0" >&nbsp;</th>
      <th id="T_27fe2_level0_col0" class="col_heading level0 col0" >prec@10</th>
      <th id="T_27fe2_level0_col1" class="col_heading level0 col1" >recall</th>
      <th id="T_27fe2_level0_col2" class="col_heading level0 col2" >map@10</th>
      <th id="T_27fe2_level0_col3" class="col_heading level0 col3" >novelty</th>
      <th id="T_27fe2_level0_col4" class="col_heading level0 col4" >serendipity</th>
    </tr>
    <tr>
      <th class="blank level1" >&nbsp;</th>
      <th id="T_27fe2_level1_col0" class="col_heading level1 col0" >mean</th>
      <th id="T_27fe2_level1_col1" class="col_heading level1 col1" >mean</th>
      <th id="T_27fe2_level1_col2" class="col_heading level1 col2" >mean</th>
      <th id="T_27fe2_level1_col3" class="col_heading level1 col3" >mean</th>
      <th id="T_27fe2_level1_col4" class="col_heading level1 col4" >mean</th>
    </tr>
    <tr>
      <th class="index_name level0" >model</th>
      <th class="blank col0" >&nbsp;</th>
      <th class="blank col1" >&nbsp;</th>
      <th class="blank col2" >&nbsp;</th>
      <th class="blank col3" >&nbsp;</th>
      <th class="blank col4" >&nbsp;</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th id="T_27fe2_level0_row0" class="row_heading level0 row0" >10k_cosine_userknn</th>
      <td id="T_27fe2_row0_col0" class="data row0 col0" >0.004142</td>
      <td id="T_27fe2_row0_col1" class="data row0 col1" >0.020041</td>
      <td id="T_27fe2_row0_col2" class="data row0 col2" >0.003958</td>
      <td id="T_27fe2_row0_col3" class="data row0 col3" >6.384824</td>
      <td id="T_27fe2_row0_col4" class="data row0 col4" >0.000006</td>
    </tr>
    <tr>
      <th id="T_27fe2_level0_row1" class="row_heading level0 row1" >30k_cosine_userknn</th>
      <td id="T_27fe2_row1_col0" class="data row1 col0" >0.003965</td>
      <td id="T_27fe2_row1_col1" class="data row1 col1" >0.019435</td>
      <td id="T_27fe2_row1_col2" class="data row1 col2" >0.003580</td>
      <td id="T_27fe2_row1_col3" class="data row1 col3" >6.881218</td>
      <td id="T_27fe2_row1_col4" class="data row1 col4" >0.000008</td>
    </tr>
    <tr>
      <th id="T_27fe2_level0_row2" class="row_heading level0 row2" >10k_tfidf_userknn</th>
      <td id="T_27fe2_row2_col0" class="data row2 col0" >0.005890</td>
      <td id="T_27fe2_row2_col1" class="data row2 col1" >0.029503</td>
      <td id="T_27fe2_row2_col2" class="data row2 col2" >0.005659</td>
      <td id="T_27fe2_row2_col3" class="data row2 col3" >6.494855</td>
      <td id="T_27fe2_row2_col4" class="data row2 col4" >0.000007</td>
    </tr>
    <tr>
      <th id="T_27fe2_level0_row3" class="row_heading level0 row3" >30k_tfidf_userknn</th>
      <td id="T_27fe2_row3_col0" class="data row3 col0" >0.006559</td>
      <td id="T_27fe2_row3_col1" class="data row3 col1" >0.033585</td>
      <td id="T_27fe2_row3_col2" class="data row3 col2" >0.006422</td>
      <td id="T_27fe2_row3_col3" class="data row3 col3" >6.575160</td>
      <td id="T_27fe2_row3_col4" class="data row3 col4" >0.000009</td>
    </tr>
  </tbody>
</table>


#### TF_IDF KNN на 30к соседей имеет лучший скор по всем метрикам, кроме новизны. Интересно, что кнн на косинусной близости с 10к соседями побил себя же, но на 30к