In [1]:
from src.dataset import Dataset
from src.postprocess_data import MinInteractionsFilter, IdsEncoder, SplitTrainValTest
import os
import faiss
import random
import numpy as np
from collections import defaultdict
from implicit.bpr import BayesianPersonalizedRanking
from src.metrics import user_recall, user_ap

os.environ['KMP_DUPLICATE_LIB_OK']='True'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = Dataset('ml-1m').get_data()

min_iterations_filter = MinInteractionsFilter()
ids_encoder = IdsEncoder()

data = min_iterations_filter.transform(data)
data = ids_encoder.fit_transform(data)

n_users = len(data['user_id'].unique())
n_items = len(data['item_id'].unique())
n_interactions = len(data)
print(f'users: {n_users}')
print(f'items: {n_items}')
print(f'interactions: {n_interactions}')
print(f'density: {(n_interactions / n_users / n_items) * 100:.2f}%')

splitter = SplitTrainValTest()
train_df, val_df, test_df = splitter.transform(data)

data

users: 6034
items: 3125
interactions: 574376
density: 3.05%


user_id,item_id,event_ts
i64,i64,i64
0,924,978300760
0,2685,978300275
0,1835,978824291
0,1015,978302039
0,2218,978300719
0,515,978302268
0,714,978301368
0,516,978824268
0,733,978301752
0,1876,978302281


In [3]:
from src.postprocess_data import create_sparse_dataset, create_positives_dataset

train_dataset = create_sparse_dataset(train_df)
train_positives = create_positives_dataset(train_df)
val_positives = create_positives_dataset(val_df)
test_positives = create_positives_dataset(test_df)

print(f'{len(train_positives.keys())} users in train')
print(f'{len(val_positives.keys())} users in val')
print(f'{len(test_positives.keys())} users in test')

4006 users in train
1579 users in val
1784 users in test


## Линейный learning rate

In [10]:
def linear_lr(epoch, n_epochs, lr_start, lr_end):
    p = epoch / n_epochs
    return lr_start * (1 - p) + lr_end * p

def cyclic_lr(epoch, n_epochs, lr_start, lr_end):
    t_i = 1 / RESTART_EPOCHS * ( (epoch - 1) % RESTART_EPOCHS + 1)
    return lr_start * (1 - t_i) + lr_end * t_i

class SWACallback:
    def __init__(self, type_lr="linear_lr"):
        self.user_factors_swa = None
        self.item_factors_swa = None
        self.recall_history = defaultdict(list)
        
        self.type_lr = type_lr
        
        self.n_models = 0
        
    def callback_fn(self, epoch, *_):
        if self.type_lr == "linear_lr":
            model.learning_rate = linear_lr(epoch, model.iterations, LR_START, LR_END)
        elif self.type_lr == "cyclic_lr":
            model.learning_rate = cyclic_lr(epoch, model.iterations, LR_START, LR_END)
        
        if (epoch + 1) % RESTART_EPOCHS == 0:
            if self.user_factors_swa is None:
                self.user_factors_swa = model.user_factors
                self.item_factors_swa = model.item_factors
            else:
                self.n_models = epoch // RESTART_EPOCHS
                self.user_factors_swa = (self.user_factors_swa * self.n_models + model.user_factors) / (self.n_models + 1)
                self.item_factors_swa = (self.item_factors_swa * self.n_models + model.item_factors) / (self.n_models + 1)
        
        index = faiss.IndexFlatIP(DIM)
        index.add(model.item_factors)
        recs = index.search(model.user_factors, 200)[1]
        
        for k in [1, 10, 20]:
            recall_list = []
            for user_id, y_true in val_positives.items():
                y_pred = [
                    item_id for item_id in recs[user_id]
                    if item_id not in train_positives.get(user_id, set())
                ]
                recall_list.append(user_recall(y_pred, y_true, k))
#             self.recall_history[k].append(np.mean(recall_list))

def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)

def get_model(seed):
    return BayesianPersonalizedRanking(
        iterations=EPOCHS, factors=(DIM - 1), random_state=seed,
        learning_rate=LR_START, regularization=REG_FACTOR
    )


## Линейный LR

In [5]:
results = dict()
results_swa = dict()

for i in range(10): 
    results[i] = dict()
    results_swa[i] = dict()

    train_df, val_df, test_df = splitter.transform(data)

    train_dataset = create_sparse_dataset(train_df)
    train_positives = create_positives_dataset(train_df)
    val_positives = create_positives_dataset(val_df)
    test_positives = create_positives_dataset(test_df)

    SEED = i
    DIM = 128
    LR_START = 1e-2
    LR_END = 1e-3
    EPOCHS = 150
    RESTART_EPOCHS = 50
    REG_FACTOR = 1e-2

    set_seed(SEED)
    model = get_model(SEED)
    fit_callback = SWACallback()
    model.fit(train_dataset, callback=fit_callback.callback_fn)

    # usual
    index = faiss.IndexFlatIP(DIM)
    index.add(model.item_factors)
    recs = index.search(model.user_factors, 50)[1]

    for k in [5, 10]:
        map_list = []
        recall_list = []
        for user_id, y_true in test_positives.items():
            y_pred = [
                item_id for item_id in recs[user_id]
                if item_id not in train_positives.get(user_id, set())
            ]

            map_list.append(user_ap(y_pred, y_true, k))
            recall_list.append(user_recall(y_pred, y_true, k))
        print(f'Recall@{k}', round(np.mean(recall_list), 4))
        print(f'MAP@{k}', round(np.nanmean(map_list), 4))
        results[i][f'Recall@{k}'] = round(np.mean(recall_list), 4)
        results[i][f'MAP@{k}'] = round(np.nanmean(map_list), 4)
        
    # swa
    index = faiss.IndexFlatIP(DIM)
    index.add(fit_callback.item_factors_swa)
    recs = index.search(fit_callback.user_factors_swa, 200)[1]

    for k in [5, 10]:
        recall_list = []
        map_list = []
        for user_id, y_true in test_positives.items():
            y_pred = [
                item_id for item_id in recs[user_id]
                if item_id not in train_positives.get(user_id, set())
            ]
            map_list.append(user_ap(y_pred, y_true, k))
            recall_list.append(user_recall(y_pred, y_true, k))
        print(f'Recall@{k}', round(np.mean(recall_list), 4))
        print(f'MAP@{k}', round(np.nanmean(map_list), 4))
        results_swa[i][f'Recall@{k}'] = round(np.mean(recall_list), 4)
        results_swa[i][f'MAP@{k}'] = round(np.nanmean(map_list), 4)

100%|███████| 150/150 [01:38<00:00,  1.52it/s, train_auc=89.64%, skipped=19.61%]
  return np.sum([


Recall@5 0.0261
MAP@5 0.1743
Recall@10 0.0473
MAP@10 0.1392
Recall@5 0.027
MAP@5 0.1729
Recall@10 0.048
MAP@10 0.1412


100%|███████| 150/150 [01:37<00:00,  1.54it/s, train_auc=89.59%, skipped=19.57%]


Recall@5 0.0265
MAP@5 0.1749
Recall@10 0.0472
MAP@10 0.1403
Recall@5 0.0278
MAP@5 0.175
Recall@10 0.0484
MAP@10 0.1401


100%|███████| 150/150 [01:37<00:00,  1.54it/s, train_auc=89.59%, skipped=19.57%]


Recall@5 0.0256
MAP@5 0.1756
Recall@10 0.0467
MAP@10 0.1401
Recall@5 0.0267
MAP@5 0.1752
Recall@10 0.0478
MAP@10 0.1388


100%|███████| 150/150 [01:38<00:00,  1.52it/s, train_auc=89.65%, skipped=19.61%]


Recall@5 0.0257
MAP@5 0.1757
Recall@10 0.0467
MAP@10 0.14
Recall@5 0.0264
MAP@5 0.1765
Recall@10 0.0483
MAP@10 0.1403


100%|███████| 150/150 [01:37<00:00,  1.54it/s, train_auc=89.58%, skipped=19.56%]


Recall@5 0.0264
MAP@5 0.1739
Recall@10 0.0463
MAP@10 0.1386
Recall@5 0.0273
MAP@5 0.1733
Recall@10 0.0477
MAP@10 0.1386


100%|███████| 150/150 [01:37<00:00,  1.54it/s, train_auc=89.56%, skipped=19.46%]


Recall@5 0.0264
MAP@5 0.1739
Recall@10 0.0454
MAP@10 0.1376
Recall@5 0.0269
MAP@5 0.1758
Recall@10 0.0473
MAP@10 0.1398


100%|███████| 150/150 [01:39<00:00,  1.51it/s, train_auc=89.59%, skipped=19.47%]


Recall@5 0.0267
MAP@5 0.1737
Recall@10 0.0456
MAP@10 0.1382
Recall@5 0.0267
MAP@5 0.1745
Recall@10 0.0474
MAP@10 0.1383


100%|███████| 150/150 [01:37<00:00,  1.54it/s, train_auc=89.57%, skipped=19.59%]


Recall@5 0.0261
MAP@5 0.1752
Recall@10 0.0457
MAP@10 0.1397
Recall@5 0.0267
MAP@5 0.1766
Recall@10 0.0471
MAP@10 0.14


100%|███████| 150/150 [01:36<00:00,  1.55it/s, train_auc=89.67%, skipped=19.58%]


Recall@5 0.0254
MAP@5 0.1699
Recall@10 0.0465
MAP@10 0.1378
Recall@5 0.0265
MAP@5 0.1756
Recall@10 0.0479
MAP@10 0.1395


100%|███████| 150/150 [01:39<00:00,  1.51it/s, train_auc=89.62%, skipped=19.50%]


Recall@5 0.0268
MAP@5 0.1729
Recall@10 0.0456
MAP@10 0.1382
Recall@5 0.027
MAP@5 0.1751
Recall@10 0.0472
MAP@10 0.1393


In [6]:
import json

dataset = "m1-1m"

json_dict = json.dumps(results_swa)
with open(f"result_swa_{dataset}.json", "w") as outfile:
    json.dump(json_dict, outfile)
    
json_dict = json.dumps(results)
with open(f"result_usual_{dataset}.json", "w") as outfile:
    json.dump(json_dict, outfile)

## Цикличный learning rate

In [7]:
results = dict()
results_swa = dict()

for i in range(10): 
    results[i] = dict()
    results_swa[i] = dict()

    train_df, val_df, test_df = splitter.transform(data)

    train_dataset = create_sparse_dataset(train_df)
    train_positives = create_positives_dataset(train_df)
    val_positives = create_positives_dataset(val_df)
    test_positives = create_positives_dataset(test_df)

    SEED = i
    DIM = 128
    LR_START = 1e-2
    LR_END = 1e-3
    EPOCHS = 150
    RESTART_EPOCHS = 50
    REG_FACTOR = 1e-2

    set_seed(SEED)
    model = get_model(SEED)
    fit_callback = SWACallback("cyclic_lr")
    model.fit(train_dataset, callback=fit_callback.callback_fn)

    # usual
    index = faiss.IndexFlatIP(DIM)
    index.add(model.item_factors)
    recs = index.search(model.user_factors, 50)[1]

    for k in [5, 10]:
        map_list = []
        recall_list = []
        for user_id, y_true in test_positives.items():
            y_pred = [
                item_id for item_id in recs[user_id]
                if item_id not in train_positives.get(user_id, set())
            ]

            map_list.append(user_ap(y_pred, y_true, k))
            recall_list.append(user_recall(y_pred, y_true, k))
        print(f'Recall@{k}', round(np.mean(recall_list), 4))
        print(f'MAP@{k}', round(np.nanmean(map_list), 4))
        results[i][f'Recall@{k}'] = round(np.mean(recall_list), 4)
        results[i][f'MAP@{k}'] = round(np.nanmean(map_list), 4)
        
    # swa
    index = faiss.IndexFlatIP(DIM)
    index.add(fit_callback.item_factors_swa)
    recs = index.search(fit_callback.user_factors_swa, 200)[1]

    for k in [5, 10]:
        recall_list = []
        map_list = []
        for user_id, y_true in test_positives.items():
            y_pred = [
                item_id for item_id in recs[user_id]
                if item_id not in train_positives.get(user_id, set())
            ]
            map_list.append(user_ap(y_pred, y_true, k))
            recall_list.append(user_recall(y_pred, y_true, k))
        print(f'Recall@{k}', round(np.mean(recall_list), 4))
        print(f'MAP@{k}', round(np.nanmean(map_list), 4))
        results_swa[i][f'Recall@{k}'] = round(np.mean(recall_list), 4)
        results_swa[i][f'MAP@{k}'] = round(np.nanmean(map_list), 4)

100%|███████| 150/150 [01:37<00:00,  1.55it/s, train_auc=89.41%, skipped=19.61%]


Recall@5 0.026
MAP@5 0.1745
Recall@10 0.0465
MAP@10 0.1391
Recall@5 0.0281
MAP@5 0.1808
Recall@10 0.0492
MAP@10 0.1432


100%|███████| 150/150 [01:37<00:00,  1.53it/s, train_auc=89.38%, skipped=19.57%]


Recall@5 0.0264
MAP@5 0.1762
Recall@10 0.0475
MAP@10 0.1405
Recall@5 0.029
MAP@5 0.1777
Recall@10 0.0492
MAP@10 0.1428


100%|███████| 150/150 [01:37<00:00,  1.54it/s, train_auc=89.39%, skipped=19.57%]


Recall@5 0.026
MAP@5 0.1764
Recall@10 0.047
MAP@10 0.1406
Recall@5 0.0275
MAP@5 0.1782
Recall@10 0.048
MAP@10 0.1414


100%|███████| 150/150 [01:36<00:00,  1.55it/s, train_auc=89.40%, skipped=19.61%]


Recall@5 0.0263
MAP@5 0.1747
Recall@10 0.047
MAP@10 0.1394
Recall@5 0.0283
MAP@5 0.1792
Recall@10 0.0491
MAP@10 0.142


100%|███████| 150/150 [01:38<00:00,  1.52it/s, train_auc=89.35%, skipped=19.56%]


Recall@5 0.0261
MAP@5 0.1736
Recall@10 0.046
MAP@10 0.1383
Recall@5 0.0287
MAP@5 0.1763
Recall@10 0.0488
MAP@10 0.1412


100%|███████| 150/150 [01:37<00:00,  1.54it/s, train_auc=89.32%, skipped=19.46%]


Recall@5 0.0262
MAP@5 0.1741
Recall@10 0.0456
MAP@10 0.1384
Recall@5 0.028
MAP@5 0.1781
Recall@10 0.049
MAP@10 0.1425


100%|███████| 150/150 [01:36<00:00,  1.56it/s, train_auc=89.38%, skipped=19.47%]


Recall@5 0.0271
MAP@5 0.1732
Recall@10 0.0463
MAP@10 0.14
Recall@5 0.0286
MAP@5 0.177
Recall@10 0.0484
MAP@10 0.1413


100%|███████| 150/150 [01:38<00:00,  1.53it/s, train_auc=89.35%, skipped=19.59%]


Recall@5 0.0263
MAP@5 0.1749
Recall@10 0.0461
MAP@10 0.1395
Recall@5 0.0275
MAP@5 0.1792
Recall@10 0.0483
MAP@10 0.1419


100%|███████| 150/150 [01:39<00:00,  1.51it/s, train_auc=89.47%, skipped=19.58%]


Recall@5 0.0256
MAP@5 0.1697
Recall@10 0.0461
MAP@10 0.1378
Recall@5 0.0281
MAP@5 0.1776
Recall@10 0.048
MAP@10 0.1409


100%|███████| 150/150 [01:36<00:00,  1.55it/s, train_auc=89.38%, skipped=19.50%]


Recall@5 0.0268
MAP@5 0.1733
Recall@10 0.0457
MAP@10 0.1388
Recall@5 0.0276
MAP@5 0.1787
Recall@10 0.0491
MAP@10 0.1423


In [8]:
import json

dataset = "m1-1m"

json_dict = json.dumps(results_swa)
with open(f"result_swa_cycle_lr_{dataset}.json", "w") as outfile:
    json.dump(json_dict, outfile)
    
json_dict = json.dumps(results)
with open(f"result_usual_cycle_lr_{dataset}.json", "w") as outfile:
    json.dump(json_dict, outfile)

## BCR

In [13]:
dataset = "bcr"

from src.dataset import Dataset
from src.postprocess_data import MinInteractionsFilter, IdsEncoder, SplitTrainValTest

data = Dataset(dataset).get_data()

min_iterations_filter = MinInteractionsFilter()
ids_encoder = IdsEncoder()

data = min_iterations_filter.transform(data)
data = ids_encoder.fit_transform(data)

n_users = len(data['user_id'].unique())
n_items = len(data['item_id'].unique())
n_interactions = len(data)

splitter = SplitTrainValTest()

user_id,item_id
i64,i64
11072,3741
11072,4804
11072,5032
11073,7622
11073,1295
11073,5406
11073,6400
11074,6000
11074,4381
11074,11937


In [16]:
import faiss
import random
import numpy as np
from collections import defaultdict
from implicit.bpr import BayesianPersonalizedRanking

from src.metrics import user_recall, user_ap


def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)

def get_model(seed):
    return BayesianPersonalizedRanking(
        iterations=EPOCHS, factors=(DIM - 1), random_state=seed,
        learning_rate=LR_START, regularization=REG_FACTOR
    )

In [18]:
results = dict()
results_swa = dict()

for i in range(10): 
    results[i] = dict()
    results_swa[i] = dict()

    train_df, val_df, test_df = splitter.transform(data)

    train_dataset = create_sparse_dataset(train_df)
    train_positives = create_positives_dataset(train_df)
    val_positives = create_positives_dataset(val_df)
    test_positives = create_positives_dataset(test_df)

    SEED = i
    DIM = 128
    LR_START = 1e-2
    LR_END = 1e-3
    EPOCHS = 150
    RESTART_EPOCHS = 50
    REG_FACTOR = 1e-2

    set_seed(SEED)
    model = get_model(SEED)
    fit_callback = SWACallback()
    model.fit(train_dataset, callback=fit_callback.callback_fn)

    # usual
    index = faiss.IndexFlatIP(DIM)
    index.add(model.item_factors)
    recs = index.search(model.user_factors, 50)[1]

    for k in [5, 10]:
        map_list = []
        recall_list = []
        for user_id, y_true in test_positives.items():
            y_pred = [
                item_id for item_id in recs[user_id]
                if item_id not in train_positives.get(user_id, set())
            ]

            map_list.append(user_ap(y_pred, y_true, k))
            recall_list.append(user_recall(y_pred, y_true, k))
        print(f'Recall@{k}', round(np.mean(recall_list), 4))
        print(f'MAP@{k}', round(np.nanmean(map_list), 4))
        results[i][f'Recall@{k}'] = round(np.mean(recall_list), 4)
        results[i][f'MAP@{k}'] = round(np.nanmean(map_list), 4)
        
    # swa
    index = faiss.IndexFlatIP(DIM)
    index.add(fit_callback.item_factors_swa)
    recs = index.search(fit_callback.user_factors_swa, 200)[1]

    for k in [5, 10]:
        recall_list = []
        map_list = []
        for user_id, y_true in test_positives.items():
            y_pred = [
                item_id for item_id in recs[user_id]
                if item_id not in train_positives.get(user_id, set())
            ]
            map_list.append(user_ap(y_pred, y_true, k))
            recall_list.append(user_recall(y_pred, y_true, k))
        print(f'Recall@{k}', round(np.mean(recall_list), 4))
        print(f'MAP@{k}', round(np.nanmean(map_list), 4))
        results_swa[i][f'Recall@{k}'] = round(np.mean(recall_list), 4)
        results_swa[i][f'MAP@{k}'] = round(np.nanmean(map_list), 4)

ValueError: negative column index found