In [1]:
from src.dataset import Dataset
from src.postprocess_data import MinInteractionsFilter, IdsEncoder, SplitTrainValTest
import os
import faiss
import random
import numpy as np
from collections import defaultdict
from implicit.bpr import BayesianPersonalizedRanking
from src.metrics import user_recall, user_ap

os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [2]:
data = Dataset('ml-1m').get_data()

min_iterations_filter = MinInteractionsFilter()
ids_encoder = IdsEncoder()

data = min_iterations_filter.transform(data)
data = ids_encoder.fit_transform(data)

n_users = len(data['user_id'].unique())
n_items = len(data['item_id'].unique())
n_interactions = len(data)
print(f'users: {n_users}')
print(f'items: {n_items}')
print(f'interactions: {n_interactions}')
print(f'density: {(n_interactions / n_users / n_items) * 100:.2f}%')

splitter = SplitTrainValTest()
train_df, val_df, test_df = splitter.transform(data)

data

users: 6034
items: 3125
interactions: 574376
density: 3.05%


user_id,item_id,event_ts
i64,i64,i64
0,924,978300760
0,2685,978300275
0,1835,978824291
0,1015,978302039
0,2218,978300719
0,515,978302268
0,714,978301368
0,516,978824268
0,733,978301752
0,1876,978302281


In [3]:
from src.postprocess_data import create_sparse_dataset, create_positives_dataset

train_dataset = create_sparse_dataset(train_df)
train_positives = create_positives_dataset(train_df)
val_positives = create_positives_dataset(val_df)
test_positives = create_positives_dataset(test_df)

print(f'{len(train_positives.keys())} users in train')
print(f'{len(val_positives.keys())} users in val')
print(f'{len(test_positives.keys())} users in test')

4006 users in train
1579 users in val
1784 users in test


## Линейный learning rate

In [4]:
def linear_lr(epoch, n_epochs, lr_start, lr_end):
    p = epoch / n_epochs
    return lr_start * (1 - p) + lr_end * p

def cyclic_lr(epoch, n_epochs, lr_start, lr_end):
    t_i = 1 / RESTART_EPOCHS * ( (epoch - 1) % RESTART_EPOCHS + 1)
    return lr_start * (1 - t_i) + lr_end * t_i

class SWACallback:
    def __init__(self, type_lr="linear_lr"):
        self.user_factors_swa = None
        self.item_factors_swa = None
        self.recall_history = defaultdict(list)
        
        self.type_lr = type_lr
        
        self.n_models = 0
        
    def callback_fn(self, epoch, *_):
        if self.type_lr == "linear_lr":
            model.learning_rate = linear_lr(epoch, model.iterations, LR_START, LR_END)
        elif self.type_lr == "cyclic_lr":
            model.learning_rate = cyclic_lr(epoch, model.iterations, LR_START, LR_END)
        
        if (epoch + 1) % RESTART_EPOCHS == 0:
            if self.user_factors_swa is None:
                self.user_factors_swa = model.user_factors
                self.item_factors_swa = model.item_factors
            else:
                self.n_models = epoch // RESTART_EPOCHS
                self.user_factors_swa = (self.user_factors_swa * self.n_models + model.user_factors) / (self.n_models + 1)
                self.item_factors_swa = (self.item_factors_swa * self.n_models + model.item_factors) / (self.n_models + 1)
        
        index = faiss.IndexFlatIP(DIM)
        index.add(model.item_factors)
        recs = index.search(model.user_factors, 200)[1]
        
        for k in [1, 10, 20]:
            recall_list = []
            for user_id, y_true in val_positives.items():
                y_pred = [
                    item_id for item_id in recs[user_id]
                    if item_id not in train_positives.get(user_id, set())
                ]
                recall_list.append(user_recall(y_pred, y_true, k))
            self.recall_history[k].append(np.mean(recall_list))

def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)

def get_model(seed):
    return BayesianPersonalizedRanking(
        iterations=EPOCHS, factors=(DIM - 1), random_state=seed,
        learning_rate=LR_START, regularization=REG_FACTOR
    )


## Линейный LR

In [13]:
results = dict()
results_swa = dict()

for i in range(10): 
    results[i] = dict()
    results_swa[i] = dict()

    train_df, val_df, test_df = splitter.transform(data)

    train_dataset = create_sparse_dataset(train_df)
    train_positives = create_positives_dataset(train_df)
    val_positives = create_positives_dataset(val_df)
    test_positives = create_positives_dataset(test_df)

    SEED = i
    DIM = 128
    LR_START = 1e-2
    LR_END = 1e-3
    EPOCHS = 150
    RESTART_EPOCHS = 50
    REG_FACTOR = 1e-2

    set_seed(SEED)
    model = get_model(SEED)
    fit_callback = SWACallback()
    model.fit(train_dataset, callback=fit_callback.callback_fn)

    # usual
    index = faiss.IndexFlatIP(DIM)
    index.add(model.item_factors)
    recs = index.search(model.user_factors, 50)[1]

    for k in [5, 10]:
        map_list = []
        recall_list = []
        for user_id, y_true in test_positives.items():
            y_pred = [
                item_id for item_id in recs[user_id]
                if item_id not in train_positives.get(user_id, set())
            ]

            map_list.append(user_ap(y_pred, y_true, k))
            recall_list.append(user_recall(y_pred, y_true, k))
        print(f'Recall@{k}', round(np.mean(recall_list), 4))
        print(f'MAP@{k}', round(np.nanmean(map_list), 4))
        results[i][f'Recall@{k}'] = round(np.mean(recall_list), 4)
        results[i][f'MAP@{k}'] = round(np.nanmean(map_list), 4)
        
    # swa
    index = faiss.IndexFlatIP(DIM)
    index.add(fit_callback.item_factors_swa)
    recs = index.search(fit_callback.user_factors_swa, 200)[1]

    for k in [5, 10]:
        recall_list = []
        map_list = []
        for user_id, y_true in test_positives.items():
            y_pred = [
                item_id for item_id in recs[user_id]
                if item_id not in train_positives.get(user_id, set())
            ]
            map_list.append(user_ap(y_pred, y_true, k))
            recall_list.append(user_recall(y_pred, y_true, k))
        print(f'Recall@{k}', round(np.mean(recall_list), 4))
        print(f'MAP@{k}', round(np.nanmean(map_list), 4))
        results_swa[i][f'Recall@{k}'] = round(np.mean(recall_list), 4)
        results_swa[i][f'MAP@{k}'] = round(np.nanmean(map_list), 4)

  0%|          | 0/150 [00:00<?, ?it/s]

Recall@5 0.0258
MAP@5 0.1731
Recall@10 0.0453
MAP@10 0.1378
Recall@5 0.0268
MAP@5 0.1761
Recall@10 0.0467
MAP@10 0.1381


  0%|          | 0/150 [00:00<?, ?it/s]

Recall@5 0.0253
MAP@5 0.1734
Recall@10 0.046
MAP@10 0.1395
Recall@5 0.0262
MAP@5 0.1747
Recall@10 0.0471
MAP@10 0.1388


  0%|          | 0/150 [00:00<?, ?it/s]

Recall@5 0.0257
MAP@5 0.1741
Recall@10 0.0466
MAP@10 0.1395
Recall@5 0.0265
MAP@5 0.176
Recall@10 0.0468
MAP@10 0.1381


  0%|          | 0/150 [00:00<?, ?it/s]

Recall@5 0.0265
MAP@5 0.172
Recall@10 0.0464
MAP@10 0.1392
Recall@5 0.027
MAP@5 0.1765
Recall@10 0.0472
MAP@10 0.1398


  0%|          | 0/150 [00:00<?, ?it/s]

Recall@5 0.0264
MAP@5 0.1751
Recall@10 0.0471
MAP@10 0.141
Recall@5 0.0266
MAP@5 0.1706
Recall@10 0.0482
MAP@10 0.1396


  0%|          | 0/150 [00:00<?, ?it/s]

Recall@5 0.0261
MAP@5 0.1728
Recall@10 0.0464
MAP@10 0.1385
Recall@5 0.0261
MAP@5 0.1753
Recall@10 0.0481
MAP@10 0.1412


  0%|          | 0/150 [00:00<?, ?it/s]

Recall@5 0.026
MAP@5 0.1732
Recall@10 0.0466
MAP@10 0.1403
Recall@5 0.0263
MAP@5 0.1761
Recall@10 0.0487
MAP@10 0.1404


  0%|          | 0/150 [00:00<?, ?it/s]

Recall@5 0.0266
MAP@5 0.1746
Recall@10 0.0456
MAP@10 0.1384
Recall@5 0.0261
MAP@5 0.171
Recall@10 0.0467
MAP@10 0.1374


  0%|          | 0/150 [00:00<?, ?it/s]

Recall@5 0.0268
MAP@5 0.1727
Recall@10 0.0459
MAP@10 0.138
Recall@5 0.0267
MAP@5 0.1754
Recall@10 0.0476
MAP@10 0.1386


  0%|          | 0/150 [00:00<?, ?it/s]

Recall@5 0.0255
MAP@5 0.1694
Recall@10 0.0463
MAP@10 0.1382
Recall@5 0.0265
MAP@5 0.1731
Recall@10 0.0458
MAP@10 0.1365


In [14]:
import json

dataset = "m1-1m"

json_dict = json.dumps(results_swa)
with open(f"result_swa_{dataset}.json", "w") as outfile:
    json.dump(json_dict, outfile)
    
json_dict = json.dumps(results)
with open(f"result_usual_{dataset}.json", "w") as outfile:
    json.dump(json_dict, outfile)

## Цикличный learning rate

In [None]:
results = dict()
results_swa = dict()

for i in range(10): 
    results[i] = dict()
    results_swa[i] = dict()

    train_df, val_df, test_df = splitter.transform(data)

    train_dataset = create_sparse_dataset(train_df)
    train_positives = create_positives_dataset(train_df)
    val_positives = create_positives_dataset(val_df)
    test_positives = create_positives_dataset(test_df)

    SEED = i
    DIM = 128
    LR_START = 1e-2
    LR_END = 1e-3
    EPOCHS = 150
    RESTART_EPOCHS = 50
    REG_FACTOR = 1e-2

    set_seed(SEED)
    model = get_model(SEED)
    fit_callback = SWACallback("cyclic_lr")
    model.fit(train_dataset, callback=fit_callback.callback_fn)

    # usual
    index = faiss.IndexFlatIP(DIM)
    index.add(model.item_factors)
    recs = index.search(model.user_factors, 50)[1]

    for k in [5, 10]:
        map_list = []
        recall_list = []
        for user_id, y_true in test_positives.items():
            y_pred = [
                item_id for item_id in recs[user_id]
                if item_id not in train_positives.get(user_id, set())
            ]

            map_list.append(user_ap(y_pred, y_true, k))
            recall_list.append(user_recall(y_pred, y_true, k))
        print(f'Recall@{k}', round(np.mean(recall_list), 4))
        print(f'MAP@{k}', round(np.nanmean(map_list), 4))
        results[i][f'Recall@{k}'] = round(np.mean(recall_list), 4)
        results[i][f'MAP@{k}'] = round(np.nanmean(map_list), 4)
        
    # swa
    index = faiss.IndexFlatIP(DIM)
    index.add(fit_callback.item_factors_swa)
    recs = index.search(fit_callback.user_factors_swa, 200)[1]

    for k in [5, 10]:
        recall_list = []
        map_list = []
        for user_id, y_true in test_positives.items():
            y_pred = [
                item_id for item_id in recs[user_id]
                if item_id not in train_positives.get(user_id, set())
            ]
            map_list.append(user_ap(y_pred, y_true, k))
            recall_list.append(user_recall(y_pred, y_true, k))
        print(f'Recall@{k}', round(np.mean(recall_list), 4))
        print(f'MAP@{k}', round(np.nanmean(map_list), 4))
        results_swa[i][f'Recall@{k}'] = round(np.mean(recall_list), 4)
        results_swa[i][f'MAP@{k}'] = round(np.nanmean(map_list), 4)

## ADD W_swd + t d

In [None]:
def linear_lr(epoch, n_epochs, lr_start, lr_end):
    p = epoch / n_epochs
    return lr_start * (1 - p) + lr_end * p

def cyclic_lr(epoch, n_epochs, lr_start, lr_end):
    t_i = 1 / RESTART_EPOCHS * ( (epoch - 1) % RESTART_EPOCHS + 1)
    return lr_start * (1 - t_i) + lr_end * t_i

class SWACallback:
    def __init__(self, type_lr="linear_lr"):
        self.user_factors_swa = None
        self.item_factors_swa = None
        self.recall_history = defaultdict(list)
        
        self.type_lr = type_lr
        
        self.n_models = 0
        
    def callback_fn(self, epoch, *_):
        if self.type_lr == "linear_lr":
            model.learning_rate = linear_lr(epoch, model.iterations, LR_START, LR_END)
        elif self.type_lr == "cyclic_lr":
            model.learning_rate = cyclic_lr(epoch, model.iterations, LR_START, LR_END)
        
        if (epoch + 1) % RESTART_EPOCHS == 0:
            if self.user_factors_swa is None:
                self.user_factors_swa = model.user_factors
                self.item_factors_swa = model.item_factors
            else:
                self.n_models = epoch // RESTART_EPOCHS
                self.user_factors_swa = (self.user_factors_swa * self.n_models + model.user_factors) / (self.n_models + 1)
                self.item_factors_swa = (self.item_factors_swa * self.n_models + model.item_factors) / (self.n_models + 1)
        
        index = faiss.IndexFlatIP(DIM)
        index.add(model.item_factors)
        recs = index.search(model.user_factors, 200)[1]
        
        for k in [1, 10, 20]:
            recall_list = []
            for user_id, y_true in val_positives.items():
                y_pred = [
                    item_id for item_id in recs[user_id]
                    if item_id not in train_positives.get(user_id, set())
                ]
                recall_list.append(user_recall(y_pred, y_true, k))
            self.recall_history[k].append(np.mean(recall_list))