In [7]:
import sys
sys.path.append("/home/ifilipov/stable-embeddings/src") 

In [8]:
dataset = "bcr"

In [9]:
from dataset import Dataset
from postprocess_data import MinInteractionsFilter, IdsEncoder, SplitTrainValTest

data = Dataset(dataset).get_data()

min_iterations_filter = MinInteractionsFilter()
ids_encoder = IdsEncoder()

data = min_iterations_filter.transform(data)
data = ids_encoder.fit_transform(data)

n_users = len(data['user_id'].unique())
n_items = len(data['item_id'].unique())
n_interactions = len(data)

splitter = SplitTrainValTest()

In [10]:
import faiss
import random
import numpy as np
from collections import defaultdict
from implicit.bpr import BayesianPersonalizedRanking

from metrics import user_recall, user_ap

def linear_lr(epoch, n_epochs, lr_start, lr_end):
    p = epoch / n_epochs
    return lr_start * (1 - p) + lr_end * p

def cosine_annealing_warm_restart(epoch, n_epochs, lr_start, lr_end, t_i):
    # https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.CosineAnnealingWarmRestarts.html#torch.optim.lr_scheduler.CosineAnnealingWarmRestarts
    t_cur = epoch % t_i
    # modification to make lr after restart closer to lr_end 
    linear_coef = (n_epochs - epoch) / n_epochs
    return lr_end + linear_coef * (lr_start - lr_end) * (1 + np.cos(np.pi * t_cur / t_i)) / 2

class CallbackClass:
    def __init__(self):
        pass
        
    def callback_fn(self, epoch, *_):        
        pass

def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)

def get_model(seed):
    return BayesianPersonalizedRanking(
        iterations=EPOCHS, factors=(DIM - 1), random_state=seed,
        learning_rate=LR_START, regularization=REG_FACTOR
    )

In [11]:
from postprocess_data import create_sparse_dataset, create_positives_dataset

In [None]:
results = dict()

for i in range(10): 
    results[i] = dict()
    
    train_df, val_df, test_df = splitter.transform(data)
    
    train_dataset = create_sparse_dataset(train_df)
    train_positives = create_positives_dataset(train_df)
    val_positives = create_positives_dataset(val_df)
    test_positives = create_positives_dataset(test_df)
    
    SEED = i
    DIM = 128
    LR_START = 1e-2
    LR_END = 1e-3
    EPOCHS = 300
    RESTART_EPOCHS = 100
    REG_FACTOR = 1e-2

    set_seed(SEED)
    model = get_model(SEED)
    model.iterations = 0
    model.fit(train_dataset, callback=CallbackClass().callback_fn)

    user_factors = model.user_factors.copy()
    item_factors = model.item_factors.copy()
    
    recs_by_seed = []
    models_by_seed = []
    
    seeds = np.random.choice(10_000, 10)
    
    for seed in seeds:
        set_seed(seed)
        
        model = get_model(seed)
        model.user_factors = user_factors.copy()
        model.item_factors = item_factors.copy()

        fit_callback = CallbackClass()
        model.fit(train_dataset, callback=fit_callback.callback_fn)

        index = faiss.IndexFlatIP(DIM)
        index.add(model.item_factors)
        recs = index.search(model.user_factors, 50)[1]

        recs_by_seed.append(recs)
        models_by_seed.append(model)

        for k in [5, 10]:
            map_list = []
            recall_list = []
            for user_id, y_true in test_positives.items():
                y_pred = [
                    item_id for item_id in recs[user_id]
                    if item_id not in train_positives.get(user_id, set())
                ]
                
                map_list.append(user_ap(y_pred, y_true, k))
                recall_list.append(user_recall(y_pred, y_true, k))
        
        user_tau = [model.user_factors - user_factors for model in models_by_seed]
        item_tau = [model.item_factors - item_factors for model in models_by_seed]
        
    best_init_alpha, best_score = None, None
    for init_alpha in np.linspace(0.01, 0.5, 100):
        new_item_factors = item_factors.copy()
        new_user_factors = user_factors.copy()

        alpha = init_alpha
        for tau in item_tau:
            new_item_factors += alpha * tau

        alpha = init_alpha
        for tau in user_tau:
            new_user_factors += alpha * tau

        index = faiss.IndexFlatIP(DIM)
        index.add(new_item_factors)
        recs = index.search(new_user_factors, 30)[1]
        
        for k in [5, 10]:
            recall_list = []
            for user_id, y_true in val_positives.items():
                y_pred = [
                    item_id for item_id in recs[user_id]
                    if item_id not in train_positives.get(user_id, set())
                ]
                recall_list.append(user_recall(y_pred, y_true, k))

            if k == 10:
                if best_score is None or best_score < np.mean(recall_list):
                    best_init_alpha, best_score = init_alpha, np.mean(recall_list)
    
    results[i]["best_alpha"] = best_init_alpha
    results[i]["best_score"] = best_score

    new_item_factors = item_factors.copy()
    new_user_factors = user_factors.copy()

    for tau in item_tau:
        new_item_factors += best_init_alpha * tau
    for tau in user_tau:
        new_user_factors += best_init_alpha * tau

    index = faiss.IndexFlatIP(DIM)
    index.add(new_item_factors)
    recs = index.search(new_user_factors, 50)[1]

    print("Test TAUS")    
    
    for k in [5, 10]:
        map_list = []
        recall_list = []
        for user_id, y_true in test_positives.items():
            y_pred = [
                item_id for item_id in recs[user_id]
                if item_id not in train_positives.get(user_id, set())
            ]
            map_list.append(user_ap(y_pred, y_true, k))
            recall_list.append(user_recall(y_pred, y_true, k))
            
        results[i][f"Recall@{k}_taus"] = round(np.mean(recall_list), 4)
        results[i][f"MAP@{k}_taus"] = round(np.nanmean(map_list), 4)
        
    print("Test each seed")
    
    recalls_10 = []
    for j, model in enumerate(models_by_seed):
        index = faiss.IndexFlatIP(DIM)
        index.add(model.item_factors)
        recs = index.search(model.user_factors, 50)[1]

        for k in [5, 10]:
            map_list = []
            recall_list = []
            for user_id, y_true in test_positives.items():
                y_pred = [
                    item_id for item_id in recs[user_id]
                    if item_id not in train_positives.get(user_id, set())
                ]
                map_list.append(user_ap(y_pred, y_true, k))
                recall_list.append(user_recall(y_pred, y_true, k))
            
            results[i][f"Recall@{k}_{j}"] = round(np.mean(recall_list), 4)
            results[i][f"MAP@{k}_{j}"] = round(np.nanmean(map_list), 4)
            
            if k == 10:
                recalls_10.append(round(np.mean(recall_list), 4))
                    
    print("Test ensemble")
    
    def softmax(x):
        return np.exp(x)/sum(np.exp(x))
    
    weights = softmax(recalls_10)
    new_user_factors = np.average([model.user_factors for model in models_by_seed], axis=0, weights=weights)
    new_item_factors = np.average([model.item_factors for model in models_by_seed], axis=0, weights=weights)

    index = faiss.IndexFlatIP(DIM)
    index.add(new_item_factors)
    recs = index.search(new_user_factors, 50)[1]

    for k in [5, 10]:
        map_list = []
        recall_list = []
        for user_id, y_true in test_positives.items():
            y_pred = [
                item_id for item_id in recs[user_id]
                if item_id not in train_positives.get(user_id, set())
            ]
            map_list.append(user_ap(y_pred, y_true, k))
            recall_list.append(user_recall(y_pred, y_true, k))
        
        results[i][f"Recall@{k}_mean"] = round(np.mean(recall_list), 4)
        results[i][f"MAP@{k}_mean"] = round(np.nanmean(map_list), 4)


0it [00:00, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

Test TAUS
Test each seed
Test ensemble


0it [00:00, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

Test TAUS
Test each seed
Test ensemble


0it [00:00, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

Test TAUS
Test each seed
Test ensemble


0it [00:00, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

Test TAUS
Test each seed
Test ensemble


0it [00:00, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

Test TAUS
Test each seed
Test ensemble


0it [00:00, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

Test TAUS
Test each seed
Test ensemble


0it [00:00, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
import json

json_dict = json.dumps(results)
with open(f"result_{dataset}.json", "w") as outfile:
    json.dump(json_dict, outfile)