In [1]:
from src.dataset import Dataset
from src.postprocess_data import MinInteractionsFilter, IdsEncoder, SplitTrainValTest

data = Dataset('ml-1m').get_data()

min_iterations_filter = MinInteractionsFilter()
ids_encoder = IdsEncoder()

data = min_iterations_filter.transform(data)
data = ids_encoder.fit_transform(data)

n_users = len(data['user_id'].unique())
n_items = len(data['item_id'].unique())
n_interactions = len(data)
print(f'users: {n_users}')
print(f'items: {n_items}')
print(f'interactions: {n_interactions}')
print(f'density: {(n_interactions / n_users / n_items) * 100:.2f}%')

splitter = SplitTrainValTest()
train_df, val_df, test_df = splitter.transform(data)

data

users: 6034
items: 3125
interactions: 574376
density: 3.05%


user_id,item_id,event_ts
i64,i64,i64
0,924,978300760
0,2685,978300275
0,1835,978824291
0,1015,978302039
0,2218,978300719
0,515,978302268
0,714,978301368
0,516,978824268
0,733,978301752
0,1876,978302281


In [2]:
from src.postprocess_data import create_sparse_dataset, create_positives_dataset

train_dataset = create_sparse_dataset(train_df)
train_positives = create_positives_dataset(train_df)
val_positives = create_positives_dataset(val_df)
test_positives = create_positives_dataset(test_df)

print(f'{len(train_positives.keys())} users in train')
print(f'{len(val_positives.keys())} users in val')
print(f'{len(test_positives.keys())} users in test')

4006 users in train
1579 users in val
1784 users in test


In [3]:
# import logging

# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [75]:
train_sessions = create_positives_dataset(train_df, apply_set=False)

In [86]:
import numpy as np
from gensim.models import Word2Vec

SEED = 42
window_len = 2
base_model = Word2Vec(window=window_len, sg=1, hs=0, min_count=0, vector_size=256, negative=15,
         ns_exponent=-0.5, alpha=0.03, min_alpha=0.003, seed=SEED)
base_model.build_vocab([seq for _, seq in train_sessions.items()], progress_per=1000000000)

In [87]:
import random
from tqdm.notebook import tqdm
from collections import defaultdict
from gensim.models import Word2Vec
from src.metrics import user_recall, user_ap
from copy import deepcopy


SEED = 42

def set_seed():
    np.random.seed(SEED)
    random.seed(SEED)


def fit_model(sequences):
    model = deepcopy(base_model)
    model.train(sequences, total_examples=model.corpus_count, epochs=10, report_delay=1000000)
    return model


recs_by_seed = []
models_by_seed = []

seeds = np.random.choice(10_000, 5)
print(seeds)

for SEED in seeds:
    print('-'*25)
    print(f'SEED = {SEED}')
    set_seed()
    
    model = fit_model([seq for _, seq in train_sessions.items()])
    
    w2v_recommendations = defaultdict(list)
    for user_id in tqdm(test_positives.keys()):
        preds = model.predict_output_word(
            context_words_list=train_sessions.get(user_id, [0])[-window_len:], topn=40)
        if preds is not None:
            w2v_recommendations[user_id] = list(map(lambda x: x[0], preds))
        
    for k in [5, 10]:
        map_list = []
        recall_list = []
        for user_id, y_true in val_positives.items():
            y_pred = [
                item_id for item_id in w2v_recommendations[user_id]
                if item_id not in train_positives.get(user_id, set())
            ]
            map_list.append(user_ap(y_pred, y_true, k))
            recall_list.append(user_recall(y_pred, y_true, k))
        print(f'Recall@{k}', round(np.mean(recall_list), 4))
        print(f'MAP@{k}', round(np.nanmean(map_list), 4))
        
    recs_by_seed.append(w2v_recommendations)
    models_by_seed.append(model)

[4905 7344  269 6850 3109]
-------------------------
SEED = 4905



Seeding based on hashing is deprecated
since Python 3.9 and will be removed in a subsequent version. The only 
supported seed types are: None, int, float, str, bytes, and bytearray.



  0%|          | 0/1784 [00:00<?, ?it/s]

Recall@5 0.0101
MAP@5 0.2011
Recall@10 0.0152
MAP@10 0.144
-------------------------
SEED = 7344


  0%|          | 0/1784 [00:00<?, ?it/s]

Recall@5 0.0099
MAP@5 0.2009
Recall@10 0.0166
MAP@10 0.1639
-------------------------
SEED = 269


  0%|          | 0/1784 [00:00<?, ?it/s]

Recall@5 0.0101
MAP@5 0.2015
Recall@10 0.0154
MAP@10 0.1505
-------------------------
SEED = 6850


  0%|          | 0/1784 [00:00<?, ?it/s]

Recall@5 0.0098
MAP@5 0.1924
Recall@10 0.0161
MAP@10 0.1494
-------------------------
SEED = 3109


  0%|          | 0/1784 [00:00<?, ?it/s]

Recall@5 0.0102
MAP@5 0.2072
Recall@10 0.0152
MAP@10 0.1455


In [88]:
vectors_tau = [model.wv.vectors - base_model.wv.vectors for model in models_by_seed]
syn1neg_tau = [model.syn1neg - base_model.syn1neg for model in models_by_seed]

def get_model(wv_vectors, syn1neg_vectors):
    model = deepcopy(base_model)
    model.wv.vectors = wv_vectors.copy()
    model.syn1neg = syn1neg_vectors.copy()
    return model

In [91]:
recall_at_k = defaultdict(list)

best_init_alpha, best_score = None, None
for init_alpha in tqdm(np.linspace(0.01, 1.0, 20)):
    new_wv_vectors = base_model.wv.vectors.copy()
    new_syn1neg = base_model.syn1neg.copy()

    alpha = init_alpha
    for tau in vectors_tau:
        new_wv_vectors += alpha * tau
        
    for tau in syn1neg_tau:
        new_syn1neg += alpha * tau
        
    model = get_model(new_wv_vectors, new_syn1neg)
    
    w2v_recommendations = defaultdict(list)
    for user_id in test_positives.keys():
        preds = model.predict_output_word(context_words_list=train_sessions.get(user_id, [0])[-1:], topn=40)
        if preds is not None:
            w2v_recommendations[user_id] = list(map(lambda x: x[0], preds))

    for k in [5, 10]:
        map_list = []
        recall_list = []
        for user_id, y_true in val_positives.items():
            y_pred = [
                item_id for item_id in w2v_recommendations[user_id]
                if item_id not in train_positives.get(user_id, set())
            ]
            map_list.append(user_ap(y_pred, y_true, k))
            recall_list.append(user_recall(y_pred, y_true, k))
#         print(f'Recall@{k}', round(np.mean(recall_list), 4))
#         print(f'MAP@{k}', round(np.nanmean(map_list), 4))
        recall_at_k[k].append(np.mean(recall_list))
        
        if k == 10:
            if best_score is None or best_score < np.mean(recall_list):
                best_init_alpha, best_score = init_alpha, np.mean(recall_list)

  0%|          | 0/20 [00:00<?, ?it/s]


overflow encountered in exp


invalid value encountered in divide



In [92]:
import pandas as pd
import plotly.express as px

df = pd.concat([
    pd.DataFrame({
        'init_alpha': np.linspace(0.01, 1.0, 20),
        'recall': v,
        'k': [k for _ in v]
    })
    for k, v in recall_at_k.items()
])
fig = px.line(
    df, x="init_alpha", y="recall", color='k',
    title='recall@k on validation set', log_y=True
)
fig.show()

In [93]:
init_alpha = best_init_alpha
print(f'init_alpha = {best_init_alpha}')

new_wv_vectors = base_model.wv.vectors.copy()
new_syn1neg = base_model.syn1neg.copy()

alpha = init_alpha
for tau in vectors_tau:
    new_wv_vectors += alpha * tau

for tau in syn1neg_tau:
    new_syn1neg += alpha * tau

model = get_model(new_wv_vectors, new_syn1neg)

w2v_recommendations = defaultdict(list)
for user_id in tqdm(test_positives.keys()):
    preds = model.predict_output_word(context_words_list=train_sessions.get(user_id, [0])[-1:], topn=40)
    if preds is not None:
        w2v_recommendations[user_id] = list(map(lambda x: x[0], preds))

for k in [5, 10]:
    map_list = []
    recall_list = []
    for user_id, y_true in test_positives.items():
        y_pred = [
            item_id for item_id in w2v_recommendations[user_id]
            if item_id not in train_positives.get(user_id, set())
        ]
        map_list.append(user_ap(y_pred, y_true, k))
        recall_list.append(user_recall(y_pred, y_true, k))
    print(f'Recall@{k}', round(np.mean(recall_list), 4))
    print(f'MAP@{k}', round(np.nanmean(map_list), 4))

init_alpha = 0.16631578947368422


  0%|          | 0/1784 [00:00<?, ?it/s]

Recall@5 0.0189
MAP@5 0.1112
Recall@10 0.0327
MAP@10 0.0854


In [94]:
for model in models_by_seed:
    w2v_recommendations = defaultdict(list)
    for user_id in tqdm(test_positives.keys()):
        preds = model.predict_output_word(context_words_list=train_sessions.get(user_id, [0])[-1:], topn=40)
        if preds is not None:
            w2v_recommendations[user_id] = list(map(lambda x: x[0], preds))

    for k in [5, 10]:
        map_list = []
        recall_list = []
        for user_id, y_true in test_positives.items():
            y_pred = [
                item_id for item_id in w2v_recommendations[user_id]
                if item_id not in train_positives.get(user_id, set())
            ]
            map_list.append(user_ap(y_pred, y_true, k))
            recall_list.append(user_recall(y_pred, y_true, k))
        print(f'Recall@{k}', round(np.mean(recall_list), 4))
        print(f'MAP@{k}', round(np.nanmean(map_list), 4))
    print('-'*25)

  0%|          | 0/1784 [00:00<?, ?it/s]

Recall@5 0.0188
MAP@5 0.1106
Recall@10 0.0304
MAP@10 0.0816
-------------------------


  0%|          | 0/1784 [00:00<?, ?it/s]

Recall@5 0.0184
MAP@5 0.1127
Recall@10 0.035
MAP@10 0.0902
-------------------------


  0%|          | 0/1784 [00:00<?, ?it/s]

Recall@5 0.0194
MAP@5 0.1118
Recall@10 0.0312
MAP@10 0.0835
-------------------------


  0%|          | 0/1784 [00:00<?, ?it/s]

Recall@5 0.0187
MAP@5 0.1089
Recall@10 0.0319
MAP@10 0.0841
-------------------------


  0%|          | 0/1784 [00:00<?, ?it/s]

Recall@5 0.0204
MAP@5 0.1167
Recall@10 0.031
MAP@10 0.0823
-------------------------
