In [1]:
import gc
import implicit
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
import sys
import torch
import umap
import wandb


from implicit.evaluation import ndcg_at_k, leave_k_out_split
from random import randint
from scipy.sparse import csr_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ParameterGrid

from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import Sampler
from tqdm.notebook import tqdm

# Prepare data

In [4]:
train = pd.read_parquet('data/train.parquet.gzip')

items_meta = pd.read_parquet('data/items_meta.parquet.gzip')

In [25]:
num_items = len(items_meta.item_id.unique())
num_users = len(train.user_id.unique())

### Add post features

In [3]:
items_stats = train.groupby('item_id').agg(
    avg_time  = pd.NamedAgg(column = 'timespent', aggfunc = 'mean'),
    users_cnt = pd.NamedAgg(column = 'user_id',   aggfunc = lambda x: len(set(x))),
    likes     = pd.NamedAgg(column = 'reaction',  aggfunc = lambda x: (x == 1).sum()),
    dislikes  = pd.NamedAgg(column = 'reaction',  aggfunc = lambda x: (x == -1).sum()),
    rating    = pd.NamedAgg(column = 'reaction',  aggfunc = lambda x: x.sum()/len(x))
)

In [5]:
embeddings = np.array(items_meta['embeddings'].tolist())
extended_embeddings = np.hstack([embeddings, items_stats[['avg_time', 'users_cnt', 'likes', 'dislikes']].values])
extended_embeddings.shape

(227606, 316)

### Scale features

In [6]:
scaler = StandardScaler()
extended_embeddings = scaler.fit_transform(extended_embeddings)
awesome_embeddings = extended_embeddings[:,:-4]

In [7]:
del items_stats
gc.collect()

3415

# Train | Val split

In [8]:
def leave_last_k_out_split(user_item_df, k = 5, train_ratio=0.8):
    df_gr = user_item_df.groupby('user_id').agg(
        item_ids = pd.NamedAgg(column = 'item_id', aggfunc = lambda x: x.tolist()),
        timespents = pd.NamedAgg(column = 'timespent', aggfunc = lambda x: x.tolist()),
        reactions = pd.NamedAgg(column = 'reaction', aggfunc = lambda x: x.tolist()))
    
    train_user_item_df = pd.DataFrame(columns=df_gr.columns)
    train_user_item_df['user_id'] = df_gr.index
    train_user_item_df.set_index('user_id')
    
    test_user_item_df = pd.DataFrame(columns=df_gr.columns)
    test_user_item_df['user_id'] = df_gr.index
    test_user_item_df.set_index('user_id')
    
    df_gr['train_size'] = df_gr.item_ids.apply(lambda ids: max(len(ids)-k, int(len(ids)*train_ratio + 1)))
    
    train_user_item_df['item_ids'] = df_gr.apply(lambda row: row.item_ids[:row.train_size], axis=1)
    test_user_item_df['item_ids'] = df_gr.apply(lambda row: row.item_ids[row.train_size:], axis=1)
    
    train_user_item_df['timespents'] = df_gr.apply(lambda row: row.timespents[:row.train_size], axis=1)
    test_user_item_df['timespents'] = df_gr.apply(lambda row: row.timespents[row.train_size:], axis=1)
    
    train_user_item_df['reactions'] = df_gr.apply(lambda row: row.reactions[:row.train_size], axis=1)
    test_user_item_df['reactions'] = df_gr.apply(lambda row: row.reactions[row.train_size:], axis=1)
    
    del df_gr
    
    return train_user_item_df, test_user_item_df

In [9]:
%%time
train_df, val_df = leave_last_k_out_split(train, k = 5)

CPU times: user 1min 29s, sys: 3.64 s, total: 1min 33s
Wall time: 1min 33s


In [11]:
train_df.head(3)

Unnamed: 0,item_ids,timespents,reactions,user_id
0,"[35236, 186864, 58724, 155390, 153029, 28510, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 2, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
1,"[46502, 93689, 78282, 33169, 48174, 124953, 47...","[0, 2, 1, 1, 0, 7, 1, 0, 0, 0, 4, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
2,"[68210, 64859, 180839, 214379, 31131, 195952, ...","[0, 2, 0, 3, 6, 0, 0, 0, 0, 0, 0, 0, 10, 0, 2,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",2


In [12]:
val_df.head(3)

Unnamed: 0,item_ids,timespents,reactions,user_id
0,"[167951, 219579, 54416, 198166, 167423]","[0, 2, 0, 0, 0]","[0, 0, 0, 0, 0]",0
1,"[48065, 17060, 221256, 106414, 198151]","[0, 0, 0, 0, 1]","[0, 0, 0, 0, 0]",1
2,"[100402, 213002, 224226, 125070, 48363]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0]",2


### Create inverse index

In [14]:
inv_index = train_df['item_ids'].map(lambda x: dict((post_id, i) for i, post_id in enumerate(x)))
train_df['inv_index'] = inv_index.values
inv_index = val_df['item_ids'].map(lambda x: dict((post_id, i) for i, post_id in enumerate(x)))
val_df['inv_index'] = inv_index.values

In [15]:
train_df.head(3)

Unnamed: 0,item_ids,timespents,reactions,user_id,inv_index
0,"[35236, 186864, 58724, 155390, 153029, 28510, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 2, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"{35236: 0, 186864: 1, 58724: 2, 155390: 3, 153..."
1,"[46502, 93689, 78282, 33169, 48174, 124953, 47...","[0, 2, 1, 1, 0, 7, 1, 0, 0, 0, 4, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,"{46502: 0, 93689: 1, 78282: 2, 33169: 3, 48174..."
2,"[68210, 64859, 180839, 214379, 31131, 195952, ...","[0, 2, 0, 3, 6, 0, 0, 0, 0, 0, 0, 0, 10, 0, 2,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",2,"{68210: 0, 64859: 1, 180839: 2, 214379: 3, 311..."


### Split interaction set on test and val

In [16]:
is_train = []
post_ids = train['item_id'].values
user_ids = train['user_id'].values
inv_index = train_df['inv_index'].values
for i in tqdm(range(len(user_ids))):
    post_id = post_ids[i]
    is_train.append(post_id in inv_index[user_ids[i]])
    
train['is_train'] = is_train

  0%|          | 0/144440015 [00:00<?, ?it/s]

In [17]:
train_u_i = train[train['is_train'] == True].reset_index(drop=True).drop(columns=['is_train'])
val_u_i = train[train['is_train'] == False].reset_index(drop=True).drop(columns=['is_train'])

In [18]:
del is_train, post_ids, user_ids, inv_index
gc.collect()

565

# Matrix Factorization ALS

In [19]:
%env MKL_NUM_THREADS=1

env: MKL_NUM_THREADS=1


In [20]:
def flatten(l):
    return [item for sublist in l for item in sublist]

def get_feedback(timespent, reaction):
    feedback = np.log(timespent + 1).astype(np.float32)
    return feedback

def create_feedback_matrix(train_df, num_users, num_items):
    indices = np.array(flatten(train_df.item_ids.to_list()))
    indptr = np.concatenate((np.array([0]), 
                            train_df.item_ids.apply(lambda ids: len(ids)).values.cumsum()))

    timespent = np.array(flatten(train_df.timespents.to_list()))
    reaction = np.array(flatten(train_df.reactions.to_list()))
    data = get_feedback(timespent, reaction)
    del timespent, reaction

    feedback = csr_matrix((data, indices, indptr), shape=(num_users, num_items))
    return feedback

In [23]:
emb_dim = awesome_embeddings.shape[1]

In [26]:
user_item_train = create_feedback_matrix(train_df, num_users, num_items)

In [27]:
user_item_val = create_feedback_matrix(val_df, num_users, num_items)

In [151]:
parameters = ParameterGrid({
        'regularization' : [0.0, 0.1, 0.2, 0.3], 
        'iterations':[10, 20], 
        'alpha' : [0.1, 1.0, 10., 100., 1000., 10000.]})

In [None]:
best = -1.
for params in list(parameters):
    mfm = implicit.als.AlternatingLeastSquares(
        factors=emb_dim,
        calculate_training_loss=True,
        num_threads=12,
        **params
    )
    mfm.item_factors = implicit.gpu.Matrix(awesome_embeddings.astype(np.float32))
    mfm.fit(user_item_train, show_progress=True)
    
    val_score = ndcg_at_k(mfm, user_item_train, user_item_val, K=5, num_threads=12)
    
    if best < val_score:
        best = val_score
        best_params = params
    print(f"reg={params['regularization']: <10} it={params['iterations']: <10} alpha={params['alpha']: <10} score:{val_score:>15.5f}")
    
print(f"""Best params: 
    regularization :{best_params['regularization']: >6}
    iterations     :{best_params['iterations']: >6}
    alpha          :{best_params['alpha']: >6}""")

In [28]:
model_mf = implicit.als.AlternatingLeastSquares(
    factors=emb_dim,
    regularization=0.1,
    alpha=100.0,
    iterations=15,
    calculate_training_loss=True,
    num_threads=12,
)

model_mf.item_factors = implicit.gpu.Matrix(awesome_embeddings.astype(np.float32))

In [29]:
model_mf.fit(user_item_train, show_progress=True)

  0%|          | 0/15 [00:00<?, ?it/s]

In [30]:
val_score = ndcg_at_k(model_mf, user_item_train, user_item_val, K=5, num_threads=12)
val_score

  0%|          | 0/1000183 [00:00<?, ?it/s]

0.03376290824772198

In [85]:
# model_mf.save('models/model_mf.npz')

In [167]:
model_mf = model_mf.load('models/model_mf.npz')

In [31]:
user_embs = model_mf.user_factors.to_numpy()
item_embs = model_mf.item_factors.to_numpy()

# Train scoring model

In [33]:
CFG = {
    'seed': 0,
    'iters': 500,
    'train_bs': 2**7, 
    'valid_bs': 2**7,
    'lr': 1e-4, 
    'weight_decay': 1e-7,
    'num_workers': 16,
    'num_samples': 2**20
}

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [37]:
# Additional features:
    # Self-interaction
# User features    
    # User activity (# of posts)
    # Number of likes
    # Number of dislikes
    # Average time on post ()
# Post features
    # Number of likes
    # Number of dislikes
    # Post popularity (# of users)
    # Average timespent

item_extra = 4
user_extra = 4
hidden_dim = 64
lstm_num_layers = 1
drop_rate = 0.2

seq_len = 10

In [38]:
class TimespentRegressor(nn.Module):
    def __init__(self):        
        super(TimespentRegressor, self).__init__()
        self.lstm_in_dim = emb_dim + item_extra + 2 # Item emb + stats + user feedback
        self.lstm_out_dim = emb_dim + item_extra + 2
        self.lstm_num_layers = lstm_num_layers
    
        self.lstm = nn.LSTM(input_size=self.lstm_in_dim, hidden_size=self.lstm_out_dim,
                          num_layers=lstm_num_layers, batch_first=True) #lstm
        self.bn0 = nn.LayerNorm(self.lstm_out_dim)
        self.relu0 = nn.LeakyReLU()
        self.dropout0 = nn.Dropout(drop_rate)
        
        self.fc1 = nn.Linear(emb_dim * 2 + user_extra + item_extra + self.lstm_out_dim + 1, hidden_dim, bias=False)
        self.bn1 = nn.LayerNorm(hidden_dim)
        self.relu1 = nn.LeakyReLU()
        self.dropout1 = nn.Dropout(drop_rate)
        
        self.fc2 = nn.Linear(hidden_dim, 1)
        
    def forward(self, history, other):
        h_0 = torch.zeros(self.lstm_num_layers, history.size(0), self.lstm_out_dim).to(device)
        c_0 = torch.zeros(self.lstm_num_layers, history.size(0), self.lstm_out_dim).to(device)
        
        # Propagate input through LSTM
        _, (hn, cn) = self.lstm(history, (h_0, c_0))
        hn = hn.view(-1, self.lstm_out_dim) 
        lstm_out = self.bn0(hn)
        lstm_out = self.relu0(lstm_out)
        lstm_out = self.dropout0(lstm_out)
        
        out = self.fc1(torch.cat([lstm_out, other], axis=1))
        out = self.bn1(out)    
        out = self.relu1(out)  
        out = self.dropout1(out)
        
        out = self.fc2(out)    
        return out

In [39]:
class MyDataset(Dataset):
    def __init__(self, df_u_i, df, items_meta, user_embs, item_embs, seq_len=5, mode='train'):
        # Interactions
        self.df_u = df_u_i['user_id'].values
        self.df_i = df_u_i['item_id'].values
        
        # Historical repr
        self.histories = df['item_ids'].values
        self.timespents = df['timespents'].values
        self.reactions = df['reactions'].values
        self.inv_index = df['inv_index'].values
        
        # User and item embeddings
        self.user_embs = user_embs
        self.item_embs = item_embs
        
        # Item source-user ids
        self.items_sources = items_meta['source_id'].values
        
        # Dataset mode and history len
        self.seq_len = seq_len
        self.mode = mode
    
    def __len__(self):
        return len(self.df_u)
   
    def __getitem__(self, idx):
        iid = self.df_i[idx]
        uid = self.df_u[idx]
        
        idx = uid
        target_pos = self.inv_index[uid][iid]
        
        
        history = self.histories[idx]
        timespents = self.timespents[idx]
        reactions = self.reactions[idx]
        
        # Generate length of history(activities) that would represent user
        hist_len = self.seq_len + randint(-3, 0)
        hist_len = min(hist_len, target_pos)
        
        # Takes all available history
        if self.mode == 'eval':
            hist_len = target_pos
        
        target_id = iid
        # Time spent on target post
        label = timespents[target_pos]
        # History features
        history_ids = history[target_pos - hist_len:target_pos]
        
        # Use embeddings to create features
        history_embs = self.item_embs[history_ids]
        ts = np.array([timespents[target_pos - hist_len:target_pos]])
        rs = np.array([reactions[target_pos - hist_len:target_pos]])
        history_features = np.hstack([history_embs, ts.T, rs.T])
        
        # Checks for self interaction
        self_interaction = int(self.items_sources[target_id] == idx)
        
        # Pack to tensors
        history_features = torch.FloatTensor(history_features)
        other = torch.FloatTensor(np.concatenate([ \
              self.user_embs[idx], \
              self.item_embs[target_id], \
              [self_interaction]]))
        label = torch.FloatTensor([label])
        
        return  history_features, other, label

In [40]:
class ImbalancedDatasetSampler(Sampler):
    def __init__(self, train_u_i, num_samples=2**18):        
        grouped = train_u_i.groupby('timespent')
        time_freqs = grouped.size()
        ws = [1. / 61 for i in range(61)]
        for i in range(61):
            if i not in grouped.indices:
                ws[i] = 0.
                
        self.cat = torch.distributions.categorical.Categorical(torch.tensor(ws))
        self.time_freqs = time_freqs
        self.num_samples = num_samples
        self.indices = grouped.indices
        
        
    def __iter__(self):
        times = self.cat.sample((self.num_samples, ))
        ids = []
        
        rands = torch.rand(len(times))
        for i, time in enumerate(times):
            time = time.item()
            ids.append(self.indices[time][int((self.time_freqs[time] * rands[i]).item())])
        return (i for i in ids)  
        
    def __len__(self):
        return self.num_samples

In [41]:
def pad_collate(batch):
    (hist, emb, label) = zip(*batch)
    bs = len(emb)
    
    hist_pad = torch.nn.utils.rnn.pad_sequence(list(hist), batch_first=True)

    return hist_pad, torch.cat(emb).reshape(bs, -1), torch.cat(label).reshape(bs, 1)

In [42]:
# Squared error
def se(preds, target):
    return ((preds - target)**2).sum()

In [58]:
def train_model(model, train_loader, val_loader, lr=1e-5, l2=1e-8, max_lr=1e-3, 
                iters=100, save_rate=10, prefix='default'):
    loss_fn = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=l2)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=max_lr, 
                                                    steps_per_epoch=len(train_loader), epochs=iters)
    
    dists = torch.zeros(61)
    for it in range(iters):
        print(f'Iteration {it}')
        # Train
        for i, batch in tqdm(enumerate(train_loader), total=len(train_loader)):
            # forward
            preds = model(batch[0].to(device), batch[1].to(device))
            
            loss = loss_fn(preds, batch[2].to(device))
            
            for time in batch[2].detach().flatten().tolist():
                dists[int(time)] += 1
            
            wandb.log({
                "loss": loss.item()
            })
            
            # backward
            loss.backward()
            # update
            optimizer.step()
            scheduler.step()
            
            # zerograd
            optimizer.zero_grad()
        
        # Calculate validation mse
        if it % 3 == 0:
            model.eval()
            mse = 0.
            with torch.no_grad():
                for i, batch in tqdm(enumerate(val_loader), total=len(val_loader)):
                    preds = model(batch[0].to(device), batch[1].to(device))
                    mse += se(preds.flatten(), batch[2].flatten().to('cuda')).item()

            mse /= (len(val_loader) * CFG['valid_bs'])
            model.train()
            
            wandb.log({"val_mse" : mse})
        
        # Log class distribution
        data = [[i, dists[i]] for i in range(61)]
        table = wandb.Table(data=data, columns = ["class", "samples"])
        wandb.log({"class_distribution" : wandb.plot.bar(table, "class", "samples",
                               title="Class distribution")})
        
        # Make ckpt
        if it % save_rate == 0:
            path = f"models/{prefix}-ckpt_iteration_{it}.pt"
            torch.save(model.state_dict(), path)

In [48]:
# Create additional features for user
users_stats = train.groupby('user_id').agg(
    avg_time  = pd.NamedAgg(column = 'timespent', aggfunc = 'mean'),
    items_cnt = pd.NamedAgg(column = 'item_id',   aggfunc = lambda x: len(set(x))),
    likes     = pd.NamedAgg(column = 'reaction',  aggfunc = lambda x: (x == 1).sum()),
    dislikes  = pd.NamedAgg(column = 'reaction',  aggfunc = lambda x: (x == -1).sum())
)
users_extended = np.hstack([user_embs, users_stats[['avg_time', 'items_cnt', 'likes', 'dislikes']].values])

del train, users_stats, user_embs
gc.collect()

4421

In [51]:
train_dataset = MyDataset(train_u_i, train_df, items_meta, 
                          users_extended, extended_embeddings, 
                          seq_len=seq_len, mode='train')

val_dataset = MyDataset(val_u_i, val_df, items_meta, 
                        users_extended, extended_embeddings, 
                        seq_len=seq_len, mode='eval')

In [52]:
train_loader = DataLoader(
    train_dataset, 
    sampler=ImbalancedDatasetSampler(train_u_i, num_samples=CFG['num_samples']),
    batch_size=CFG['train_bs'],
    collate_fn=pad_collate,
    num_workers=CFG['num_workers'],
    pin_memory=True,
    pin_memory_device='cuda'
)

val_loader = DataLoader(
    val_dataset,
    sampler=ImbalancedDatasetSampler(val_u_i, num_samples=CFG['num_samples']),
    batch_size=CFG['valid_bs'],
    collate_fn=pad_collate,
    num_workers=CFG['num_workers'],
    pin_memory=True,
    pin_memory_device='cuda'
)    

In [56]:
hypers = {
    'lr' : CFG['lr'],
    'max_lr' : CFG['lr'] * 2.,
    'l2' : CFG['weight_decay'],
    'iters' : CFG['iters']
}

In [None]:
wandb.init(
    project="VK-cup-recommender",

    config = {
        **hypers
    }
)

prefix = 'balanced'
model = TimespentRegressor().to(device)
wandb.watch(model)

train_model(model, train_loader, val_loader, prefix=prefix, save_rate=15,  **hypers)

wandb.finish()

path = f"models/{prefix}_iteration_{CFG['iters']}.pt"
torch.save(model.state_dict(), path)

del model
gc.collect()
torch.cuda.empty_cache()

In [45]:
# model = TimespentRegressor().to(device)
# model.load_state_dict(torch.load('models/balanced-ckpt_iteration_250.pt'))

<All keys matched successfully>

In [62]:
from prettytable import PrettyTable

def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad: continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params+=params
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params
    
count_parameters(model)

+-------------------+------------+
|      Modules      | Parameters |
+-------------------+------------+
| lstm.weight_ih_l0 |   404496   |
| lstm.weight_hh_l0 |   404496   |
|  lstm.bias_ih_l0  |    1272    |
|  lstm.bias_hh_l0  |    1272    |
|     bn0.weight    |    318     |
|      bn0.bias     |    318     |
|     fc1.weight    |   60864    |
|     bn1.weight    |     64     |
|      bn1.bias     |     64     |
|     fc2.weight    |     64     |
|      fc2.bias     |     1      |
+-------------------+------------+
Total Trainable Params: 873229


873229

# Inference

In [8]:
processed_ds, _ = leave_last_k_out_split(train, k = 0)

candidate_items = candidates_df.item_id.values

hists = processed_ds['item_ids'].values
timespents = processed_ds['timespents'].values
reactions = processed_ds['reactions'].values
items_sources = items_meta['source_id'].values

In [3]:
def predict(test_ids, predictions, mf_scores):
    final_preds = []
    for i, user_id in tqdm(enumerate(test_ids), total = len(test_ids)):
        user_pred = []
        scrs = score_items(model, user_id, predictions[i]).flatten()
        
        # NN score
        scrs /= 60.
        scrs = scrs.to('cpu')
        # score + similarity
        scrs += mf_scores[i]
        
        t_scores = scrs.tolist()
        ss = sorted(t_scores)[::-1]
        for i in range(20):
            idx = t_scores.index(ss[i])
            user_pred.append(predictions[i][idx])

        final_preds.append(user_pred)
    return final_preds

In [4]:
def score_items(model, user_id, item_ids):
    history = hists[user_id]
    times = timespents[user_id]
    likes = reactions[user_id]

    target_pos = len(history)

    hist_len = seq_len
    hist_len = min(hist_len, target_pos)

    history_ids = history[target_pos - hist_len:target_pos]

    history_embs = extended_embeddings[history_ids]
    ts = np.array([times[target_pos - hist_len:target_pos]])
    rs = np.array([likes[target_pos - hist_len:target_pos]])
    history_features = np.hstack([history_embs, ts.T, rs.T])

    others = []
    for item_id in item_ids:
        self_interaction = int(items_sources[item_id] == user_id)
        others.append(
            np.concatenate(
                [users_extended[user_id], 
                extended_embeddings[item_id], 
                [self_interaction]]
            ))
    
    history_features = torch.FloatTensor(history_features).unsqueeze(0).repeat(len(item_ids), 1, 1)
    other = torch.FloatTensor(np.array(others))

    with torch.no_grad():
        model.eval()
        scores = model(history_features.to(device), other.to(device))
    
    return scores

In [169]:
predictions = []
scores = []
test_ids = test.user_id.values
bs = 512

for i in tqdm(range((len(test_ids) + bs - 1) // bs)):
    user_ids = test_ids[i*bs:(i + 1)*bs]
    
    ids, scrs = model_mf.recommend(user_ids, user_item_train[user_ids],
                                  N=20,
                                  filter_already_liked_items=True,
                                  items=candidate_items
                                  )
    
    scores.append(scrs)
    predictions.append(ids)
    
scores = np.concatenate(scores)
predictions = np.concatenate(predictions)

  0%|          | 0/391 [00:00<?, ?it/s]

In [170]:
predictions = predict(test_ids, predictions, scores)

  0%|          | 0/200000 [00:00<?, ?it/s]

In [150]:
test['predictions'] = predictions
test

Unnamed: 0,user_id,predictions
0,7,"[63017, 49912, 29054, 63388, 200250, 11787, 46..."
1,8,"[221001, 53828, 221344, 4305, 87797, 56900, 46..."
2,9,"[190377, 49912, 64601, 148826, 155056, 54559, ..."
3,11,"[221001, 97654, 108363, 2915, 2216, 217361, 46..."
4,18,"[221001, 88573, 29054, 131294, 120767, 11787, ..."
...,...,...
199995,1000160,"[190377, 88573, 63495, 2915, 87797, 35482, 131..."
199996,1000165,"[221001, 53828, 29054, 68855, 155056, 124525, ..."
199997,1000166,"[221001, 40628, 227299, 74088, 197397, 139838,..."
199998,1000168,"[190377, 49912, 29054, 148826, 200250, 11787, ..."


In [151]:
test.to_parquet('sample_submission.parquet.gzip', compression='gzip')