In [1]:
# -*- coding: utf-8 -*-
import os
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
from collections import Counter
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F
from time import time
import gc
import warnings
warnings.filterwarnings("ignore")
import logging
import random
from utils import set_random_seed, get_logger, ensure_dir, str2bool, str2float

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

exp_id = int(random.SystemRandom().random() * 100000)

logging.basicConfig(
    filename="log/narm_{}.log".format(exp_id),
    level=logging.INFO,
    format='[%(asctime)s] - %(message)s'
)
model_name = 'narm'

df_sess = pd.read_csv('data/sessions_train.csv')
df_test = pd.read_csv('data/phase2/sessions_test_task1.csv')
print(f'df_sess.shape = {df_sess.shape}, df_test.shape = {df_test.shape}')
logging.info(f'df_sess.shape = {df_sess.shape}, df_test.shape = {df_test.shape}')
products = pd.read_csv('data/products_train.csv')
print(f'products.shape = {products.shape}')
logging.info(f'products.shape = {products.shape}')
product2idx = dict(zip(products['id'].unique(), range(1, products['id'].nunique()+1)))
idx2product = dict(zip(range(1, products['id'].nunique()+1), products['id'].unique()))
product_num = products['id'].nunique() + 1

product_dict = dict()
locales = ['UK', 'DE', 'JP', 'IT', 'FR', 'ES']

for locale in locales:
    product_dict[locale] = [product2idx[x] for x in list(products[products['locale']==locale]['id'].unique())]
    
def str2list(x):
    x = x.replace('[', '').replace(']', '').replace("'", '').replace('\n', ' ').replace('\r', ' ')
    l = [product2idx[i] for i in x.split() if i]
    return l


df_sess['prev_items'] = df_sess['prev_items'].apply(lambda x: str2list(x))
df_test['prev_items'] = df_test['prev_items'].apply(lambda x: str2list(x))

df_sess['next_item'] = df_sess['next_item'].apply(lambda x: product2idx[x])

df_train, df_valid, _, _ = train_test_split(
    df_sess, df_sess['locale'], test_size=0.1, random_state=2023, stratify=df_sess['locale'])

print(f'df_train.shape = {df_train.shape}, df_valid.shape = {df_valid.shape}')
logging.info(f'df_train.shape = {df_train.shape}, df_valid.shape = {df_valid.shape}')
train = (list(df_train["prev_items"]), list(df_train["next_item"]))
valid = (list(df_valid["prev_items"]), list(df_valid["next_item"]))


def collate_fn_train(data):
    data.sort(key=lambda x: len(x[0]), reverse=True)  #  按长度排序
    lens = [len(hist) for hist, _ in data]
    labels = []
    padded_seq = torch.zeros(len(data), max(lens)).long()
    for i, (hist, label) in enumerate(data):
        padded_seq[i, :lens[i]] = torch.LongTensor(hist)
        labels.append(label)

    return padded_seq, torch.tensor(labels).long(), lens

class TrainDataset(Dataset):

    def __init__(self, data):
        self.data = data

    def __getitem__(self, index):
        return self.data[0][index], self.data[1][index]

    def __len__(self):
        return len(self.data[0])

def TrainDataLoader(data, bs=512):
    data_set = TrainDataset(data)
    data_loader = DataLoader(data_set, batch_size=bs, shuffle=True, collate_fn=collate_fn_train, drop_last=True)

    return data_loader


class Model(nn.Module):

    def __init__(self, n_items, hidden_size, embedding_dim, n_layers=1):
        super(Model, self).__init__()
        self.n_items = n_items
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.embedding_dim = embedding_dim
        self.emb = nn.Embedding(self.n_items + 1, self.embedding_dim, padding_idx=0)
        self.emb_dropout = nn.Dropout(0.25) # 0.25
        self.gru = nn.GRU(self.embedding_dim, self.hidden_size, self.n_layers, batch_first=True)
        self.a_1 = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
        self.a_2 = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
        self.v_t = nn.Linear(self.hidden_size, 1, bias=False)
        self.ct_dropout = nn.Dropout(0.5) # 0.5
        self.b = nn.Linear(self.embedding_dim, 2 * self.hidden_size, bias=False)
        # self.sf = nn.Softmax()
        self.device = device

        # self.tanh = nn.Tanh()

    def forward(self, seq, lengths):
        hidden = self.init_hidden(seq.size(0))
        embs = self.emb_dropout(self.emb(seq))
        embs = pack_padded_sequence(embs, lengths, batch_first=True)
        gru_out, hidden = self.gru(embs, hidden)
        gru_out, lengths = pad_packed_sequence(gru_out, batch_first=True)

        # fetch the last hidden state of last timestamp
        ht = hidden[-1]
        # gru_out = gru_out.permute(1, 0, 2)

        c_global = ht
        q1 = self.a_1(gru_out.contiguous().view(-1, self.hidden_size)).view(gru_out.size())
        q2 = self.a_2(ht)
        mask = torch.where(seq > 0, torch.tensor([1.], device=self.device),
                           torch.tensor([0.], device=self.device))
        q2_expand = q2.unsqueeze(1).expand_as(q1)
        q2_masked = mask.unsqueeze(2).expand_as(q1) * q2_expand

        alpha = self.v_t(torch.sigmoid(q1 + q2_masked).view(-1, self.hidden_size)).view(mask.size())
        c_local = torch.sum(alpha.unsqueeze(2).expand_as(gru_out) * gru_out, 1)

        c_t = torch.cat([c_local, c_global], 1)
        c_t = self.ct_dropout(c_t)

        # c_t = self.tanh(c_t)

        item_embs = self.emb(torch.arange(self.n_items + 1).to(self.device))
        scores = torch.matmul(c_t, self.b(item_embs).permute(1, 0))
        # scores = self.sf(scores)

        return scores

    def init_hidden(self, batch_size):
        return torch.zeros((self.n_layers, batch_size, self.hidden_size), requires_grad=True).to(self.device)

class Loss(nn.Module):
    def __init__(self, reg=0, eps=1e-6):
        super(Loss, self).__init__()
        self.reg = reg
        self.eps = eps

    def forward(self, p, n):
        p = torch.exp(p)
        n = torch.exp(n)
        prob = - torch.log(p / (p + torch.sum(n, dim=1, keepdim=True)) + self.eps)

        return prob.sum() + self.reg


def evaluate(rec_matrix, targets, match_num):
    # (B, 100), (B,), (100, )
    target_repeats = torch.repeat_interleave(targets.view(-1, 1), dim=1, repeats=match_num)  # (B, 100)
    judge = torch.where(rec_matrix - target_repeats == 0)
    hit = len(judge[0])
    mrr = 0
    ndcg = 0
    for pos in judge[1]:
        mrr += 1 / (pos.float() + 1)
        ndcg += 1 / torch.log2(pos.float() + 2)
    return hit, ndcg, mrr


item_nuniq = product_num
emb_dim = 64
epochs = 50
lr = 1e-3
hidden_size = 100
n_layers = 1
match_num = 100
gamma = 1e-5
mix_recall_num = 100
bs = 128

  from .autonotebook import tqdm as notebook_tqdm


df_sess.shape = (3606249, 3), df_test.shape = (316972, 2)
products.shape = (1551057, 11)
df_train.shape = (3245624, 3), df_valid.shape = (360625, 3)


In [2]:
model = Model(item_nuniq, hidden_size, emb_dim, n_layers).to(device)
print(model)
for name, param in model.named_parameters():
    print(str(name) + '\t' + str(param.shape) + '\t' +
                              str(param.device) + '\t' + str(param.requires_grad))
total_num = sum([param.nelement() for param in model.parameters()])
print(total_num)

Model(
  (emb): Embedding(1410677, 64, padding_idx=0)
  (emb_dropout): Dropout(p=0.25, inplace=False)
  (gru): GRU(64, 100, batch_first=True)
  (a_1): Linear(in_features=100, out_features=100, bias=False)
  (a_2): Linear(in_features=100, out_features=100, bias=False)
  (v_t): Linear(in_features=100, out_features=1, bias=False)
  (ct_dropout): Dropout(p=0.5, inplace=False)
  (b): Linear(in_features=64, out_features=200, bias=False)
)
emb.weight	torch.Size([1410677, 64])	cuda:0	True
gru.weight_ih_l0	torch.Size([300, 64])	cuda:0	True
gru.weight_hh_l0	torch.Size([300, 100])	cuda:0	True
gru.bias_ih_l0	torch.Size([300])	cuda:0	True
gru.bias_hh_l0	torch.Size([300])	cuda:0	True
a_1.weight	torch.Size([100, 100])	cuda:0	True
a_2.weight	torch.Size([100, 100])	cuda:0	True
v_t.weight	torch.Size([1, 100])	cuda:0	True
b.weight	torch.Size([200, 64])	cuda:0	True
90366028


In [3]:
model.load_state_dict(torch.load('./ckpt/62307/62307_narm.pt', map_location='cpu'))

<All keys matched successfully>

In [4]:
model.device

device(type='cuda', index=0)

In [5]:
def collate_fn_test(data):
    data.sort(key=lambda x: len(x), reverse=True)  #  按长度排序
    lens = [len(hist) for hist in data]
    padded_seq = torch.zeros(len(data), max(lens)).long()
    for i, (hist) in enumerate(data):
        padded_seq[i, :lens[i]] = torch.LongTensor(hist)
    return padded_seq, lens

In [6]:
class TestDataset(Dataset):

    def __init__(self, data):
        self.data = data

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return len(self.data)

def TestDataLoader(data, bs=512):
    data_set = TestDataset(data)
    data_loader = DataLoader(data_set, batch_size=bs, shuffle=True, collate_fn=collate_fn_test, drop_last=False)
    return data_loader

In [7]:
test = list(df_test["prev_items"])

In [8]:
test_loader = TestDataLoader(test, bs=bs)

In [9]:
preds_res = []
model.eval()
for hist_click, lens in tqdm(test_loader, total=len(test_loader)):
    # (B, len)
    hist_click = hist_click.to(device)
    candidates_score = F.softmax(model(hist_click, lens)[:, 1:], dim=1)
    candidate_argsort = candidates_score.argsort(dim=1, descending=True)
    rec_matrix = candidate_argsort[:, :match_num] + 1  # (B, 100)
    # print(hist_click.shape, rec_matrix.shape)
    preds_res.append(rec_matrix)

preds_res = torch.cat(preds_res, axis=0)
print(preds_res.shape)

100%|██████████| 2477/2477 [02:47<00:00, 14.79it/s]

torch.Size([316972, 100])





In [17]:
df_test_origin = pd.read_csv('data/phase2/sessions_test_task1.csv')
preds_res = preds_res.cpu().numpy()
assert len(preds_res) == len(df_test_origin)
test_res_unencoded = []
for ind, x in tqdm(enumerate(preds_res), total=len(preds_res)):
    x_unencoded = [idx2product[id_] for id_ in x]
    test_res_unencoded.append(x_unencoded)

100%|██████████| 316972/316972 [00:24<00:00, 12788.31it/s]


In [22]:
df_test_origin['next_item_prediction'] = test_res_unencoded
df_test_origin[['locale', 'next_item_prediction']].to_parquet('output/{}_{}.parquet'.format(exp_id, model_name), engine='pyarrow')

In [23]:
df_test_origin[['locale', 'next_item_prediction']]

Unnamed: 0,locale,next_item_prediction
0,DE,"[B07FCL6SJS, B08GKYXSXV, B01LYL85UF, B0BFRPVB8..."
1,DE,"[B07KBFZC3K, B0B4MPKSKF, B009SKHAOC, B009S7401..."
2,DE,"[B01M7SABBH, B09BVFPXF9, B071P67LCY, B09HCH96J..."
3,DE,"[B08F2V295C, B08K99ZQK8, B0B79JGGDG, B09C254YJ..."
4,DE,"[B09SPDBCSN, B0BD3QMH91, B07BNF842T, B09GB6F7L..."
...,...,...
316967,UK,"[B000VCVPY2, B0BC1DBK44, B09NVYSYYJ, B0772WYFP..."
316968,UK,"[B09YJVTLBN, B0BDF6TQ2L, B0000TZ4XE, B0B6154H6..."
316969,UK,"[B00Q4TNDWS, B00Q4TMZ9K, B09V1L3SXP, B01E6XBBE..."
316970,UK,"[B08NSX3R2H, B06XPD3BP2, B07ZHTNGPD, B07F84MRM..."


In [24]:
a = pd.read_parquet('output/2023_1634_MatchModelV2withATTMatchFold0_99.parquet')

In [25]:
a

Unnamed: 0,locale,next_item_prediction
0,DE,"[B07SDFLVKD, B093X59B31, B0BGC82WVW, B091CK241..."
1,DE,"[B084CB7GX9, B07BDNST44, B0024NKBQE, B004P4QFJ..."
2,DE,"[B09Z4PZQBF, B0936P3P8D, B09Z4PYG8Q, B0936K9LT..."
3,DE,"[B07T6Y2HG7, B07Y1KLF25, B07HQ83TFF, B07T5XY2C..."
4,DE,"[B08SHZHRQ7, B08P94RML3, B09C89S7WG, B08YK8FQJ..."
...,...,...
316967,UK,"[B07GKP2LCF, B07GKYSHB4, B07GKM97YF, B006DDGCI..."
316968,UK,"[B00M35Y326, B08B395NHL, B000FHC0QK, B091DWY6C..."
316969,UK,"[B08VD5DC5L, B08VDHH6QF, B0BK7QC4H3, B08VDGMBG..."
316970,UK,"[B0B7M72LFQ, B09WCQYGX8, B08W2JJZBM, B0B6C8MZD..."


In [26]:
b = pd.read_parquet('output/phase2_submission_task1_rule_next_one_zhaohui_new.parquet')

In [27]:
b

Unnamed: 0,locale,next_item_prediction
0,DE,"[B07SDFLVKD, B091CK241X, B0B9GJLV2D, B0BGC82WV..."
1,DE,"[B084CB7GX9, B08XW4W667, B09YD8XV6M, B004P4OF1..."
2,DE,"[B09Z4T2GJ3, B09Z3FBXMB, B09Z4PZQBF, B0936K9LT..."
3,DE,"[B07T2NBLX9, B07Y1KLF25, B07T6Y2HG7, B07HQ83TF..."
4,DE,"[B0B2DRKZ6X, B0B2JY9THB, B08YK8FQJ8, B08SHZHRQ..."
...,...,...
316967,UK,"[B07GKP2LCF, B07GKYSHB4, B016RAAUEM, B006DDGCI..."
316968,UK,"[B00M35Y326, B000FHC0QK, B08B395NHL, B091DWY6C..."
316969,UK,"[B08VDGMBGP, B08VDSL596, B08VD5DC5L, B08VDNCZT..."
316970,UK,"[B089CZWB4C, B08T1ZJYHV, B08DTYFYGP, B08W2JJZB..."
