In [50]:
import torch
from torch import nn, Tensor

class TransformerModel2(nn.Module):

    def __init__(self, ntoken: int, d_model: int, max_len: int, nhead: int, nlayers: int, src_padding_idx: int, dropout: int,device):
        super().__init__()
        self.src_word_embedding = nn.Embedding(ntoken, d_model)
        self.src_postional_embedding = nn.Embedding(max_len,d_model)
        self.trg_word_embedding = nn.Embedding(ntoken, d_model)
        self.trg_postional_embedding = nn.Embedding(d_model,d_model)
        self.device = device
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=nlayers,
            num_decoder_layers=nlayers,
            dropout=dropout,
            )
        self.output_layer = nn.Linear(d_model,ntoken)
        self.dropout = nn.Dropout(dropout)
        self.src_pad_idx = src_padding_idx

    def get_src_padding_mask(self, src):
        #src shape: [src_length, batch_size]
        src_mask = src.transpose(0,1) == self.src_pad_idx
        #return shape:[batch_size,src_length]
        return src_padding_mask


    def forward(self, src, trg):
        """
        Arguments:
            src: Tensor, shape ``[seq_len, batch_size]``
            trg: Tensor, shape ``[seq_len, batch_size]``

        Returns:
            output Tensor of shape ``[seq_len, batch_size, ntoken]``
        """
        src_seq_length, N = src.size()
        trg_seq_length, N = trg.size()
        src_positon = (
            torch.arange(0,src_seq_length).unsqueeze(1).expand(src_seq_length,N).to(self.device)
        )
        trg_positon = (
            torch.arange(0,trg_seq_length).unsqueeze(1).expand(trg_seq_length,N).to(self.device)
        )
        embed_src = self.dropout((self.src_word_embedding(src) + self.src_postional_embedding(src_positon)))
        embed_trg = self.dropout((self.trg_word_embedding(trg) + self.trg_postional_embedding(trg_positon)))
        src_padding_mask = self.get_src_padding_mask(src)
        trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(self.device)
        output = self.transformer(
            embed_src,
            embed_trg,
            src_key_padding_mask = src_padding_mask,
            tgt_mask = trg_mask,
            
            
        )
        output = self.output_layer(output)
        return output

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
from zipfile import ZipFile
import pandas as pd
import numpy as np
zf = ZipFile("order_products__prior.csv.zip")
train_df = pd.read_csv(zf.extract("order_products__prior.csv"))

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32434489 entries, 0 to 32434488
Data columns (total 4 columns):
 #   Column             Dtype
---  ------             -----
 0   order_id           int64
 1   product_id         int64
 2   add_to_cart_order  int64
 3   reordered          int64
dtypes: int64(4)
memory usage: 989.8 MB


In [6]:
unique_product = train_df.product_id.unique()

In [7]:
sorted_product = np.sort(unique_product)

In [8]:
sorted_product

array([    1,     2,     3, ..., 49686, 49687, 49688], dtype=int64)

In [9]:
len(unique_product)

49677

In [10]:
zf = ZipFile("order_products__train.csv.zip")
val_df = pd.read_csv(zf.extract("order_products__train.csv"))

In [11]:
val_df.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1


In [12]:
train_products = train_df.product_id.values
products, counts = np.unique(train_products,return_counts=True)

In [13]:
count_sort_idx = np.argsort(-counts)

In [14]:
sorted_products = products[count_sort_idx] #sort products by how many times they appear in the data

In [15]:
sorted_products

array([24852, 13176, 21137, ..., 38045, 21428, 10270], dtype=int64)

In [16]:
product_counts = np.array(list(train_df.product_id.value_counts().items()))

In [17]:
product_counts

array([[ 24852, 472565],
       [ 13176, 379450],
       [ 21137, 264683],
       ...,
       [ 31254,      1],
       [ 13397,      1],
       [ 23624,      1]])

In [18]:
product_counts[:,1][:4999].sum()/product_counts[:,1].sum()

0.8153107637983752

In [19]:
#encode the top k products into indices from 1 to k
k = 4999
product_to_idx = {product:i for i,product in enumerate(sorted_products[:k],start=1)}

In [20]:
#create new columns "idx" and fill all non top k products with 0
train_df['idx'] = train_df.product_id.map(product_to_idx)

In [21]:
#do the same thing to validation dataframe
val_df['idx'] = val_df.product_id.map(product_to_idx)

In [22]:
#an reversed dictionary for decode the product indices
idx_to_product = {value:key for (key,value) in product_to_idx.items()}

In [23]:
train_df.fillna(0,inplace=True)

In [24]:
val_df.fillna(0,inplace=True)

In [25]:
train_df.head(50)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,idx
0,2,33120,1,1,200.0
1,2,28985,2,1,37.0
2,2,9327,3,0,860.0
3,2,45918,4,1,0.0
4,2,30035,5,0,0.0
5,2,17794,6,1,31.0
6,2,40141,7,1,4413.0
7,2,1819,8,1,2352.0
8,2,43668,9,0,0.0
9,3,33754,1,1,114.0


In [26]:
train_df.idx = train_df.idx.astype(int)

In [27]:
train_df

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,idx
0,2,33120,1,1,200
1,2,28985,2,1,37
2,2,9327,3,0,860
3,2,45918,4,1,0
4,2,30035,5,0,0
...,...,...,...,...,...
32434484,3421083,39678,6,1,3168
32434485,3421083,11352,7,0,747
32434486,3421083,4600,8,0,2182
32434487,3421083,24852,9,1,1


In [97]:
val_df.idx = val_df.idx.astype(int)
val_df.head(50)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,idx
0,1,49302,1,1,0
1,1,11109,2,1,1254
2,1,10246,3,0,151
3,1,49683,4,0,16
4,1,43633,5,1,0
5,1,13176,6,0,2
6,1,47209,7,0,5
7,1,22035,8,1,44
8,36,39612,1,0,0
9,36,19660,2,1,49


In [29]:
train_df.idx.value_counts()

idx
0       5990301
1        472565
2        379450
3        264683
4        241921
         ...   
4998       1014
4995       1014
4997       1014
4996       1014
4999       1013
Name: count, Length: 5000, dtype: int64

In [30]:
val_df.idx.value_counts()

idx
0       280770
1        18726
2        15480
3        10894
4         9784
         ...  
4329        13
4546        13
4520        13
4653        13
4380        11
Name: count, Length: 5000, dtype: int64

In [31]:
#customized way to create dataset

def create_dataset(df,max_len,max_sequence_start=0, max_sequence_end=100000):
    df_values = df[['order_id','idx']].values
    catch_index = df_values[0][0]
    one_row = [0]*max_len
    data = []
    idx = 0
    df_leng = len(df)
    for row in range(df_leng-1):
            
        if df_values[row][1] > 0 and idx < max_len:
            one_row[idx] = df_values[row][1]
            idx += 1
            
        if df_values[row+1][0] != catch_index and one_row != [0]*max_len:
            data.append(torch.tensor(one_row,dtype = torch.long))
            del one_row
            torch.cuda.empty_cache()
            one_row = [0]*max_len
            catch_index = df_values[row+1][0]
            idx = 0

        if row == df_leng -2 and df_values[row+1][1] > 0:
            one_row.append(df_values[row+1][1])
            data.append(torch.tensor(one_row,dtype = torch.long))
            del one_row
            torch.cuda.empty_cache()
            catch_index = df_values[row+1][0]
  
    return torch.stack(data[max_sequence_start:max_sequence_end]).to(device)

In [32]:
# del train_data
# torch.cuda.empty_cache()
seq_length = 10
train_data = create_dataset(train_df,seq_length,max_sequence_start=0,max_sequence_end=-1)

In [33]:
train_data = train_data.t().contiguous()

In [34]:
train_data.size()

torch.Size([10, 3211011])

In [35]:
# del val_data
# torch.cuda.empty_cache()
val_data = create_dataset(val_df,seq_length,max_sequence_start=0,max_sequence_end=-1)

In [45]:
val_data

tensor([[1254,   49, 2867,  ..., 1075, 1571,   25],
        [ 151,   25, 1257,  ...,  100,  613, 3146],
        [  16,  270,  592,  ...,    0, 4477,  311],
        ...,
        [   0,    0,  325,  ...,    0,    0,    0],
        [   0,    0, 1971,  ...,    0,    0,    0],
        [   0,    0,    0,  ...,    0,    0,    0]], device='cuda:0')

In [37]:
val_data.size()

torch.Size([130985, 10])

In [38]:
val_data = val_data.t().contiguous()

In [59]:
# del model,TransformerModel2
# torch.cuda.empty_cache()

In [55]:
ntokens = k+1 # size of vocabulary
d_model = 512  # embedding dimension
nlayers = 2  # number of ``nn.TransformerEncoderLayer`` in ``nn.TransformerEncoder``
nhead = 4  # number of heads in ``nn.MultiheadAttention``
dropout = 0.1  # dropout probability
src_pad_idx = 0
max_len = seq_length-1
model = TransformerModel2(ntoken=ntokens,
                          d_model=d_model,
                          max_len=max_len,
                          nhead=nhead,
                          nlayers=nlayers,
                          dropout=dropout,
                          src_padding_idx=src_pad_idx,
                         device=device).to(device)

In [56]:
from typing import Tuple
bptt = 512
def get_batch(source: Tensor, i: int) -> Tuple[Tensor, Tensor]:
    """
    Args:
        source: Tensor, shape ``[full_seq_len, batch_size]``
        i: int

    Returns:
        tuple (data, target), where data has shape ``[seq_len, batch_size]`` and
        target has shape ``[seq_len * batch_size]``
    """
    data = source[: -1, i: i+bptt]
    target = source[1:, i: i+bptt]
    return data, target

In [57]:
import time

criterion = nn.CrossEntropyLoss(ignore_index=src_pad_idx)
lr = 3.0 # learning rate
optimizer = torch.optim.AdamW(model.parameters(), lr=lr,weight_decay=0.1)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                       mode='min',
                                                       factor =0.8,
                                                       patience=5,
                                                       threshold=0.001)

def train(model: nn.Module) -> None:
    model.train()  # turn on train mode
    total_loss = 0.
    log_interval = 300
    start_time = time.time()
    seq_length = train_data.size(0)-1

    num_batches = train_data.size(-1) // bptt
    for batch, i in enumerate(range(0, train_data.size(-1) - bptt , bptt)):
        data, targets = get_batch(train_data, i)
        output = model(data,targets)
        output_flat = output.reshape(-1, output.shape[2])
        targets_flat = targets.reshape(-1)
        loss = criterion(output_flat, targets_flat)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()

        total_loss += loss.item()
        

        
        if batch % log_interval == 0 and batch > 0:
            lr = optimizer.param_groups[0]['lr']
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_loss = total_loss / log_interval
            scheduler.step(cur_loss)
            # print(f'| epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | '
            #       f'lr {lr:02.4f} | ms/batch {ms_per_batch:5.2f} | '
            #       f'loss {cur_loss:5.4f} | ppl {ppl:8.4f}')
            print(f'| epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | '
                  f'lr {lr:02.4f} | ms/batch {ms_per_batch:5.2f} | '
                  f'loss {cur_loss:5.4f}')
            total_loss = 0
            start_time = time.time()

def evaluate(model: nn.Module, eval_data: Tensor) -> float:
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    seq_length = val_data.size(0)-1
    with torch.no_grad():
        for i in range(0, eval_data.size(-1) - bptt, bptt):
            data, targets = get_batch(eval_data, i)
            seq_len = data.size(0)

            output = model(data, targets)
            target_flat = targets.reshape(-1)
            output_flat = output.view(-1, ntokens)
            total_loss += criterion(output_flat, target_flat).item()

    return total_loss / (eval_data.size(1) - 1)

In [58]:
import os
from tempfile import TemporaryDirectory
best_val_loss = float('inf')
epochs = 30

with TemporaryDirectory() as tempdir:
    best_model_params_path = os.path.join(tempdir, "best_model_params.pt")

    for epoch in range(1, epochs + 1):
        epoch_start_time = time.time()
        train(model)
        val_loss = evaluate(model, val_data)
        # val_ppl = math.exp(val_loss)
        elapsed = time.time() - epoch_start_time
        print('-' * 89)
        print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
              f'valid loss {val_loss:5.2f}')
        print('-' * 89)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), best_model_params_path)
        
        scheduler.step(best_val_loss)
    model.load_state_dict(torch.load(best_model_params_path)) # load best model states

| epoch   1 |   300/ 6271 batches | lr 3.0000 | ms/batch 123.60 | loss 529.1801
| epoch   1 |   600/ 6271 batches | lr 3.0000 | ms/batch 124.70 | loss 722.1259
| epoch   1 |   900/ 6271 batches | lr 3.0000 | ms/batch 125.70 | loss 620.0053
| epoch   1 |  1200/ 6271 batches | lr 3.0000 | ms/batch 126.08 | loss 831.1685
| epoch   1 |  1500/ 6271 batches | lr 3.0000 | ms/batch 126.58 | loss 791.0096
| epoch   1 |  1800/ 6271 batches | lr 3.0000 | ms/batch 126.93 | loss 858.0040
| epoch   1 |  2100/ 6271 batches | lr 3.0000 | ms/batch 126.87 | loss 1067.7777
| epoch   1 |  2400/ 6271 batches | lr 2.4000 | ms/batch 127.02 | loss 730.9357
| epoch   1 |  2700/ 6271 batches | lr 2.4000 | ms/batch 127.01 | loss 670.9440
| epoch   1 |  3000/ 6271 batches | lr 2.4000 | ms/batch 126.96 | loss 757.6661
| epoch   1 |  3300/ 6271 batches | lr 2.4000 | ms/batch 126.98 | loss 653.1396
| epoch   1 |  3600/ 6271 batches | lr 2.4000 | ms/batch 127.05 | loss 635.1509
| epoch   1 |  3900/ 6271 batches | lr 

In [59]:
torch.save(model,"trans4rec2_2.pth")

In [77]:
def test(model: nn.Module, eval_data: Tensor, i: int) -> float:
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    with torch.no_grad():
        data, targets = get_batch(eval_data, i)
        seq_length = data.size(0)
        output = model(data,targets)
        output_flat = output.reshape(-1, output.shape[2])
        targets_flat = targets.reshape(-1)
    return data, output, targets_flat, output_flat

In [136]:
data, targets = get_batch(val_data, 0)

In [137]:
targets.shape

torch.Size([9, 512])

In [138]:
targets

tensor([[ 151,   25, 1257,  ...,   11,  288,   22],
        [  16,  270,  592,  ...,   23,  258,   19],
        [   2, 4787,   30,  ...,    0,   72, 4493],
        ...,
        [   0,    0,  325,  ...,    0,    0,    0],
        [   0,    0, 1971,  ...,    0,    0,    0],
        [   0,    0,    0,  ...,    0,    0,    0]], device='cuda:0')

In [139]:
output = model(data,targets)

In [133]:
topk_results = torch.topk(output.reshape(-1,ntokens),5)[1]


In [134]:
topk_results

tensor([[ 151,  112, 4462, 4852, 4682],
        [  25, 4106,    9,   35, 4872],
        [1257, 1887, 2512, 1381, 1954],
        ...,
        [1606, 1038, 1501, 1444, 2412],
        [1038, 1444, 1606, 1501, 2239],
        [1038, 1606, 1444, 1501, 2239]], device='cuda:0')

In [118]:
for row in range(9):
    print

tensor([[ 151,   25, 1257,  ...,   11,  288,   22],
        [  16,  270,  592,  ...,   23,  258,   19],
        [   2, 4787,   30,  ...,    0,   72, 4493],
        ...,
        [   0,    0,  325,  ...,    0,    0,    0],
        [   0,    0, 1971,  ...,    0,    0,    0],
        [   0,    0,    0,  ...,    0,    0,    0]], device='cuda:0')

In [71]:
output

tensor([[[ -3.2129,   3.7318,  -2.0477,  ..., -11.2380,  -1.0535,  -3.2966],
         [ -8.9984,   2.4188,   2.8738,  ...,  -3.8900,   3.6188,  -4.4709],
         [  1.2564,   3.8848,   4.5695,  ...,  -0.4588,   7.0649,   0.3766],
         ...,
         [  2.7206,   1.8285,  -4.1161,  ...,  -1.8436,  -2.2552,  -4.0761],
         [  5.5080,   3.9570,  -2.3059,  ...,  -2.7890,  -3.7426, -11.5296],
         [ -9.4433, -14.0306,   5.1443,  ...,  -4.7098,  -1.7035,  -0.8511]],

        [[ -6.7897,  -8.2017,  -7.2549,  ...,  -2.1213,  -0.6391,  -2.0576],
         [  0.7875,   2.9819,  -0.1782,  ...,  -1.1121,  -1.4375,  -1.8682],
         [ -1.8948,  -5.9607,  -2.3729,  ...,   7.6164,   0.3351,  -3.1175],
         ...,
         [  2.1742,   6.3812,   3.2649,  ...,  -3.5295,   7.6317,  -2.6995],
         [  1.6297,   7.8104,  -0.9956,  ...,   1.8467,  13.0602,  -9.8371],
         [ -8.9232,  -7.8414,   1.9285,  ...,  -1.4150,  -4.2047,  -5.5328]],

        [[  2.8453,   6.5733,  35.3299,  ...

In [87]:
samples =2000
input = inpt_data[:samples].t().reshape(-1,1)
true = targets[:samples]
prob_pred = predictions[:samples,:]
_,pred = torch.topk(prob_pred, 5)
for i in range(samples):
    print(f'input: {input[i]}| target: {true[i]}| predictions: {pred[i]}')

input: tensor([1254], device='cuda:0')| target: 151| predictions: tensor([ 151,  112, 4462, 4852, 4682], device='cuda:0')
input: tensor([151], device='cuda:0')| target: 25| predictions: tensor([  25, 4106,    9,   35, 4872], device='cuda:0')
input: tensor([16], device='cuda:0')| target: 1257| predictions: tensor([1257, 1887, 2512, 1381, 1954], device='cuda:0')
input: tensor([2], device='cuda:0')| target: 22| predictions: tensor([22, 39, 18, 21, 33], device='cuda:0')
input: tensor([5], device='cuda:0')| target: 1815| predictions: tensor([1815, 2281, 1169,  351, 3290], device='cuda:0')
input: tensor([44], device='cuda:0')| target: 364| predictions: tensor([ 364,  991,  777, 1176,  366], device='cuda:0')
input: tensor([0], device='cuda:0')| target: 259| predictions: tensor([ 259,  158,  179, 1816,  246], device='cuda:0')
input: tensor([0], device='cuda:0')| target: 2284| predictions: tensor([2284, 1454, 1644, 2729, 1872], device='cuda:0')
input: tensor([0], device='cuda:0')| target: 1| pr

In [135]:
predictions

tensor([[-12.8072,   3.8082,   3.6361,  ...,  -1.6357,  -1.5952,  -1.6449],
        [-12.8072,   3.8082,   3.6361,  ...,  -1.6357,  -1.5952,  -1.6449],
        [-12.8072,   3.8082,   3.6361,  ...,  -1.6357,  -1.5952,  -1.6449],
        ...,
        [-12.8072,   3.8082,   3.6361,  ...,  -1.6357,  -1.5952,  -1.6449],
        [-12.8072,   3.8082,   3.6361,  ...,  -1.6357,  -1.5952,  -1.6449],
        [-12.8072,   3.8082,   3.6361,  ...,  -1.6357,  -1.5952,  -1.6449]],
       device='cuda:0')