In [1]:
!nvidia-smi

Wed Dec 21 13:06:05 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.119.04   Driver Version: 450.119.04   CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:85:00.0 Off |                    0 |
| N/A   37C    P0    44W / 163W |      0MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000000:8A:00.0 Off |                    0 |
| N/A   45C    P0   148W / 163W |  20540MiB / 32510MiB |     72%      Default |
|       

In [2]:
import os

os.environ["CUDA_VISIBLE_DEVICES"]="0"

import cudf
import glob
import gc

import pandas as pd

from tqdm import tqdm
from transformers4rec import torch as tr

from merlin_standard_lib import Schema
import torch 

from custom_t4r import *

                not been set for this class (NDCGAt). The property determines if `update` by
                default needs access to the full metric state. If this is not the case, significant speedups can be
                achieved and we recommend setting this to `False`.
                We provide an checking function
                `from torchmetrics.utilities import check_forward_full_state_property`
                that can be used to check if the `full_state_update=True` (old and potential slower behaviour,
                default for now) or if `full_state_update=False` can be used safely.
                
                not been set for this class (DCGAt). The property determines if `update` by
                default needs access to the full metric state. If this is not the case, significant speedups can be
                achieved and we recommend setting this to `False`.
                We provide an checking function
                `from torchmetrics.utilities import c

In [3]:
class CustomTabularSequenceFeatures(tr.TabularSequenceFeatures):
    def forward(self, inputs, training=False, testing=False, **kwargs):
        self.to_merge.categorical_module.type_seq = inputs['type_']
        outputs = super(CustomTabularSequenceFeatures, self).forward(inputs, training=training, testing=testing, **kwargs)
        return outputs

In [4]:
# Training Param
batch_size = 1024
lr = 0.0005
lr_scheduler = 'constant' # cosine
num_train_epochs = 1
using_test = True
using_type = True
bl_shuffle = True

# Transformer Architecture
d_model = 64
n_head = 8
n_layer = 3
proj_num = 2
act_mlp = torch.nn.ReLU
act_mlp = None

# Next Item Prediction
item_correction = True
neg_factor=1
label_smoothing=0.0
temperature=1.0
remove_false_neg = False
item_correction_factor = 0.1

In [5]:
params = {
    'batch_size': batch_size,
    'lr': lr,
    'lr_scheduler': lr_scheduler,
    'num_train_epochs': num_train_epochs,
    'using_test': using_test,
    'using_type':using_type,
    'bl_shuffle': bl_shuffle,
    'd_model': d_model,
    'n_head': n_head,
    'n_layer': n_layer,
    'proj_num': proj_num,
    'act_mlp': act_mlp,
    'item_correction': item_correction,
    'neg_factor': neg_factor,
    'label_smoothing': label_smoothing,
    'temperature': temperature,
    'remove_false_neg': remove_false_neg,
    'item_correction_factor': item_correction_factor
}

In [6]:
params = {
    'batch_size': 1024,
    'lr': 0.0005,
    'lr_scheduler': 'constant',
    'num_train_epochs': 1,
    'using_test': False,
    'using_type': True,
    'bl_shuffle': False,
    'masking': 'mlm',
    'd_model': 256,
    'n_head': 4,
    'n_layer': 4,
    'proj_num': 1,
    'act_mlp': 'ReLu',
    'item_correction': False,
    'neg_factor': 8,
    'label_smoothing': 0.0,
    'temperature': 1.962964566678596,
    'remove_false_neg': False, # Should be True
    'item_correction_factor': 0.0978173779982345,
    'top20': True,
    'loss_types': True
}

In [7]:
if act_mlp == 'None':
    act_mlp = None
else:
    act_mlp = torch.nn.ReLU

In [8]:
df_aid = pd.read_parquet('./data/t4r_map_aid.parquet')
df_aid['count'] = df_aid['count']/df_aid['count'].sum()
df_aid = pd.concat([
    pd.DataFrame({'aid_': [0,1], 'aid':[-1, -1], 'count': 0.00001}),
    df_aid]
).sort_values(['aid_'])
item_probs = torch.Tensor(df_aid['count'].values).cuda()

In [9]:
schema = Schema().from_proto_text('test.pb')
if not using_type:
    schema1 = schema.select_by_name(['aid_'])
    projection = None
else:
    schema1 = schema.select_by_name(['aid_'])
    projection = tr.MLPBlock([params['d_model']]*proj_num)
    prediction = None

schema2 = schema.select_by_name(['aid_', 'type_'])

inputs = CustomTabularSequenceFeatures.from_schema(
    schema1,
    max_sequence_length=20,
    masking=params['masking'],
    embedding_dims={
        'aid_': params['d_model'],
        'type_': 16
    },
    projection=projection
)

In [10]:
def custom_loss(x, y, types):
    loss = torch.nn.CrossEntropyLoss(
        label_smoothing=params['label_smoothing'],
        reduce=False
    )(x, y)
    #loss = loss*(types==1.0)
    loss = torch.mean(loss)
    return(loss)

In [11]:
transformer_config = tr.XLNetConfig.build(
    d_model=params['d_model'], n_head=params['n_head'], n_layer=params['n_layer'], total_seq_length=20
)
body = tr.SequentialBlock(
    inputs, 
    tr.MLPBlock([params['d_model']], activation=act_mlp), 
    tr.TransformerBlock(transformer_config, masking=inputs.masking)
)
head = tr.Head(
    body,
    CustomNextItemPredictionTask(
        loss = custom_loss,
        weight_tying=True, 
        item_probs=item_probs, 
        item_correction=params['item_correction'],
        neg_factor=params['neg_factor'],
        temperature=params['temperature'],
        remove_false_neg=params['remove_false_neg'],
        item_correction_factor=params['item_correction_factor'],
        loss_types=params['loss_types']
    ),
    inputs=inputs,
)
model = tr.Model(head)
print(model)
from transformers4rec.config.trainer import T4RecTrainingArguments


# Set hyperparameters for training 

train_args = T4RecTrainingArguments(
    data_loader_engine='nvtabular', 
    dataloader_drop_last = True,
    gradient_accumulation_steps = 1,
    per_device_train_batch_size = params['batch_size'], 
    per_device_eval_batch_size = 128,
    output_dir = "./tmp-test", 
    learning_rate=params['lr'],
    lr_scheduler_type=params['lr_scheduler'], 
    learning_rate_num_cosine_cycles_by_epoch=1.5,
    num_train_epochs=1,
    max_sequence_length=20, 
    report_to = [],
    logging_steps=1000,
    save_steps=1000000,
    no_cuda=False,
    #resume_from_checkpoint='./tmp_2/checkpoint-50000'
)

trainer = CustomTrainer(
    model=model,
    args=train_args,
    schema=schema2,
    compute_metrics=True,
)

trainer.set_shuffle(shuffle=params['bl_shuffle'])

Model(
  (heads): ModuleList(
    (0): Head(
      (body): SequentialBlock(
        (0): CustomTabularSequenceFeatures(
          (to_merge): ModuleDict(
            (categorical_module): SequenceEmbeddingFeatures(
              (filter_features): FilterFeatures()
              (embedding_tables): ModuleDict(
                (aid_): Embedding(1825326, 256, padding_idx=0)
              )
            )
          )
          (_aggregation): ConcatFeatures()
          (projection_module): SequentialBlock(
            (0): DenseBlock(
              (0): Linear(in_features=256, out_features=256, bias=True)
              (1): ReLU(inplace=True)
            )
            (1): DenseBlock(
              (0): Linear(in_features=256, out_features=256, bias=True)
              (1): ReLU(inplace=True)
            )
          )
          (_masking): MaskedLanguageModeling()
        )
        (1): SequentialBlock(
          (0): DenseBlock(
            (0): Linear(in_features=256, out_features=256, bi

In [12]:
if params['using_test']:
    trainer.train_dataset_or_path = ['./data/t4r_train.parquet'] + ['./data/t4r_xgb_train_x.parquet']
else:
    trainer.train_dataset_or_path = ['./data/t4r_train.parquet']


In [13]:
outout = {}
for epoch in range(1):
    trainer.reset_lr_scheduler()
    trainer.train()

***** Running training *****
  Num examples = 10584064
  Num Epochs = 1
  Instantaneous batch size per device = 1024
  Total train batch size (w. parallel, distributed & accumulation) = 1024
  Gradient Accumulation steps = 1
  Total optimization steps = 10336


Step,Training Loss
1000,8.9075
2000,8.046
3000,7.2375
4000,6.7315
5000,6.2251
6000,5.8497
7000,5.5826
8000,5.282
9000,5.073
10000,4.8698


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Training completed. Do not forget to share your model on huggingface.co/models =)




In [55]:
ds = trainer.get_train_dataloader()

In [56]:
batch = next(iter(ds))

In [59]:
inputs(batch)

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [3, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')


In [60]:
inputs.to_merge.categorical_module.type_seq

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [3, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')

In [None]:
def custom_loss(x, y, types):
    

In [8]:
schema = Schema().from_proto_text('test.pb')
if using_type:
    schema1 = schema.select_by_name(['aid_'])
    projection = None
else:
    schema1 = schema.select_by_name(['aid_', 'type_'])
    projection = tr.MLPBlock([params['d_model']]*proj_num)

schema2 = schema.select_by_name(['aid_', 'type_'])

df_aid = pd.read_parquet('./data/t4r_map_aid.parquet')
df_aid['count'] = df_aid['count']/df_aid['count'].sum()
df_aid = pd.concat([
    pd.DataFrame({'aid_': [0,1], 'aid':[-1, -1], 'count': 0.00001}),
    df_aid]
).sort_values(['aid_'])
item_probs = torch.Tensor(df_aid['count'].values).cuda()

inputs = tr.TabularSequenceFeatures.from_schema(
    schema1,
    max_sequence_length=20,
    masking=params['masking'],
    embedding_dims={
        'aid_': params['d_model'],
        'type_': 16
    },
    projection=projection
)
transformer_config = tr.XLNetConfig.build(
    d_model=params['d_model'], n_head=params['n_head'], n_layer=params['n_layer'], total_seq_length=20
)
body = tr.SequentialBlock(
    inputs, 
    tr.MLPBlock([params['d_model']], activation=act_mlp), 
    tr.TransformerBlock(transformer_config, masking=inputs.masking)
)
head = tr.Head(
    body,
    CustomNextItemPredictionTask(
        loss = torch.nn.CrossEntropyLoss(label_smoothing=params['label_smoothing']),
        weight_tying=True, 
        item_probs=item_probs, 
        item_correction=params['item_correction'],
        neg_factor=params['neg_factor'],
        temperature=params['temperature'],
        remove_false_neg=params['remove_false_neg'],
        item_correction_factor=params['item_correction_factor']
    ),
    inputs=inputs,
)
model = tr.Model(head)
print(model)
from transformers4rec.config.trainer import T4RecTrainingArguments


# Set hyperparameters for training 

train_args = T4RecTrainingArguments(
    data_loader_engine='nvtabular', 
    dataloader_drop_last = True,
    gradient_accumulation_steps = 1,
    per_device_train_batch_size = params['batch_size'], 
    per_device_eval_batch_size = 128,
    output_dir = "./tmp-test", 
    learning_rate=params['lr'],
    lr_scheduler_type=params['lr_scheduler'], 
    learning_rate_num_cosine_cycles_by_epoch=1.5,
    num_train_epochs=1,
    max_sequence_length=20, 
    report_to = [],
    logging_steps=1000,
    save_steps=1000000,
    no_cuda=False,
    #resume_from_checkpoint='./tmp_2/checkpoint-50000'
)

trainer = CustomTrainer(
    model=model,
    args=train_args,
    schema=schema2,
    compute_metrics=True,
)

trainer.set_shuffle(shuffle=params['bl_shuffle'])

Model(
  (heads): ModuleList(
    (0): Head(
      (body): SequentialBlock(
        (0): TabularSequenceFeatures(
          (to_merge): ModuleDict(
            (categorical_module): SequenceEmbeddingFeatures(
              (filter_features): FilterFeatures()
              (embedding_tables): ModuleDict(
                (aid_): Embedding(1825326, 256, padding_idx=0)
              )
            )
          )
          (_aggregation): ConcatFeatures()
          (_masking): MaskedLanguageModeling()
        )
        (1): SequentialBlock(
          (0): DenseBlock(
            (0): Linear(in_features=256, out_features=256, bias=True)
            (1): ReLU(inplace=True)
          )
        )
        (2): TansformerBlock(
          (transformer): XLNetModel(
            (word_embedding): Embedding(1, 256)
            (layer): ModuleList(
              (0): XLNetLayer(
                (rel_attn): XLNetRelativeAttention(
                  (layer_norm): LayerNorm((256,), eps=0.03, elementwise_af

In [9]:
!ls ./data/

candidates		 test				 xgb_test_x.parquet
t4r_map_aid.parquet	 top_15_buy2buy_v3.parquet	 xgb_test_y.parquet
t4r_train.parquet	 top_15_carts_orders_v3.parquet  xgb_train_x.parquet
t4r_xgb_test_x.parquet	 top_20_clicks_v3.parquet	 xgb_train_y.parquet
t4r_xgb_train_x.parquet  train


In [10]:
outout = {}
for epoch in range(1):
    if params['using_test']:
        trainer.train_dataset_or_path = ['./data/t4r_train.parquet'] + ['./data/t4r_xgb_train_x.parquet']
    else:
        trainer.train_dataset_or_path = ['./data/t4r_train.parquet']
    trainer.reset_lr_scheduler()
    trainer.train()

***** Running training *****
  Num examples = 10584064
  Num Epochs = 1
  Instantaneous batch size per device = 1024
  Total train batch size (w. parallel, distributed & accumulation) = 1024
  Gradient Accumulation steps = 1
  Total optimization steps = 10336


Step,Training Loss
1000,8.1348
2000,6.6677
3000,5.9621
4000,5.6553
5000,5.3442
6000,5.1003
7000,4.9177
8000,4.6953
9000,4.5298
10000,4.368




Training completed. Do not forget to share your model on huggingface.co/models =)




In [14]:
eval_paths = ['./data/t4r_xgb_test_x.parquet']
trainer.args.predict_top_k = 20
trainer.test_dataset_or_path = ['./data/t4r_xgb_test_x.parquet']
prediction = trainer.predict(eval_paths)

import pandas as pd

df = pd.read_parquet('./data/t4r_xgb_test_x.parquet', columns=['session'])
df = df.head(prediction[0][0].shape[0])
df['pred'] = prediction[0][0].tolist()
df_aid = pd.read_parquet('./data/t4r_map_aid.parquet')
map_aid = df_aid.to_records()
map_aids = {}

for r in map_aid:
    map_aids[r[1]] = r[2]

map_aids[1] = -1
map_aids[0] = -1

df['pred'] = df['pred'].apply(lambda x: [map_aids[y] for y in x])
df.columns = ['session', 'labels']

In [15]:
recalls = {}

for target_type in ['clicks']:
    test_labels = pd.read_parquet('./data/xgb_test_y.parquet')
    test_labels = test_labels[['session', 'aid', 'type']].groupby(['session', 'type']).agg(list).reset_index()
    test_labels = test_labels.loc[test_labels['type']==target_type]
    test_labels = test_labels.merge(df, how='left', on=['session'])
    test_labels['hits'] = test_labels.apply(lambda df: len(set(df.aid).intersection(set(df.labels))), axis=1)
    test_labels['gt_count'] = test_labels.aid.str.len().clip(0,20)
    recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
    recalls[target_type] = recall


In [16]:
recalls

{'clicks': 0.3197804815458813}

In [24]:
recalls

{'clicks': 0.31261183922079927}

In [25]:
recalls

{'clicks': 0.31261183922079927}

In [13]:
!mkdir -p ./data/candidates/test/t4r
!mkdir -p ./data/candidates/train/t4r

In [14]:
topk = 40
eval_paths = ['./data/t4r_xgb_test_x.parquet']
trainer.args.predict_top_k = topk
trainer.test_dataset_or_path = ['./data/t4r_xgb_test_x.parquet']
prediction = trainer.predict(eval_paths)

In [15]:
df_aid = pd.read_parquet('./data/t4r_map_aid.parquet')
map_aid = df_aid.to_records()
map_aids = {}

for r in map_aid:
    map_aids[r[1]] = r[2]

map_aids[1] = -1
map_aids[0] = -1

In [16]:
%%time

import pandas as pd

df = pd.read_parquet('./data/t4r_xgb_test_x.parquet', columns=['session'])
df = df.head(prediction[0][0].shape[0])
df['pred'] = prediction[0][0].tolist()
df['pred'] = df['pred'].apply(lambda x: [map_aids[y] for y in x])
df = cudf.from_pandas(df)
for i in range(topk):
    df['rec_' + str(i)] = df.pred.list.get(i, default=-1)
df.drop(['pred'], inplace=True, axis=1)
df = cudf.melt(df, id_vars=['session'], value_vars=['rec_' + str(i) for i in range(topk)])
df['t4r_score'] = 1/(1+df['variable'].cat.codes)
df[['session', 'value', 't4r_score']].rename(
    columns={'value': 'cand'}
).to_parquet('./data/candidates/test/t4r/cand.parquet')

CPU times: user 1min 39s, sys: 6.8 s, total: 1min 46s
Wall time: 1min 46s


In [17]:
import gc

del df
gc.collect()

0

In [18]:
topk = 40
eval_paths = ['./data/t4r_xgb_train_x.parquet']
trainer.args.predict_top_k = topk
trainer.test_dataset_or_path = ['./data/t4r_xgb_train_x.parquet']
prediction = trainer.predict(eval_paths)

In [19]:
%%time

import pandas as pd

df = pd.read_parquet('./data/t4r_xgb_train_x.parquet', columns=['session'])
df = df.head(prediction[0][0].shape[0])
df['pred'] = prediction[0][0].tolist()
df['pred'] = df['pred'].apply(lambda x: [map_aids[y] for y in x])
df = cudf.from_pandas(df)
for i in range(topk):
    df['rec_' + str(i)] = df.pred.list.get(i, default=-1)
df.drop(['pred'], inplace=True, axis=1)
df = cudf.melt(df, id_vars=['session'], value_vars=['rec_' + str(i) for i in range(topk)])
df['t4r_score'] = 1/(1+df['variable'].cat.codes)
df[['session', 'value', 't4r_score']].rename(
    columns={'value': 'cand'}
).to_parquet('./data/candidates/train/t4r/cand.parquet')

CPU times: user 48.4 s, sys: 2.59 s, total: 51 s
Wall time: 51.4 s


In [21]:
recall

0.40670810927130596

In [48]:
df = df.to_pandas()

In [58]:
df['t4r_score'] = 1/(1+df['variable'].cat.codes)

In [59]:
df['t4r_score'] 

0           1.000
1           1.000
2           1.000
3           1.000
4           1.000
            ...  
71349475    0.025
71349476    0.025
71349477    0.025
71349478    0.025
71349479    0.025
Name: t4r_score, Length: 71349480, dtype: float32

In [29]:
df.drop(['variable'], inplace=True, axis=1)

In [None]:
df_items

In [None]:

recalls = {}

for target_type in ['clicks', 'carts', "orders"]:
    test_labels = pd.read_parquet(path + '/test_labels.parquet')
    test_labels = test_labels[['session', 'aid', 'type']].groupby(['session', 'type']).agg(list).reset_index()
    test_labels = test_labels.loc[test_labels['type']==target_type]
    test_labels = test_labels.merge(df, how='left', on=['session'])
    test_labels['hits'] = test_labels.apply(lambda df: len(set(df.aid).intersection(set(df.labels))), axis=1)
    test_labels['gt_count'] = test_labels.aid.str.len().clip(0,20)
    recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
    recalls[target_type] = recall
print(epoch, recalls)
outout[epoch] = recalls

In [13]:
for epoch in range(10):
    if params['using_test']:
        trainer.train_dataset_or_path = [path + 'data/train.parquet'] + [path + 'data/test.parquet']
    else:
        trainer.train_dataset_or_path = [path + 'data/train.parquet']
    trainer.reset_lr_scheduler()
    trainer.train()

    eval_paths = [path + 'data/test.parquet']
    trainer.args.predict_top_k = 20
    trainer.test_dataset_or_path = [path + 'data/test.parquet']
    prediction = trainer.predict(eval_paths)

    import pandas as pd

    df = pd.read_parquet(path + 'data/test.parquet', columns=['session'])
    df = df.head(prediction[0][0].shape[0])
    df['pred'] = prediction[0][0].tolist()
    df_aid = pd.read_parquet(path + '/data/map_aid.parquet')
    map_aid = df_aid.to_records()
    map_aids = {}

    for r in map_aid:
        map_aids[r[1]] = r[2]

    map_aids[1] = -1
    map_aids[0] = -1

    df['pred'] = df['pred'].apply(lambda x: [map_aids[y] for y in x])
    df.columns = ['session', 'labels']

    recalls = {}

    for target_type in ['clicks', 'carts', "orders"]:
        test_labels = pd.read_parquet(path + '/test_labels.parquet')
        test_labels = test_labels[['session', 'aid', 'type']].groupby(['session', 'type']).agg(list).reset_index()
        test_labels = test_labels.loc[test_labels['type']==target_type]
        test_labels = test_labels.merge(df, how='left', on=['session'])
        test_labels['hits'] = test_labels.apply(lambda df: len(set(df.aid).intersection(set(df.labels))), axis=1)
        test_labels['gt_count'] = test_labels.aid.str.len().clip(0,20)
        recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
        recalls[target_type] = recall
    print(epoch, recalls)
    outout[epoch+13] = recalls

***** Running training *****
  Num examples = 10584064
  Num Epochs = 1
  Instantaneous batch size per device = 1024
  Total train batch size (w. parallel, distributed & accumulation) = 1024
  Gradient Accumulation steps = 1
  Total optimization steps = 10336


Step,Training Loss
1000,3.4084
2000,3.3447
3000,3.5014
4000,3.4986
5000,3.4184
6000,3.3633
7000,3.2752
8000,3.1778
9000,3.1296
10000,3.0021


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Training completed. Do not forget to share your model on huggingface.co/models =)




0 {'clicks': 0.4395167740131393, 'carts': 0.2949520388133773, 'orders': 0.40411926106773977}


***** Running training *****
  Num examples = 10584064
  Num Epochs = 1
  Instantaneous batch size per device = 1024
  Total train batch size (w. parallel, distributed & accumulation) = 1024
  Gradient Accumulation steps = 1
  Total optimization steps = 10336


Step,Training Loss
1000,3.5412
2000,3.4515
3000,3.478
4000,3.4476
5000,3.3633
6000,3.3131
7000,3.2278
8000,3.127
9000,3.0798
10000,2.9546




Training completed. Do not forget to share your model on huggingface.co/models =)




1 {'clicks': 0.4388637192704659, 'carts': 0.2944088490549733, 'orders': 0.403273278199697}


***** Running training *****
  Num examples = 10584064
  Num Epochs = 1
  Instantaneous batch size per device = 1024
  Total train batch size (w. parallel, distributed & accumulation) = 1024
  Gradient Accumulation steps = 1
  Total optimization steps = 10336


Step,Training Loss
1000,3.5757
2000,3.4839
3000,3.4629
4000,3.4241
5000,3.3432
6000,3.2893
7000,3.2009
8000,3.1044
9000,3.0541
10000,2.9279




Training completed. Do not forget to share your model on huggingface.co/models =)




2 {'clicks': 0.43934416042476754, 'carts': 0.2944776295763296, 'orders': 0.4030834949707444}


***** Running training *****
  Num examples = 10584064
  Num Epochs = 1
  Instantaneous batch size per device = 1024
  Total train batch size (w. parallel, distributed & accumulation) = 1024
  Gradient Accumulation steps = 1
  Total optimization steps = 10336


Step,Training Loss
1000,3.5868
2000,3.4904
3000,3.447



KeyboardInterrupt



In [19]:
outout

{0: {'clicks': 0.4093698108039996,
  'carts': 0.27327735657056795,
  'orders': 0.3745581106597058},
 1: {'clicks': 0.42505175530758016,
  'carts': 0.28394186458702586,
  'orders': 0.38732183697298966},
 2: {'clicks': 0.4333475643647302,
  'carts': 0.29088693364090573,
  'orders': 0.398335697582033},
 3: {'clicks': 0.4368677308102597,
  'carts': 0.29359759162359067,
  'orders': 0.40266854519896683},
 4: {'clicks': 0.43769282376267704,
  'carts': 0.2933806684408514,
  'orders': 0.4010473460906263},
 5: {'clicks': 0.4364407998683534,
  'carts': 0.29198036757016055,
  'orders': 0.39820703098613297},
 6: {'clicks': 0.4379183721848162,
  'carts': 0.29383567804367033,
  'orders': 0.40181291233623156},
 7: {'clicks': 0.4384862708905595,
  'carts': 0.2939256218023671,
  'orders': 0.4013593625856839},
 8: {'clicks': 0.43876072649607073,
  'carts': 0.29437357699273925,
  'orders': 0.40322824489113196},
 9: {'clicks': 0.4382302274011413,
  'carts': 0.2940085111486171,
  'orders': 0.403807244572682

In [15]:
eval_paths = [path + 'data/test.parquet']
trainer.args.predict_top_k = 20
trainer.test_dataset_or_path = [path + 'data/test.parquet']
prediction = trainer.predict(eval_paths)

In [16]:
prediction

PredictionOutput(predictions=(array([[ 64741,  36670,  24906, ...,   6683, 217820,  12693],
       [135681,  57674,  61132, ...,  48042,  23134,  12618],
       [  5233,    465,   5202, ...,    454,  76046,   2601],
       ...,
       [167262,  18758,  19317, ...,  78228,   2796,   9786],
       [ 15513,  10764,  13103, ...,  10983, 117849,  14583],
       [120020, 117659,  56312, ..., 238944, 140828, 406793]]), array([[24.68053 , 23.489492, 22.749798, ..., 20.471788, 20.45895 ,
        20.447681],
       [34.522373, 23.81918 , 23.412258, ..., 20.423279, 20.341911,
        20.282814],
       [28.116419, 22.63007 , 22.254469, ..., 18.253366, 18.046333,
        17.892616],
       ...,
       [32.94166 , 21.69962 , 20.959791, ..., 19.10243 , 19.097216,
        19.021849],
       [27.69482 , 27.61845 , 26.228548, ..., 19.584465, 18.8672  ,
        18.861032],
       [38.39622 , 30.486069, 26.92591 , ..., 19.255077, 19.011198,
        18.970572]], dtype=float32)), label_ids=None, metrics={'

In [17]:
import pandas as pd

df = pd.read_parquet(path + 'data/test.parquet', columns=['session'])
df = df.head(prediction[0][0].shape[0])
df['pred'] = prediction[0][0].tolist()
df_aid = pd.read_parquet(path + '/data/map_aid.parquet')
map_aid = df_aid.to_records()

map_aids = {}

for r in map_aid:
    map_aids[r[1]] = r[2]

map_aids[1] = -1
map_aids[0] = -1

df['pred'] = df['pred'].apply(lambda x: [map_aids[y] for y in x])
df['score'] = prediction[0][1].tolist()
df.to_parquet('./data/pred_transformer.parquet')

In [6]:
schema = Schema().from_proto_text('test.pb')

In [7]:
if using_type:
    schema1 = schema.select_by_name(['aid_'])
    projection = None
else:
    schema1 = schema.select_by_name(['aid_', 'type_'])
    projection = tr.MLPBlock([d_model]*proj_num)

schema2 = schema.select_by_name(['aid_', 'type_'])

In [8]:
df_aid = pd.read_parquet(path + '/data/map_aid.parquet')
df_aid['count'] = df_aid['count']/df_aid['count'].sum()
df_aid = pd.concat([
    pd.DataFrame({'aid_': [0,1], 'aid':[-1, -1], 'count': 0.00001}),
    df_aid]
).sort_values(['aid_'])
item_probs = torch.Tensor(df_aid['count'].values).cuda()

In [9]:
inputs = tr.TabularSequenceFeatures.from_schema(
    schema1,
    max_sequence_length=20,
    masking="mlm",
    embedding_dims={
        'aid_': d_model,
        'type_': 16
    },
    projection=projection
)

In [10]:
transformer_config = tr.XLNetConfig.build(
    d_model=d_model, n_head=n_head, n_layer=n_layer, total_seq_length=20
)
# Define the model block including: inputs, masking, projection and transformer block.
body = tr.SequentialBlock(
    inputs, 
    tr.MLPBlock([d_model], activation=act_mlp), 
    tr.TransformerBlock(transformer_config, masking=inputs.masking)
)

# Define the evaluation top-N metrics and the cut-offs


# Define a head related to next item prediction task 
head = tr.Head(
    body,
    CustomNextItemPredictionTask(
        loss = torch.nn.CrossEntropyLoss(label_smoothing=label_smoothing),
        weight_tying=True, 
        item_probs=item_probs, 
        item_correction=item_correction,
        neg_factor=neg_factor,
        temperature=temperature,
        remove_false_neg=remove_false_neg,
        item_correction_factor=item_correction_factor
    ),
    inputs=inputs,
)

# Get the end-to-end Model class 
model = tr.Model(head)


In [11]:
model

Model(
  (heads): ModuleList(
    (0): Head(
      (body): SequentialBlock(
        (0): TabularSequenceFeatures(
          (to_merge): ModuleDict(
            (categorical_module): SequenceEmbeddingFeatures(
              (filter_features): FilterFeatures()
              (embedding_tables): ModuleDict(
                (aid_): Embedding(1825326, 64, padding_idx=0)
              )
            )
          )
          (_aggregation): ConcatFeatures()
          (_masking): CausalLanguageModeling()
        )
        (1): SequentialBlock(
          (0): DenseBlock(
            (0): Linear(in_features=64, out_features=64, bias=True)
          )
        )
        (2): TansformerBlock(
          (transformer): XLNetModel(
            (word_embedding): Embedding(1, 64)
            (layer): ModuleList(
              (0): XLNetLayer(
                (rel_attn): XLNetRelativeAttention(
                  (layer_norm): LayerNorm((64,), eps=0.03, elementwise_affine=True)
                  (dropout): D

In [12]:
from transformers4rec.config.trainer import T4RecTrainingArguments


# Set hyperparameters for training 

train_args = T4RecTrainingArguments(
    data_loader_engine='nvtabular', 
    dataloader_drop_last = True,
    gradient_accumulation_steps = 1,
    per_device_train_batch_size = batch_size, 
    per_device_eval_batch_size = 128,
    output_dir = "./tmp-test", 
    learning_rate=lr,
    lr_scheduler_type=lr_scheduler, 
    learning_rate_num_cosine_cycles_by_epoch=1.5,
    num_train_epochs=num_train_epochs,
    max_sequence_length=20, 
    report_to = [],
    logging_steps=1000,
    save_steps=1000000,
    no_cuda=False,
    #resume_from_checkpoint='./tmp_2/checkpoint-50000'
)

In [13]:
trainer = CustomTrainer(
    model=model,
    args=train_args,
    schema=schema2,
    compute_metrics=True,
)

In [14]:
trainer.set_shuffle(shuffle=bl_shuffle)

In [15]:
if using_test:
    trainer.train_dataset_or_path = [path + 'data/train.parquet'] + [path + 'data/test.parquet']
else:
    trainer.train_dataset_or_path = [path + 'data/train.parquet']
trainer.reset_lr_scheduler()
trainer.train()
#trainer.state.global_step +=1

***** Running training *****
  Num examples = 12367872
  Num Epochs = 1
  Instantaneous batch size per device = 1024
  Total train batch size (w. parallel, distributed & accumulation) = 1024
  Gradient Accumulation steps = 1
  Total optimization steps = 12078


Step,Training Loss
1000,7.0833
2000,5.4582
3000,4.85
4000,4.157
5000,3.7905
6000,3.5598
7000,3.3723
8000,3.2097
9000,3.065


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)


KeyboardInterrupt



In [18]:
eval_paths = [path + 'data/test.parquet']
trainer.args.predict_top_k = 100
trainer.test_dataset_or_path = [path + 'data/test.parquet']
prediction = trainer.predict(eval_paths)

In [19]:
import pandas as pd

df = pd.read_parquet(path + 'data/test.parquet', columns=['session'])
df = df.head(prediction[0][0].shape[0])
df['pred'] = prediction[0][0].tolist()
df_aid = pd.read_parquet(path + '/data/map_aid.parquet')
map_aid = df_aid.to_records()
map_aids = {}

for r in map_aid:
    map_aids[r[1]] = r[2]
    
map_aids[1] = -1
map_aids[0] = -1

df['pred'] = df['pred'].apply(lambda x: [map_aids[y] for y in x])
df.columns = ['session', 'labels']

In [20]:
recalls = {}

for target_type in ['clicks', 'carts', "orders"]:
    test_labels = pd.read_parquet(path + '/test_labels.parquet')
    test_labels = test_labels[['session', 'aid', 'type']].groupby(['session', 'type']).agg(list).reset_index()
    test_labels = test_labels.loc[test_labels['type']==target_type]
    test_labels = test_labels.merge(df, how='left', on=['session'])
    test_labels['hits'] = test_labels.apply(lambda df: len(set(df.aid).intersection(set(df.labels))), axis=1)
    test_labels['gt_count'] = test_labels.aid.str.len().clip(0,20)
    recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
    recalls[target_type] = recall

In [18]:
recalls

{'clicks': 0.43934416042476754,
 'carts': 0.2944776295763296,
 'orders': 0.4030834949707444}

In [34]:
test_labels = pd.read_parquet(path + '/test_labels.parquet')


In [35]:
test_labels.head()

Unnamed: 0,session,aid,type
0,11098528,796572,clicks
1,11098528,92401,orders
2,11098528,1561739,orders
3,11098528,950341,orders
4,11098529,1105029,clicks


In [21]:
prediction[0][1]

array([[ 9.603529 ,  9.584609 ,  9.4937725, ...,  8.045933 ,  8.042598 ,
         8.034368 ],
       [ 9.104875 ,  7.8496995,  7.3927402, ...,  5.7704325,  5.769855 ,
         5.7587805],
       [14.364313 , 13.633856 , 13.207083 , ...,  8.756059 ,  8.704534 ,
         8.700096 ],
       ...,
       [10.337594 ,  7.860083 ,  7.7989454, ...,  6.551793 ,  6.548812 ,
         6.546137 ],
       [14.379154 , 12.915765 , 12.796582 , ...,  7.0632195,  7.0531487,
         7.0494847],
       [13.049289 ,  9.941049 ,  6.932467 , ...,  5.827901 ,  5.823332 ,
         5.818209 ]], dtype=float32)

In [24]:
df = cudf.from_pandas(df)
df = df.to_pandas()

In [32]:
df

Unnamed: 0,session,pred
0,11098528,"[1289327, 1033148, 846545, 1722334, 1631509, 4..."
1,11098529,"[1105029, 258321, 1383767, 1171006, 644001, 12..."
2,11098530,"[264500, 409236, 877496, 583026, 1058185, 3641..."
3,11098531,"[1271998, 702610, 396199, 1565726, 524159, 448..."
4,11098532,"[108125, 1673641, 470541, 317695, 7651, 123224..."
...,...,...
1783732,12899774,"[33035, 31490, 1226691, 771913, 1539309, 13596..."
1783733,12899775,"[1743151, 1760714, 1550204, 1163166, 155954, 1..."
1783734,12899776,"[548599, 1470447, 1048957, 1598714, 1364783, 4..."
1783735,12899777,"[384045, 1688215, 1308634, 1281056, 395762, 14..."


In [33]:
df.drop(['score'], axis=1, inplace=True)

KeyError: "['score'] not found in axis"

In [49]:
df.columns = ['session', 'labels']

In [50]:
import pandas as pd

test_labels = pd.read_parquet(path + '/test_labels.parquet')
test_labels = test_labels[['session', 'aid', 'type']].groupby(['session', 'type']).agg(list).reset_index()
test_labels = test_labels.loc[test_labels['type']=='clicks']


In [51]:
test_labels

Unnamed: 0,session,type,aid
0,11098528,clicks,[796572]
2,11098529,clicks,[1105029]
4,11098530,clicks,[264500]
7,11098532,clicks,[461190]
9,11098533,clicks,[1697666]
...,...,...,...
2189388,12899774,clicks,[1399483]
2189389,12899775,clicks,[1760714]
2189390,12899776,clicks,[1737908]
2189391,12899777,clicks,[384045]


In [52]:
test_labels = test_labels.merge(df, how='left', on=['session'])

In [None]:
test_labels = test_labels.merge(df, how='left', on=['session'])
test_labels['hits'] = test_labels.apply(lambda df: len(set(df.aid).intersection(set(df.labels))), axis=1)
test_labels['gt_count'] = test_labels.aid.str.len().clip(0,20)
recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()


In [54]:
test_labels['gt_count'] = test_labels.aid.str.len().clip(0,20)
recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()


In [55]:
recall

0.4810194098226338

In [147]:
test_labels['hits'].sum()

306536

In [74]:
prediction[0][0]

array([[  503,  1255,  1009, ...,   270,  4844,   467],
       [    1,  1591,   511, ...,  2942, 12201,   288],
       [  465,   962,   454, ...,  5244,  2192,  3930],
       ...,
       [    1,  4442,  6283, ..., 55952,  1295,  3330],
       [ 2259,   569,  8203, ...,  3419,  3655,   662],
       [    1,   102,   858, ...,  1394,    40,   245]])

In [75]:
test_labels = cudf.read_parquet(path + '/test_labels.parquet')

In [76]:
test_labels = test_labels.merge(
    test_labels.groupby(['session', 'type']).count().reset_index().rename(columns={'aid': 'no_gt'}),
    how='left',
    on=['session', 'type']
)


In [64]:
test_labels[test_labels['type']=='clicks'][['session', 'no_gt']].groupby(['session']).max().no_gt.clip(0,20).sum()

1737986

In [77]:
df_aid = cudf.read_parquet(path + '/data/map_aid.parquet')

In [78]:
df = df.merge(
    df_aid,
    how='left',
    on='pred_' + str(i) + '_'
)

KeyError: 'pred_19_'

In [79]:
df

Unnamed: 0,session
0,11098528
1,11098529
2,11098530
3,11098531
4,11098532
...,...
1783732,12899774
1783733,12899775
1783734,12899776
1783735,12899777


In [80]:
df_aid

Unnamed: 0,aid_,aid
0,2,1460571
1,3,29735
2,4,108125
3,5,1733943
4,6,832192
...,...,...
1562189,1562191,838504
1562190,1562192,1774445
1562191,1562193,816230
1562192,1562194,1628178


In [81]:
for i in range(20):
    df['pred_' + str(i) + '_'] = prediction[0][0][:, i]
    df_aid.columns = ['pred_' + str(i) + '_', 'pred_' + str(i)]
    df = df.merge(
        df_aid,
        how='left',
        on='pred_' + str(i) + '_'
    )
    df['pred_' + str(i)] = df['pred_' + str(i)].fillna(-1)
    df.drop(['pred_' + str(i) + '_'], inplace=True, axis=1)

In [34]:
df = cudf.melt(df, id_vars=['session'], value_vars=['pred_' + str(i) for i in range(20)])

In [35]:
df.drop(['variable'], axis=1, inplace=True)

In [36]:
import gc
gc.collect()

776

In [37]:
df = df.to_pandas()

In [38]:
df = df.merge(
    test_labels.to_pandas(),
    how='left',
    on=['session']
)

In [47]:
df['type'].drop_duplicates()

0     clicks
8     orders
11     carts
Name: type, dtype: object

In [54]:
df_clicks = df[df['type']=='carts']

In [55]:
df_clicks['hit'] = (df_clicks['aid']==df_clicks['value']).astype('int8')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clicks['hit'] = (df_clicks['aid']==df_clicks['value']).astype('int8')


In [56]:
df_clicks = df_clicks.groupby(['session']).agg({'no_gt': max, 'hit': sum})

In [57]:
df_clicks.no_gt.clip(0,20).sum()

567021

In [58]:
df_clicks.hit.sum()

19326

In [140]:
84380/1737802

0.048555589186800335

In [148]:
df_clicks['no_gt'].max()

1

In [113]:
test_labels

Unnamed: 0,session,aid,type,no_gt
0,11103159,1107392,carts,16
1,11103159,1105859,carts,16
2,11103159,518675,carts,16
3,11103159,157592,carts,16
4,11103159,1667358,carts,16
...,...,...,...,...
2621075,12899752,429407,clicks,1
2621076,12899754,922732,clicks,1
2621077,12899755,605,clicks,1
2621078,12899748,1503117,clicks,1


In [56]:
test_labels

Unnamed: 0,session,aid,type
0,11098528,796572,clicks
1,11098528,92401,orders
2,11098528,1561739,orders
3,11098528,950341,orders
4,11098529,1105029,clicks
...,...,...,...
2621075,12899774,1399483,clicks
2621076,12899775,1760714,clicks
2621077,12899776,1737908,clicks
2621078,12899777,384045,clicks


In [58]:
test_labels['no_gt'] = test_labels.groupby(['session', 'type']).count()

TypeError: incompatible index of inserted column with frame index

In [None]:
test_labels

In [43]:
df

(1783737, 5)

In [40]:
ds.to_ddf()

<merlin.io.dataset.Dataset at 0x7f2805bab6a0>

In [None]:
??trainer.get_train_dataloader

In [42]:
next(iter(dl))

({'session': tensor([    0,     1,     2,  ..., 66480, 66481, 66482], device='cuda:0',
         dtype=torch.int32),
  'type_': (tensor([1, 1, 1,  ..., 1, 2, 1], device='cuda:0'),
   tensor([[     0],
           [    20],
           [    40],
           ...,
           [858002],
           [858004],
           [858024]], device='cuda:0', dtype=torch.int32)),
  'aid_': (tensor([1092338,  712999,   29865,  ..., 1624604,  181850,  181850],
          device='cuda:0'),
   tensor([[     0],
           [    20],
           [    40],
           ...,
           [858002],
           [858004],
           [858024]], device='cuda:0', dtype=torch.int32)),
  'dummy': (tensor([1, 1, 1,  ..., 1, 1, 1], device='cuda:0'),
   tensor([[     0],
           [    20],
           [    40],
           ...,
           [858002],
           [858004],
           [858024]], device='cuda:0', dtype=torch.int32)),
  'rank': (tensor([1, 2, 3,  ..., 5, 6, 7], device='cuda:0'),
   tensor([[     0],
           [    20],
   

In [31]:
??trainer.train

In [19]:
cp test.pb ../../preprocess/data/schema.pb

In [28]:
from merlin.io import Dataset
from merlin.loader.torch import Loader

ds = Dataset([path + 'data/train.parquet'])
loader = Loader(ds, batch_size=65536)

({'session': tensor([    0,     1,     2,  ..., 66480, 66481, 66482], device='cuda:0',
         dtype=torch.int32),
  'type_': (tensor([1, 1, 1,  ..., 1, 2, 1], device='cuda:0'),
   tensor([[     0],
           [    20],
           [    40],
           ...,
           [858002],
           [858004],
           [858024]], device='cuda:0', dtype=torch.int32)),
  'aid_': (tensor([1092338,  712999,   29865,  ..., 1624604,  181850,  181850],
          device='cuda:0'),
   tensor([[     0],
           [    20],
           [    40],
           ...,
           [858002],
           [858004],
           [858024]], device='cuda:0', dtype=torch.int32)),
  'dummy': (tensor([1, 1, 1,  ..., 1, 1, 1], device='cuda:0'),
   tensor([[     0],
           [    20],
           [    40],
           ...,
           [858002],
           [858004],
           [858024]], device='cuda:0', dtype=torch.int32)),
  'rank': (tensor([1, 2, 3,  ..., 5, 6, 7], device='cuda:0'),
   tensor([[     0],
           [    20],
   

In [None]:
ds