In [1]:
#!pip install torch torchmetrics==0.10.0

In [3]:
import os

os.environ["CUDA_VISIBLE_DEVICES"]="0"

import cudf
import glob
import gc

import pandas as pd

from tqdm import tqdm
from transformers4rec import torch as tr

from merlin_standard_lib import Schema
import torch 

from custom_t4r import *

                not been set for this class (NDCGAt). The property determines if `update` by
                default needs access to the full metric state. If this is not the case, significant speedups can be
                achieved and we recommend setting this to `False`.
                We provide an checking function
                `from torchmetrics.utilities import check_forward_full_state_property`
                that can be used to check if the `full_state_update=True` (old and potential slower behaviour,
                default for now) or if `full_state_update=False` can be used safely.
                
                not been set for this class (DCGAt). The property determines if `update` by
                default needs access to the full metric state. If this is not the case, significant speedups can be
                achieved and we recommend setting this to `False`.
                We provide an checking function
                `from torchmetrics.utilities import c

In [4]:
class CustomTabularSequenceFeatures(tr.TabularSequenceFeatures):
    def forward(self, inputs, training=False, testing=False, **kwargs):
        self.to_merge.categorical_module.type_seq = inputs['type_']
        outputs = super(CustomTabularSequenceFeatures, self).forward(inputs, training=training, testing=testing, **kwargs)
        return outputs

In [5]:
# # Training Param
# batch_size = 1024
# lr = 0.0005
# lr_scheduler = 'constant' # cosine
# num_train_epochs = 1
# using_test = True
# using_type = True
# bl_shuffle = True

# # Transformer Architecture
# d_model = 64
# n_head = 8
# n_layer = 3
# proj_num = 2
# act_mlp = torch.nn.ReLU
# act_mlp = None

# # Next Item Prediction
# item_correction = True
# neg_factor=1
# label_smoothing=0.0
# temperature=1.0
# remove_false_neg = False
# item_correction_factor = 0.1

In [6]:
# params = {
#      'batch_size': 1024,
#      'lr': 0.0005,
#      'lr_scheduler': 'constant',
#      'num_train_epochs': 1,
#      'using_test': False,
#      'using_type': True,
#      'bl_shuffle': False,
#      'masking': 'mlm',
#      'd_model': 256,
#      'n_head': 16,
#      'n_layer': 3,
#      'proj_num': 1,
#      'act_mlp': 'None',
#      'item_correction': True,
#      'neg_factor': 4,
#      'label_smoothing': 0.0,
#      'temperature': 1.0,
#      'remove_false_neg': False,
#      'item_correction_factor': 0.1,
#      'transformer_dropout': 0.1,
#      'mlm_probability': 0.25,
#      'top20': True,
#      'loss_types': True,
#      'loss_types_type': 'Weighted',
#      'multi_task_emb': 16,
#      'mt_num_layers': 1,
#      'use_tanh': True,
#      'seq_len': 50,
#      'split': 0
# }

params = {
    'batch_size': 1024,
    'lr': 0.0005,
    'lr_scheduler': 'cosine',
    'num_train_epochs': 1,
    'using_test': True,
    'using_type': False,
    'bl_shuffle': True,
    'masking': 'mlm',
    'd_model': 256,
    'n_head': 32,
    'n_layer': 3,
    'proj_num': 1,
    'act_mlp': 'None',
    'item_correction': False,
    'neg_factor': 4,
    'label_smoothing': 0.0,
    'temperature': 1.5734215681668653,
    'remove_false_neg': True,
    'item_correction_factor': 0.04152252077012748,
    'transformer_dropout': 0.05096800263401626,
    'mlm_probability': 0.35044384745899415,
    'top20': True,
    'loss_types': True,
    'loss_types_type': 'Simple',
    'multi_task_emb': 0,
    'mt_num_layers': 1,
    'use_tanh': False,
    'seq_len': 20,
    'split': 0
}

In [7]:
if params['act_mlp'] == 'None':
    act_mlp = None
else:
    act_mlp = torch.nn.ReLU
    
from transformers4rec.torch.masking import MaskedLanguageModeling

masking = 'mlm'
if params['masking']=='mlm':
    masking = MaskedLanguageModeling(
        hidden_size=params['d_model'], 
        mlm_probability=params['mlm_probability']
    )
else:
    masking = params['masking']
    
if params['loss_types']:
    if params['loss_types']=='Simple':
        def custom_loss(x, y, types):
            loss = torch.nn.CrossEntropyLoss(
                label_smoothing=params['label_smoothing'],
                reduce=False
            )(x, y)
            #loss = loss*(types==1.0)
            loss = torch.mean(loss)
            return(loss)
        loss_fn = custom_loss
    else:
        def custom_loss(x, y, types):
            loss = torch.nn.CrossEntropyLoss(
                label_smoothing=params['label_smoothing'],
                reduce=False
            )(x, y)
            loss = loss*(types==1.0)+loss*(types==2.0)*5.0+loss*(types==3.0)*10.0
            loss = torch.mean(loss)
            return(loss)
        loss_fn = custom_loss
else:
    loss_fn = torch.nn.CrossEntropyLoss(
        label_smoothing=params['label_smoothing']
    )
    
if params['mt_num_layers']==1:
    mt_tower = torch.nn.Sequential(
        torch.nn.Linear(
            params['d_model']+params['multi_task_emb'], 
            params['d_model']
        )
    )
else:
    mt_tower = torch.nn.Sequential(
            torch.nn.Linear(
                params['d_model']+params['multi_task_emb'], 
                params['d_model']
            ),
            torch.nn.ReLU(),
            torch.nn.Linear(
                params['d_model'], 
                params['d_model']
            ),
        )

In [8]:
seq_len = params['seq_len']
split = params['split']

df_aid = pd.read_parquet('./data/t4r/t4r_map_aid_' + str(seq_len) + '_' + str(split) + '.parquet')
df_aid['count'] = df_aid['count']/df_aid['count'].sum()
df_aid = pd.concat([
    pd.DataFrame({'aid_': [0,1], 'aid':[-1, -1], 'count': 0.00001}),
    df_aid]
).sort_values(['aid_'])
item_probs = torch.Tensor(df_aid['count'].values).cuda()

In [9]:
schema = Schema().from_proto_text('test.pb')
if not params['using_type']:
    schema1 = schema.select_by_name(['aid_'])
    projection = None
else:
    schema1 = schema.select_by_name(['aid_', 'type_'])
    projection = tr.MLPBlock([params['d_model']]*params['proj_num'])

schema2 = schema.select_by_name(['aid_', 'type_'])

inputs = CustomTabularSequenceFeatures.from_schema(
    schema1,
    max_sequence_length=params['seq_len'],
    masking=masking,
    embedding_dims={
        'aid_': params['d_model'],
        'type_': 16
    },
    projection=projection
)

In [10]:
transformer_config = tr.XLNetConfig.build(
    d_model=params['d_model'], 
    n_head=params['n_head'], 
    n_layer=params['n_layer'], 
    dropout=params['transformer_dropout'],
    total_seq_length=params['seq_len']
)

body = tr.SequentialBlock(
    inputs, 
    tr.MLPBlock([params['d_model']], activation=act_mlp), 
    tr.TransformerBlock(transformer_config, masking=inputs.masking)
)

head = tr.Head(
    body,
    CustomNextItemPredictionTask(
        loss = loss_fn,
        weight_tying=True, 
        item_probs=item_probs, 
        item_correction=params['item_correction'],
        neg_factor=params['neg_factor'],
        temperature=params['temperature'],
        remove_false_neg=params['remove_false_neg'],
        item_correction_factor=params['item_correction_factor'],
        loss_types = params['loss_types'],
        multi_task = params['multi_task_emb'],
        mt_tower = mt_tower,
        eval_task = 1,
        d_model = params['d_model'],
        use_tanh = params['use_tanh']
    ),
    inputs=inputs,
)
model = tr.Model(head)

print(model)

from transformers4rec.config.trainer import T4RecTrainingArguments


# Set hyperparameters for training 

train_args = T4RecTrainingArguments(
    data_loader_engine='nvtabular', 
    dataloader_drop_last = True,
    gradient_accumulation_steps = 1,
    per_device_train_batch_size = params['batch_size'], 
    per_device_eval_batch_size = 128,
    output_dir = "./tmp-test", 
    learning_rate=params['lr'],
    lr_scheduler_type=params['lr_scheduler'], 
    learning_rate_num_cosine_cycles_by_epoch=1.5,
    num_train_epochs=1,
    max_sequence_length=params['seq_len'], 
    report_to = [],
    logging_steps=1000,
    save_steps=1000000,
    no_cuda=False,
    #resume_from_checkpoint='./tmp_2/checkpoint-50000'
)

trainer = CustomTrainer(
    model=model,
    args=train_args,
    schema=schema2,
    compute_metrics=True,
)

trainer.set_max_seq_len(max_seq_len=params['seq_len'])
trainer.set_shuffle(shuffle=params['bl_shuffle'])

Model(
  (heads): ModuleList(
    (0): Head(
      (body): SequentialBlock(
        (0): CustomTabularSequenceFeatures(
          (to_merge): ModuleDict(
            (categorical_module): SequenceEmbeddingFeatures(
              (filter_features): FilterFeatures()
              (embedding_tables): ModuleDict(
                (aid_): Embedding(1840501, 256, padding_idx=0)
              )
            )
          )
          (_aggregation): ConcatFeatures()
          (_masking): MaskedLanguageModeling()
        )
        (1): SequentialBlock(
          (0): DenseBlock(
            (0): Linear(in_features=256, out_features=256, bias=True)
          )
        )
        (2): TansformerBlock(
          (transformer): XLNetModel(
            (word_embedding): Embedding(1, 256)
            (layer): ModuleList(
              (0): XLNetLayer(
                (rel_attn): XLNetRelativeAttention(
                  (layer_norm): LayerNorm((256,), eps=0.03, elementwise_affine=True)
                  (

In [11]:
if params['using_test']:
    trainer.train_dataset_or_path = [
        './data/t4r/t4r_train_' + str(seq_len) + '_' + str(split) + '.parquet'
    ] + [
        './data/t4r/t4r_xgb_train_x_' + str(seq_len) + '_' + str(split) + '.parquet',
        './data/t4r/t4r_xgb_sub_x_' + str(seq_len) + '_' + str(split) + '.parquet'
    ]
else:
    trainer.train_dataset_or_path = [
        './data/t4r/t4r_train_' + str(seq_len) + '_' + str(split) + '.parquet',
        './data/t4r/t4r_xgb_sub_x_' + str(seq_len) + '_' + str(split) + '.parquet'
    ]

In [12]:
df_aid = pd.read_parquet('./data/t4r/t4r_map_aid_' + str(seq_len) + '_' + str(split) + '.parquet')
map_aid = df_aid.to_records()
map_aids = {}

for r in map_aid:
    map_aids[r[1]] = r[2]

map_aids[1] = -1
map_aids[0] = -1

In [13]:
trainer.reset_lr_scheduler()
recall_hist = []
for e in range(4):
    if e==3:
        trainer.train_dataset_or_path = [
            './data/t4r/t4r_xgb_train_x_' + str(seq_len) + '_' + str(split) + '.parquet',
            './data/t4r/t4r_xgb_sub_x_' + str(seq_len) + '_' + str(split) + '.parquet'
        ]
    trainer.train()
    os.system('mkdir -p ./data/t4r/checkpoints/checkpoint_' + str(e))
    trainer.save_model(output_dir = './data/t4r/checkpoints/checkpoint_' + str(e) + '/')
    
    ### Clicks
    eval_paths = ['./data/t4r/t4r_xgb_train_x_' + str(seq_len) + '_' + str(split) + '.parquet']
    trainer.args.predict_top_k = 20
    trainer.test_dataset_or_path = ['./data/t4r/t4r_xgb_test_x_' + str(seq_len) + '_' + str(split) + '.parquet']
    trainer.model.wrapper_module.heads[0].prediction_task_dict['next-item'].eval_task = torch.Tensor([1]).long()
    prediction = trainer.predict(eval_paths)

    import pandas as pd
    
    df = pd.read_parquet('./data/t4r/t4r_xgb_train_x_' + str(seq_len) + '_' + str(split) + '.parquet', columns=['session'])
    df = df.head(prediction[0][0].shape[0])
    df['pred'] = prediction[0][0].tolist()
    df['pred'] = df['pred'].apply(lambda x: [map_aids[y] for y in x if y in map_aids])
    df.columns = ['session', 'labels']

    recalls1 = {}
    recalls2 = {}
    recalls3 = {}

    for target_type in ['clicks', 'carts', 'orders']:
        test_labels = pd.read_parquet('./data/xgb_train_y.parquet')
        test_labels = test_labels[['session', 'aid', 'type']].groupby(['session', 'type']).agg(list).reset_index()
        test_labels = test_labels.loc[test_labels['type']==target_type]
        test_labels = test_labels.merge(df, how='left', on=['session'])
        test_labels['hits'] = test_labels.apply(lambda df: len(set(df.aid).intersection(set(df.labels))), axis=1)
        test_labels['gt_count'] = test_labels.aid.str.len().clip(0,20)
        recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
        recalls1[target_type] = recall

#     ### Carts
#     eval_paths = ['./data/t4r/t4r_xgb_train_x_' + str(seq_len) + '_' + str(split) + '.parquet']
#     trainer.args.predict_top_k = 20
#     trainer.test_dataset_or_path = ['./data/t4r/t4r_xgb_train_x_' + str(seq_len) + '_' + str(split) + '.parquet']
#     trainer.model.wrapper_module.heads[0].prediction_task_dict['next-item'].eval_task = torch.Tensor([2]).long()
#     prediction = trainer.predict(eval_paths)

#     import pandas as pd
    
#     df = pd.read_parquet('./data/t4r/t4r_xgb_train_x_' + str(seq_len) + '_' + str(split) + '.parquet', columns=['session'])
#     df = df.head(prediction[0][0].shape[0])
#     df['pred'] = prediction[0][0].tolist()
#     df['pred'] = df['pred'].apply(lambda x: [map_aids[y] for y in x if y in map_aids])
#     df.columns = ['session', 'labels']

#     recalls2 = {}

#     for target_type in ['clicks', 'carts', 'orders']:
#         test_labels = pd.read_parquet('./data/xgb_train_y.parquet')
#         test_labels = test_labels[['session', 'aid', 'type']].groupby(['session', 'type']).agg(list).reset_index()
#         test_labels = test_labels.loc[test_labels['type']==target_type]
#         test_labels = test_labels.merge(df, how='left', on=['session'])
#         test_labels['hits'] = test_labels.apply(lambda df: len(set(df.aid).intersection(set(df.labels))), axis=1)
#         test_labels['gt_count'] = test_labels.aid.str.len().clip(0,20)
#         recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
#         recalls2[target_type] = recall
        
#     ### Orders
#     eval_paths = ['./data/t4r/t4r_xgb_train_x_' + str(seq_len) + '_' + str(split) + '.parquet']
#     trainer.args.predict_top_k = 20
#     trainer.test_dataset_or_path = ['./data/t4r/t4r_xgb_train_x_' + str(seq_len) + '_' + str(split) + '.parquet']
#     trainer.model.wrapper_module.heads[0].prediction_task_dict['next-item'].eval_task = torch.Tensor([3]).long()
#     prediction = trainer.predict(eval_paths)

#     import pandas as pd
    
#     df = pd.read_parquet('./data/t4r/t4r_xgb_train_x_' + str(seq_len) + '_' + str(split) + '.parquet', columns=['session'])
#     df = df.head(prediction[0][0].shape[0])
#     df['pred'] = prediction[0][0].tolist()
#     df['pred'] = df['pred'].apply(lambda x: [map_aids[y] for y in x if y in map_aids])
#     df.columns = ['session', 'labels']

#     recalls3 = {}

#     for target_type in ['clicks', 'carts', 'orders']:
#         test_labels = pd.read_parquet('./data/xgb_train_y.parquet')
#         test_labels = test_labels[['session', 'aid', 'type']].groupby(['session', 'type']).agg(list).reset_index()
#         test_labels = test_labels.loc[test_labels['type']==target_type]
#         test_labels = test_labels.merge(df, how='left', on=['session'])
#         test_labels['hits'] = test_labels.apply(lambda df: len(set(df.aid).intersection(set(df.labels))), axis=1)
#         test_labels['gt_count'] = test_labels.aid.str.len().clip(0,20)
#         recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
#         recalls3[target_type] = recall
    recall_hist.append({
        'e': e,
        'recalls1': recalls1,
        'recalls2': recalls2,
        'recalls3': recalls3
    })
    print(recall_hist)
    
    os.system('mkdir -p ./data/t4r/train_pred/pred_' + str(e))
    
    topk = 50
    eval_paths = ['./data/t4r/t4r_xgb_train_x_' + str(seq_len) + '_' + str(split) + '.parquet']
    trainer.args.predict_top_k = topk
    trainer.test_dataset_or_path = ['./data/t4r/t4r_xgb_test_x_' + str(seq_len) + '_' + str(split) + '.parquet']
    trainer.model.wrapper_module.heads[0].prediction_task_dict['next-item'].eval_task = torch.Tensor([1]).long()
    prediction = trainer.predict(eval_paths)
    
    import pandas as pd
    df = pd.read_parquet(eval_paths[0], columns=['session'])
    assert df.shape[0]==prediction[0][0].shape[0]
    df['pred'] = prediction[0][0].tolist()
    df['pred'] = df['pred'].apply(lambda x: [map_aids[y] for y in x if y in map_aids])
    df = cudf.from_pandas(df)
    for i in range(topk):
        df['rec_' + str(i)] = df.pred.list.get(i, default=-1)
    df.drop(['pred'], inplace=True, axis=1)
    df = cudf.melt(df, id_vars=['session'], value_vars=['rec_' + str(i) for i in range(topk)])
    df['order'] = df['variable'].cat.codes.astype('int')
    df[['session', 'value', 'order']].to_parquet('./data/t4r/train_pred/pred_' + str(e) + '/rec.parquet')
    del df
    gc.collect()

    import pandas as pd
    df = pd.read_parquet(eval_paths[0], columns=['session'])
    assert df.shape[0]==prediction[0][0].shape[0]
    pred2 = torch.nn.Softmax(dim=1)(torch.Tensor(prediction[0][1]))
    pred2 = pred2.numpy()
    df['pred'] = pred2.tolist()
    df = cudf.from_pandas(df)
    for i in range(topk):
        df['rec_' + str(i)] = df.pred.list.get(i, default=-1)
    df.drop(['pred'], inplace=True, axis=1)
    df = cudf.melt(df, id_vars=['session'], value_vars=['rec_' + str(i) for i in range(topk)])
    df['order'] = df['variable'].cat.codes.astype('int')
    df[['session', 'value', 'order']].to_parquet('./data/t4r/train_pred/pred_' + str(e) + '/score.parquet')
    del df, pred2
    gc.collect()
    
    os.system('mkdir -p ./data/t4r/sub_pred/pred_' + str(e))
    
    eval_paths = ['./data/t4r/t4r_xgb_sub_x_' + str(seq_len) + '_' + str(split) + '.parquet']
    trainer.args.predict_top_k = topk
    trainer.test_dataset_or_path = ['./data/t4r/t4r_xgb_sub_x_' + str(seq_len) + '_' + str(split) + '.parquet']
    trainer.model.wrapper_module.heads[0].prediction_task_dict['next-item'].eval_task = torch.Tensor([1]).long()
    prediction = trainer.predict(eval_paths)
    
    import pandas as pd
    df = pd.read_parquet(eval_paths[0], columns=['session'])
    assert df.shape[0]==prediction[0][0].shape[0]
    df['pred'] = prediction[0][0].tolist()
    df['pred'] = df['pred'].apply(lambda x: [map_aids[y] for y in x if y in map_aids])
    df = cudf.from_pandas(df)
    for i in range(topk):
        df['rec_' + str(i)] = df.pred.list.get(i, default=-1)
    df.drop(['pred'], inplace=True, axis=1)
    df = cudf.melt(df, id_vars=['session'], value_vars=['rec_' + str(i) for i in range(topk)])
    df['order'] = df['variable'].cat.codes.astype('int')
    df[['session', 'value', 'order']].to_parquet('./data/t4r/sub_pred/pred_' + str(e) + '/rec.parquet')
    del df
    gc.collect()

    import pandas as pd
    df = pd.read_parquet(eval_paths[0], columns=['session'])
    assert df.shape[0]==prediction[0][0].shape[0]
    pred2 = torch.nn.Softmax(dim=1)(torch.Tensor(prediction[0][1]))
    pred2 = pred2.numpy()
    df['pred'] = pred2.tolist()
    df = cudf.from_pandas(df)
    for i in range(topk):
        df['rec_' + str(i)] = df.pred.list.get(i, default=-1)
    df.drop(['pred'], inplace=True, axis=1)
    df = cudf.melt(df, id_vars=['session'], value_vars=['rec_' + str(i) for i in range(topk)])
    df['order'] = df['variable'].cat.codes.astype('int')
    df[['session', 'value', 'order']].to_parquet('./data/t4r/sub_pred/pred_' + str(e) + '/score.parquet')
    del df, pred2
    gc.collect()

***** Running training *****
  Num examples = 14039040
  Num Epochs = 1
  Instantaneous batch size per device = 1024
  Total train batch size (w. parallel, distributed & accumulation) = 1024
  Gradient Accumulation steps = 1
  Total optimization steps = 13710


Step,Training Loss
1000,11.2242
2000,8.7371
3000,7.6011
4000,7.1405
5000,7.0489
6000,7.0007
7000,6.786
8000,6.3605
9000,5.9023
10000,5.5312




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./data/t4r/checkpoints/checkpoint_0/
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


[{'e': 0, 'recalls1': {'clicks': 0.447318908207546, 'carts': 0.3209616575047485, 'orders': 0.4708972243398599}, 'recalls2': {}, 'recalls3': {}}]


***** Running training *****
  Num examples = 14039040
  Num Epochs = 1
  Instantaneous batch size per device = 1024
  Total train batch size (w. parallel, distributed & accumulation) = 1024
  Gradient Accumulation steps = 1
  Total optimization steps = 13710


Step,Training Loss
1000,4.6016
2000,4.5127
3000,4.4724
4000,4.4404
5000,4.4228
6000,4.3638
7000,4.2569
8000,4.1777
9000,4.1315
10000,4.1071




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./data/t4r/checkpoints/checkpoint_1/
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[{'e': 0, 'recalls1': {'clicks': 0.447318908207546, 'carts': 0.3209616575047485, 'orders': 0.4708972243398599}, 'recalls2': {}, 'recalls3': {}}, {'e': 1, 'recalls1': {'clicks': 0.4552056230602548, 'carts': 0.3312011371712864, 'orders': 0.4912973131198111}, 'recalls2': {}, 'recalls3': {}}]


***** Running training *****
  Num examples = 14039040
  Num Epochs = 1
  Instantaneous batch size per device = 1024
  Total train batch size (w. parallel, distributed & accumulation) = 1024
  Gradient Accumulation steps = 1
  Total optimization steps = 13710


Step,Training Loss
1000,3.9702
2000,3.8956
3000,3.8168
4000,3.7691
5000,3.7431
6000,3.7318
7000,3.7487
8000,3.7921
9000,3.8698
10000,3.8949




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./data/t4r/checkpoints/checkpoint_2/
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


[{'e': 0, 'recalls1': {'clicks': 0.447318908207546, 'carts': 0.3209616575047485, 'orders': 0.4708972243398599}, 'recalls2': {}, 'recalls3': {}}, {'e': 1, 'recalls1': {'clicks': 0.4552056230602548, 'carts': 0.3312011371712864, 'orders': 0.4912973131198111}, 'recalls2': {}, 'recalls3': {}}, {'e': 2, 'recalls1': {'clicks': 0.47509416071245686, 'carts': 0.3464527768812795, 'orders': 0.5114272020483722}, 'recalls2': {}, 'recalls3': {}}]


***** Running training *****
  Num examples = 3454976
  Num Epochs = 1
  Instantaneous batch size per device = 1024
  Total train batch size (w. parallel, distributed & accumulation) = 1024
  Gradient Accumulation steps = 1
  Total optimization steps = 3374


Step,Training Loss
1000,1.8688
2000,2.008
3000,2.3037




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./data/t4r/checkpoints/checkpoint_3/
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


[{'e': 0, 'recalls1': {'clicks': 0.447318908207546, 'carts': 0.3209616575047485, 'orders': 0.4708972243398599}, 'recalls2': {}, 'recalls3': {}}, {'e': 1, 'recalls1': {'clicks': 0.4552056230602548, 'carts': 0.3312011371712864, 'orders': 0.4912973131198111}, 'recalls2': {}, 'recalls3': {}}, {'e': 2, 'recalls1': {'clicks': 0.47509416071245686, 'carts': 0.3464527768812795, 'orders': 0.5114272020483722}, 'recalls2': {}, 'recalls3': {}}, {'e': 3, 'recalls1': {'clicks': 0.4689640768107453, 'carts': 0.342050823514473, 'orders': 0.5086737368961114}, 'recalls2': {}, 'recalls3': {}}]
