In [1]:
import sys
sys.path.append('../')

import collections
import os
import random
from pathlib import Path
import logging
import shutil
import time
from packaging import version
from collections import defaultdict

from tqdm import tqdm
import numpy as np
import gzip
import torch
import torch.nn as nn
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist
import torch.backends.cudnn as cudnn

from src.param import parse_args
from src.utils import LossMeter
from src.dist_utils import reduce_dict
from transformers import T5Tokenizer, T5TokenizerFast
from src.tokenization import P5Tokenizer, P5TokenizerFast
from src.pretrain_model import P5Pretraining

from torch.utils.data import DataLoader, Dataset
from src.pretrain_data import get_loader

from torch.utils.data import DataLoader, Dataset, Sampler
from src.pretrain_data import get_loader
from evaluate.utils import rouge_score, bleu_score, unique_sentence_percent, root_mean_square_error, mean_absolute_error, feature_detect, feature_matching_ratio, feature_coverage_ratio, feature_diversity
from evaluate.metrics4rec import evaluate_all

_use_native_amp = False
_use_apex = False

# Check if Pytorch version >= 1.6 to switch between Native AMP and Apex
if version.parse(torch.__version__) < version.parse("1.6"):
    from transormers.file_utils import is_apex_available
    if is_apex_available():
        from apex import amp
    _use_apex = True
else:
    _use_native_amp = True
    from torch.cuda.amp import autocast

from src.trainer_base import TrainerBase

import pickle

def load_pickle(filename):
    with open(filename, "rb") as f:
        return pickle.load(f)


def save_pickle(data, filename):
    with open(filename, "wb") as f:
        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
        
import json

def load_json(file_path):
    with open(file_path, "r") as f:
        return json.load(f)
    
def ReadLineFromFile(path): # This function is used to read lines from a file and return a list of lines
    lines = []
    with open(path,'r') as fd:
        for line in fd:
            lines.append(line.rstrip('\n'))
    return lines

def parse(path):  # This function is used to parse the data file into a list of dictionaries for each review text
    g = gzip.open(path, 'r')
    for l in g:
        yield eval(l)

P5/src/tokenization.py


In [11]:
class DotDict(dict):  # This class is used to convert a dictionary into a class with attributes. This allows for easy access to the dictionary values using the dot notation (e.g. dict.key instead of dict['key'])
    def __init__(self, **kwds):
        self.update(kwds)
        self.__dict__ = self
        
args = DotDict()

args.distributed = False
args.multiGPU = True
args.fp16 = True
args.train = "beauty"
args.valid = "beauty"
args.test = "beauty"
args.batch_size = 32
args.optim = 'adamw' 
args.warmup_ratio = 0.05
args.lr = 1e-3
args.num_workers = 4
args.clip_grad_norm = 1.0
args.losses = 'rating,sequential,explanation,review,traditional'
args.backbone = 't5-small' # small or base
args.output = 'snap/beauty-small'
args.epoch = 10
args.local_rank = 0

args.comment = ''
args.train_topk = -1
args.valid_topk = -1
args.dropout = 0.1

args.tokenizer = 'p5'
args.max_text_length = 512
args.do_lower_case = False
args.word_mask_rate = 0.15
args.gen_max_length = 64

args.weight_decay = 0.01
args.adam_eps = 1e-6
args.gradient_accumulation_steps = 1

'''
Set seeds
'''
args.seed = 2022
torch.manual_seed(args.seed)
random.seed(args.seed)
np.random.seed(args.seed)

'''
Whole word embedding
'''
args.whole_word_embed = True

cudnn.benchmark = True
ngpus_per_node = torch.cuda.device_count()
args.world_size = ngpus_per_node

LOSSES_NAME = [f'{name}_loss' for name in args.losses.split(',')]
if args.local_rank in [0, -1]:
    print(LOSSES_NAME)
LOSSES_NAME.append('total_loss') # total loss

args.LOSSES_NAME = LOSSES_NAME

gpu = 0 # Change GPU ID
args.gpu = gpu
args.rank = gpu
print(f'Process Launching at GPU {gpu}')

torch.cuda.set_device('cuda:{}'.format(gpu))

comments = []
dsets = []
if 'toys' in args.train:
    dsets.append('toys')
if 'beauty' in args.train:
    dsets.append('beauty')
if 'sports' in args.train:
    dsets.append('sports')
comments.append(''.join(dsets))
if args.backbone:
    comments.append(args.backbone)
comments.append(''.join(args.losses.split(',')))
if args.comment != '':
    comments.append(args.comment)
comment = '_'.join(comments)

if args.local_rank in [0, -1]:
    print(args)

['rating_loss', 'sequential_loss', 'explanation_loss', 'review_loss', 'traditional_loss']
Process Launching at GPU 0
{'distributed': False, 'multiGPU': True, 'fp16': True, 'train': 'beauty', 'valid': 'beauty', 'test': 'beauty', 'batch_size': 32, 'optim': 'adamw', 'warmup_ratio': 0.05, 'lr': 0.001, 'num_workers': 4, 'clip_grad_norm': 1.0, 'losses': 'rating,sequential,explanation,review,traditional', 'backbone': 't5-small', 'output': 'snap/beauty-small', 'epoch': 10, 'local_rank': 0, 'comment': '', 'train_topk': -1, 'valid_topk': -1, 'dropout': 0.1, 'tokenizer': 'p5', 'max_text_length': 512, 'do_lower_case': False, 'word_mask_rate': 0.15, 'gen_max_length': 64, 'weight_decay': 0.01, 'adam_eps': 1e-06, 'gradient_accumulation_steps': 1, 'seed': 2022, 'whole_word_embed': True, 'world_size': 8, 'LOSSES_NAME': ['rating_loss', 'sequential_loss', 'explanation_loss', 'review_loss', 'traditional_loss', 'total_loss'], 'gpu': 0, 'rank': 0}


In [12]:
def create_config(args):
    from transformers import T5Config, BartConfig

    if 't5' in args.backbone:
        config_class = T5Config
    else:
        return None

    config = config_class.from_pretrained(args.backbone)
    config.dropout_rate = args.dropout
    config.dropout = args.dropout
    config.attention_dropout = args.dropout
    config.activation_dropout = args.dropout
    config.losses = args.losses

    return config


def create_tokenizer(args):
    from transformers import T5Tokenizer, T5TokenizerFast
    from src.tokenization import P5Tokenizer, P5TokenizerFast

    if 'p5' in args.tokenizer:
        tokenizer_class = P5Tokenizer

    tokenizer_name = args.backbone
    
    tokenizer = tokenizer_class.from_pretrained(
        tokenizer_name,
        max_length=args.max_text_length,
        do_lower_case=args.do_lower_case,
    )

    print(tokenizer_class, tokenizer_name)
    
    return tokenizer


def create_model(model_class, config=None):
    print(f'Building Model at GPU {args.gpu}')

    model_name = args.backbone

    model = model_class.from_pretrained(
        model_name,
        config=config
    )
    return model

In [13]:
config = create_config(args)

if args.tokenizer is None:
    args.tokenizer = args.backbone
    
tokenizer = create_tokenizer(args)

model_class = P5Pretraining
model = create_model(model_class, config)

model = model.cuda()

if 'p5' in args.tokenizer:
    model.resize_token_embeddings(tokenizer.vocab_size)
    
model.tokenizer = tokenizer

# I wat to see the model structure
print(model)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'P5Tokenizer'.


<class 'src.tokenization.P5Tokenizer'> t5-small
Building Model at GPU 0


Some weights of P5Pretraining were not initialized from the model checkpoint at t5-small and are newly initialized: ['encoder.whole_word_embeddings.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


P5Pretraining(
  (shared): Embedding(32100, 512)
  (encoder): JointEncoder(
    (embed_tokens): Embedding(32100, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dropout(p=0.

#### Load Model

In [14]:
args.load = "../snap/beauty-small.pth"

# Load Checkpoint
from src.utils import load_state_dict, LossMeter, set_global_logging_level
from pprint import pprint

def load_checkpoint(ckpt_path):
    state_dict = load_state_dict(ckpt_path, 'cpu')
    results = model.load_state_dict(state_dict, strict=False)
    print('Model loaded from ', ckpt_path)
    pprint(results)

ckpt_path = args.load
load_checkpoint(ckpt_path)

from src.all_amazon_templates import all_tasks as task_templates

Model loaded from  ../snap/beauty-small.pth
<All keys matched successfully>


In [7]:
# Lets do the train loader here for fine tuning P5 on rating data
data_splits = load_pickle('../../Data/data/beauty/rating_splits_augmented.pkl')
train_data = data_splits['train']
valid_data = data_splits['val']

# Write the dataloader 
from torch.utils.data import DataLoader, Dataset
from src.pretrain_data import get_loader

test_task_list = {'rating': ['1-10'] # or '1-6'
}
test_sample_numbers = {'rating': 1, 'sequential': (1, 1, 1), 'explanation': 1, 'review': 1, 'traditional': (1, 1)}

# def get_loader(args, task_list, sample_numbers, split='toys', mode='train', 
#             #    batch_size=16, workers=4, distributed=False): # this is the original function
train_loader = get_loader(args,test_task_list, test_sample_numbers, 'beauty', 'train', batch_size=16, workers=4, distributed=False)
valid_loader = get_loader(args,test_task_list, test_sample_numbers, 'beauty', 'val', batch_size=16, workers=4, distributed=False)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'P5Tokenizer'.


Data sources:  ['beauty']
compute_datum_info


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'P5Tokenizer'.


Data sources:  ['beauty']
compute_datum_info


## Fine TUNING CODE FOR RATING  PREDICTION

In [7]:

# define the optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, eps=args.adam_eps, weight_decay=args.weight_decay)
# This is a rating prediction task so choose appropriate loss function
criterion = nn.MSELoss()

# Write the training loop
def train_one_epoch(model, optimizer, criterion, train_loader, device, epoch, args):
    model.train()
    losses = []
    tk0 = tqdm(train_loader, total=len(train_loader))
    for bi, batch in enumerate(tk0):
        # for key in batch:
        #     batch[key] = batch[key].to(device)
        # i want to check the keys in the batch
        # print(batch.keys())
        optimizer.zero_grad()
        outputs = model.train_step(batch)
        # print(outputs.keys())
        loss = outputs['loss']
        # print(loss.item())
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
    
    return np.mean(losses)

# Write the validation loop
def valid_one_epoch(model, criterion, valid_loader, device, epoch, args):
    model.eval()
    losses = []
    with torch.no_grad():
        tk0 = tqdm(valid_loader, total=len(valid_loader))
        for bi, batch in enumerate(tk0):
            outputs = model.valid_step(batch)
            loss = outputs['loss']
            losses.append(loss.item())
        
            
    return np.mean(losses)

# Write the main training loop
for epoch in range(args.epoch):
    train_loss = train_one_epoch(model, optimizer, criterion, train_loader, torch.device('cuda:{}'.format(gpu)), epoch, args)
    valid_loss = valid_one_epoch(model, criterion, valid_loader, torch.device('cuda:{}'.format(gpu)), epoch, args)
    print(f'Epoch {epoch} Train Loss: {train_loss} Valid Loss: {valid_loss}')
    

# Save the final model 

torch.save(model.state_dict(), f'../snap/beauty-small-finetuned.pth')


100%|██████████| 9926/9926 [23:46<00:00,  6.96it/s]
100%|██████████| 1241/1241 [02:53<00:00,  7.16it/s]


Epoch 0 Train Loss: 1.0236993594537598 Valid Loss: 6.830050754316577


#### Check Test Split

In [8]:
data_splits = load_pickle('../../Data/data/beauty/rating_splits_augmented.pkl')
test_review_data = data_splits['test']

In [9]:
len(test_review_data)

19850

In [10]:
test_review_data[0]

{'reviewerID': 'A2QKXW3LDQ66P5',
 'asin': 'B005X2F7KI',
 'reviewerName': 'stephanie',
 'helpful': [5, 6],
 'reviewText': 'Absolutely great product.  I bought this for my fourteen year old niece for Christmas and of course I had to try it out, then I tried another one, and another one and another one.  So much fun!  I even contemplated keeping a few for myself!',
 'overall': 5.0,
 'summary': 'Perfect!',
 'unixReviewTime': 1352937600,
 'reviewTime': '11 15, 2012',
 'explanation': 'Absolutely great product',
 'feature': 'product'}

In [11]:
data_maps = load_json(os.path.join('../../Data/data', 'beauty', 'datamaps.json'))
print(len(data_maps['user2id'])) # number of users
print(len(data_maps['item2id'])) # number of items

22363
12101


### Test P5

In [12]:
from torch.utils.data import DataLoader, Dataset, Sampler
from src.pretrain_data import get_loader
from evaluate.utils import rouge_score, bleu_score, unique_sentence_percent, root_mean_square_error, mean_absolute_error, feature_detect, feature_matching_ratio, feature_coverage_ratio, feature_diversity
from evaluate.metrics4rec import evaluate_all

#### Evaluation - Rating

In [13]:
test_task_list = {'rating': ['1-10'] # or '1-6'
}
test_sample_numbers = {'rating': 3, 'sequential': (1, 1, 1), 'explanation': 1, 'review': 1, 'traditional': (1, 1)}

zeroshot_test_loader = get_loader(
        args,
        test_task_list,
        test_sample_numbers,
        split=args.test, 
        mode='test', 
        batch_size=args.batch_size,
        workers=args.num_workers,
        distributed=args.distributed
)
print(len(zeroshot_test_loader))  # noof datasamples/batch size which is 19850/16 which is  equal to 1241

gt_ratings = []
pred_ratings = []
for i, batch in tqdm(enumerate(zeroshot_test_loader)):
    with torch.no_grad():
        results = model.generate_step(batch)
        gt_ratings.extend(batch['target_text'])
        pred_ratings.extend(results)
        # print(f"result for batch {i} is {results} and ground truth is {batch['target_text']}")
        
        
        
predicted_rating = [(float(r), float(p)) for (r, p) in zip(gt_ratings, pred_ratings) if p in [str(i/10.0) for i in list(range(10, 50))]]
RMSE = root_mean_square_error(predicted_rating, 5.0, 1.0)
print('RMSE {:7.4f}'.format(RMSE))
MAE = mean_absolute_error(predicted_rating, 5.0, 1.0)
print('MAE {:7.4f}'.format(MAE))

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'P5Tokenizer'.


Data sources:  ['beauty']
compute_datum_info
931


602it [01:42,  4.30it/s]

In [None]:
test_task_list = {'rating': ['1-6'] # or '1-10'
}
test_sample_numbers = {'rating': 1, 'sequential': (1, 1, 1), 'explanation': 1, 'review': 1, 'traditional': (1, 1)}

zeroshot_test_loader = get_loader(
        args,
        test_task_list,
        test_sample_numbers,
        split=args.test, 
        mode='test', 
        batch_size=args.batch_size,
        workers=args.num_workers,
        distributed=args.distributed
)
print(len(zeroshot_test_loader))

gt_ratings = []
pred_ratings = []
for i, batch in tqdm(enumerate(zeroshot_test_loader)):
    with torch.no_grad():
        results = model.generate_step(batch)
        gt_ratings.extend(batch['target_text'])
        pred_ratings.extend(results)
        
predicted_rating = [(float(r), float(p)) for (r, p) in zip(gt_ratings, pred_ratings) if p in [str(i/10.0) for i in list(range(10, 50))]]
RMSE = root_mean_square_error(predicted_rating, 5.0, 1.0)
print('RMSE {:7.4f}'.format(RMSE))
MAE = mean_absolute_error(predicted_rating, 5.0, 1.0)
print('MAE {:7.4f}'.format(MAE))

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'P5Tokenizer'.


Data sources:  ['beauty']
compute_datum_info
311


311it [00:35,  8.76it/s]


RMSE  1.2915
MAE  0.8516


## Fine tuning code for sequential Recommendation

In [15]:
test_task_list = {'sequential': ['2-13'] # or '2-3'
}
test_sample_numbers = {'rating': 1, 'sequential': (1, 1, 1), 'explanation': 1, 'review': 1, 'traditional': (1, 1)}

train_loader = get_loader(args,test_task_list, test_sample_numbers, 'beauty', 'train', batch_size=16, workers=4, distributed=False)
val_loader = get_loader(args,test_task_list, test_sample_numbers, 'beauty', 'val', batch_size=16, workers=4, distributed=False)
test_loader = get_loader(args,test_task_list, test_sample_numbers, 'beauty', 'test', batch_size=16, workers=4, distributed=False)



The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'P5Tokenizer'.


Data sources:  ['beauty']
compute_datum_info


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'P5Tokenizer'.


Data sources:  ['beauty']
compute_datum_info


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'P5Tokenizer'.


Data sources:  ['beauty']
compute_datum_info


In [16]:
optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, eps=args.adam_eps, weight_decay=args.weight_decay)
# this is a sequential recommendation task so choose appropriate loss function
criterion = nn.CrossEntropyLoss()


# Now write the training loop
def train_one_epoch(model, optimizer, criterion, train_loader, device, epoch, args):
    model.train()
    losses = []
    tk0 = tqdm(train_loader, total=len(train_loader))
    for bi, batch in enumerate(tk0):
        optimizer.zero_grad()
        outputs = model.train_step(batch)
        loss = outputs['loss']
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
    
    return np.mean(losses)

def valid_one_epoch(model, criterion, valid_loader, device, epoch, args):
    model.eval()
    losses = []
    with torch.no_grad():
        tk0 = tqdm(valid_loader, total=len(valid_loader))
        for bi, batch in enumerate(tk0):
            outputs = model.valid_step(batch)
            loss = outputs['loss']
            losses.append(loss.item())
        
            
    return np.mean(losses)


print(f"Noof epochs is {args.epoch}")
for epoch in range(args.epoch):
    train_loss = train_one_epoch(model, optimizer, criterion, train_loader, torch.device('cuda:{}'.format(gpu)), epoch, args)
    valid_loss = valid_one_epoch(model, criterion, val_loader, torch.device('cuda:{}'.format(gpu)), epoch, args)
    print(f'Epoch {epoch} Train Loss: {train_loss} Valid Loss: {valid_loss}')

torch.save(model.state_dict(), f'../snap/beauty-sequential-small-finetuned.pth')

Noof epochs is 10


100%|██████████| 1398/1398 [02:29<00:00,  9.33it/s]
100%|██████████| 1398/1398 [04:43<00:00,  4.94it/s]


Epoch 0 Train Loss: 2.6386192579978864 Valid Loss: 2.7751061532630428


100%|██████████| 1398/1398 [03:01<00:00,  7.69it/s]
100%|██████████| 1398/1398 [04:35<00:00,  5.07it/s]


Epoch 1 Train Loss: 2.638275221864212 Valid Loss: 2.7769726044971375


100%|██████████| 1398/1398 [02:58<00:00,  7.84it/s]
100%|██████████| 1398/1398 [04:25<00:00,  5.27it/s]


Epoch 2 Train Loss: 2.621647774661559 Valid Loss: 2.7808673326379068


100%|██████████| 1398/1398 [02:57<00:00,  7.88it/s]
100%|██████████| 1398/1398 [04:13<00:00,  5.52it/s]


Epoch 3 Train Loss: 2.5905724807358608 Valid Loss: 2.7943229624982897


100%|██████████| 1398/1398 [02:45<00:00,  8.45it/s]
100%|██████████| 1398/1398 [02:55<00:00,  7.96it/s]


Epoch 4 Train Loss: 2.5582109034828875 Valid Loss: 2.7731038693535823


100%|██████████| 1398/1398 [02:10<00:00, 10.71it/s]
100%|██████████| 1398/1398 [02:47<00:00,  8.32it/s]


Epoch 5 Train Loss: 2.525574745879494 Valid Loss: 2.771753679188195


100%|██████████| 1398/1398 [02:08<00:00, 10.84it/s]
100%|██████████| 1398/1398 [02:51<00:00,  8.16it/s]


Epoch 6 Train Loss: 2.4839056124844094 Valid Loss: 2.796595169068747


100%|██████████| 1398/1398 [02:11<00:00, 10.66it/s]
100%|██████████| 1398/1398 [02:51<00:00,  8.14it/s]


Epoch 7 Train Loss: 2.4402393956211674 Valid Loss: 2.8147759070041696


100%|██████████| 1398/1398 [02:14<00:00, 10.36it/s]
100%|██████████| 1398/1398 [02:54<00:00,  7.99it/s]


Epoch 8 Train Loss: 2.389599316365729 Valid Loss: 2.803485835485363


100%|██████████| 1398/1398 [02:37<00:00,  8.86it/s]
100%|██████████| 1398/1398 [03:51<00:00,  6.05it/s]


Epoch 9 Train Loss: 2.3358529626214626 Valid Loss: 2.8028240330911673


In [17]:
# Now load the model and evaluate it
model.load_state_dict(torch.load(f'../snap/beauty-sequential-small-finetuned.pth'))


<All keys matched successfully>

#### Evaluation - Sequential

In [18]:
test_task_list = {'sequential': ['2-13'] # or '2-3'
}
test_sample_numbers = {'rating': 1, 'sequential': (1, 1, 1), 'explanation': 1, 'review': 1, 'traditional': (1, 1)}

zeroshot_test_loader = get_loader(
        args,
        test_task_list,
        test_sample_numbers,
        split=args.test, 
        mode='test', 
        batch_size=args.batch_size,
        workers=args.num_workers,
        distributed=args.distributed
)
print(len(zeroshot_test_loader))

all_info = []
for i, batch in tqdm(enumerate(zeroshot_test_loader)):
    with torch.no_grad():
        results = model.generate_step(batch)
        beam_outputs = model.generate(
                batch['input_ids'].to('cuda'), 
                max_length=50, 
                num_beams=20,
                no_repeat_ngram_size=0, 
                num_return_sequences=20,
                early_stopping=True
        )
        generated_sents = model.tokenizer.batch_decode(beam_outputs, skip_special_tokens=True)
        for j, item in enumerate(zip(results, batch['target_text'], batch['source_text'])):
            new_info = {}
            new_info['target_item'] = item[1]
            new_info['gen_item_list'] = generated_sents[j*20: (j+1)*20]
            all_info.append(new_info)
            
gt = {}
ui_scores = {}
for i, info in enumerate(all_info):
    gt[i] = [int(info['target_item'])]
    pred_dict = {}
    for j in range(len(info['gen_item_list'])):
        try:
            pred_dict[int(info['gen_item_list'][j])] = -(j+1)
        except:
            pass
    ui_scores[i] = pred_dict
    
evaluate_all(ui_scores, gt, 5)
evaluate_all(ui_scores, gt, 10)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'P5Tokenizer'.


Data sources:  ['beauty']
compute_datum_info
699


699it [28:31,  2.45s/it]



NDCG@5	Rec@5	Hits@5	Prec@5	MAP@5	MRR@5
0.0370	0.0489	0.0489	0.0098	0.0331	0.0331

NDCG@10	Rec@10	Hits@10	Prec@10	MAP@10	MRR@10
0.0413	0.0621	0.0621	0.0062	0.0349	0.0349


('\nNDCG@10\tRec@10\tHits@10\tPrec@10\tMAP@10\tMRR@10\n0.0413\t0.0621\t0.0621\t0.0062\t0.0349\t0.0349',
 {'ndcg': 0.04133068891141221,
  'map': 0.03487996399949179,
  'recall': 0.06211152349863614,
  'precision': 0.006211152349863453,
  'mrr': 0.03487996399949179,
  'hit': 0.06211152349863614})

In [19]:
    
evaluate_all(ui_scores, gt, 5)
evaluate_all(ui_scores, gt, 10)


NDCG@5	Rec@5	Hits@5	Prec@5	MAP@5	MRR@5
0.0370	0.0489	0.0489	0.0098	0.0331	0.0331

NDCG@10	Rec@10	Hits@10	Prec@10	MAP@10	MRR@10
0.0413	0.0621	0.0621	0.0062	0.0349	0.0349


('\nNDCG@10\tRec@10\tHits@10\tPrec@10\tMAP@10\tMRR@10\n0.0413\t0.0621\t0.0621\t0.0062\t0.0349\t0.0349',
 {'ndcg': 0.04133068891141221,
  'map': 0.03487996399949179,
  'recall': 0.06211152349863614,
  'precision': 0.006211152349863453,
  'mrr': 0.03487996399949179,
  'hit': 0.06211152349863614})

In [20]:
test_task_list = {'sequential': ['2-3'] # or '2-13'
}
test_sample_numbers = {'rating': 1, 'sequential': (1, 1, 1), 'explanation': 1, 'review': 1, 'traditional': (1, 1)}

zeroshot_test_loader = get_loader(
        args,
        test_task_list,
        test_sample_numbers,
        split=args.test, 
        mode='test', 
        batch_size=args.batch_size,
        workers=args.num_workers,
        distributed=args.distributed
)
print(len(zeroshot_test_loader))

all_info = []
for i, batch in tqdm(enumerate(zeroshot_test_loader)):
    with torch.no_grad():
        results = model.generate_step(batch)
        beam_outputs = model.generate(
                batch['input_ids'].to('cuda'), 
                max_length=50, 
                num_beams=20,
                no_repeat_ngram_size=0, 
                num_return_sequences=20,
                early_stopping=True
        )
        generated_sents = model.tokenizer.batch_decode(beam_outputs, skip_special_tokens=True)
        for j, item in enumerate(zip(results, batch['target_text'], batch['source_text'])):
            new_info = {}
            new_info['target_item'] = item[1]
            new_info['gen_item_list'] = generated_sents[j*20: (j+1)*20]
            all_info.append(new_info)
            
gt = {}
ui_scores = {}
for i, info in enumerate(all_info):
    gt[i] = [int(info['target_item'])]
    pred_dict = {}
    for j in range(len(info['gen_item_list'])):
        try:
            pred_dict[int(info['gen_item_list'][j])] = -(j+1)
        except:
            pass
    ui_scores[i] = pred_dict
    
evaluate_all(ui_scores, gt, 5)
evaluate_all(ui_scores, gt, 10)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'P5Tokenizer'.


Data sources:  ['beauty']
compute_datum_info
699


699it [23:12,  1.99s/it]



NDCG@5	Rec@5	Hits@5	Prec@5	MAP@5	MRR@5
0.0372	0.0491	0.0491	0.0098	0.0333	0.0333

NDCG@10	Rec@10	Hits@10	Prec@10	MAP@10	MRR@10
0.0416	0.0627	0.0627	0.0063	0.0351	0.0351


('\nNDCG@10\tRec@10\tHits@10\tPrec@10\tMAP@10\tMRR@10\n0.0416\t0.0627\t0.0627\t0.0063\t0.0351\t0.0351',
 {'ndcg': 0.04162398098999096,
  'map': 0.03508729342472579,
  'recall': 0.06273755757277646,
  'precision': 0.006273755757277482,
  'mrr': 0.03508729342472579,
  'hit': 0.06273755757277646})

#### Evaluation - Explanation

In [None]:
test_task_list = {'explanation': ['3-12'] # or '3-9' or '3-3'
}
test_sample_numbers = {'rating': 1, 'sequential': (1, 1, 1), 'explanation': 1, 'review': 1, 'traditional': (1, 1)}

zeroshot_test_loader = get_loader(
        args,
        test_task_list,
        test_sample_numbers,
        split=args.test, 
        mode='test', 
        batch_size=args.batch_size,
        workers=args.num_workers,
        distributed=args.distributed
)
print(len(zeroshot_test_loader))

tokens_predict = []
tokens_test = []
for i, batch in tqdm(enumerate(zeroshot_test_loader)):
    with torch.no_grad():
        outputs = model.generate(
                batch['input_ids'].to('cuda'), 
                min_length=9,
                num_beams=12,
                num_return_sequences=1,
                num_beam_groups=3,
                repetition_penalty=0.7
        )
        results = model.tokenizer.batch_decode(outputs, skip_special_tokens=True)
        tokens_predict.extend(results) 
        tokens_test.extend(batch['target_text'])
        
new_tokens_predict = [l.split() for l in tokens_predict]
new_tokens_test = [ll.split() for ll in tokens_test]
BLEU1 = bleu_score(new_tokens_test, new_tokens_predict, n_gram=1, smooth=False)
BLEU4 = bleu_score(new_tokens_test, new_tokens_predict, n_gram=4, smooth=False)
ROUGE = rouge_score(tokens_test, tokens_predict)

print('BLEU-1 {:7.4f}'.format(BLEU1))
print('BLEU-4 {:7.4f}'.format(BLEU4))
for (k, v) in ROUGE.items():
    print('{} {:7.4f}'.format(k, v))

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'P5Tokenizer'.


Data sources:  ['beauty']
compute_datum_info
839


0it [00:00, ?it/s]


ValueError: `diversity_penalty` is not 0.0 or `num_beam_groups` is not 1, triggering group beam search. In this generation mode, `diversity_penalty` should be greater than `0.0`, otherwise your groups will be identical.

In [None]:
test_task_list = {'explanation': ['3-9'] # or '3-12' or '3-3'
}
test_sample_numbers = {'rating': 1, 'sequential': (1, 1, 1), 'explanation': 1, 'review': 1, 'traditional': (1, 1)}

zeroshot_test_loader = get_loader(
        args,
        test_task_list,
        test_sample_numbers,
        split=args.test, 
        mode='test', 
        batch_size=args.batch_size,
        workers=args.num_workers,
        distributed=args.distributed
)
print(len(zeroshot_test_loader))

tokens_predict = []
tokens_test = []
for i, batch in tqdm(enumerate(zeroshot_test_loader)):
    with torch.no_grad():
        outputs = model.generate(
                batch['input_ids'].to('cuda'), 
                min_length=10,
                num_beams=12,
                num_return_sequences=1,
                num_beam_groups=3
        )
        results = model.tokenizer.batch_decode(outputs, skip_special_tokens=True)
        tokens_predict.extend(results) 
        tokens_test.extend(batch['target_text'])
        
new_tokens_predict = [l.split() for l in tokens_predict]
new_tokens_test = [ll.split() for ll in tokens_test]
BLEU1 = bleu_score(new_tokens_test, new_tokens_predict, n_gram=1, smooth=False)
BLEU4 = bleu_score(new_tokens_test, new_tokens_predict, n_gram=4, smooth=False)
ROUGE = rouge_score(tokens_test, tokens_predict)

print('BLEU-1 {:7.4f}'.format(BLEU1))
print('BLEU-4 {:7.4f}'.format(BLEU4))
for (k, v) in ROUGE.items():
    print('{} {:7.4f}'.format(k, v))

Data sources:  ['beauty']
compute_datum_info
839


839it [09:12,  1.52it/s]


BLEU-1 19.9696
BLEU-4  2.7584
rouge_1/f_score 24.9963
rouge_1/r_score 26.8389
rouge_1/p_score 28.0289
rouge_2/f_score  5.1078
rouge_2/r_score  6.1455
rouge_2/p_score  5.5334
rouge_l/f_score 18.4491
rouge_l/r_score 23.0867
rouge_l/p_score 22.6628


In [None]:
test_task_list = {'explanation': ['3-3'] # or '3-12' or '3-9'
}
test_sample_numbers = {'rating': 1, 'sequential': (1, 1, 1), 'explanation': 1, 'review': 1, 'traditional': (1, 1)}

zeroshot_test_loader = get_loader(
        args,
        test_task_list,
        test_sample_numbers,
        split=args.test, 
        mode='test', 
        batch_size=args.batch_size,
        workers=args.num_workers,
        distributed=args.distributed
)
print(len(zeroshot_test_loader))

tokens_predict = []
tokens_test = []
for i, batch in tqdm(enumerate(zeroshot_test_loader)):
    with torch.no_grad():
        outputs = model.generate(
                batch['input_ids'].to('cuda'), 
                min_length=10
        )
        results = model.tokenizer.batch_decode(outputs, skip_special_tokens=True)
        tokens_predict.extend(results) 
        tokens_test.extend(batch['target_text'])
        
new_tokens_predict = [l.split() for l in tokens_predict]
new_tokens_test = [ll.split() for ll in tokens_test]
BLEU1 = bleu_score(new_tokens_test, new_tokens_predict, n_gram=1, smooth=False)
BLEU4 = bleu_score(new_tokens_test, new_tokens_predict, n_gram=4, smooth=False)
ROUGE = rouge_score(tokens_test, tokens_predict)

print('BLEU-1 {:7.4f}'.format(BLEU1))
print('BLEU-4 {:7.4f}'.format(BLEU4))
for (k, v) in ROUGE.items():
    print('{} {:7.4f}'.format(k, v))

Data sources:  ['beauty']
compute_datum_info
839


839it [03:27,  4.03it/s]


BLEU-1 15.5223
BLEU-4  0.9783
rouge_1/f_score 17.0412
rouge_1/r_score 18.2074
rouge_1/p_score 18.9502
rouge_2/f_score  1.8962
rouge_2/r_score  2.3611
rouge_2/p_score  2.0044
rouge_l/f_score 12.1709
rouge_l/r_score 15.3009
rouge_l/p_score 14.4041


#### Evaluation - Review

Since T0 & GPT-2 checkpoints hosted on Hugging Face platform are slow to conduct inference, we only perform evaluation on the first 800 instances for prompts in Task Family 4.

In [None]:
test_task_list = {'review': ['4-4'] # or '4-2'
}
test_sample_numbers = {'rating': 1, 'sequential': (1, 1, 1), 'explanation': 1, 'review': 1, 'traditional': (1, 1)}

zeroshot_test_loader = get_loader(
        args,
        test_task_list,
        test_sample_numbers,
        split=args.test, 
        mode='test', 
        batch_size=args.batch_size,
        workers=args.num_workers,
        distributed=args.distributed
)
print(len(zeroshot_test_loader))

gt_ratings = []
pred_ratings = []
for i, batch in tqdm(enumerate(zeroshot_test_loader)):
    if i > 50:
        break
    with torch.no_grad():
        results = model.generate_step(batch)
        gt_ratings.extend(batch['target_text'])
        pred_ratings.extend(results)
        
predicted_rating = [(float(r), round(float(p))) for (r, p) in zip(gt_ratings, pred_ratings)]
RMSE = root_mean_square_error(predicted_rating, 5.0, 1.0)
print('RMSE {:7.4f}'.format(RMSE))
MAE = mean_absolute_error(predicted_rating, 5.0, 1.0)
print('MAE {:7.4f}'.format(MAE))

Data sources:  ['beauty']
compute_datum_info
1241


51it [00:02, 19.18it/s]

RMSE  0.6262
MAE  0.3113





In [None]:
test_task_list = {'review': ['4-2'] # or '4-4'
}
test_sample_numbers = {'rating': 1, 'sequential': (1, 1, 1), 'explanation': 1, 'review': 1, 'traditional': (1, 1)}

zeroshot_test_loader = get_loader(
        args,
        test_task_list,
        test_sample_numbers,
        split=args.test, 
        mode='test', 
        batch_size=args.batch_size,
        workers=args.num_workers,
        distributed=args.distributed
)
print(len(zeroshot_test_loader))

gt_ratings = []
pred_ratings = []
for i, batch in tqdm(enumerate(zeroshot_test_loader)):
    if i > 50:
        break
    with torch.no_grad():
        results = model.generate_step(batch)
        gt_ratings.extend(batch['target_text'])
        pred_ratings.extend(results)
        
predicted_rating = [(float(r), round(float(p))) for (r, p) in zip(gt_ratings, pred_ratings)]
RMSE = root_mean_square_error(predicted_rating, 5.0, 1.0)
print('RMSE {:7.4f}'.format(RMSE))
MAE = mean_absolute_error(predicted_rating, 5.0, 1.0)
print('MAE {:7.4f}'.format(MAE))

Data sources:  ['beauty']
compute_datum_info
1241


51it [00:02, 19.43it/s]


RMSE  0.6233
MAE  0.3051


In [None]:
test_task_list = {'review': ['4-1']
}
test_sample_numbers = {'rating': 1, 'sequential': (1, 1, 1), 'explanation': 1, 'review': 1, 'traditional': (1, 1)}

zeroshot_test_loader = get_loader(
        args,
        test_task_list,
        test_sample_numbers,
        split=args.test, 
        mode='test', 
        batch_size=args.batch_size,
        workers=args.num_workers,
        distributed=args.distributed
)
print(len(zeroshot_test_loader))

tokens_predict = []
tokens_test = []
for i, batch in tqdm(enumerate(zeroshot_test_loader)):
    if i > 50:
        break
    with torch.no_grad():
        results = model.generate_step(batch)
        tokens_predict.extend(results) 
        tokens_test.extend(batch['target_text'])
        
new_tokens_predict = [l.split() for l in tokens_predict]
new_tokens_test = [ll.split() for ll in tokens_test]
BLEU2 = bleu_score(new_tokens_test, new_tokens_predict, n_gram=2, smooth=False)
ROUGE = rouge_score(tokens_test, tokens_predict)

print('BLEU-2 {:7.4f}'.format(BLEU2))
for (k, v) in ROUGE.items():
    print('{} {:7.4f}'.format(k, v))

Data sources:  ['beauty']
compute_datum_info
1241


51it [00:06,  7.92it/s]


BLEU-2  2.1225
rouge_1/f_score  8.4205
rouge_1/r_score  7.5503
rouge_1/p_score 11.1520
rouge_2/f_score  1.6676
rouge_2/r_score  1.5984
rouge_2/p_score  1.9812
rouge_l/f_score  7.5476
rouge_l/r_score  7.5304
rouge_l/p_score 11.1520


#### Evaluation - Traditional

In [None]:
test_task_list = {'traditional': ['5-8']  # or '5-5'
}
test_sample_numbers = {'rating': 1, 'sequential': (1, 1, 1), 'explanation': 1, 'review': 1, 'traditional': (1, 1)}

zeroshot_test_loader = get_loader(
        args,
        test_task_list,
        test_sample_numbers,
        split=args.test, 
        mode='test', 
        batch_size=args.batch_size,
        workers=args.num_workers,
        distributed=args.distributed
)
print(len(zeroshot_test_loader))

all_info = []
for i, batch in tqdm(enumerate(zeroshot_test_loader)):
    with torch.no_grad():
        results = model.generate_step(batch)
        beam_outputs = model.generate(
                batch['input_ids'].to('cuda'), 
                max_length=50, 
                num_beams=20,
                no_repeat_ngram_size=0, 
                num_return_sequences=20,
                early_stopping=True
        )
        generated_sents = model.tokenizer.batch_decode(beam_outputs, skip_special_tokens=True)
        for j, item in enumerate(zip(results, batch['target_text'], batch['source_text'])):
            new_info = {}
            new_info['target_item'] = item[1]
            new_info['gen_item_list'] = generated_sents[j*20: (j+1)*20]
            all_info.append(new_info)
            
gt = {}
ui_scores = {}
for i, info in enumerate(all_info):
    gt[i] = [int(info['target_item'])]
    pred_dict = {}
    for j in range(len(info['gen_item_list'])):
        try:
            pred_dict[int(info['gen_item_list'][j])] = -(j+1)
        except:
            pass
    ui_scores[i] = pred_dict
    
evaluate_all(ui_scores, gt, 1)
evaluate_all(ui_scores, gt, 5)
evaluate_all(ui_scores, gt, 10)

Data sources:  ['beauty']
compute_datum_info
1398


  next_indices = next_tokens // vocab_size
1398it [17:55,  1.30it/s]



NDCG@1	Rec@1	Hits@1	Prec@1	MAP@1	MRR@1
0.0598	0.0598	0.0598	0.0598	0.0598	0.0598

NDCG@5	Rec@5	Hits@5	Prec@5	MAP@5	MRR@5
0.1101	0.1589	0.1589	0.0318	0.0940	0.0940

NDCG@10	Rec@10	Hits@10	Prec@10	MAP@10	MRR@10
0.1340	0.2332	0.2332	0.0233	0.1039	0.1039


('\nNDCG@10\tRec@10\tHits@10\tPrec@10\tMAP@10\tMRR@10\n0.1340\t0.2332\t0.2332\t0.0233\t0.1039\t0.1039',
 {'ndcg': 0.13398695780876257,
  'map': 0.10386263733533777,
  'recall': 0.23315297589768816,
  'precision': 0.02331529758977105,
  'mrr': 0.10386263733533777,
  'hit': 0.23315297589768816})

In [None]:
test_task_list = {'traditional': ['5-5']  # or '5-8'
}
test_sample_numbers = {'rating': 1, 'sequential': (1, 1, 1), 'explanation': 1, 'review': 1, 'traditional': (1, 1)}

zeroshot_test_loader = get_loader(
        args,
        test_task_list,
        test_sample_numbers,
        split=args.test, 
        mode='test', 
        batch_size=args.batch_size,
        workers=args.num_workers,
        distributed=args.distributed
)
print(len(zeroshot_test_loader))

all_info = []
for i, batch in tqdm(enumerate(zeroshot_test_loader)):
    with torch.no_grad():
        results = model.generate_step(batch)
        beam_outputs = model.generate(
                batch['input_ids'].to('cuda'), 
                max_length=50, 
                num_beams=20,
                no_repeat_ngram_size=0, 
                num_return_sequences=20,
                early_stopping=True
        )
        generated_sents = model.tokenizer.batch_decode(beam_outputs, skip_special_tokens=True)
        for j, item in enumerate(zip(results, batch['target_text'], batch['source_text'])):
            new_info = {}
            new_info['target_item'] = item[1]
            new_info['gen_item_list'] = generated_sents[j*20: (j+1)*20]
            all_info.append(new_info)
            
gt = {}
ui_scores = {}
for i, info in enumerate(all_info):
    gt[i] = [int(info['target_item'])]
    pred_dict = {}
    for j in range(len(info['gen_item_list'])):
        try:
            pred_dict[int(info['gen_item_list'][j])] = -(j+1)
        except:
            pass
    ui_scores[i] = pred_dict
    
evaluate_all(ui_scores, gt, 1)
evaluate_all(ui_scores, gt, 5)
evaluate_all(ui_scores, gt, 10)

Data sources:  ['beauty']
compute_datum_info
1398


1398it [17:42,  1.32it/s]



NDCG@1	Rec@1	Hits@1	Prec@1	MAP@1	MRR@1
0.0595	0.0595	0.0595	0.0595	0.0595	0.0595

NDCG@5	Rec@5	Hits@5	Prec@5	MAP@5	MRR@5
0.1112	0.1606	0.1606	0.0321	0.0949	0.0949

NDCG@10	Rec@10	Hits@10	Prec@10	MAP@10	MRR@10
0.1352	0.2352	0.2352	0.0235	0.1047	0.1047


('\nNDCG@10\tRec@10\tHits@10\tPrec@10\tMAP@10\tMRR@10\n0.1352\t0.2352\t0.2352\t0.0235\t0.1047\t0.1047',
 {'ndcg': 0.13516935746926673,
  'map': 0.10474829455400039,
  'recall': 0.23520994499843492,
  'precision': 0.023520994499845772,
  'mrr': 0.10474829455400039,
  'hit': 0.23520994499843492})