In [1]:
%env CUDA_VISIBLE_DEVICES=-1

env: CUDA_VISIBLE_DEVICES=-1


In [2]:
import argparse
import logging.handlers
import os
import random
from collections import OrderedDict

from torch.optim.lr_scheduler import MultiStepLR
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from MultiWOZ import get_batch
from evaluator import evaluateModel_Slow
from tools import *
from transformer.Transformer import RespGenerator, UncertaintyLoss
from tqdm import tqdm
import pandas as pd

In [3]:
def parse_opt():
    parser = argparse.ArgumentParser()
    parser.add_argument('--option', type=str, default="train", help="whether to train or test the model", choices=['train', 'test', 'postprocess'])
    parser.add_argument('--emb_dim', type=int, default=128, help="the embedding dimension")
    parser.add_argument('--dropout', type=float, default=0.2, help="dropout rate")
    parser.add_argument('--resume', action='store_true', default=False, help="whether to resume previous run")
    parser.add_argument('--batch_size', type=int, default=3, help="train/dev/test batch size")
    parser.add_argument('--model', type=str, default="model", help="path to save or load models")
    parser.add_argument('--data_dir', type=str, default='data', help="data dir")
    parser.add_argument('--beam_size', type=int, default=2, help="beam size of act/response generator")
    parser.add_argument('--max_seq_length', type=int, default=50, help="max input length")
    parser.add_argument('--ngram', type=int, default=3, help="avoid n gram repeatness")
    parser.add_argument('--layer_num', type=int, default=3, help="transformer layer num")
    parser.add_argument('--evaluate_every', type=int, default=5, help="checkpoints")
    parser.add_argument('--head', type=int, default=4, help="head num for transformer")
    parser.add_argument("--learning_rate", default=1e-3, type=float, help="The initial learning rate for Adam.")
    parser.add_argument("--output_file", default='output', type=str, help="path to save generated act/response")
    parser.add_argument("--non_delex", default=False, action="store_true", help="non delex testing")
    parser.add_argument("--hist_num", default=0,type=int, help="turn num of history")
    parser.add_argument('--log', type=str, default='log', help="log file")

    parser.add_argument('--act_source',  type=str, choices=["pred", "bert",'groundtruth'], default='pred', help="action source for validate/test")
    parser.add_argument('--seed', type=int, default=1, help="random seed for initialization")

    args = parser.parse_args("--option test --model model/MarCo_BERT --batch_size 384 --max_seq_length 50 --act_source bert".split())
    return args

args = parse_opt()
print(args)

Namespace(act_source='bert', batch_size=384, beam_size=2, data_dir='data', dropout=0.2, emb_dim=128, evaluate_every=5, head=4, hist_num=0, layer_num=3, learning_rate=0.001, log='log', max_seq_length=50, model='model/MarCo_BERT', ngram=3, non_delex=False, option='test', output_file='output', resume=False, seed=1)


In [4]:
if args.option == 'train':
    if not os.path.exists(args.model):
        os.makedirs(args.model)
    args.log = os.path.join(args.model, 'train.log')
elif args.option == 'test':
    dir = os.path.dirname(args.model)
    args.log = os.path.join(dir, 'test.log')

logger = logging.getLogger(__name__)
handler1 = logging.StreamHandler()
handler2 = logging.FileHandler(filename=args.log)

logger.setLevel(logging.DEBUG)
handler1.setLevel(logging.WARNING)
handler2.setLevel(logging.DEBUG)

formatter = logging.Formatter("%(asctime)s %(name)s %(levelname)s %(message)s")
handler1.setFormatter(formatter)
handler2.setFormatter(formatter)

logger.addHandler(handler1)
logger.addHandler(handler2)

device = torch.device('cuda' if torch.cuda.is_available() else "cpu")

def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    numpy.random.seed(seed)
    random.seed(seed)
setup_seed(args.seed)

with open("{}/vocab.json".format(args.data_dir), 'r') as f:
    vocabulary = json.load(f)

act_ontology = Constants.act_ontology

vocab, ivocab = vocabulary['vocab'], vocabulary['rev']
tokenizer = Tokenizer(vocab, ivocab, False)

with open("{}/act_vocab.json".format(args.data_dir), 'r') as f:
    act_vocabulary = json.load(f)

act_vocab, act_ivocab = act_vocabulary['vocab'], act_vocabulary['rev']
act_tokenizer = Tokenizer(act_vocab, act_ivocab, False)

logger.info("Loading Vocabulary of {} size".format(tokenizer.vocab_len))
# Loading the dataset

checkpoint_file = args.model

if 'train' in args.option:
    *train_examples, _ = get_batch(args.data_dir, 'train', tokenizer, act_tokenizer, args.max_seq_length)
    train_data = TensorDataset(*train_examples)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size)
    *val_examples, val_id = get_batch(args.data_dir, 'val', tokenizer, act_tokenizer, args.max_seq_length)
    dialogs = json.load(open('{}/val.json'.format(args.data_dir)))
    gt_turns = json.load(open('{}/val_reference.json'.format(args.data_dir)))
elif 'test' in args.option or 'postprocess' in args.option:
    *val_examples, val_id = get_batch(args.data_dir, 'test', tokenizer, act_tokenizer, args.max_seq_length)
    dialogs = json.load(open('{}/test.json'.format(args.data_dir)))
    if args.non_delex:
        gt_turns = json.load(open('{}/test_reference_nondelex.json'.format(args.data_dir)))
    else:
        gt_turns = json.load(open('{}/test_reference.json'.format(args.data_dir)))

In [5]:
eval_data = TensorDataset(*val_examples)
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)

BLEU_calc = BLEUScorer()
F1_calc = F1Scorer()

best_BLEU = 0

weighted_loss_func = UncertaintyLoss(2)
weighted_loss_func.to(device)

resp_generator = RespGenerator(vocab_size=tokenizer.vocab_len,
                               act_vocab_size=act_tokenizer.vocab_len,
                               d_word_vec=args.emb_dim,
                               act_dim=Constants.act_len,
                               n_layers=args.layer_num,
                               d_model=args.emb_dim,
                               n_head=args.head,
                               dropout=args.dropout)

resp_generator.to(device)

bce_loss_func = torch.nn.BCEWithLogitsLoss()
bce_loss_func.to(device)

ce_loss_func = torch.nn.CrossEntropyLoss(ignore_index=Constants.PAD)
ce_loss_func.to(device)


label_list = Constants.functions + Constants.arguments

In [7]:
resp_generator.load_state_dict(torch.load(args.model, map_location=device))
resp_generator.eval()
logger.info("Loading model from {}".format(checkpoint_file))

In [8]:
# Start Evaluating after each epoch
model_turns = {}
act_turns={}
context_inputs = {}
TP, TN, FN, FP = 0, 0, 0, 0
example_success={}
for batch_step, batch in tqdm(enumerate(eval_dataloader), total=len(eval_dataloader)):
    all_pred = []
    batch = tuple(t.to(device) for t in batch)
    input_ids, action_masks, rep_in, resp_out, belief_state, bert_act_seq, act_in, act_out, all_label, \
    act_input_mask, resp_input_mask, *_ = batch

    if args.act_source == 'bert':
        act_in = bert_act_seq
    elif args.act_source == 'pred':
        hyps, act_logits = resp_generator.act_translate_batch(input_mask=act_input_mask, bs=belief_state, \
                                                              src_seq=input_ids, n_bm=args.beam_size,
                                                              max_token_seq_len=Constants.ACT_MAX_LEN)
        for hyp_step, hyp in enumerate(hyps):
            pre1 = [0] * Constants.act_len
            for w in hyp:
                if w not in [Constants.PAD, Constants.EOS]:
                    pre1[w - 3] = 1
            if len(hyp) < Constants.ACT_MAX_LEN:
                hyps[hyp_step] = list(hyps[hyp_step]) + [Constants.PAD] * (Constants.ACT_MAX_LEN - len(hyp))
            all_pred.append(pre1)
            file_name = val_id[batch_step * args.batch_size + hyp_step]
            if file_name not in act_turns:
                act_turns[file_name] = [pre1]
            else:
                act_turns[file_name].append(pre1)

        all_pred=torch.Tensor(all_pred)
        all_label=all_label.cpu()
        TP, TN, FN, FP = obtain_TP_TN_FN_FP(all_pred, all_label, TP, TN, FN, FP)

        act_in = torch.tensor(hyps, dtype=torch.long).to(device)
    else:
        pass
    _, _, act_vecs = resp_generator.act_forward(tgt_seq=act_in, src_seq=input_ids, bs=belief_state,
                                                input_mask=act_input_mask)
    action_masks = act_in.eq(Constants.PAD) + act_in.eq(Constants.EOS)
    resp_hyps = resp_generator.resp_translate_batch(bs=belief_state, act_vecs=act_vecs, act_mask=action_masks,
                                                    input_mask=resp_input_mask,
                                                    src_seq=input_ids, n_bm=args.beam_size,
                                                    max_token_seq_len=40,gram_num=args.ngram)

    for hyp_step, hyp in enumerate(resp_hyps):
        pred = tokenizer.convert_id_to_tokens(hyp)
        file_name = val_id[batch_step * args.batch_size + hyp_step]
        if file_name not in model_turns:
            model_turns[file_name] = [pred]
        else:
            model_turns[file_name].append(pred)
            
        context = tokenizer.convert_id_to_tokens(batch[0][hyp_step])
        if file_name not in context_inputs:
            context_inputs[file_name] = [context]
        else:
            context_inputs[file_name].append(context)

100%|██████████| 20/20 [15:28<00:00, 46.45s/it]


In [9]:
precision = TP / (TP + FP + 0.001)
recall = TP / (TP + FN + 0.001)
F1 = 2 * precision * recall / (precision + recall + 0.001)
print("ACT precision is {:.6f} recall is {:.6f} F1 is {:.6f}".format(precision, recall, F1))
logger.info("ACT precision is {:.6f} recall is {:.6f} F1 is {:.6f}".format(precision, recall, F1))

BLEU = BLEU_calc.score(model_turns, gt_turns)
inform, request, all_match_success = evaluateModel_Slow(model_turns, example_success)
print("Test BLEU {:.4f}, inform {:.2f}, request {:.2f}, score {:.2f}".format(BLEU, inform, request, (inform + request) / 2 + 100 * BLEU))
logger.info("Test BLEU {:.4f}, inform {:.2f}, request {:.2f}, score {:.2f}".format(BLEU, inform, request, (inform + request) / 2 + 100 * BLEU))

# Match and Success stats
pred_file = os.path.join(args.output_file, 'stats_pred'+'.tsv')
all_match_success = [{
    'file': f,
    'match': m,
    'success': s
} for f,(m,s) in all_match_success.items()]
all_match_success = pd.DataFrame.from_records(all_match_success)
all_match_success.to_csv(pred_file, sep="\t", index=False)

f1_entity = F1_calc.score(model_turns, gt_turns)*100
print("Test Entity-F1 {:.4f}".format(f1_entity))
logger.info("Test Entity-F1 {:.4f}".format(f1_entity))

resp_file = os.path.join(args.output_file, 'resp_pred.json')
with open(resp_file, 'w') as fp:
    model_turns = OrderedDict(sorted(model_turns.items()))
    json.dump(model_turns, fp, indent=2)

act_file = os.path.join(args.output_file, 'act_pred.json')
with open(act_file, 'w') as fp:
    act_turns = OrderedDict(sorted(act_turns.items()))
    json.dump(act_turns, fp, indent=2)

with open('output/example_statistic.json','w') as f:
    json.dump(example_success,f)

save_name = 'test-inform-{:.2f}-request-{:.2f}-bleu-{:.4f}'.format(inform, request, BLEU)
torch.save(resp_generator.state_dict(), os.path.join('model', save_name))

ACT precision is 0.000000 recall is 0.000000 F1 is 0.000000
Corpus Inform Success : 92.30%
Corpus Requestable Success : 78.60%
Test BLEU 0.2003, inform 92.30, request 78.60, score 105.48
Test Entity-F1 59.9920


In [10]:
df = []
for key, preds in model_turns.items():
    for i, t in enumerate(preds):
        if t is None:
            raise Exception(t)
        bleu = BLEU_calc.score({key: [t]}, {key: [gt_turns[key][i]]})*100
        f1 = F1_calc.score({key: [t]}, {key: [gt_turns[key][i]]})*100
        df.append({
            'file': key,
            'context': context_inputs[key][i].replace('[SEP]', '<br>'),
            'gold': gt_turns[key][i],
            # There are some empty string predictions
            'generated': t if len(t) > 0 else " ",
            'loss': 0,
            'bleu': bleu,
            'f1_entity': f1,
            'context_length': 2*i+1 # last user + 2*i{usr,sys}
        })
#     pred_file.write(f"file\tcontext\tgold\tgenerated\tloss\tbleu\tf1_entity\n")
#     for idx, h, r, l, b, f1, file in tqdm(zip(indices, pred_hyp, pred_ref, losses, bleus, f1_ents, all_dialog_files)):
#         pred_file.write(f"{file}\t{'<br>'.join(data[idx][:-1])}\t{str(r)}\t{str(h)}\t{str(l)}\t{str(b)}\t{str(f1)}\n")

df = pd.DataFrame.from_records(df)

In [11]:
df.sample(6)

Unnamed: 0,file,context,gold,generated,loss,bleu,f1_entity,context_length
4506,PMUL2578,number is [hotel_phone] <br> i would also like...,where are you departing from ? what is your de...,i would be happy to book a taxi for you . wher...,0,34.892157,0.0,15
4498,PMUL2563,name type [UNK] id postcode reference area add...,"thank you for contacting us , have a nice day .","you are welcome , have a great day !",0,0.511282,0.0,11
1162,MUL1028,. phone name type id area address postcode ref...,the phone number for [attraction_name] is [att...,the address for [attraction_name] is [attracti...,0,52.694553,79.92004,9
6100,PMUL4644,. can you look again please ? phone name type ...,[hotel_name] is very nice .,how about [hotel_name] ? it s located at [hote...,0,0.013194,66.57783,5
2702,MUL2439,[restaurant_pricerange] price -s . postcode ty...,the address is [restaurant_address] . it is in...,[restaurant_name] is located in the [restauran...,0,0.375536,74.934407,7
1653,MUL1546,reference <br> it s booked and will cost [valu...,[restaurant_name] is an inexpensive [restauran...,[restaurant_name] is a [restaurant_food] resta...,0,26.773546,74.934407,7


In [12]:
df.to_csv('output/error_beam_test.tsv', sep='\t', index=False)

In [13]:
resp_file = os.path.join(args.output_file, 'resp_pred.json')
with open(resp_file, 'r') as f:
    model_turns = json.load(f)

success_rate = nondetokenize(model_turns, dialogs)
BLEU = BLEU_calc.score(model_turns, gt_turns)
print(BLEU)

resp_file = os.path.join(args.output_file, 'resp_non_delex_pred.json')
with open(resp_file, 'w') as fp:
    model_turns = OrderedDict(sorted(model_turns.items()))
    json.dump(model_turns, fp, indent=2)

0.1522129258452236


# Marco-BERT

```
ACT precision is 0.000000 recall is 0.000000 F1 is 0.000000
Corpus Inform Success : 92.30%
Corpus Requestable Success : 78.60%
Test BLEU 0.2003, inform 92.30, request 78.60, score 105.48
Test F1-Entity 59.9920
```

### Marco-PredAct

```

```