multilabel_bert_diff.patch

diff --git a/data_loader.py b/data_loader.py
index dbaa512..a909717 100644
--- a/data_loader.py
+++ b/data_loader.py
@@ -2,11 +2,12 @@ import os
 import copy
 import json
 import logging
+from tqdm import tqdm
 
 import torch
 from torch.utils.data import TensorDataset
 
-from utils import get_intent_labels, get_slot_labels
+from utils import get_violation_labels
 
 logger = logging.getLogger(__name__)
 
@@ -18,15 +19,14 @@ class InputExample(object):
     Args:
         guid: Unique id for the example.
         words: list. The words of the sequence.
-        intent_label: (Optional) string. The intent label of the example.
-        slot_labels: (Optional) list. The slot labels of the example.
+        violation_labels: (Optional) list. The violation labels of the example.
     """
 
-    def __init__(self, guid, words, intent_label=None, slot_labels=None):
+    def __init__(self, guid, conv_id, words, violation_labels=None):
         self.guid = guid
+        self.conv_id = conv_id
         self.words = words
-        self.intent_label = intent_label
-        self.slot_labels = slot_labels
+        self.violation_labels = violation_labels
 
     def __repr__(self):
         return str(self.to_json_string())
@@ -44,12 +44,11 @@ class InputExample(object):
 class InputFeatures(object):
     """A single set of features of data."""
 
-    def __init__(self, input_ids, attention_mask, token_type_ids, intent_label_id, slot_labels_ids):
+    def __init__(self, input_ids, attention_mask, token_type_ids, violation_labels_ids):
         self.input_ids = input_ids
         self.attention_mask = attention_mask
         self.token_type_ids = token_type_ids
-        self.intent_label_id = intent_label_id
-        self.slot_labels_ids = slot_labels_ids
+        self.violation_labels_ids = violation_labels_ids
 
     def __repr__(self):
         return str(self.to_json_string())
@@ -63,46 +62,14 @@ class InputFeatures(object):
         """Serializes this instance to a JSON string."""
         return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
 
-
-class JointProcessor(object):
-    """Processor for the JointBERT data set """
-
+class MultilabelProcessor(object):
+    """Processor for the MultilabelBERT data set """
+    
     def __init__(self, args):
         self.args = args
-        self.intent_labels = get_intent_labels(args)
-        self.slot_labels = get_slot_labels(args)
-
-        self.input_text_file = 'seq.in'
-        self.intent_label_file = 'label'
-        self.slot_labels_file = 'seq.out'
-
-    @classmethod
-    def _read_file(cls, input_file, quotechar=None):
-        """Reads a tab separated value file."""
-        with open(input_file, "r", encoding="utf-8") as f:
-            lines = []
-            for line in f:
-                lines.append(line.strip())
-            return lines
-
-    def _create_examples(self, texts, intents, slots, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for i, (text, intent, slot) in enumerate(zip(texts, intents, slots)):
-            guid = "%s-%s" % (set_type, i)
-            # 1. input_text
-            words = text.split()  # Some are spaced twice
-            # 2. intent
-            intent_label = self.intent_labels.index(intent) if intent in self.intent_labels else self.intent_labels.index("UNK")
-            # 3. slot
-            slot_labels = []
-            for s in slot.split():
-                slot_labels.append(self.slot_labels.index(s) if s in self.slot_labels else self.slot_labels.index("UNK"))
-
-            assert len(words) == len(slot_labels)
-            examples.append(InputExample(guid=guid, words=words, intent_label=intent_label, slot_labels=slot_labels))
-        return examples
-
+        self.violation_labels = get_violation_labels(args)
+        self.violation2index = {c:i for i, c in enumerate(self.violation_labels)}
+        
     def get_examples(self, mode):
         """
         Args:
@@ -110,17 +77,78 @@ class JointProcessor(object):
         """
         data_path = os.path.join(self.args.data_dir, self.args.task, mode)
         logger.info("LOOKING AT {}".format(data_path))
-        return self._create_examples(texts=self._read_file(os.path.join(data_path, self.input_text_file)),
-                                     intents=self._read_file(os.path.join(data_path, self.intent_label_file)),
-                                     slots=self._read_file(os.path.join(data_path, self.slot_labels_file)),
-                                     set_type=mode)
+        conversations = json.load(open(os.path.join(data_path, f'{mode}.json'),'r'))
+        
+        examples = []
+        for cid, turns in tqdm(conversations.items()):
+            accumulated_texts = None
+            for turn in turns:
+                accumulated_texts = turn['utterance'].split(' ') if accumulated_texts is None \
+                                    else accumulated_texts + turn['utterance'].split(' ')
+
+                y = [0] * len(self.violation_labels)
+                cnames = set([v[1] for v in turn['violations']])
+                for cname in cnames:
+                    y[self.violation2index[cname]] = 1
 
+                gid = mode + '-' + turn['utteranceId']
+                examples.append(InputExample(guid=gid, conv_id=turn['conversationId'], words=accumulated_texts, violation_labels=y))
 
-processors = {
-    "atis": JointProcessor,
-    "snips": JointProcessor
-}
+        return examples
 
+def load_and_cache_examples(args, tokenizer, mode, with_utterance_ids = False):
+    processor = MultilabelProcessor(args)
+    
+    if mode == "train":
+        examples = processor.get_examples("train")
+    elif mode == "dev":
+        examples = processor.get_examples("dev")
+    elif mode == "test":
+        examples = processor.get_examples("test")
+    else:
+        raise Exception("For mode, Only train, dev, test is available")
+    
+    # Conversation IDs will be used to calculate the conversation correct metric 
+    conversation_ids = [example.conv_id for example in examples]
+        
+    # Load data features from cache or dataset file
+    cached_features_file = os.path.join(
+        args.data_dir,
+        'cached_multilabel_{}_{}_{}_{}'.format(
+            mode,
+            args.task,
+            list(filter(None, args.model_name_or_path.split("/"))).pop(),
+            args.max_seq_len
+        )
+    )
+
+    if os.path.exists(cached_features_file):
+        logger.info("Loading features from cached file %s", cached_features_file)
+        features = torch.load(cached_features_file)
+    else:
+        # Load data features from dataset file
+        logger.info("Creating features from dataset file at %s", args.data_dir)
+
+        # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
+        pad_token_label_id = args.ignore_index
+        features = convert_examples_to_features(examples, args.max_seq_len, tokenizer,
+                                                pad_token_label_id=pad_token_label_id)
+        logger.info("Saving features into cached file %s", cached_features_file)
+        torch.save(features, cached_features_file)
+
+    # Convert to Tensors and build dataset
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
+    all_violation_labels_ids = torch.tensor([f.violation_labels_ids for f in features], dtype=torch.float)
+
+    dataset = TensorDataset(all_input_ids, all_attention_mask,
+                            all_token_type_ids, all_violation_labels_ids)
+    
+    if with_utterance_ids:
+        return dataset, conversation_ids, [example.guid for example in examples]
+    else:
+        return dataset, conversation_ids
 
 def convert_examples_to_features(examples, max_seq_len, tokenizer,
                                  pad_token_label_id=-100,
@@ -141,29 +169,23 @@ def convert_examples_to_features(examples, max_seq_len, tokenizer,
 
         # Tokenize word by word (for NER)
         tokens = []
-        slot_labels_ids = []
-        for word, slot_label in zip(example.words, example.slot_labels):
+        for word in example.words:
             word_tokens = tokenizer.tokenize(word)
             if not word_tokens:
                 word_tokens = [unk_token]  # For handling the bad-encoded word
             tokens.extend(word_tokens)
-            # Use the real label id for the first token of the word, and padding ids for the remaining tokens
-            slot_labels_ids.extend([int(slot_label)] + [pad_token_label_id] * (len(word_tokens) - 1))
 
         # Account for [CLS] and [SEP]
         special_tokens_count = 2
-        if len(tokens) > max_seq_len - special_tokens_count:
-            tokens = tokens[:(max_seq_len - special_tokens_count)]
-            slot_labels_ids = slot_labels_ids[:(max_seq_len - special_tokens_count)]
+        while len(tokens) > max_seq_len - special_tokens_count:
+            tokens.pop(0)
 
         # Add [SEP] token
         tokens += [sep_token]
-        slot_labels_ids += [pad_token_label_id]
         token_type_ids = [sequence_a_segment_id] * len(tokens)
 
         # Add [CLS] token
         tokens = [cls_token] + tokens
-        slot_labels_ids = [pad_token_label_id] + slot_labels_ids
         token_type_ids = [cls_token_segment_id] + token_type_ids
 
         input_ids = tokenizer.convert_tokens_to_ids(tokens)
@@ -177,14 +199,12 @@ def convert_examples_to_features(examples, max_seq_len, tokenizer,
         input_ids = input_ids + ([pad_token_id] * padding_length)
         attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
         token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
-        slot_labels_ids = slot_labels_ids + ([pad_token_label_id] * padding_length)
 
         assert len(input_ids) == max_seq_len, "Error with input length {} vs {}".format(len(input_ids), max_seq_len)
         assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len)
         assert len(token_type_ids) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_ids), max_seq_len)
-        assert len(slot_labels_ids) == max_seq_len, "Error with slot labels length {} vs {}".format(len(slot_labels_ids), max_seq_len)
-
-        intent_label_id = int(example.intent_label)
+        
+        violation_labels_ids = example.violation_labels
 
         if ex_index < 5:
             logger.info("*** Example ***")
@@ -193,63 +213,75 @@ def convert_examples_to_features(examples, max_seq_len, tokenizer,
             logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
             logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
             logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
-            logger.info("intent_label: %s (id = %d)" % (example.intent_label, intent_label_id))
-            logger.info("slot_labels: %s" % " ".join([str(x) for x in slot_labels_ids]))
+            logger.info("violation_labels: %s" % " ".join([str(x) for x in violation_labels_ids]))
 
         features.append(
             InputFeatures(input_ids=input_ids,
                           attention_mask=attention_mask,
                           token_type_ids=token_type_ids,
-                          intent_label_id=intent_label_id,
-                          slot_labels_ids=slot_labels_ids
+                          violation_labels_ids=violation_labels_ids
                           ))
 
     return features
 
+    
+# class JointProcessor(object):
+#     """Processor for the JointBERT data set """
+
+#     def __init__(self, args):
+#         self.args = args
+#         self.intent_labels = get_intent_labels(args)
+#         self.slot_labels = get_slot_labels(args)
+
+#         self.input_text_file = 'seq.in'
+#         self.intent_label_file = 'label'
+#         self.slot_labels_file = 'seq.out'
+
+#     @classmethod
+#     def _read_file(cls, input_file, quotechar=None):
+#         """Reads a tab separated value file."""
+#         with open(input_file, "r", encoding="utf-8") as f:
+#             lines = []
+#             for line in f:
+#                 lines.append(line.strip())
+#             return lines
+
+#     def _create_examples(self, texts, intents, slots, set_type):
+#         """Creates examples for the training and dev sets."""
+#         examples = []
+#         for i, (text, intent, slot) in enumerate(zip(texts, intents, slots)):
+#             guid = "%s-%s" % (set_type, i)
+#             # 1. input_text
+#             words = text.split()  # Some are spaced twice
+#             # 2. intent
+#             intent_label = self.intent_labels.index(intent) if intent in self.intent_labels else self.intent_labels.index("UNK")
+#             # 3. slot
+#             slot_labels = []
+#             for s in slot.split():
+#                 slot_labels.append(self.slot_labels.index(s) if s in self.slot_labels else self.slot_labels.index("UNK"))
+
+#             assert len(words) == len(slot_labels)
+#             examples.append(InputExample(guid=guid, words=words, intent_label=intent_label, slot_labels=slot_labels))
+#         return examples
+
+#     def get_examples(self, mode):
+#         """
+#         Args:
+#             mode: train, dev, test
+#         """
+#         data_path = os.path.join(self.args.data_dir, self.args.task, mode)
+#         logger.info("LOOKING AT {}".format(data_path))
+#         return self._create_examples(texts=self._read_file(os.path.join(data_path, self.input_text_file)),
+#                                      intents=self._read_file(os.path.join(data_path, self.intent_label_file)),
+#                                      slots=self._read_file(os.path.join(data_path, self.slot_labels_file)),
+#                                      set_type=mode)
+
+
+# processors = {
+#     "atis": JointProcessor,
+#     "snips": JointProcessor
+# }
 
-def load_and_cache_examples(args, tokenizer, mode):
-    processor = processors[args.task](args)
-
-    # Load data features from cache or dataset file
-    cached_features_file = os.path.join(
-        args.data_dir,
-        'cached_{}_{}_{}_{}'.format(
-            mode,
-            args.task,
-            list(filter(None, args.model_name_or_path.split("/"))).pop(),
-            args.max_seq_len
-        )
-    )
-
-    if os.path.exists(cached_features_file):
-        logger.info("Loading features from cached file %s", cached_features_file)
-        features = torch.load(cached_features_file)
-    else:
-        # Load data features from dataset file
-        logger.info("Creating features from dataset file at %s", args.data_dir)
-        if mode == "train":
-            examples = processor.get_examples("train")
-        elif mode == "dev":
-            examples = processor.get_examples("dev")
-        elif mode == "test":
-            examples = processor.get_examples("test")
-        else:
-            raise Exception("For mode, Only train, dev, test is available")
 
-        # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
-        pad_token_label_id = args.ignore_index
-        features = convert_examples_to_features(examples, args.max_seq_len, tokenizer,
-                                                pad_token_label_id=pad_token_label_id)
-        logger.info("Saving features into cached file %s", cached_features_file)
-        torch.save(features, cached_features_file)
 
-    # Convert to Tensors and build dataset
-    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
-    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
-    all_intent_label_ids = torch.tensor([f.intent_label_id for f in features], dtype=torch.long)
-    all_slot_labels_ids = torch.tensor([f.slot_labels_ids for f in features], dtype=torch.long)
 
-    dataset = TensorDataset(all_input_ids, all_attention_mask,
-                            all_token_type_ids, all_intent_label_ids, all_slot_labels_ids)
-    return dataset
diff --git a/main.py b/main.py
index eca6fe4..2b52315 100644
--- a/main.py
+++ b/main.py
@@ -1,7 +1,7 @@
 import argparse
 
 from trainer import Trainer
-from utils import init_logger, load_tokenizer, read_prediction_text, set_seed, MODEL_CLASSES, MODEL_PATH_MAP
+from utils import init_logger, load_tokenizer, set_seed, MODEL_CLASSES, MODEL_PATH_MAP
 from data_loader import load_and_cache_examples
 
 
@@ -10,11 +10,11 @@ def main(args):
     set_seed(args)
     tokenizer = load_tokenizer(args)
 
-    train_dataset = load_and_cache_examples(args, tokenizer, mode="train")
-    dev_dataset = load_and_cache_examples(args, tokenizer, mode="dev")
-    test_dataset = load_and_cache_examples(args, tokenizer, mode="test")
+    train_dataset, train_conv_ids = load_and_cache_examples(args, tokenizer, mode="train")
+    dev_dataset, dev_conv_ids = load_and_cache_examples(args, tokenizer, mode="dev")
+    test_dataset, test_conv_ids = load_and_cache_examples(args, tokenizer, mode="test")
 
-    trainer = Trainer(args, train_dataset, dev_dataset, test_dataset)
+    trainer = Trainer(args, train_dataset, dev_dataset, test_dataset, train_conv_ids, dev_conv_ids, test_conv_ids)
 
     if args.do_train:
         trainer.train()
@@ -29,16 +29,14 @@ if __name__ == '__main__':
 
     parser.add_argument("--task", default=None, required=True, type=str, help="The name of the task to train")
     parser.add_argument("--model_dir", default=None, required=True, type=str, help="Path to save, load model")
-    parser.add_argument("--data_dir", default="./data", type=str, help="The input data dir")
-    parser.add_argument("--intent_label_file", default="intent_label.txt", type=str, help="Intent Label file")
-    parser.add_argument("--slot_label_file", default="slot_label.txt", type=str, help="Slot Label file")
+    parser.add_argument("--data_dir", default="../data", type=str, help="The input data dir")
 
     parser.add_argument("--model_type", default="bert", type=str, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
 
     parser.add_argument('--seed', type=int, default=1234, help="random seed for initialization")
     parser.add_argument("--train_batch_size", default=32, type=int, help="Batch size for training.")
     parser.add_argument("--eval_batch_size", default=64, type=int, help="Batch size for evaluation.")
-    parser.add_argument("--max_seq_len", default=50, type=int, help="The maximum total input sequence length after tokenization.")
+    parser.add_argument("--max_seq_len", default=150, type=int, help="The maximum total input sequence length after tokenization.")
     parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
     parser.add_argument("--num_train_epochs", default=10.0, type=float, help="Total number of training epochs to perform.")
     parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
@@ -60,11 +58,6 @@ if __name__ == '__main__':
     parser.add_argument("--ignore_index", default=0, type=int,
                         help='Specifies a target value that is ignored and does not contribute to the input gradient')
 
-    parser.add_argument('--slot_loss_coef', type=float, default=1.0, help='Coefficient for the slot loss.')
-
-    # CRF option
-    parser.add_argument("--use_crf", action="store_true", help="Whether to use CRF")
-    parser.add_argument("--slot_pad_label", default="PAD", type=str, help="Pad token for slot label pad (to be ignore when calculate loss)")
 
     args = parser.parse_args()
 
diff --git a/model/__init__.py b/model/__init__.py
index 000c735..fd683f0 100644
--- a/model/__init__.py
+++ b/model/__init__.py
@@ -1,3 +1,4 @@
 from .modeling_jointbert import JointBERT
 from .modeling_jointdistilbert import JointDistilBERT
 from .modeling_jointalbert import JointAlbert
+from .modeling_multilabelbert import MultilabelBERT
\ No newline at end of file
diff --git a/predict.py b/predict.py
index abcdd00..06c5029 100644
--- a/predict.py
+++ b/predict.py
@@ -2,12 +2,13 @@ import os
 import logging
 import argparse
 from tqdm import tqdm, trange
+from data_loader import load_and_cache_examples
 
 import numpy as np
 import torch
 from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
 
-from utils import init_logger, load_tokenizer, get_intent_labels, get_slot_labels, MODEL_CLASSES
+from utils import init_logger, load_tokenizer, get_violation_labels, MODEL_CLASSES
 
 logger = logging.getLogger(__name__)
 
@@ -28,8 +29,7 @@ def load_model(pred_config, args, device):
     try:
         model = MODEL_CLASSES[args.model_type][1].from_pretrained(args.model_dir,
                                                                   args=args,
-                                                                  intent_label_lst=get_intent_labels(args),
-                                                                  slot_label_lst=get_slot_labels(args))
+                                                                  violation_label_lst=get_violation_labels(args))
         model.to(device)
         model.eval()
         logger.info("***** Model Loaded *****")
@@ -38,93 +38,6 @@ def load_model(pred_config, args, device):
 
     return model
 
-
-def read_input_file(pred_config):
-    lines = []
-    with open(pred_config.input_file, "r", encoding="utf-8") as f:
-        for line in f:
-            line = line.strip()
-            words = line.split()
-            lines.append(words)
-
-    return lines
-
-
-def convert_input_file_to_tensor_dataset(lines,
-                                         pred_config,
-                                         args,
-                                         tokenizer,
-                                         pad_token_label_id,
-                                         cls_token_segment_id=0,
-                                         pad_token_segment_id=0,
-                                         sequence_a_segment_id=0,
-                                         mask_padding_with_zero=True):
-    # Setting based on the current model type
-    cls_token = tokenizer.cls_token
-    sep_token = tokenizer.sep_token
-    unk_token = tokenizer.unk_token
-    pad_token_id = tokenizer.pad_token_id
-
-    all_input_ids = []
-    all_attention_mask = []
-    all_token_type_ids = []
-    all_slot_label_mask = []
-
-    for words in lines:
-        tokens = []
-        slot_label_mask = []
-        for word in words:
-            word_tokens = tokenizer.tokenize(word)
-            if not word_tokens:
-                word_tokens = [unk_token]  # For handling the bad-encoded word
-            tokens.extend(word_tokens)
-            # Use the real label id for the first token of the word, and padding ids for the remaining tokens
-            slot_label_mask.extend([pad_token_label_id + 1] + [pad_token_label_id] * (len(word_tokens) - 1))
-
-        # Account for [CLS] and [SEP]
-        special_tokens_count = 2
-        if len(tokens) > args.max_seq_len - special_tokens_count:
-            tokens = tokens[: (args.max_seq_len - special_tokens_count)]
-            slot_label_mask = slot_label_mask[:(args.max_seq_len - special_tokens_count)]
-
-        # Add [SEP] token
-        tokens += [sep_token]
-        token_type_ids = [sequence_a_segment_id] * len(tokens)
-        slot_label_mask += [pad_token_label_id]
-
-        # Add [CLS] token
-        tokens = [cls_token] + tokens
-        token_type_ids = [cls_token_segment_id] + token_type_ids
-        slot_label_mask = [pad_token_label_id] + slot_label_mask
-
-        input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-        # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
-        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
-
-        # Zero-pad up to the sequence length.
-        padding_length = args.max_seq_len - len(input_ids)
-        input_ids = input_ids + ([pad_token_id] * padding_length)
-        attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
-        token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
-        slot_label_mask = slot_label_mask + ([pad_token_label_id] * padding_length)
-
-        all_input_ids.append(input_ids)
-        all_attention_mask.append(attention_mask)
-        all_token_type_ids.append(token_type_ids)
-        all_slot_label_mask.append(slot_label_mask)
-
-    # Change to Tensor
-    all_input_ids = torch.tensor(all_input_ids, dtype=torch.long)
-    all_attention_mask = torch.tensor(all_attention_mask, dtype=torch.long)
-    all_token_type_ids = torch.tensor(all_token_type_ids, dtype=torch.long)
-    all_slot_label_mask = torch.tensor(all_slot_label_mask, dtype=torch.long)
-
-    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_slot_label_mask)
-
-    return dataset
-
-
 def predict(pred_config):
     # load model and args
     args = get_args(pred_config)
@@ -132,79 +45,43 @@ def predict(pred_config):
     model = load_model(pred_config, args, device)
     logger.info(args)
 
-    intent_label_lst = get_intent_labels(args)
-    slot_label_lst = get_slot_labels(args)
+    violation_label_lst = get_violation_labels(args)
 
     # Convert input file to TensorDataset
     pad_token_label_id = args.ignore_index
     tokenizer = load_tokenizer(args)
-    lines = read_input_file(pred_config)
-    dataset = convert_input_file_to_tensor_dataset(lines, pred_config, args, tokenizer, pad_token_label_id)
+    dataset, conv_ids, utterance_ids = load_and_cache_examples(args, tokenizer, mode="test", with_utterance_ids = True)
 
     # Predict
     sampler = SequentialSampler(dataset)
     data_loader = DataLoader(dataset, sampler=sampler, batch_size=pred_config.batch_size)
 
-    all_slot_label_mask = None
-    intent_preds = None
-    slot_preds = None
+    violation_preds = None
 
     for batch in tqdm(data_loader, desc="Predicting"):
         batch = tuple(t.to(device) for t in batch)
         with torch.no_grad():
             inputs = {"input_ids": batch[0],
                       "attention_mask": batch[1],
-                      "intent_label_ids": None,
-                      "slot_labels_ids": None}
+                      "violation_label_ids": None}
             if args.model_type != "distilbert":
                 inputs["token_type_ids"] = batch[2]
             outputs = model(**inputs)
-            _, (intent_logits, slot_logits) = outputs[:2]
+            _, logits = outputs[:2]
 
-            # Intent Prediction
-            if intent_preds is None:
-                intent_preds = intent_logits.detach().cpu().numpy()
+            # Violation Prediction
+            if violation_preds is None:
+                violation_preds = logits.detach().cpu().numpy()
             else:
-                intent_preds = np.append(intent_preds, intent_logits.detach().cpu().numpy(), axis=0)
-
-            # Slot prediction
-            if slot_preds is None:
-                if args.use_crf:
-                    # decode() in `torchcrf` returns list with best index directly
-                    slot_preds = np.array(model.crf.decode(slot_logits))
-                else:
-                    slot_preds = slot_logits.detach().cpu().numpy()
-                all_slot_label_mask = batch[3].detach().cpu().numpy()
-            else:
-                if args.use_crf:
-                    slot_preds = np.append(slot_preds, np.array(model.crf.decode(slot_logits)), axis=0)
-                else:
-                    slot_preds = np.append(slot_preds, slot_logits.detach().cpu().numpy(), axis=0)
-                all_slot_label_mask = np.append(all_slot_label_mask, batch[3].detach().cpu().numpy(), axis=0)
-
-    intent_preds = np.argmax(intent_preds, axis=1)
-
-    if not args.use_crf:
-        slot_preds = np.argmax(slot_preds, axis=2)
-
-    slot_label_map = {i: label for i, label in enumerate(slot_label_lst)}
-    slot_preds_list = [[] for _ in range(slot_preds.shape[0])]
-
-    for i in range(slot_preds.shape[0]):
-        for j in range(slot_preds.shape[1]):
-            if all_slot_label_mask[i, j] != pad_token_label_id:
-                slot_preds_list[i].append(slot_label_map[slot_preds[i][j]])
+                violation_preds = np.append(violation_preds, logits.detach().cpu().numpy(), axis=0)
+    
+    violation_preds = violation_preds > 0
+    violation_preds = violation_preds.astype(int)
 
     # Write to output file
     with open(pred_config.output_file, "w", encoding="utf-8") as f:
-        for words, slot_preds, intent_pred in zip(lines, slot_preds_list, intent_preds):
-            line = ""
-            for word, pred in zip(words, slot_preds):
-                if pred == 'O':
-                    line = line + word + " "
-                else:
-                    line = line + "[{}:{}] ".format(word, pred)
-            f.write("<{}> -> {}\n".format(intent_label_lst[intent_pred], line.strip()))
+        for utterance_id, pred in zip(utterance_ids, violation_preds):
+            f.write("{}\t{}\n".format(utterance_id, [violation_label_lst[i] for i, v in enumerate(pred) if v]))
 
     logger.info("Prediction Done!")
 
@@ -213,10 +90,9 @@ if __name__ == "__main__":
     init_logger()
     parser = argparse.ArgumentParser()
 
-    parser.add_argument("--input_file", default="sample_pred_in.txt", type=str, help="Input file for prediction")
     parser.add_argument("--output_file", default="sample_pred_out.txt", type=str, help="Output file for prediction")
     parser.add_argument("--model_dir", default="./atis_model", type=str, help="Path to save, load model")
-
+    parser.add_argument("--mode", default="test", type=str, help="train, dev, or test")
     parser.add_argument("--batch_size", default=32, type=int, help="Batch size for prediction")
     parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
 
diff --git a/trainer.py b/trainer.py
index 8e0e9b6..0c6a05d 100644
--- a/trainer.py
+++ b/trainer.py
@@ -1,5 +1,6 @@
 import os
 import logging
+import json
 from tqdm import tqdm, trange
 
 import numpy as np
@@ -7,20 +8,26 @@ import torch
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 from transformers import BertConfig, AdamW, get_linear_schedule_with_warmup
 
-from utils import MODEL_CLASSES, compute_metrics, get_intent_labels, get_slot_labels
+from utils import MODEL_CLASSES, compute_metrics, get_violation_labels
 
 logger = logging.getLogger(__name__)
 
 
 class Trainer(object):
-    def __init__(self, args, train_dataset=None, dev_dataset=None, test_dataset=None):
+    def __init__(self, args, 
+                 train_dataset=None, dev_dataset=None, test_dataset=None,
+                 train_conv_ids=None, dev_conv_ids=None, test_conv_ids=None,
+                ):
         self.args = args
         self.train_dataset = train_dataset
         self.dev_dataset = dev_dataset
         self.test_dataset = test_dataset
+        self.train_conv_ids = train_conv_ids
+        self.dev_conv_ids = dev_conv_ids
+        self.test_conv_ids = test_conv_ids
 
-        self.intent_label_lst = get_intent_labels(args)
-        self.slot_label_lst = get_slot_labels(args)
+        self.violation_label_lst = get_violation_labels(args)
+        
         # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
         self.pad_token_label_id = args.ignore_index
 
@@ -29,8 +36,7 @@ class Trainer(object):
         self.model = self.model_class.from_pretrained(args.model_name_or_path,
                                                       config=self.config,
                                                       args=args,
-                                                      intent_label_lst=self.intent_label_lst,
-                                                      slot_label_lst=self.slot_label_lst)
+                                                      violation_label_lst=self.violation_label_lst)
 
         # GPU or CPU
         self.device = "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu"
@@ -80,8 +86,7 @@ class Trainer(object):
 
                 inputs = {'input_ids': batch[0],
                           'attention_mask': batch[1],
-                          'intent_label_ids': batch[3],
-                          'slot_labels_ids': batch[4]}
+                          'violation_label_ids': batch[3]}
                 if self.args.model_type != 'distilbert':
                     inputs['token_type_ids'] = batch[2]
                 outputs = self.model(**inputs)
@@ -120,8 +125,10 @@ class Trainer(object):
     def evaluate(self, mode):
         if mode == 'test':
             dataset = self.test_dataset
+            conv_ids = self.test_conv_ids
         elif mode == 'dev':
             dataset = self.dev_dataset
+            conv_ids = self.dev_conv_ids
         else:
             raise Exception("Only dev and test dataset available")
 
@@ -134,11 +141,8 @@ class Trainer(object):
         logger.info("  Batch size = %d", self.args.eval_batch_size)
         eval_loss = 0.0
         nb_eval_steps = 0
-        intent_preds = None
-        slot_preds = None
-        out_intent_label_ids = None
-        out_slot_labels_ids = None
-
+        violation_preds = None
+        out_violation_label_ids = None
         self.model.eval()
 
         for batch in tqdm(eval_dataloader, desc="Evaluating"):
@@ -146,70 +150,44 @@ class Trainer(object):
             with torch.no_grad():
                 inputs = {'input_ids': batch[0],
                           'attention_mask': batch[1],
-                          'intent_label_ids': batch[3],
-                          'slot_labels_ids': batch[4]}
+                          'violation_label_ids': batch[3]}
                 if self.args.model_type != 'distilbert':
                     inputs['token_type_ids'] = batch[2]
                 outputs = self.model(**inputs)
-                tmp_eval_loss, (intent_logits, slot_logits) = outputs[:2]
+                tmp_eval_loss, violation_logits = outputs[:2]
 
                 eval_loss += tmp_eval_loss.mean().item()
             nb_eval_steps += 1
 
-            # Intent prediction
-            if intent_preds is None:
-                intent_preds = intent_logits.detach().cpu().numpy()
-                out_intent_label_ids = inputs['intent_label_ids'].detach().cpu().numpy()
-            else:
-                intent_preds = np.append(intent_preds, intent_logits.detach().cpu().numpy(), axis=0)
-                out_intent_label_ids = np.append(
-                    out_intent_label_ids, inputs['intent_label_ids'].detach().cpu().numpy(), axis=0)
-
-            # Slot prediction
-            if slot_preds is None:
-                if self.args.use_crf:
-                    # decode() in `torchcrf` returns list with best index directly
-                    slot_preds = np.array(self.model.crf.decode(slot_logits))
-                else:
-                    slot_preds = slot_logits.detach().cpu().numpy()
-
-                out_slot_labels_ids = inputs["slot_labels_ids"].detach().cpu().numpy()
+            # violation prediction
+            if violation_preds is None:
+                violation_preds = violation_logits.detach().cpu().numpy()
+                out_violation_label_ids = inputs['violation_label_ids'].detach().cpu().numpy()
             else:
-                if self.args.use_crf:
-                    slot_preds = np.append(slot_preds, np.array(self.model.crf.decode(slot_logits)), axis=0)
-                else:
-                    slot_preds = np.append(slot_preds, slot_logits.detach().cpu().numpy(), axis=0)
-
-                out_slot_labels_ids = np.append(out_slot_labels_ids, inputs["slot_labels_ids"].detach().cpu().numpy(), axis=0)
-
+                violation_preds = np.append(violation_preds, violation_logits.detach().cpu().numpy(), axis=0)
+                out_violation_label_ids = np.append(
+                    out_violation_label_ids, inputs['violation_label_ids'].detach().cpu().numpy(), axis=0)
+        
         eval_loss = eval_loss / nb_eval_steps
         results = {
             "loss": eval_loss
         }
 
-        # Intent result
-        intent_preds = np.argmax(intent_preds, axis=1)
-
-        # Slot result
-        if not self.args.use_crf:
-            slot_preds = np.argmax(slot_preds, axis=2)
-        slot_label_map = {i: label for i, label in enumerate(self.slot_label_lst)}
-        out_slot_label_list = [[] for _ in range(out_slot_labels_ids.shape[0])]
-        slot_preds_list = [[] for _ in range(out_slot_labels_ids.shape[0])]
-
-        for i in range(out_slot_labels_ids.shape[0]):
-            for j in range(out_slot_labels_ids.shape[1]):
-                if out_slot_labels_ids[i, j] != self.pad_token_label_id:
-                    out_slot_label_list[i].append(slot_label_map[out_slot_labels_ids[i][j]])
-                    slot_preds_list[i].append(slot_label_map[slot_preds[i][j]])
-
-        total_result = compute_metrics(intent_preds, out_intent_label_ids, slot_preds_list, out_slot_label_list)
+        # violation result
+        violation_preds = violation_preds > 0
+        violation_preds = violation_preds.astype(int)
+        
+        total_result = compute_metrics(violation_preds, out_violation_label_ids, self.violation_label_lst, conv_ids, mode)
         results.update(total_result)
 
         logger.info("***** Eval results *****")
         for key in sorted(results.keys()):
             logger.info("  %s = %s", key, str(results[key]))
-
+        
+        if mode == 'test':
+            results_to_dump = {'args':vars(self.args), 'results':results}
+            json.dump(results_to_dump, open(os.path.join(self.args.model_dir, f'results_multilabel_{self.args.task}_{self.args.model_type}.json'),'w'))
+            
         return results
 
     def save_model(self):
@@ -230,9 +208,8 @@ class Trainer(object):
 
         try:
             self.model = self.model_class.from_pretrained(self.args.model_dir,
-                                                          args=self.args,
-                                                          intent_label_lst=self.intent_label_lst,
-                                                          slot_label_lst=self.slot_label_lst)
+                                                          args=self.args,                                                       
+                                                          violation_label_lst=self.violation_label_lst)
             self.model.to(self.device)
             logger.info("***** Model Loaded *****")
         except:
diff --git a/utils.py b/utils.py
index 3ca6dc9..aaf7eea 100644
--- a/utils.py
+++ b/utils.py
@@ -1,16 +1,115 @@
 import os
 import random
 import logging
+import json
 
 import torch
 import numpy as np
-from seqeval.metrics import precision_score, recall_score, f1_score
+# from seqeval.metrics import precision_score, recall_score, f1_score
 
-from transformers import BertConfig, DistilBertConfig, AlbertConfig
-from transformers import BertTokenizer, DistilBertTokenizer, AlbertTokenizer
+from transformers import BertConfig
+from transformers import BertTokenizer
+from sklearn.metrics import precision_recall_fscore_support
+from model import MultilabelBERT
 
-from model import JointBERT, JointDistilBERT, JointAlbert
+MODEL_CLASSES = {
+    'bert': (BertConfig, MultilabelBERT, BertTokenizer),
+}
+
+MODEL_PATH_MAP = {
+    'bert': 'bert-base-uncased',
+}
+
+def get_violation_labels(args):
+    bot_definition = json.load(open(f'{args.data_dir}/{args.task}/bot_definition_{args.task}.json', 'r'))
+    distinct_slot_values = bot_definition['distinct_slot_values']
+    closed_type_constraints = [f"closedType_{slot_var}" for slot_var in distinct_slot_values]
+    return [c['name'] for c in bot_definition['constraints']] + closed_type_constraints
+    
+def load_tokenizer(args):
+    return MODEL_CLASSES[args.model_type][2].from_pretrained(args.model_name_or_path)
+
+
+def init_logger():
+    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                        datefmt='%m/%d/%Y %H:%M:%S',
+                        level=logging.INFO)
+
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if not args.no_cuda and torch.cuda.is_available():
+        torch.cuda.manual_seed_all(args.seed)
+
+def calculate_prf_one_group(all_gts, all_pds):
+    common = all_gts.intersection(all_pds)
+    try:
+        precision = len(common) / len(all_pds)
+    except:
+        precision = None
+    try:
+        recall = len(common) / len(all_gts)
+    except:
+        recall = None
+    try:
+        f1 = 2 * precision * recall / (precision + recall)
+    except:
+        f1 = None
+        
+    return {
+            'precision': precision,
+            'recall': recall,
+            'f1': f1
+            }
+
+def compute_metrics(violation_preds, violation_labels, violation_names, conv_ids, mode = 'dev'):
+    assert violation_preds.shape == violation_labels.shape
+    violation_preds = violation_preds.astype(int)
+    violation_labels = violation_labels.astype(int)
+    
+    results = {}
+    results['accuracy'] = np.sum(violation_preds == violation_labels) / np.prod(violation_preds.shape)
+    
+    p, r, f, _ = precision_recall_fscore_support(violation_labels.flatten(), violation_preds.flatten())
+    results['precision'] = p[1] 
+    results['recall'] = r[1]
+    results['f1'] = f[1]
+    
+    turn_correct = []
+    turn_iou = []
+    conversation_profiles = {conv_id:1 for conv_id in set(conv_ids)}
+    for eid, pred, label in zip(list(range(len(violation_preds))), violation_preds, violation_labels):
+        if all(pred == label):
+            this_turn_correct = 1 
+            this_iou = 1
+        else:
+            this_turn_correct = 0
+            pred_idx = set([idx for idx, v in enumerate(pred) if v])
+            label_idx = set([idx for idx, v in enumerate(label) if v])
+            this_iou = len(label_idx.intersection(pred_idx)) / len(label_idx.union(pred_idx))
+        turn_correct.append(this_turn_correct)
+        turn_iou.append(this_iou)
+        conversation_profiles[conv_ids[eid]] = conversation_profiles[conv_ids[eid]] * this_turn_correct
+    results['turn_correct'] = np.mean(turn_correct)
+    results['turn_iou'] = np.mean(turn_iou)
+    results['conversation_correct'] = np.mean(list(conversation_profiles.values()))
+    
+    if mode in ['test', 'dev']:
+        results['constraint_stats'] = dict()
+        for c in range(violation_labels.shape[1]): # For each constraint
+            acc = np.sum(violation_preds[:,c] == violation_labels[:,c]) / violation_preds.shape[0]
+            p, r, f, _ = precision_recall_fscore_support(violation_labels[:,c],violation_preds[:,c])
+            results['constraint_stats'][violation_names[c]] = {
+                'precision':p[-1],
+                'recall':r[-1],
+                'f1':f[-1],
+                'accuracy':acc
+            }
+                
+    return results
 
+"""
 MODEL_CLASSES = {
     'bert': (BertConfig, JointBERT, BertTokenizer),
     'distilbert': (DistilBertConfig, JointDistilBERT, DistilBertTokenizer),
@@ -85,7 +184,7 @@ def read_prediction_text(args):
 
 
 def get_sentence_frame_acc(intent_preds, intent_labels, slot_preds, slot_labels):
-    """For the cases that intent and all the slots are correct (in one sentence)"""
+    # For the cases that intent and all the slots are correct (in one sentence)
     # Get the intent comparison result
     intent_result = (intent_preds == intent_labels)
 
@@ -105,3 +204,4 @@ def get_sentence_frame_acc(intent_preds, intent_labels, slot_preds, slot_labels)
     return {
         "sementic_frame_acc": sementic_acc
     }
+"""
\ No newline at end of file