diff --git a/common/evaluators/BertEvaluator.py b/common/evaluators/BertEvaluator.py deleted file mode 100644 index bded6a1..0000000 --- a/common/evaluators/BertEvaluator.py +++ /dev/null @@ -1,120 +0,0 @@ -import time - -import datetime -import numpy as np -import os -import torch -import tqdm -import torch.nn.functional as F -from tensorboardX import SummaryWriter - -from .trainer import Trainer -from models.bert.args import get_args -from utils.optimization import warmup_linear -from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset) - - -class BertEvaluator(object): - def __init(self, model, processor) - self.args = get_args() - self.model = model - #self.train_dataloader = train_dataloader - #self.optimizer = optimizer - self.processor = processor - self.eval_examples = None - #self.num_train_optimization_steps = None - self.eval_examples = self.processor.get_dev_examples(args.data_dir) - #self.num_train_optimization_steps = int( - # len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs - #if args.local_rank != -1: - # num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() - self.global_step = 0 - self.nb_tr_steps = 0 - self.tr_loss = 0 - self.eval_loss, self.eval_accuracy = 0.0, 0.0 - self.nb_eval_steps, self.nb_eval_examples = 0, 0 - -""" def train_epoch(self): - for step, batch in enumerate(tqdm(self.train_dataloader, desc="Iteration")): - batch = tuple(t.to(device) for t in batch) - input_ids, input_mask, segment_ids, label_ids = batch - loss = self.model(input_ids, segment_ids, input_mask, label_ids) #model no more returns the loss, change this - if n_gpu > 1: - loss = loss.mean() # mean() to average on multi-gpu. - if self.args.gradient_accumulation_steps > 1: - loss = loss / self.args.gradient_accumulation_steps - - if self.args.fp16: - self.optimizer.backward(loss) - else: - loss.backward() - - self.tr_loss += loss.item() - self.nb_tr_examples += input_ids.size(0) - self.nb_tr_steps += 1 - if (step + 1) % self.args.gradient_accumulation_steps == 0: - if self.args.fp16: - # modify learning rate with special warm up BERT uses - # if args.fp16 is False, BertAdam is used that handles this automatically - lr_this_step = self.args.learning_rate * warmup_linear(self.global_step/self.num_train_optimization_steps, self.args.warmup_proportion) - for param_group in self.optimizer.param_groups: - param_group['lr'] = lr_this_step - self.optimizer.step() - self.optimizer.zero_grad() - self.global_step += 1""" - def evaluate(self, epochs): - eval_features = convert_examples_to_features( - self.eval_examples, label_list, args.max_seq_length, tokenizer) - logger.info("***** Running training *****") - logger.info(" Num examples = %d", len(eval_examples)) - logger.info(" Batch size = %d", args.eval_batch_size) - #logger.info(" Num steps = %d", num_train_optimization_steps) - all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) - all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) - all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) - all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) - eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) - eval_sampler = SequentialSampler(eval_data) - - #if args.local_rank == -1: - # train_sampler = RandomSampler(train_data) - #else: - # train_sampler = DistributedSampler(train_data) - eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) - - model.eval() - - for input_ids, input_mask, segment_ids, label_ids in tqdm(self.eval_dataloader, desc="Evaluating"): - input_ids = input_ids.to(device) - input_mask = input_mask.to(device) - segment_ids = segment_ids.to(device) - label_ids = label_ids.to(device) - - with torch.no_grad(): - tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) - logits = model(input_ids, segment_ids, input_mask) - - logits = logits.detach().cpu().numpy() - label_ids = label_ids.to('cpu').numpy() - tmp_eval_accuracy = accuracy(logits, label_ids) - - eval_loss += tmp_eval_loss.mean().item() - eval_accuracy += tmp_eval_accuracy - - nb_eval_examples += input_ids.size(0) - nb_eval_steps += 1 - - self.eval_loss = self.eval_loss / self.nb_eval_steps - self.eval_accuracy = self.eval_accuracy / self.nb_eval_examples - #loss = tr_loss/nb_tr_steps if args.do_train else None - result = {'eval_loss': eval_loss, - 'eval_accuracy': eval_accuracy, - 'global_step': global_step} - #'loss': loss} - - output_eval_file = os.path.join(args.output_dir, "eval_results.txt") - with open(output_eval_file, "w") as writer: - logger.info("***** Eval results *****") - for key in sorted(result.keys()): - logger.info(" %s = %s", key, str(result[key])) - writer.write("%s = %s\n" % (key, str(result[key]))) diff --git a/common/evaluators/bert_evaluator.py b/common/evaluators/bert_evaluator.py new file mode 100644 index 0000000..d23ca9c --- /dev/null +++ b/common/evaluators/bert_evaluator.py @@ -0,0 +1,74 @@ +import os + +import torch +from torch.utils.data import DataLoader, SequentialSampler, TensorDataset +from tqdm import tqdm + +from datasets.processors.bert_processor import convert_examples_to_features, accuracy +from utils.tokenization4bert import BertTokenizer + + +class BertEvaluator(object): + def __init__(self, model, processor, args): + self.args = args + self.model = model + self.processor = processor + self.eval_examples = self.processor.get_dev_examples(args.data_dir) + self.tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) + + def evaluate(self): + label_list = self.processor.get_labels() + eval_features = convert_examples_to_features(self.eval_examples, + label_list, + self.args.max_seq_length, + self.tokenizer) + + print("Num. of examples =", len(self.eval_examples)) + print("Batch size = %d", self.args.eval_batch_size) + + all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) + all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) + all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) + all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) + + eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) + eval_sampler = SequentialSampler(eval_data) + eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=self.args.eval_batch_size) + + self.model.eval() + + eval_loss, eval_accuracy = 0, 0 + nb_eval_steps, nb_eval_examples = 0, 0 + + for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): + input_ids = input_ids.to(self.args.device) + input_mask = input_mask.to(self.args.device) + segment_ids = segment_ids.to(self.args.device) + label_ids = label_ids.to(self.args.device) + + with torch.no_grad(): + tmp_eval_loss = self.model(input_ids, segment_ids, input_mask, label_ids) + logits = self.model(input_ids, segment_ids, input_mask) + + logits = logits.detach().cpu().numpy() + label_ids = label_ids.to('cpu').numpy() + tmp_eval_accuracy = accuracy(logits, label_ids) + + eval_loss += tmp_eval_loss.mean().item() + eval_accuracy += tmp_eval_accuracy + + nb_eval_examples += input_ids.size(0) + nb_eval_steps += 1 + + eval_loss = eval_loss / nb_eval_steps + eval_accuracy = eval_accuracy / nb_eval_examples + + result = {'eval_loss': eval_loss, + 'eval_accuracy': eval_accuracy} + + output_eval_file = os.path.join(self.args.output_dir, "eval_results.txt") + with open(output_eval_file, "w") as writer: + print("***** Eval results *****") + for key in sorted(result.keys()): + print(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) diff --git a/common/train4bert.py b/common/trainers/bert_trainer.py similarity index 51% rename from common/train4bert.py rename to common/trainers/bert_trainer.py index 3b8393b..9aecda9 100644 --- a/common/train4bert.py +++ b/common/trainers/bert_trainer.py @@ -1,44 +1,40 @@ -import time - -import datetime -import numpy as np -import os import torch -import tqdm import torch.nn.functional as F -from tensorboardX import SummaryWriter +from torch.utils.data import DataLoader, RandomSampler, TensorDataset +from torch.utils.data.distributed import DistributedSampler +from tqdm import tqdm +from tqdm import trange -from .trainer import Trainer -from models.bert.args import get_args +from datasets.processors.bert_processor import convert_examples_to_features from utils.optimization import warmup_linear -from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset) +from utils.tokenization4bert import BertTokenizer class BertTrainer(object): - def __init(self, model, optimizer, processor) - self.args = get_args() + def __init__(self, model, optimizer, processor, args): + self.args = args self.model = model - #self.train_dataloader = train_dataloader self.optimizer = optimizer self.processor = processor - self.train_examples = None - self.num_train_optimization_steps = None + self.tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) + self.train_examples = self.processor.get_train_examples(args.data_dir) self.num_train_optimization_steps = int( - len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs + len(self.train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: - num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() + self.num_train_optimization_steps = args.num_train_optimization_steps // torch.distributed.get_world_size() self.global_step = 0 self.nb_tr_steps = 0 self.tr_loss = 0 - def train_epoch(self): - for step, batch in enumerate(tqdm(self.train_dataloader, desc="Iteration")): - batch = tuple(t.to(device) for t in batch) + def train_epoch(self, train_dataloader): + for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): + batch = tuple(t.to(self.args.device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch - loss = self.model(input_ids, segment_ids, input_mask, label_ids) #model no more returns the loss, change this - if n_gpu > 1: - loss = loss.mean() # mean() to average on multi-gpu. + logits = self.model(input_ids, segment_ids, input_mask, label_ids) + loss = F.cross_entropy(logits.view(-1, self.args.num_labels), label_ids.view(-1)) + if self.args.n_gpu > 1: + loss = loss.mean() if self.args.gradient_accumulation_steps > 1: loss = loss / self.args.gradient_accumulation_steps @@ -48,12 +44,9 @@ def train_epoch(self): loss.backward() self.tr_loss += loss.item() - self.nb_tr_examples += input_ids.size(0) self.nb_tr_steps += 1 if (step + 1) % self.args.gradient_accumulation_steps == 0: if self.args.fp16: - # modify learning rate with special warm up BERT uses - # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = self.args.learning_rate * warmup_linear(self.global_step/self.num_train_optimization_steps, self.args.warmup_proportion) for param_group in self.optimizer.param_groups: param_group['lr'] = lr_this_step @@ -61,26 +54,27 @@ def train_epoch(self): self.optimizer.zero_grad() self.global_step += 1 - def train(self, epochs): + def train(self): + label_list = self.processor.get_labels() train_features = convert_examples_to_features( - self.train_examples, label_list, args.max_seq_length, tokenizer) - logger.info("***** Running training *****") - logger.info(" Num examples = %d", len(train_examples)) - logger.info(" Batch size = %d", args.train_batch_size) - logger.info(" Num steps = %d", num_train_optimization_steps) + self.train_examples, label_list, self.args.max_seq_length, self.tokenizer) + print("***** Running training *****") + print(" Num. of examples: ", len(self.train_examples)) + print(" Batch size:", self.args.train_batch_size) + print(" Num of steps:", self.num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) - if args.local_rank == -1: + if self.args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) - train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) - model.train() - for _ in trange(int(args.num_train_epochs), desc="Epoch"): - tr_loss = 0 - nb_tr_examples, nb_tr_steps = 0, 0 - self.train_epoch() + train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=self.args.train_batch_size) + + self.model.train() + + for _ in trange(int(self.args.num_train_epochs), desc="Epoch"): + self.train_epoch(train_dataloader) diff --git a/common/trainers/classification_trainer.py b/common/trainers/classification_trainer.py index e42ad6b..2ce81ac 100644 --- a/common/trainers/classification_trainer.py +++ b/common/trainers/classification_trainer.py @@ -7,7 +7,7 @@ import torch.nn.functional as F from tensorboardX import SummaryWriter -from .trainer import Trainer +from common.trainers.trainer import Trainer class ClassificationTrainer(Trainer): diff --git a/datasets/aapd.py b/datasets/aapd.py index 2702559..8894790 100644 --- a/datasets/aapd.py +++ b/datasets/aapd.py @@ -9,6 +9,7 @@ from datasets.reuters import clean_string, clean_string_fl, split_sents + def char_quantize(string, max_length=1000): identity = np.identity(len(AAPDCharQuantized.ALPHABET)) quantized_string = np.array([identity[AAPDCharQuantized.ALPHABET[char]] for char in list(string.lower()) if char in AAPDCharQuantized.ALPHABET], dtype=np.float32) diff --git a/common/evaluation4bert.py b/datasets/processors/__init__.py similarity index 100% rename from common/evaluation4bert.py rename to datasets/processors/__init__.py diff --git a/datasets/processors/aapd_processor.py b/datasets/processors/aapd_processor.py new file mode 100644 index 0000000..ce50a30 --- /dev/null +++ b/datasets/processors/aapd_processor.py @@ -0,0 +1,33 @@ +import os + +from datasets.processors.bert_processor import BertProcessor, InputExample + + +class AAPDProcessor(BertProcessor): + """Processor for the IMDB dataset""" + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = line[1] + label = line[0] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples \ No newline at end of file diff --git a/datasets/data4bert.py b/datasets/processors/bert_processor.py similarity index 50% rename from datasets/data4bert.py rename to datasets/processors/bert_processor.py index 3c34c11..a1cfc64 100644 --- a/datasets/data4bert.py +++ b/datasets/processors/bert_processor.py @@ -1,6 +1,7 @@ import csv -from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset) -from utils import tokenization4bert + +import sys +import numpy as np class InputExample(object): @@ -32,7 +33,9 @@ def __init__(self, input_ids, input_mask, segment_ids, label_id): self.input_mask = input_mask self.segment_ids = segment_ids self.label_id = label_id -class DataProcessor(object): + + +class BertProcessor(object): """Base class for data converters for sequence classification data sets.""" def get_train_examples(self, data_dir): @@ -55,158 +58,10 @@ def _read_tsv(cls, input_file, quotechar=None): lines = [] for line in reader: if sys.version_info[0] == 2: - line = list(unicode(cell, 'utf-8') for cell in line) + line = list(str(cell, 'utf-8') for cell in line) lines.append(line) return lines -class Sst2Processor(DataProcessor): - """Processor for the SST-2 data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, i) - text_a = line[0] - label = line[1] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) - return examples - -class ReutersProcessor(DataProcessor): - """"Processor for the Reuters data set""" - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, i) - text_a = line[1] - label = line[0] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) - return examples - -class IMDBProcessor(DataProcessor): - """Processor for the IMDB dataset""" - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, i) - text_a = line[1] - label = line[0] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) - return examples - - -class AAPDProcessor(DataProcessor): - """Processor for the IMDB dataset""" - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, i) - text_a = line[1] - label = line[0] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) - return examples - - -class Yelp2014rocessor(DataProcessor): - """Processor for the IMDB dataset""" - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, i) - text_a = line[1] - label = line[0] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) - return examples - def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer): """Loads a data file into a list of `InputBatch`s.""" @@ -272,15 +127,14 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer label_id = label_map[example.label] if ex_index < 5: - logger.info("*** Example ***") - logger.info("guid: %s" % (example.guid)) - logger.info("tokens: %s" % " ".join( + print("*** Example ***") + print("guid: %s" % (example.guid)) + print("tokens: %s" % " ".join( [str(x) for x in tokens])) - logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) - logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) - logger.info( - "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) - logger.info("label: %s (id = %d)" % (example.label, label_id)) + print("input_ids: %s" % " ".join([str(x) for x in input_ids])) + print("input_mask: %s" % " ".join([str(x) for x in input_mask])) + print("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + print("label: %s (id = %d)" % (example.label, label_id)) features.append( InputFeatures(input_ids=input_ids, @@ -306,6 +160,7 @@ def _truncate_seq_pair(tokens_a, tokens_b, max_length): else: tokens_b.pop() + def accuracy(out, labels): outputs = np.argmax(out, axis=1) return np.sum(outputs == labels) diff --git a/datasets/processors/imdb_processor.py b/datasets/processors/imdb_processor.py new file mode 100644 index 0000000..5978cf3 --- /dev/null +++ b/datasets/processors/imdb_processor.py @@ -0,0 +1,33 @@ +import os + +from datasets.processors.bert_processor import BertProcessor, InputExample + + +class IMDBProcessor(BertProcessor): + """Processor for the IMDB dataset""" + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = line[1] + label = line[0] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples \ No newline at end of file diff --git a/datasets/processors/reuters_processor.py b/datasets/processors/reuters_processor.py new file mode 100644 index 0000000..6e4833b --- /dev/null +++ b/datasets/processors/reuters_processor.py @@ -0,0 +1,33 @@ +import os + +from datasets.processors.bert_processor import BertProcessor, InputExample + + +class ReutersProcessor(BertProcessor): + """"Processor for the Reuters data set""" + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = line[1] + label = line[0] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples \ No newline at end of file diff --git a/datasets/processors/sst_processor.py b/datasets/processors/sst_processor.py new file mode 100644 index 0000000..3aa603b --- /dev/null +++ b/datasets/processors/sst_processor.py @@ -0,0 +1,34 @@ +import os + +from datasets.processors.bert_processor import BertProcessor, InputExample + + +class Sst2Processor(BertProcessor): + """Processor for the SST-2 data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = line[0] + label = line[1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples \ No newline at end of file diff --git a/datasets/processors/yelp_processor.py b/datasets/processors/yelp_processor.py new file mode 100644 index 0000000..4ff8326 --- /dev/null +++ b/datasets/processors/yelp_processor.py @@ -0,0 +1,33 @@ +import os + +from datasets.processors.bert_processor import BertProcessor, InputExample + + +class Yelp2014Processor(BertProcessor): + """Processor for the IMDB dataset""" + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = line[1] + label = line[0] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples \ No newline at end of file diff --git a/models/bert/__main__.py b/models/bert/__main__.py index f89d843..c4db3e1 100644 --- a/models/bert/__main__.py +++ b/models/bert/__main__.py @@ -1,21 +1,21 @@ -from models.bert.args import get_args -from models.bert.model import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME -from __future__ import absolute_import, division, print_function -from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE -#from datasets.dataset4bert import DataProcessor, Sst2Processor, ReutersProcessor, AAPDProcessor, InputExample, InputFeatures -from datasets import data4bert -from datasets import BertTokenizer -from utils import optimization -from datasets.tokenization4bert import BertTokenizer -from common import train4bert -#from common.train4bert import TrainerFactory -from common.evaluate4bert import EvaluatorFactory +import os +import random + +import numpy as np +import torch +from common.evaluators.bert_evaluator import BertEvaluator +from common.trainers.bert_trainer import BertTrainer +from datasets.processors.sst_processor import Sst2Processor +from models.bert.args import get_args +from models.bert.model import BertForSequenceClassification +from utils.io import PYTORCH_PRETRAINED_BERT_CACHE +from utils.optimization import BertAdam +from utils.tokenization4bert import BertTokenizer if __name__ == '__main__': # Set default configuration in args.py args = get_args() - #logger = get_logger() # Set random seed for reproducibility @@ -26,17 +26,11 @@ ptvsd.wait_for_attach() processors = { - "cola": ColaProcessor, - "mnli": MnliProcessor, - "mrpc": MrpcProcessor, - "sst-2": Sst2Processor, + "sst-2": Sst2Processor } num_labels_task = { - "cola": 2, - "sst-2": 2, - "mnli": 3, - "mrpc": 2, + "sst-2": 2 } if args.local_rank == -1 or args.no_cuda: @@ -48,14 +42,24 @@ n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') - logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( - device, n_gpu, bool(args.local_rank != -1), args.fp16)) + + print("Device: {} Num. of GPUs: {}, Distributed training: {}, FP16: {}".format(device, + n_gpu, + bool(args.local_rank != -1), + args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) + task_name = args.task_name.lower() + if task_name not in processors: + raise ValueError("Task not found: %s" % (task_name)) + args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps + args.device = device + args.n_gpu = n_gpu + args.num_labels = num_labels_task[task_name] random.seed(args.seed) np.random.seed(args.seed) @@ -71,25 +75,27 @@ if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) - task_name = args.task_name.lower() - - if task_name not in processors: - raise ValueError("Task not found: %s" % (task_name)) - processor = processors[task_name]() - num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) + train_examples = None + num_train_optimization_steps = None + if args.do_train: + train_examples = processor.get_train_examples(args.data_dir) + num_train_optimization_steps = int( + len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs + if args.local_rank != -1: + num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() ## Model Preparation # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)) model = BertForSequenceClassification.from_pretrained(args.bert_model, - cache_dir=cache_dir, - num_labels = num_labels) + cache_dir=cache_dir, + num_labels=args.num_labels) if args.fp16: model.half() model.to(device) @@ -115,7 +121,7 @@ from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: - raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") + raise ImportError("Please install Nvidia Apex for distributed and fp16 training") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, @@ -131,19 +137,14 @@ lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) - ############################### - ## Setup trainer - trainer = BertTrainer(model, optimizer, processor) - evaluotor = BertEvaluator(model, processor) - ############################### - - ############################### - ## DataLoader done while setting up the trainer - ############################### - ######################## + + trainer = BertTrainer(model, optimizer, processor, args) + evaluator = BertEvaluator(model, processor, args) + if args.do_train: trainer.train() else: - model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels) - if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Dont need the second condition here perhaps - + model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=args.num_labels) + + if args.do_eval: + evaluator.evaluate() diff --git a/models/bert/args.py b/models/bert/args.py index 47d29e4..800f47e 100644 --- a/models/bert/args.py +++ b/models/bert/args.py @@ -1,36 +1,12 @@ import os +from argparse import ArgumentParser import models.args def get_args(): - parser = models.args.get_args() - """ - parser.add_argument('--bidirectional', action='store_true') - parser.add_argument('--bottleneck-layer', action='store_true') - parser.add_argument('--num-layers', type=int, default=2) - parser.add_argument('--hidden-dim', type=int, default=256) - parser.add_argument('--mode', type=str, default='static', choices=['rand', 'static', 'non-static']) - parser.add_argument('--dataset', type=str, default='Reuters', choices=['Reuters', 'AAPD', 'IMDB', 'Yelp2014']) - parser.add_argument('--words-dim', type=int, default=300) - parser.add_argument('--embed-dim', type=int, default=300) - parser.add_argument('--epoch-decay', type=int, default=15) - parser.add_argument('--weight-decay', type=float, default=0) + parser = ArgumentParser(description="PyTorch deep learning models for document classification") - parser.add_argument('--dropout', type=float, default=0.5) - parser.add_argument('--wdrop', type=float, default=0.0, help="weight drop") - parser.add_argument('--beta-ema', type=float, default=0, help="temporal averaging") - parser.add_argument('--embed-droprate', type=float, default=0.0, help="embedding dropout") - parser.add_argument('--tar', type=float, default=0.0, help="temporal activation regularization") - parser.add_argument('--ar', type=float, default=0.0, help="activation regularization") - - parser.add_argument('--word-vectors-dir', default=os.path.join(os.pardir, 'Castor-data', 'embeddings', 'word2vec')) - parser.add_argument('--word-vectors-file', default='GoogleNews-vectors-negative300.txt') - parser.add_argument('--save-path', type=str, default=os.path.join('reg_lstm', 'saves')) - parser.add_argument('--resume-snapshot', type=str) - parser.add_argument('--trained-model', type=str) - """ - ## Required parameters parser.add_argument("--data_dir", default=None, type=str, diff --git a/models/bert/model.py b/models/bert/model.py index 9d865d9..21bd403 100644 --- a/models/bert/model.py +++ b/models/bert/model.py @@ -851,4 +851,3 @@ def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=No pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) return logits - diff --git a/models/distill/__main__.py b/models/distill/__main__.py index 0c03bc2..a8a6563 100644 --- a/models/distill/__main__.py +++ b/models/distill/__main__.py @@ -1,15 +1,11 @@ from models.bert.args import get_args -from models.bert.model import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME +from models.bert.model import BertForSequenceClassification from __future__ import absolute_import, division, print_function from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE #from datasets.dataset4bert import DataProcessor, Sst2Processor, ReutersProcessor, AAPDProcessor, InputExample, InputFeatures -from datasets import data4bert -from datasets import BertTokenizer -from utils import optimization from datasets.tokenization4bert import BertTokenizer -from common import train4bert + #from common.train4bert import TrainerFactory -from common.evaluate4bert import EvaluatorFactory if __name__ == '__main__': diff --git a/utils/tokenization4bert.py b/utils/tokenization.py similarity index 96% rename from utils/tokenization4bert.py rename to utils/tokenization.py index c549e06..587ec70 100644 --- a/utils/tokenization4bert.py +++ b/utils/tokenization.py @@ -14,15 +14,13 @@ # limitations under the License. """Tokenization classes.""" -from __future__ import absolute_import, division, print_function, unicode_literals - import collections import logging import os import unicodedata from io import open -from .file_utils import cached_path +from utils.io import cached_path logger = logging.getLogger(__name__) @@ -74,13 +72,13 @@ def whitespace_tokenize(text): class BertTokenizer(object): """Runs end-to-end tokenization: punctuation splitting + wordpiece""" - def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True, + def __init__(self, vocab_file, is_lowercase=True, max_len=None, do_basic_tokenize=True, never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")): """Constructs a BertTokenizer. Args: vocab_file: Path to a one-wordpiece-per-line vocabulary file - do_lower_case: Whether to lower case the input + is_lowercase: Whether to lower case the input Only has an effect when do_wordpiece_only=False do_basic_tokenize: Whether to do basic tokenization before wordpiece. max_len: An artificial maximum length to truncate tokenized sequences to; @@ -99,7 +97,7 @@ def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokeni [(ids, tok) for tok, ids in self.vocab.items()]) self.do_basic_tokenize = do_basic_tokenize if do_basic_tokenize: - self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case, + self.basic_tokenizer = BasicTokenizer(is_lowercase=is_lowercase, never_split=never_split) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) self.max_len = max_len if max_len is not None else int(1e12) @@ -177,14 +175,14 @@ class BasicTokenizer(object): """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" def __init__(self, - do_lower_case=True, + is_lowercase=True, never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")): """Constructs a BasicTokenizer. Args: - do_lower_case: Whether to lower case the input. + is_lowercase: Whether to lower case the input. """ - self.do_lower_case = do_lower_case + self.is_lowercase = is_lowercase self.never_split = never_split def tokenize(self, text): @@ -200,7 +198,7 @@ def tokenize(self, text): orig_tokens = whitespace_tokenize(text) split_tokens = [] for token in orig_tokens: - if self.do_lower_case and token not in self.never_split: + if self.is_lowercase and token not in self.never_split: token = token.lower() token = self._run_strip_accents(token) split_tokens.extend(self._run_split_on_punc(token))