Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Integrate BERT into Castor framework (#17)
* Remove unused classes in models/bert

* Split data4bert module into multiple processors

* Refactor BERT tokenizer
  • Loading branch information
achyudh authored and Ashutosh-Adhikari committed Apr 5, 2019
1 parent af92a55 commit e4244ec
Show file tree
Hide file tree
Showing 17 changed files with 348 additions and 408 deletions.
120 changes: 0 additions & 120 deletions common/evaluators/BertEvaluator.py

This file was deleted.

74 changes: 74 additions & 0 deletions common/evaluators/bert_evaluator.py
@@ -0,0 +1,74 @@
import os

import torch
from torch.utils.data import DataLoader, SequentialSampler, TensorDataset
from tqdm import tqdm

from datasets.processors.bert_processor import convert_examples_to_features, accuracy
from utils.tokenization4bert import BertTokenizer


class BertEvaluator(object):
def __init__(self, model, processor, args):
self.args = args
self.model = model
self.processor = processor
self.eval_examples = self.processor.get_dev_examples(args.data_dir)
self.tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

def evaluate(self):
label_list = self.processor.get_labels()
eval_features = convert_examples_to_features(self.eval_examples,
label_list,
self.args.max_seq_length,
self.tokenizer)

print("Num. of examples =", len(self.eval_examples))
print("Batch size = %d", self.args.eval_batch_size)

all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)

eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=self.args.eval_batch_size)

self.model.eval()

eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
input_ids = input_ids.to(self.args.device)
input_mask = input_mask.to(self.args.device)
segment_ids = segment_ids.to(self.args.device)
label_ids = label_ids.to(self.args.device)

with torch.no_grad():
tmp_eval_loss = self.model(input_ids, segment_ids, input_mask, label_ids)
logits = self.model(input_ids, segment_ids, input_mask)

logits = logits.detach().cpu().numpy()
label_ids = label_ids.to('cpu').numpy()
tmp_eval_accuracy = accuracy(logits, label_ids)

eval_loss += tmp_eval_loss.mean().item()
eval_accuracy += tmp_eval_accuracy

nb_eval_examples += input_ids.size(0)
nb_eval_steps += 1

eval_loss = eval_loss / nb_eval_steps
eval_accuracy = eval_accuracy / nb_eval_examples

result = {'eval_loss': eval_loss,
'eval_accuracy': eval_accuracy}

output_eval_file = os.path.join(self.args.output_dir, "eval_results.txt")
with open(output_eval_file, "w") as writer:
print("***** Eval results *****")
for key in sorted(result.keys()):
print(" %s = %s", key, str(result[key]))
writer.write("%s = %s\n" % (key, str(result[key])))
72 changes: 33 additions & 39 deletions common/train4bert.py → common/trainers/bert_trainer.py
@@ -1,44 +1,40 @@
import time

import datetime
import numpy as np
import os
import torch import torch
import tqdm
import torch.nn.functional as F import torch.nn.functional as F
from tensorboardX import SummaryWriter from torch.utils.data import DataLoader, RandomSampler, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm
from tqdm import trange


from .trainer import Trainer from datasets.processors.bert_processor import convert_examples_to_features
from models.bert.args import get_args
from utils.optimization import warmup_linear from utils.optimization import warmup_linear
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset) from utils.tokenization4bert import BertTokenizer




class BertTrainer(object): class BertTrainer(object):
def __init(self, model, optimizer, processor) def __init__(self, model, optimizer, processor, args):
self.args = get_args() self.args = args
self.model = model self.model = model
#self.train_dataloader = train_dataloader
self.optimizer = optimizer self.optimizer = optimizer
self.processor = processor self.processor = processor
self.train_examples = None self.tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
self.num_train_optimization_steps = None
self.train_examples = self.processor.get_train_examples(args.data_dir) self.train_examples = self.processor.get_train_examples(args.data_dir)
self.num_train_optimization_steps = int( self.num_train_optimization_steps = int(
len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs len(self.train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
if args.local_rank != -1: if args.local_rank != -1:
num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() self.num_train_optimization_steps = args.num_train_optimization_steps // torch.distributed.get_world_size()
self.global_step = 0 self.global_step = 0
self.nb_tr_steps = 0 self.nb_tr_steps = 0
self.tr_loss = 0 self.tr_loss = 0


def train_epoch(self): def train_epoch(self, train_dataloader):
for step, batch in enumerate(tqdm(self.train_dataloader, desc="Iteration")): for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
batch = tuple(t.to(device) for t in batch) batch = tuple(t.to(self.args.device) for t in batch)
input_ids, input_mask, segment_ids, label_ids = batch input_ids, input_mask, segment_ids, label_ids = batch
loss = self.model(input_ids, segment_ids, input_mask, label_ids) #model no more returns the loss, change this logits = self.model(input_ids, segment_ids, input_mask, label_ids)
if n_gpu > 1: loss = F.cross_entropy(logits.view(-1, self.args.num_labels), label_ids.view(-1))
loss = loss.mean() # mean() to average on multi-gpu. if self.args.n_gpu > 1:
loss = loss.mean()
if self.args.gradient_accumulation_steps > 1: if self.args.gradient_accumulation_steps > 1:
loss = loss / self.args.gradient_accumulation_steps loss = loss / self.args.gradient_accumulation_steps


Expand All @@ -48,39 +44,37 @@ def train_epoch(self):
loss.backward() loss.backward()


self.tr_loss += loss.item() self.tr_loss += loss.item()
self.nb_tr_examples += input_ids.size(0)
self.nb_tr_steps += 1 self.nb_tr_steps += 1
if (step + 1) % self.args.gradient_accumulation_steps == 0: if (step + 1) % self.args.gradient_accumulation_steps == 0:
if self.args.fp16: if self.args.fp16:
# modify learning rate with special warm up BERT uses
# if args.fp16 is False, BertAdam is used that handles this automatically
lr_this_step = self.args.learning_rate * warmup_linear(self.global_step/self.num_train_optimization_steps, self.args.warmup_proportion) lr_this_step = self.args.learning_rate * warmup_linear(self.global_step/self.num_train_optimization_steps, self.args.warmup_proportion)
for param_group in self.optimizer.param_groups: for param_group in self.optimizer.param_groups:
param_group['lr'] = lr_this_step param_group['lr'] = lr_this_step
self.optimizer.step() self.optimizer.step()
self.optimizer.zero_grad() self.optimizer.zero_grad()
self.global_step += 1 self.global_step += 1


def train(self, epochs): def train(self):
label_list = self.processor.get_labels()
train_features = convert_examples_to_features( train_features = convert_examples_to_features(
self.train_examples, label_list, args.max_seq_length, tokenizer) self.train_examples, label_list, self.args.max_seq_length, self.tokenizer)
logger.info("***** Running training *****") print("***** Running training *****")
logger.info(" Num examples = %d", len(train_examples)) print(" Num. of examples: ", len(self.train_examples))
logger.info(" Batch size = %d", args.train_batch_size) print(" Batch size:", self.args.train_batch_size)
logger.info(" Num steps = %d", num_train_optimization_steps) print(" Num of steps:", self.num_train_optimization_steps)
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
if args.local_rank == -1: if self.args.local_rank == -1:
train_sampler = RandomSampler(train_data) train_sampler = RandomSampler(train_data)
else: else:
train_sampler = DistributedSampler(train_data) train_sampler = DistributedSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)


model.train() train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=self.args.train_batch_size)
for _ in trange(int(args.num_train_epochs), desc="Epoch"):
tr_loss = 0 self.model.train()
nb_tr_examples, nb_tr_steps = 0, 0
self.train_epoch() for _ in trange(int(self.args.num_train_epochs), desc="Epoch"):
self.train_epoch(train_dataloader)
2 changes: 1 addition & 1 deletion common/trainers/classification_trainer.py
Expand Up @@ -7,7 +7,7 @@
import torch.nn.functional as F import torch.nn.functional as F
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter


from .trainer import Trainer from common.trainers.trainer import Trainer




class ClassificationTrainer(Trainer): class ClassificationTrainer(Trainer):
Expand Down
1 change: 1 addition & 0 deletions datasets/aapd.py
Expand Up @@ -9,6 +9,7 @@


from datasets.reuters import clean_string, clean_string_fl, split_sents from datasets.reuters import clean_string, clean_string_fl, split_sents



def char_quantize(string, max_length=1000): def char_quantize(string, max_length=1000):
identity = np.identity(len(AAPDCharQuantized.ALPHABET)) identity = np.identity(len(AAPDCharQuantized.ALPHABET))
quantized_string = np.array([identity[AAPDCharQuantized.ALPHABET[char]] for char in list(string.lower()) if char in AAPDCharQuantized.ALPHABET], dtype=np.float32) quantized_string = np.array([identity[AAPDCharQuantized.ALPHABET[char]] for char in list(string.lower()) if char in AAPDCharQuantized.ALPHABET], dtype=np.float32)
Expand Down
File renamed without changes.
33 changes: 33 additions & 0 deletions datasets/processors/aapd_processor.py
@@ -0,0 +1,33 @@
import os

from datasets.processors.bert_processor import BertProcessor, InputExample


class AAPDProcessor(BertProcessor):
"""Processor for the IMDB dataset"""
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

def get_labels(self):
"""See base class."""
return ["0", "1"]

def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, i)
text_a = line[1]
label = line[0]
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
return examples

0 comments on commit e4244ec

Please sign in to comment.