Skip to content
Permalink
Browse files

Integrate BERT into Castor framework (#17)

* Remove unused classes in models/bert

* Split data4bert module into multiple processors

* Refactor BERT tokenizer
  • Loading branch information...
achyudh authored and Ashutosh-Adhikari committed Apr 5, 2019
1 parent af92a55 commit e4244ec73950d1efb15f706de6a4c77988c821ba

This file was deleted.

Oops, something went wrong.
@@ -0,0 +1,74 @@
import os

import torch
from torch.utils.data import DataLoader, SequentialSampler, TensorDataset
from tqdm import tqdm

from datasets.processors.bert_processor import convert_examples_to_features, accuracy
from utils.tokenization4bert import BertTokenizer


class BertEvaluator(object):
def __init__(self, model, processor, args):
self.args = args
self.model = model
self.processor = processor
self.eval_examples = self.processor.get_dev_examples(args.data_dir)
self.tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

def evaluate(self):
label_list = self.processor.get_labels()
eval_features = convert_examples_to_features(self.eval_examples,
label_list,
self.args.max_seq_length,
self.tokenizer)

print("Num. of examples =", len(self.eval_examples))
print("Batch size = %d", self.args.eval_batch_size)

all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)

eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=self.args.eval_batch_size)

self.model.eval()

eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
input_ids = input_ids.to(self.args.device)
input_mask = input_mask.to(self.args.device)
segment_ids = segment_ids.to(self.args.device)
label_ids = label_ids.to(self.args.device)

with torch.no_grad():
tmp_eval_loss = self.model(input_ids, segment_ids, input_mask, label_ids)
logits = self.model(input_ids, segment_ids, input_mask)

logits = logits.detach().cpu().numpy()
label_ids = label_ids.to('cpu').numpy()
tmp_eval_accuracy = accuracy(logits, label_ids)

eval_loss += tmp_eval_loss.mean().item()
eval_accuracy += tmp_eval_accuracy

nb_eval_examples += input_ids.size(0)
nb_eval_steps += 1

eval_loss = eval_loss / nb_eval_steps
eval_accuracy = eval_accuracy / nb_eval_examples

result = {'eval_loss': eval_loss,
'eval_accuracy': eval_accuracy}

output_eval_file = os.path.join(self.args.output_dir, "eval_results.txt")
with open(output_eval_file, "w") as writer:
print("***** Eval results *****")
for key in sorted(result.keys()):
print(" %s = %s", key, str(result[key]))
writer.write("%s = %s\n" % (key, str(result[key])))
@@ -1,44 +1,40 @@
import time

import datetime
import numpy as np
import os
import torch
import tqdm
import torch.nn.functional as F
from tensorboardX import SummaryWriter
from torch.utils.data import DataLoader, RandomSampler, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm
from tqdm import trange

from .trainer import Trainer
from models.bert.args import get_args
from datasets.processors.bert_processor import convert_examples_to_features
from utils.optimization import warmup_linear
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
from utils.tokenization4bert import BertTokenizer


class BertTrainer(object):
def __init(self, model, optimizer, processor)
self.args = get_args()
def __init__(self, model, optimizer, processor, args):
self.args = args
self.model = model
#self.train_dataloader = train_dataloader
self.optimizer = optimizer
self.processor = processor
self.train_examples = None
self.num_train_optimization_steps = None
self.tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

self.train_examples = self.processor.get_train_examples(args.data_dir)
self.num_train_optimization_steps = int(
len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
len(self.train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
if args.local_rank != -1:
num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
self.num_train_optimization_steps = args.num_train_optimization_steps // torch.distributed.get_world_size()
self.global_step = 0
self.nb_tr_steps = 0
self.tr_loss = 0

def train_epoch(self):
for step, batch in enumerate(tqdm(self.train_dataloader, desc="Iteration")):
batch = tuple(t.to(device) for t in batch)
def train_epoch(self, train_dataloader):
for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
batch = tuple(t.to(self.args.device) for t in batch)
input_ids, input_mask, segment_ids, label_ids = batch
loss = self.model(input_ids, segment_ids, input_mask, label_ids) #model no more returns the loss, change this
if n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu.
logits = self.model(input_ids, segment_ids, input_mask, label_ids)
loss = F.cross_entropy(logits.view(-1, self.args.num_labels), label_ids.view(-1))
if self.args.n_gpu > 1:
loss = loss.mean()
if self.args.gradient_accumulation_steps > 1:
loss = loss / self.args.gradient_accumulation_steps

@@ -48,39 +44,37 @@ def train_epoch(self):
loss.backward()

self.tr_loss += loss.item()
self.nb_tr_examples += input_ids.size(0)
self.nb_tr_steps += 1
if (step + 1) % self.args.gradient_accumulation_steps == 0:
if self.args.fp16:
# modify learning rate with special warm up BERT uses
# if args.fp16 is False, BertAdam is used that handles this automatically
lr_this_step = self.args.learning_rate * warmup_linear(self.global_step/self.num_train_optimization_steps, self.args.warmup_proportion)
for param_group in self.optimizer.param_groups:
param_group['lr'] = lr_this_step
self.optimizer.step()
self.optimizer.zero_grad()
self.global_step += 1

def train(self, epochs):
def train(self):
label_list = self.processor.get_labels()
train_features = convert_examples_to_features(
self.train_examples, label_list, args.max_seq_length, tokenizer)
logger.info("***** Running training *****")
logger.info(" Num examples = %d", len(train_examples))
logger.info(" Batch size = %d", args.train_batch_size)
logger.info(" Num steps = %d", num_train_optimization_steps)
self.train_examples, label_list, self.args.max_seq_length, self.tokenizer)
print("***** Running training *****")
print(" Num. of examples: ", len(self.train_examples))
print(" Batch size:", self.args.train_batch_size)
print(" Num of steps:", self.num_train_optimization_steps)
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
if args.local_rank == -1:
if self.args.local_rank == -1:
train_sampler = RandomSampler(train_data)
else:
train_sampler = DistributedSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

model.train()
for _ in trange(int(args.num_train_epochs), desc="Epoch"):
tr_loss = 0
nb_tr_examples, nb_tr_steps = 0, 0
self.train_epoch()
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=self.args.train_batch_size)

self.model.train()

for _ in trange(int(self.args.num_train_epochs), desc="Epoch"):
self.train_epoch(train_dataloader)
@@ -7,7 +7,7 @@
import torch.nn.functional as F
from tensorboardX import SummaryWriter

from .trainer import Trainer
from common.trainers.trainer import Trainer


class ClassificationTrainer(Trainer):
@@ -9,6 +9,7 @@

from datasets.reuters import clean_string, clean_string_fl, split_sents


def char_quantize(string, max_length=1000):
identity = np.identity(len(AAPDCharQuantized.ALPHABET))
quantized_string = np.array([identity[AAPDCharQuantized.ALPHABET[char]] for char in list(string.lower()) if char in AAPDCharQuantized.ALPHABET], dtype=np.float32)
@@ -0,0 +1,33 @@
import os

from datasets.processors.bert_processor import BertProcessor, InputExample


class AAPDProcessor(BertProcessor):
"""Processor for the IMDB dataset"""
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

def get_labels(self):
"""See base class."""
return ["0", "1"]

def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, i)
text_a = line[1]
label = line[0]
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
return examples
Oops, something went wrong.

0 comments on commit e4244ec

Please sign in to comment.
You can’t perform that action at this time.