Skip to content
Permalink
Browse files

Add hierarchical BERT model (#25)

  • Loading branch information...
achyudh committed Jul 4, 2019
1 parent 3cd54c2 commit 7eb7c89e9d86375fc63b07d7f8e59fb3eca7197b
@@ -7,7 +7,9 @@
from torch.utils.data import DataLoader, SequentialSampler, TensorDataset
from tqdm import tqdm

from datasets.bert_processors.abstract_processor import convert_examples_to_features
from datasets.bert_processors.abstract_processor import convert_examples_to_features, \
convert_examples_to_hierarchical_features
from utils.preprocessing import pad_input_matrix
from utils.tokenization import BertTokenizer

# Suppress warnings from sklearn.metrics
@@ -26,14 +28,28 @@ def __init__(self, model, processor, args, split='dev'):
self.eval_examples = self.processor.get_dev_examples(args.data_dir)

def get_scores(self, silent=False):
eval_features = convert_examples_to_features(self.eval_examples, self.args.max_seq_length, self.tokenizer)
if self.args.is_hierarchical:
eval_features = convert_examples_to_hierarchical_features(
self.eval_examples, self.args.max_seq_length, self.tokenizer)
else:
eval_features = convert_examples_to_features(
self.eval_examples, self.args.max_seq_length, self.tokenizer)

unpadded_input_ids = [f.input_ids for f in eval_features]
unpadded_input_mask = [f.input_mask for f in eval_features]
unpadded_segment_ids = [f.segment_ids for f in eval_features]

if self.args.is_hierarchical:
pad_input_matrix(unpadded_input_ids, self.args.max_doc_length)
pad_input_matrix(unpadded_input_mask, self.args.max_doc_length)
pad_input_matrix(unpadded_segment_ids, self.args.max_doc_length)

all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
padded_input_ids = torch.tensor(unpadded_input_ids, dtype=torch.long)
padded_input_mask = torch.tensor(unpadded_input_mask, dtype=torch.long)
padded_segment_ids = torch.tensor(unpadded_segment_ids, dtype=torch.long)
label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)

eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
eval_data = TensorDataset(padded_input_ids, padded_input_mask, padded_segment_ids, label_ids)
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=self.args.batch_size)

@@ -3,15 +3,16 @@

import torch
import torch.nn.functional as F
from tensorboardX import SummaryWriter
from torch.utils.data import DataLoader, RandomSampler, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm
from tqdm import trange

from common.evaluators.bert_evaluator import BertEvaluator
from datasets.bert_processors.abstract_processor import convert_examples_to_features
from datasets.bert_processors.abstract_processor import convert_examples_to_hierarchical_features
from utils.optimization import warmup_linear
from utils.preprocessing import pad_input_matrix
from utils.tokenization import BertTokenizer


@@ -25,7 +26,6 @@ def __init__(self, model, optimizer, processor, args):
self.tokenizer = BertTokenizer.from_pretrained(args.model, is_lowercase=args.is_lowercase)

timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
self.writer = SummaryWriter(log_dir="tensorboard_logs/" + timestamp)
self.snapshot_path = os.path.join(self.args.save_path, self.processor.NAME, '%s.pt' % timestamp)

self.num_train_optimization_steps = int(
@@ -74,18 +74,33 @@ def train_epoch(self, train_dataloader):
self.iterations += 1

def train(self):
train_features = convert_examples_to_features(
self.train_examples, self.args.max_seq_length, self.tokenizer)
if self.args.is_hierarchical:
train_features = convert_examples_to_hierarchical_features(
self.train_examples, self.args.max_seq_length, self.tokenizer)
else:
train_features = convert_examples_to_features(
self.train_examples, self.args.max_seq_length, self.tokenizer)

unpadded_input_ids = [f.input_ids for f in train_features]
unpadded_input_mask = [f.input_mask for f in train_features]
unpadded_segment_ids = [f.segment_ids for f in train_features]

if self.args.is_hierarchical:
pad_input_matrix(unpadded_input_ids, self.args.max_doc_length)
pad_input_matrix(unpadded_input_mask, self.args.max_doc_length)
pad_input_matrix(unpadded_segment_ids, self.args.max_doc_length)

print("Number of examples: ", len(self.train_examples))
print("Batch size:", self.args.batch_size)
print("Num of steps:", self.num_train_optimization_steps)

all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
padded_input_ids = torch.tensor(unpadded_input_ids, dtype=torch.long)
padded_input_mask = torch.tensor(unpadded_input_mask, dtype=torch.long)
padded_segment_ids = torch.tensor(unpadded_segment_ids, dtype=torch.long)
label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)

train_data = TensorDataset(padded_input_ids, padded_input_mask, padded_segment_ids, label_ids)

if self.args.local_rank == -1:
train_sampler = RandomSampler(train_data)
else:
@@ -2,6 +2,7 @@

import sys
import numpy as np
from nltk.tokenize import sent_tokenize


class InputExample(object):
@@ -170,6 +171,69 @@ def convert_examples_to_features(examples, max_seq_length, tokenizer, print_exam
return features


def convert_examples_to_hierarchical_features(examples, max_seq_length, tokenizer, print_examples=False):
"""
Loads a data file into a list of InputBatch objects
:param examples:
:param max_seq_length:
:param tokenizer:
:param print_examples:
:return: a list of InputBatch objects
"""

features = []
for (ex_index, example) in enumerate(examples):
tokens_a = [tokenizer.tokenize(line) for line in sent_tokenize(example.text_a)]
tokens_b = None

if example.text_b:
tokens_b = [tokenizer.tokenize(line) for line in sent_tokenize(example.text_b)]
# Modifies `tokens_a` and `tokens_b` in place so that the total length is less than the specified length
# Account for [CLS], [SEP], [SEP]
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
else:
# Account for [CLS] and [SEP]
for i0 in range(len(tokens_a)):
if len(tokens_a[i0]) > max_seq_length - 2:
tokens_a[i0] = tokens_a[i0][:(max_seq_length - 2)]

tokens = [["[CLS]"] + line + ["[SEP]"] for line in tokens_a]
segment_ids = [[0] * len(line) for line in tokens]

if tokens_b:
tokens += tokens_b + ["[SEP]"]
segment_ids += [1] * (len(tokens_b) + 1)

input_ids = list()
for line in tokens:
input_ids.append(tokenizer.convert_tokens_to_ids(line))

# Input mask has 1 for real tokens and 0 for padding tokens
input_mask = [[1] * len(line_ids) for line_ids in input_ids]

# Zero-pad up to the sequence length.
padding = [[0] * (max_seq_length - len(line_ids)) for line_ids in input_ids]
for i0 in range(len(input_ids)):
input_ids[i0] += padding[i0]
input_mask[i0] += padding[i0]
segment_ids[i0] += padding[i0]

label_id = [float(x) for x in example.label]

if print_examples and ex_index < 5:
print("tokens: %s" % " ".join([str(x) for x in tokens]))
print("input_ids: %s" % " ".join([str(x) for x in input_ids]))
print("input_mask: %s" % " ".join([str(x) for x in input_mask]))
print("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
print("label: %s" % example.label)

features.append(InputFeatures(input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
label_id=label_id))
return features


def _truncate_seq_pair(tokens_a, tokens_b, max_length):
"""
Truncates a sequence pair in place to the maximum length
@@ -1,5 +1,6 @@
import os
import random
import time

import numpy as np
import torch
@@ -26,7 +27,9 @@

def evaluate_split(model, processor, args, split='dev'):
evaluator = BertEvaluator(model, processor, args, split)
start_time = time.time()
accuracy, precision, recall, f1, avg_loss = evaluator.get_scores(silent=True)[0]
print("Inference time", time.time() - start_time)
print('\n' + LOG_HEADER)
print(LOG_TEMPLATE.format(split.upper(), accuracy, precision, recall, f1, avg_loss))

@@ -86,6 +89,7 @@ def evaluate_split(model, processor, args, split='dev'):

processor = dataset_map[args.dataset]()
args.is_lowercase = 'uncased' in args.model
args.is_hierarchical = False
tokenizer = BertTokenizer.from_pretrained(args.model, is_lowercase=args.is_lowercase)

train_examples = None
@@ -109,7 +113,6 @@ def evaluate_split(model, processor, args, split='dev'):
from apex.parallel import DistributedDataParallel as DDP
except ImportError:
raise ImportError("Install NVIDIA Apex to use distributed and FP16 training.")

model = DDP(model)
elif n_gpu > 1:
model = torch.nn.DataParallel(model)
@@ -0,0 +1,48 @@
# Hierarchical BERT

A wrapper around pre-trained [BERT](https://arxiv.org/abs/1810.04805) models for finetuning on Document Classification tasks.

## Quick start

For fine-tuning the pre-trained BERT-base model on Reuters dataset, just run the following from the project working directory.

```
python -m models.hbert --dataset Reuters --model bert-base-uncased --max-seq-length 256 --batch-size 16 --lr 2e-5 --epochs 30
```

The best model weights will be saved in

```
models/hbert/saves/Reuters/best_model.pt
```

To test the model, you can use the following command.

```
python -m models.hbert --dataset Reuters --model bert-base-uncased --max-seq-length 256 --batch-size 16 --lr 2e-5 --epochs 30 --trained-model models/hbert/saves/Reuters/best_model.pt
```

## Model Types

We follow the same types of models as in [huggingface's implementation](https://github.com/huggingface/pytorch-pretrained-BERT.git)
- bert-base-uncased
- bert-large-uncased
- bert-base-cased
- bert-large-cased

## Dataset

We experiment the model on the following datasets:
- Reuters (ModApte)
- AAPD
- IMDB
- Yelp 2014

## Settings

Finetuning procedure can be found in :
- [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805)
- [DocBERT: BERT for Document Classification](https://arxiv.org/abs/1904.08398v1)

## Acknowledgement
- Our implementation is inspired from [huggingface's implementation](https://github.com/huggingface/pytorch-pretrained-BERT.git)
No changes.

0 comments on commit 7eb7c89

Please sign in to comment.
You can’t perform that action at this time.