## BERT Imports

In [1]:
import torch
import pickle
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
from torch.nn import CrossEntropyLoss, MSELoss

from tqdm import tqdm_notebook, trange
import os
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM, BertForSequenceClassification
from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule

from multiprocessing import Pool, cpu_count
from tools import *
import convert_examples_to_features

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Constants

In [2]:
# The input data dir. Should contain the .tsv files (or other data files) for the task.
DATA_DIR = "../data/"

# Bert pre-trained model selected in the list: bert-base-uncased, 
# bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased,
# bert-base-multilingual-cased, bert-base-chinese.
BERT_MODEL = 'bert-base-cased'

# The name of the task to train.I'm going to name this 'yelp'.
TASK_NAME = 'bias_classification'

# The output directory where the fine-tuned model and checkpoints will be written.
OUTPUT_DIR = f'../outputs/{TASK_NAME}/'

# The directory where the evaluation reports will be written to.
REPORTS_DIR = f'../reports/{TASK_NAME}_evaluation_report/'

# This is where BERT will look for pre-trained models to load parameters from.
CACHE_DIR = '../cache/'

# The maximum total input sequence length after WordPiece tokenization.
# Sequences longer than this will be truncated, and sequences shorter than this will be padded.
MAX_SEQ_LENGTH = 256

TRAIN_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 16
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3
RANDOM_SEED = 42
GRADIENT_ACCUMULATION_STEPS = 1
WARMUP_PROPORTION = 0.1
OUTPUT_MODE = 'classification'

CONFIG_NAME = "config.json"
WEIGHTS_NAME = "pytorch_model.bin"

In [3]:
output_mode = OUTPUT_MODE

cache_dir = CACHE_DIR

In [4]:
if os.path.exists(REPORTS_DIR) and os.listdir(REPORTS_DIR):
        REPORTS_DIR += f'/report_{len(os.listdir(REPORTS_DIR))}'
        os.makedirs(REPORTS_DIR)
if not os.path.exists(REPORTS_DIR):
    os.makedirs(REPORTS_DIR)
    REPORTS_DIR += f'/report_{len(os.listdir(REPORTS_DIR))}'
    os.makedirs(REPORTS_DIR)

In [5]:
if os.path.exists(OUTPUT_DIR) and os.listdir(OUTPUT_DIR):
        raise ValueError("Output directory ({}) already exists and is not empty.".format(OUTPUT_DIR))
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

## Load The Data

In [6]:
processor = BinaryClassificationProcessor()
train_examples = processor.get_train_examples(DATA_DIR)
train_examples_len = len(train_examples)

['66,1,a,this version is a missed opportunity to narrow the provisions and time limit their applications .']
<class 'list'>
1
['66', '1', 'a', 'this version is a missed opportunity to narrow the provisions and time limit their applications .']
<class 'list'>
4
['2145,1,a,the title vii health professions programs are also the only federal programs designed to train providers in interdisciplinary settings to respond to the needs of special and underserved populations .']
<class 'list'>
1
['2145', '1', 'a', 'the title vii health professions programs are also the only federal programs designed to train providers in interdisciplinary settings to respond to the needs of special and underserved populations .']
<class 'list'>
4
['65,1,a,"mr. chairman , the patriot act was enacted in the wake of the 9/11 terrorist attacks , rushed through the house as a suspension bill the day after it was introduced ."']
<class 'list'>
1
['65', '1', 'a', '"mr. chairman , the patriot act was enacted in the wake

<class 'list'>
1
['524', '1', 'a', 'but would it not be a great message to send to the senate and to the american people by providing them with the estate tax relief they want and need without breaking the bank ?']
<class 'list'>
4
['2794,2,a,"this bill brings clean coal technology , strengthens nuclear power ; and it actually helps renewable power in the aspect of wind power ."']
<class 'list'>
1
['2794', '2', 'a', '"this bill brings clean coal technology , strengthens nuclear power ; and it actually helps renewable power in the aspect of wind power ."']
<class 'list'>
4
['738,2,a,bankruptcy relief for family farmers will be made easier for those to obtain a discharge of their indebtedness .']
<class 'list'>
1
['738', '2', 'a', 'bankruptcy relief for family farmers will be made easier for those to obtain a discharge of their indebtedness .']
<class 'list'>
4
['2447,2,a,"since the end of the cold war , carriers have been kept very busy and have proven their value in numerous operations

['613,2,a,"with this resolution , mr. speaker , the house will take a first step toward enacting these needed economic reforms to help small businesses create not just jobs but long-term , rewarding careers for the american people ."']
<class 'list'>
1
['613', '2', 'a', '"with this resolution , mr. speaker , the house will take a first step toward enacting these needed economic reforms to help small businesses create not just jobs but long-term , rewarding careers for the american people ."']
<class 'list'>
4
['612,1,a,"lost jobs , of course , also leave families without health coverage when they are at their financially most vulnerable ."']
<class 'list'>
1
['612', '1', 'a', '"lost jobs , of course , also leave families without health coverage when they are at their financially most vulnerable ."']
<class 'list'>
4
['2202,1,a,"we saw during hurricane katrina looters in new orleans , but the real looters are the big oil companies ."']
<class 'list'>
1
['2202', '1', 'a', '"we saw during

<class 'list'>
4
['255,2,a,"we do not have to choose between embryonic stem cell research and cord blood , assuming that only embryonic can solve problems ."']
<class 'list'>
1
['255', '2', 'a', '"we do not have to choose between embryonic stem cell research and cord blood , assuming that only embryonic can solve problems ."']
<class 'list'>
4
['2002,1,a,"that is their argument , but they are going to strip the health care benefits away from almost 8 million people that have this kind of coverage ."']
<class 'list'>
1
['2002', '1', 'a', '"that is their argument , but they are going to strip the health care benefits away from almost 8 million people that have this kind of coverage ."']
<class 'list'>
4
['1485,1,a,"instead of considering these bills to weaken osha , we should be strengthening provisions of the occupational safety and health act ."']
<class 'list'>
1
['1485', '1', 'a', '"instead of considering these bills to weaken osha , we should be strengthening provisions of the occup

['2778', '1', 'a', 'there is a better approach .']
<class 'list'>
4
['1304,1,a,the alternative also would have provided more resources for important priorities and would have laid the basis for more responsible tax policy .']
<class 'list'>
1
['1304', '1', 'a', 'the alternative also would have provided more resources for important priorities and would have laid the basis for more responsible tax policy .']
<class 'list'>
4
['2665,1,a,i find it amazing to hear the gentleman from california ( mr. dreier )  give his portrayal of what is going on in america and the world .']
<class 'list'>
1
['2665', '1', 'a', 'i find it amazing to hear the gentleman from california ( mr. dreier )  give his portrayal of what is going on in america and the world .']
<class 'list'>
4
['1608,2,a,the 9/11 commission was constituted in order to tell the american public what we could do to avoid or stave off another attack like the one that occurred on 9/11 .']
<class 'list'>
1
['1608', '2', 'a', 'the 9/11 commi

4
['2338,2,a,"unfortunately , to import liquid natural gas , we have got about three or four facilities today ."']
<class 'list'>
1
['2338', '2', 'a', '"unfortunately , to import liquid natural gas , we have got about three or four facilities today ."']
<class 'list'>
4
['2304,1,a,"in addition to the safety violations that contributed to that explosion , osha cited the company for allowing employees to work at heights of up to 50 feet without fall protection and for requiring employees to stand on a conveyor belt to remove jammed logs without adequate protection against being caught in a machine ."']
<class 'list'>
1
['2304', '1', 'a', '"in addition to the safety violations that contributed to that explosion , osha cited the company for allowing employees to work at heights of up to 50 feet without fall protection and for requiring employees to stand on a conveyor belt to remove jammed logs without adequate protection against being caught in a machine ."']
<class 'list'>
4
['213,1,a,th

In [7]:
label_list = processor.get_labels() # [0, 1] for binary classification
num_labels = len(label_list)

In [8]:
num_train_optimization_steps = int(
    train_examples_len / TRAIN_BATCH_SIZE / GRADIENT_ACCUMULATION_STEPS) * NUM_TRAIN_EPOCHS

In [9]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

INFO:pytorch_pretrained_bert.tokenization:loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt from cache at /root/.pytorch_pretrained_bert/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1


In [10]:
label_map = {label: i for i, label in enumerate(label_list)}
train_examples_for_processing = [(example, label_map, MAX_SEQ_LENGTH, tokenizer, OUTPUT_MODE) for example in train_examples]

In [11]:
process_count = cpu_count() - 1
if __name__ ==  '__main__':
    print(f'Preparing to convert {train_examples_len} examples..')
    print(f'Spawning {process_count} processes..')
    with Pool(process_count) as p:
        train_features = list(tqdm_notebook(p.imap(convert_examples_to_features.convert_example_to_feature, train_examples_for_processing), total=train_examples_len))

Preparing to convert 2422 examples..
Spawning 7 processes..


HBox(children=(IntProgress(value=0, max=2422), HTML(value='')))




In [12]:
with open(DATA_DIR + "train_features.pkl", "wb") as f:
    pickle.dump(train_features, f)

## To BERT & Beyond

In [13]:
# Load pre-trained model (weights)
model = BertForSequenceClassification.from_pretrained(BERT_MODEL, cache_dir=CACHE_DIR, num_labels=num_labels)
# model = BertForSequenceClassification.from_pretrained(CACHE_DIR + 'cased_base_bert_pytorch.tar.gz', cache_dir=CACHE_DIR, num_labels=num_labels)

INFO:pytorch_pretrained_bert.modeling:loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz from cache at ../cache/a803ce83ca27fecf74c355673c434e51c265fb8a3e0e57ac62a80e38ba98d384.681017f415dfb33ec8d0e04fe51a619f3f01532ecea04edbfd48c5d160550d9c
INFO:pytorch_pretrained_bert.modeling:extracting archive file ../cache/a803ce83ca27fecf74c355673c434e51c265fb8a3e0e57ac62a80e38ba98d384.681017f415dfb33ec8d0e04fe51a619f3f01532ecea04edbfd48c5d160550d9c to temp dir /tmp/tmpvw31e8wn
INFO:pytorch_pretrained_bert.modeling:Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 28996
}

INFO:pytorch_pretrained_bert.modeling:Weights of BertForSequenceClassification not initialized from pretrained model: 

In [14]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): FusedLayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): FusedLayerNorm(torch.Size([768]), eps=1e-12, eleme

In [15]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

In [16]:
optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=LEARNING_RATE,
                     warmup=WARMUP_PROPORTION,
                     t_total=num_train_optimization_steps)

In [17]:
global_step = 0
nb_tr_steps = 0
tr_loss = 0

In [18]:
logger.info("***** Running training *****")
logger.info("  Num examples = %d", train_examples_len)
logger.info("  Batch size = %d", TRAIN_BATCH_SIZE)
logger.info("  Num steps = %d", num_train_optimization_steps)
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)

if OUTPUT_MODE == "classification":
    all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
elif OUTPUT_MODE == "regression":
    all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)

INFO:root:***** Running training *****
INFO:root:  Num examples = 2422
INFO:root:  Batch size = 16
INFO:root:  Num steps = 453


In [19]:
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=TRAIN_BATCH_SIZE)

In [20]:
model.train()
for _ in trange(int(NUM_TRAIN_EPOCHS), desc="Epoch"):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(tqdm_notebook(train_dataloader, desc="Iteration")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch

        logits = model(input_ids, segment_ids, input_mask, labels=None)

        if OUTPUT_MODE == "classification":
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
        elif OUTPUT_MODE == "regression":
            loss_fct = MSELoss()
            loss = loss_fct(logits.view(-1), label_ids.view(-1))

        if GRADIENT_ACCUMULATION_STEPS > 1:
            loss = loss / GRADIENT_ACCUMULATION_STEPS

        loss.backward()
        print("\r%f" % loss, end='')
        
        tr_loss += loss.item()
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
        if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

HBox(children=(IntProgress(value=0, description='Iteration', max=152, style=ProgressStyle(description_width='i…

0.422613

Epoch:  33%|███▎      | 1/3 [01:11<02:23, 71.68s/it]

0.358368


HBox(children=(IntProgress(value=0, description='Iteration', max=152, style=ProgressStyle(description_width='i…

0.332276

Epoch:  67%|██████▋   | 2/3 [02:23<01:11, 71.66s/it]

0.284048


HBox(children=(IntProgress(value=0, description='Iteration', max=152, style=ProgressStyle(description_width='i…

0.011183



0.058729

Epoch: 100%|██████████| 3/3 [03:34<00:00, 71.64s/it]

0.004416





In [21]:
model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(OUTPUT_DIR, WEIGHTS_NAME)
output_config_file = os.path.join(OUTPUT_DIR, CONFIG_NAME)

torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(OUTPUT_DIR)

'../outputs/bias_classification/vocab.txt'