# TODO

* use https://colab.research.google.com/drive/1iDHCYIrWswIKp-n-pOg69xLoZO09MEgf#scrollTo=zjzTkJGl1J0l this model and arch instead

In [1]:
!pip install pandas
!pip install tqdm
!pip install torch
!pip install pytorch_pretrained_bert



## Download data

In [4]:
from pathlib import Path
import os

DATA_DIR = Path('./data').resolve()
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

In [5]:
import requests
import logging
import shutil
import tarfile
import urllib
from tqdm import tqdm_notebook as tqdm

logger = logging.getLogger()


def download_url(url, dest:str=None):
    tmp = Path('/tmp') 
    tmp.mkdir(exist_ok=True)
    tmp_file = str(tmp/os.path.basename(url))
    
    site = urllib.request.urlopen(url)
    url_size = site.info()['Content-Length']

    if not os.path.exists(tmp_file) or (os.path.exists(tmp_file) and url_size != os.stat(tmp_file).st_size):
        response = requests.get(url, stream=True)
        with open(tmp_file, "wb") as handle:
            for data in tqdm(response.iter_content(), desc=f"downloading {url}", total=url_size, leave=False):
                handle.write(data)
    
    with tarfile.open(tmp_file) as tf:
        tf.extractall(path=dest)
    
    os.remove(tmp_file)
    return str(dest)

In [7]:
download_url(url, dest=DATA_DIR)

HBox(children=(IntProgress(value=1, bar_style='info', description='downloading https://ai.stanford.edu/~amaas/…



'/workspace/data'

In [10]:
!ls -lh $DATA_DIR

total 81M
drwxr-xr-x. 4 7297 1000 105 Jun 26  2011 aclImdb
-rw-r--r--. 1 root root 81M Jul  2 10:39 aclImdb_v1.tar.gz


## Read, transform and save to BERT format 

In [11]:
import pandas as pd

def read_imdb(imdb_dir: str):
    imdb_dir = Path(imdb_dir)
    data = {}
    for t in ['train', 'test']:
        texts, labels = [], []
        for p in ['pos', 'neg']:
            for file in tqdm((imdb_dir/'train'/p).glob("*.txt"), desc=f'reading {t}/{p}'):
                with open(file, 'r') as fin:
                    texts +=  [fin.readlines()[0].replace(r'\n', ' ').strip()]
                    labels += [0 if p=='neg' else 1]
        df = pd.DataFrame(
        {'label': labels, 'text': texts})
        data[t] = df.sample(frac=1)
        
    return tuple(data.values())



def save_bertify(df: pd.DataFrame, fname: str):
    # https://medium.com/swlh/a-simple-guide-on-using-bert-for-text-classification-bbf041ac8d04
    fname = str(fname)
    assert fname.endswith('.tsv'), "fname has to be a tsv file!"
    
    df_bert = pd.DataFrame({
        'id': range(len(df)),
        'label': df['label'],
        'alpha': ['a'] * len(df),
        'text': df['text']})
    df_bert.to_csv(fname, sep='\t', index=False, header=False)
    print(f"saved {len(df_bert)} bertified samples to {fname}")

### Read data

In [12]:
IMDB = DATA_DIR/'aclImdb'

df_trn, df_tst = read_imdb(IMDB)

HBox(children=(IntProgress(value=1, bar_style='info', description='reading train/pos', max=1, style=ProgressSt…




HBox(children=(IntProgress(value=1, bar_style='info', description='reading train/neg', max=1, style=ProgressSt…




HBox(children=(IntProgress(value=1, bar_style='info', description='reading test/pos', max=1, style=ProgressSty…




HBox(children=(IntProgress(value=1, bar_style='info', description='reading test/neg', max=1, style=ProgressSty…




In [13]:
df_trn.head()

Unnamed: 0,label,text
24607,0,Take:<br /><br />1. a famous play<br /><br />2...
2789,1,This was one of those wonderful rare moments i...
5223,1,This very strange movie is unlike anything mad...
8373,1,Maria Braun got married right in the middle of...
19407,0,"The idea of In the Name of the People is good,..."


### Save in Bert format

In [14]:
save_bertify(df_trn, DATA_DIR/'train.tsv')
save_bertify(df_tst, DATA_DIR/'dev.tsv')

saved 25000 bertified samples to /workspace/data/train.tsv
saved 25000 bertified samples to /workspace/data/dev.tsv


## Utils: Data to Features

In [15]:
%%writefile processor.py

from __future__ import absolute_import, division, print_function

import csv
import os
import sys

csv.field_size_limit(2147483647) # Increase CSV reader's field limit incase we have long text.


class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.

        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label


class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""

    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()

    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()

    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
        with open(input_file, "r", encoding="utf-8") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
                if sys.version_info[0] == 2:
                    line = list(unicode(cell, 'utf-8') for cell in line)
                lines.append(line)
            return lines


class BinaryClassificationProcessor(DataProcessor):
    """Processor for binary classification dataset."""

    def get_train_examples(self, data_dir, max_num=None):
        """See base class."""
        examples = self._read_tsv(os.path.join(data_dir, "train.tsv"))
        if max_num and max_num < len(examples): 
            examples = examples[:max_num]
        return self._create_examples(examples, "train")

    def get_dev_examples(self, data_dir, max_num=None):
        """See base class."""
        examples = self._read_tsv(os.path.join(data_dir, "dev.tsv"))
        if max_num and max_num < len(examples): 
            examples = examples[:max_num]
        return self._create_examples(examples, "dev")

    def get_labels(self):
        """See base class."""
        return ["0", "1"]

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            guid = "%s-%s" % (set_type, i)
            text_a = line[3]
            label = line[1]
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
        return examples

Writing processor.py


In [16]:
%%writefile features.py

class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id


def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()


def convert_example_to_feature(example_row):
    # return example_row
    example, label_map, max_seq_length, tokenizer, output_mode = example_row

    tokens_a = tokenizer.tokenize(example.text_a)

    tokens_b = None
    if example.text_b:
        tokens_b = tokenizer.tokenize(example.text_b)
        # Modifies `tokens_a` and `tokens_b` in place so that the total
        # length is less than the specified length.
        # Account for [CLS], [SEP], [SEP] with "- 3"
        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
    else:
        # Account for [CLS] and [SEP] with "- 2"
        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[:(max_seq_length - 2)]

    tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
    segment_ids = [0] * len(tokens)

    if tokens_b:
        tokens += tokens_b + ["[SEP]"]
        segment_ids += [1] * (len(tokens_b) + 1)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    padding = [0] * (max_seq_length - len(input_ids))
    input_ids += padding
    input_mask += padding
    segment_ids += padding

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    if output_mode == "classification":
        label_id = label_map[example.label]
    elif output_mode == "regression":
        label_id = float(example.label)
    else:
        raise KeyError(output_mode)

    return InputFeatures(input_ids=input_ids,
                         input_mask=input_mask,
                         segment_ids=segment_ids,
                         label_id=label_id)


Writing features.py


## Lets prepare features for BERT

In [17]:
import os
import pickle

import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
from torch.nn import CrossEntropyLoss, MSELoss

from tqdm import tqdm_notebook as tqdm
from tqdm import tnrange
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM, BertForSequenceClassification
from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule

from multiprocessing import Pool, cpu_count

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

%load_ext autoreload
%autoreload 2

# local imports 
from processor import BinaryClassificationProcessor
from features import convert_example_to_feature, InputFeatures

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [18]:
MAX_TRAIN_SAMPLES = 5000
MAX_DEV_SAMPLES = 10000


BERT_MODEL = 'bert-base-cased'

TASK_NAME = 'imdb'

OUTPUT_DIR = f'outputs/{TASK_NAME}/'
REPORTS_DIR = f'reports/{TASK_NAME}_eval_report/'

# BERT pretrained params are cached here
CACHE_DIR = 'cache/'

MAX_SEQ_LEN = 128

TRAIN_BATCH_SIZE = 8
EVAL_BATCH_SIZE = 8
LEARNING_RATE = 2e-5

GRADIENT_ACCUMULATION_STEPS = 4
NUM_EPOCHS = 1

RANDOM_SEED = 42
WARMUP_PCT = 0.1
OUTPUT_MODE = 'classification'

CONFIG_NAME = "bert_config.json"
WEIGHTS_NAME = "pytorch_model.bin"

In [19]:
if os.path.exists(REPORTS_DIR) and os.listdir(REPORTS_DIR):
        REPORTS_DIR += f'/report_{len(os.listdir(REPORTS_DIR))}'
        os.makedirs(REPORTS_DIR)
if not os.path.exists(REPORTS_DIR):
    os.makedirs(REPORTS_DIR)
    REPORTS_DIR += f'/report_{len(os.listdir(REPORTS_DIR))}'
    os.makedirs(REPORTS_DIR)

In [21]:
if os.path.exists(OUTPUT_DIR) and os.listdir(OUTPUT_DIR):
        raise ValueError("Output directory ({}) already exists and is not empty.".format(OUTPUT_DIR))
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

## Load train examples

In [25]:
processor = BinaryClassificationProcessor()

train_examples = processor.get_train_examples(DATA_DIR, max_num=MAX_TRAIN_SAMPLES)
num_train_examples = len(train_examples)
print(f"loaded {num_train_examples} train samples")

label_list = processor.get_labels()
num_labels = len(label_list)

num_opt_steps = int(num_train_examples / TRAIN_BATCH_SIZE / GRADIENT_ACCUMULATION_STEPS) * NUM_EPOCHS

loaded 5000 train samples


In [26]:
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL, do_lower_case=False)

INFO:pytorch_pretrained_bert.tokenization:loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt from cache at /root/.pytorch_pretrained_bert/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1


In [None]:
# all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
# all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
# all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)

# if OUTPUT_MODE == "classification":
#     all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
# elif OUTPUT_MODE == "regression":
#     all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)

In [27]:
num_processes = cpu_count() if cpu_count() <=8 else 8
    
def create_features(examples, tokenizer, label_list, output_mode='classification', max_seq_len=128):
    
    label_to_id = {label: i for i, label in enumerate(label_list)}
    examples_for_processing = [(example, label_to_id, max_seq_len, tokenizer, output_mode) for example in examples]

    num_examples = len(examples)
    print(f'Preparing to convert {num_examples} examples..')
    print(f'Spawning {num_processes} processes..')
    with Pool(num_processes) as p:
        train_features = list(tqdm(p.imap(convert_example_to_feature, examples_for_processing), total=num_examples))
    
    with open(DATA_DIR/"train_features.pkl", "wb") as f:
        pickle.dump(train_features, f)
    
    return train_features

def features_to_bert_input(features, output_mode='classification'):
    input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    input_masks = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)

    if output_mode == "classification":
        label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
    elif output_mode == "regression":
        label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)
    
    return {"input_ids": input_ids, "input_masks": input_masks, 
            "segment_ids": segment_ids, "label_ids": label_ids}

In [28]:
train_features = create_features(train_examples, tokenizer, label_list, 
                                 output_mode=OUTPUT_MODE, max_seq_len=MAX_SEQ_LEN)

Preparing to convert 5000 examples..
Spawning 8 processes..


HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))




In [29]:
bert_input = features_to_bert_input(train_features, output_mode=OUTPUT_MODE)

## Fine tuning BERT

In [30]:
# Load pre-trained model (weights)
model = BertForSequenceClassification.from_pretrained(BERT_MODEL, cache_dir=CACHE_DIR, num_labels=num_labels)
# model = BertForSequenceClassification.from_pretrained(CACHE_DIR + 'cased_base_bert_pytorch.tar.gz', cache_dir=CACHE_DIR, num_labels=num_labels)


INFO:pytorch_pretrained_bert.file_utils:https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz not found in cache, downloading to /tmp/tmpkr_ntjnt
100%|██████████| 404400730/404400730 [00:25<00:00, 16053368.01B/s]
INFO:pytorch_pretrained_bert.file_utils:copying /tmp/tmpkr_ntjnt to cache at cache/a803ce83ca27fecf74c355673c434e51c265fb8a3e0e57ac62a80e38ba98d384.681017f415dfb33ec8d0e04fe51a619f3f01532ecea04edbfd48c5d160550d9c
INFO:pytorch_pretrained_bert.file_utils:creating metadata file for cache/a803ce83ca27fecf74c355673c434e51c265fb8a3e0e57ac62a80e38ba98d384.681017f415dfb33ec8d0e04fe51a619f3f01532ecea04edbfd48c5d160550d9c
INFO:pytorch_pretrained_bert.file_utils:removing temp file /tmp/tmpkr_ntjnt
INFO:pytorch_pretrained_bert.modeling:loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz from cache at cache/a803ce83ca27fecf74c355673c434e51c265fb8a3e0e57ac62a80e38ba98d384.681017f415dfb33ec8d0e04fe51a619f3f01532ecea04edbfd48c5d

In [36]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1)
            )
          )
          (intermediat

In [37]:
def get_num_params(model):
    import numpy as np
    mp = filter(lambda p: p.requires_grad, model.parameters())
    return sum(np.prod(p.size()) for p in mp)

In [38]:
get_num_params(model)

108311810

## Create optimizer

In [39]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=LEARNING_RATE,
                     warmup=WARMUP_PCT,
                     t_total=num_opt_steps)


In [41]:
global_step = 0
nb_tr_steps = 0
tr_loss = 0

import logging
logger = logging.getLogger()

logger.info("***** Running training *****")
logger.info("  Num examples = %d", num_train_examples)
logger.info("  Batch size = %d", TRAIN_BATCH_SIZE)
logger.info("  Num steps = %d", num_opt_steps)

INFO:root:***** Running training *****
INFO:root:  Num examples = 5000
INFO:root:  Batch size = 8
INFO:root:  Num steps = 156


## Create dataloaders

In [42]:
train_ds = TensorDataset(*bert_input.values())

train_sampler = RandomSampler(train_ds)
train_dl = DataLoader(train_ds, sampler=train_sampler, batch_size=TRAIN_BATCH_SIZE)


In [43]:
model.train()

PRINT_EVERY = 10
NUM_EPOCHS = 2

train_losses = []

for epoch in tnrange(int(NUM_EPOCHS), desc="epoch"):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    
    for step, batch in enumerate(tqdm(train_dl, desc="iter")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch

        logits = model(input_ids, segment_ids, input_mask, labels=None)

        if OUTPUT_MODE == "classification":
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
        elif OUTPUT_MODE == "regression":
            loss_fct = MSELoss()
            loss = loss_fct(logits.view(-1), label_ids.view(-1))
        train_losses += [loss]
        if GRADIENT_ACCUMULATION_STEPS > 1:
            loss = loss / GRADIENT_ACCUMULATION_STEPS
        
        loss.backward()
        if step % PRINT_EVERY == 0:
            print(f"\r epoch {epoch+1}/{NUM_EPOCHS}, step {step}/{num_opt_steps} | loss: {round(loss.item(),3)}", end='')
        
        tr_loss += loss.item()
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
        
        if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1

HBox(children=(IntProgress(value=0, description='epoch', max=2, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='iter', max=625, style=ProgressStyle(description_width='initia…

 epoch 1/2, step 620/156 | loss: 0.064

HBox(children=(IntProgress(value=0, description='iter', max=625, style=ProgressStyle(description_width='initia…

 epoch 2/2, step 0/156 | loss: 0.071



 epoch 2/2, step 10/156 | loss: 0.057



 epoch 2/2, step 20/156 | loss: 0.144



 epoch 2/2, step 30/156 | loss: 0.059



 epoch 2/2, step 40/156 | loss: 0.029



 epoch 2/2, step 50/156 | loss: 0.111



 epoch 2/2, step 60/156 | loss: 0.056



 epoch 2/2, step 70/156 | loss: 0.056



 epoch 2/2, step 80/156 | loss: 0.016



 epoch 2/2, step 90/156 | loss: 0.041



 epoch 2/2, step 100/156 | loss: 0.033



 epoch 2/2, step 110/156 | loss: 0.052



 epoch 2/2, step 120/156 | loss: 0.14



 epoch 2/2, step 130/156 | loss: 0.066



 epoch 2/2, step 140/156 | loss: 0.056



 epoch 2/2, step 150/156 | loss: 0.048



 epoch 2/2, step 160/156 | loss: 0.04



 epoch 2/2, step 170/156 | loss: 0.142



 epoch 2/2, step 180/156 | loss: 0.039



 epoch 2/2, step 190/156 | loss: 0.049



 epoch 2/2, step 200/156 | loss: 0.063



 epoch 2/2, step 210/156 | loss: 0.055



 epoch 2/2, step 220/156 | loss: 0.106



 epoch 2/2, step 230/156 | loss: 0.043



 epoch 2/2, step 240/156 | loss: 0.078



 epoch 2/2, step 250/156 | loss: 0.054



 epoch 2/2, step 260/156 | loss: 0.139



 epoch 2/2, step 270/156 | loss: 0.031



 epoch 2/2, step 280/156 | loss: 0.032



 epoch 2/2, step 290/156 | loss: 0.036



 epoch 2/2, step 300/156 | loss: 0.081



 epoch 2/2, step 310/156 | loss: 0.037



 epoch 2/2, step 320/156 | loss: 0.114



 epoch 2/2, step 330/156 | loss: 0.081



 epoch 2/2, step 340/156 | loss: 0.043



 epoch 2/2, step 350/156 | loss: 0.024



 epoch 2/2, step 360/156 | loss: 0.04



 epoch 2/2, step 370/156 | loss: 0.049



 epoch 2/2, step 380/156 | loss: 0.021



 epoch 2/2, step 390/156 | loss: 0.056



 epoch 2/2, step 400/156 | loss: 0.058



 epoch 2/2, step 410/156 | loss: 0.148



 epoch 2/2, step 420/156 | loss: 0.038



 epoch 2/2, step 430/156 | loss: 0.027



 epoch 2/2, step 440/156 | loss: 0.035



 epoch 2/2, step 450/156 | loss: 0.056



 epoch 2/2, step 460/156 | loss: 0.018



 epoch 2/2, step 470/156 | loss: 0.047



 epoch 2/2, step 480/156 | loss: 0.04



 epoch 2/2, step 490/156 | loss: 0.077



 epoch 2/2, step 500/156 | loss: 0.066



 epoch 2/2, step 510/156 | loss: 0.088



 epoch 2/2, step 520/156 | loss: 0.05



 epoch 2/2, step 530/156 | loss: 0.083



 epoch 2/2, step 540/156 | loss: 0.083



 epoch 2/2, step 550/156 | loss: 0.053



 epoch 2/2, step 560/156 | loss: 0.019



 epoch 2/2, step 570/156 | loss: 0.017



 epoch 2/2, step 580/156 | loss: 0.121



 epoch 2/2, step 590/156 | loss: 0.05



 epoch 2/2, step 600/156 | loss: 0.076



 epoch 2/2, step 610/156 | loss: 0.103



 epoch 2/2, step 620/156 | loss: 0.059






## Save fine-tuned model and config file as well

In [44]:
# Only save the model it-self
model_to_save = model.module if hasattr(model, 'module') else model  

# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(OUTPUT_DIR, WEIGHTS_NAME)
output_config_file = os.path.join(OUTPUT_DIR, CONFIG_NAME)

torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(OUTPUT_DIR)

'outputs/imdb/vocab.txt'

In [45]:
!ls -l $OUTPUT_DIR

total 423360
-rw-r--r--. 1 root root       313 Jul  4 12:52 bert_config.json
-rw-r--r--. 1 root root 433297515 Jul  4 12:52 pytorch_model.bin
-rw-r--r--. 1 root root    213450 Jul  4 12:52 vocab.txt


## Lets archive config and model together to a .tar and then gzip them!

In [46]:
imdb_targz = "imdb.tar.gz"
config_file = OUTPUT_DIR+CONFIG_NAME
model_file = OUTPUT_DIR+WEIGHTS_NAME

In [47]:
imdb_targz, config_file, model_file

('imdb.tar.gz',
 'outputs/imdb/bert_config.json',
 'outputs/imdb/pytorch_model.bin')

In [48]:
if os.path.isfile(imdb_targz):
    os.remove(imdb_targz)

In [49]:
!cd $OUTPUT_DIR && tar -cvzf $imdb_targz $CONFIG_NAME $WEIGHTS_NAME

bert_config.json
pytorch_model.bin


In [50]:
output_tar = OUTPUT_DIR+imdb_targz
target_tar = CACHE_DIR+imdb_targz
!cp $output_tar $target_tar

In [52]:
!ls -lh $CACHE_DIR

total 769M
-rw-r--r--. 1 root root 386M Jul  4 12:47 a803ce83ca27fecf74c355673c434e51c265fb8a3e0e57ac62a80e38ba98d384.681017f415dfb33ec8d0e04fe51a619f3f01532ecea04edbfd48c5d160550d9c
-rw-r--r--. 1 root root  136 Jul  4 12:47 a803ce83ca27fecf74c355673c434e51c265fb8a3e0e57ac62a80e38ba98d384.681017f415dfb33ec8d0e04fe51a619f3f01532ecea04edbfd48c5d160550d9c.json
-rw-r--r--. 1 root root 384M Jul  4 12:52 imdb.tar.gz


## Evaluation

In [55]:
from sklearn.metrics import confusion_matrix, matthews_corrcoef, accuracy_score, f1_score

In [56]:
def get_eval_report(task_name, labels, preds):
    mcc = matthews_corrcoef(labels, preds)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    acc, f1 = accuracy_score(labels, preds), f1_score(labels, preds)
    return {
        "task": task_name,
        "acc": acc,
        "f1": f1,
        "mcc": mcc,
        "tp": tp,
        "tn": tn,
        "fp": fp,
        "fn": fn
    }

def compute_metrics(task_name, labels, preds):
    assert len(preds) == len(labels)
    return get_eval_report(task_name, labels, preds)

In [57]:
tokenizer = BertTokenizer.from_pretrained(OUTPUT_DIR + 'vocab.txt', do_lower_case=False)

INFO:pytorch_pretrained_bert.tokenization:loading vocabulary file outputs/imdb/vocab.txt


In [60]:
processor = BinaryClassificationProcessor()
eval_examples = processor.get_dev_examples(DATA_DIR, max_num=MAX_DEV_SAMPLES)
label_list = processor.get_labels() # [0, 1] for binary classification
num_labels = len(label_list)
num_eval_samples = len(eval_examples)
print(f"Loaded {num_eval_samples} eval samples")

Loaded 10000 eval samples


In [61]:
dev_features = create_features(eval_examples, tokenizer, label_list, 
                               output_mode=OUTPUT_MODE, max_seq_len=MAX_SEQ_LEN)

dev_bert_input = features_to_bert_input(dev_features, output_mode=OUTPUT_MODE)

Preparing to convert 10000 examples..
Spawning 8 processes..


HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




In [63]:
model = BertForSequenceClassification.from_pretrained(CACHE_DIR + "imdb.tar.gz", 
                                                      cache_dir=CACHE_DIR, 
                                                      num_labels=num_labels).to(device)

INFO:pytorch_pretrained_bert.modeling:loading archive file cache/imdb.tar.gz
INFO:pytorch_pretrained_bert.modeling:extracting archive file cache/imdb.tar.gz to temp dir /tmp/tmpy8yrsoqw
INFO:pytorch_pretrained_bert.modeling:Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 28996
}



In [66]:
eval_ds = TensorDataset(*dev_bert_input.values())
eval_sampler = SequentialSampler(eval_ds)
eval_dl = DataLoader(eval_ds, sampler=eval_sampler, batch_size=EVAL_BATCH_SIZE)

In [67]:
import numpy as np

def evaluate(model, eval_dl, eval_ids):
    model.eval()
    eval_loss = 0
    nb_eval_steps = 0
    preds = []
    
    for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dl, desc="evaluating"):
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        label_ids = label_ids.to(device)

        with torch.no_grad():
            logits = model(input_ids, segment_ids, input_mask, labels=None)

        # create eval loss and other metric required by the task
        if OUTPUT_MODE == "classification":
            loss_fct = CrossEntropyLoss()
            tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
        elif OUTPUT_MODE == "regression":
            loss_fct = MSELoss()
            tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))

        eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        
        if len(preds) == 0:
            preds.append(logits.detach().cpu().numpy())
        else:
            preds[0] = np.append(
                preds[0], logits.detach().cpu().numpy(), axis=0)

    eval_loss = eval_loss / nb_eval_steps
    preds = preds[0]
    
    if OUTPUT_MODE == "classification":
        preds = np.argmax(preds, axis=1)
    
    elif OUTPUT_MODE == "regression":
        preds = np.squeeze(preds)

    result = compute_metrics(TASK_NAME, eval_ids, preds)
    result['eval_loss'] = eval_loss
    output_eval_file = os.path.join(REPORTS_DIR, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results *****")
        for key in (result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))
    return result

In [68]:
eval_label_ids = dev_bert_input['label_ids'].numpy()

evaluate(model, eval_dl, eval_label_ids)

HBox(children=(IntProgress(value=0, description='evaluating', max=1250, style=ProgressStyle(description_width=…

INFO:root:***** Eval results *****
INFO:root:  task = imdb
INFO:root:  acc = 0.8611
INFO:root:  f1 = 0.8604160385890866
INFO:root:  mcc = 0.7224211301143488
INFO:root:  tp = 4281
INFO:root:  tn = 4330
INFO:root:  fp = 637
INFO:root:  fn = 752
INFO:root:  eval_loss = 0.3187084569394589





{'task': 'imdb',
 'acc': 0.8611,
 'f1': 0.8604160385890866,
 'mcc': 0.7224211301143488,
 'tp': 4281,
 'tn': 4330,
 'fp': 637,
 'fn': 752,
 'eval_loss': 0.3187084569394589}