# TODO

* use model described in:
https://colab.research.google.com/drive/1iDHCYIrWswIKp-n-pOg69xLoZO09MEgf#scrollTo=zjzTkJGl1J0l - Done

* build flask app

In [1]:
!pip install pandas tqdm
!pip install torch pytorch_pretrained_bert pytorch-ignite



In [1]:
import requests
import logging
import tarfile
import urllib
from tqdm import tqdm as tqdm

logger = logging.getLogger()

import sys
import os
from pathlib import Path

In [2]:
DATA_DIR = Path('./data').resolve()
IMDB = DATA_DIR/'aclImdb'
TEXT_COL = "text"
LABEL_COL = "label"

url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

def download_url(url:str, dest:str, overwrite:bool=True, show_progress=True, 
                 chunk_size=1024*1024, timeout=4, retries=5)->None:
    "Download `url` to `dest` unless it exists and not `overwrite`."
    dest = Path(dest)/os.path.basename(url)
    if os.path.exists(dest) and not overwrite: 
        print("File already existing")
        return

    s = requests.Session()
    s.mount('http://',requests.adapters.HTTPAdapter(max_retries=retries))
    u = s.get(url, stream=True, timeout=timeout)
    try: file_size = int(u.headers["Content-Length"])
    except: show_progress = False
    print(f"Downloading {url}")
    with open(dest, 'wb') as f:
        nbytes = 0
        if show_progress: 
            pbar = tqdm(range(file_size), leave=False)
        try:
            for chunk in u.iter_content(chunk_size=chunk_size):
                nbytes += len(chunk)
                if show_progress: pbar.update(nbytes)
                f.write(chunk)
        except requests.exceptions.ConnectionError as e:
            print(f"Download failed after {retries} retries.")
            import sys;sys.exit(1)
        finally:
            return str(dest)
        
def untar(file_path, dest:str):
    print(f"Untar {os.path.basename(file_path)} to {dest}")
    with tarfile.open(file_path) as tf:
        tf.extractall(path=str(dest))
    os.remove(file_path)
    return str(dest)

## Download imdb data

In [12]:
file_path = download_url(url, '/tmp', overwrite=True)
untar(file_path, DATA_DIR)

Downloading https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


HBox(children=(IntProgress(value=0, max=84125825), HTML(value='')))

Untar aclImdb_v1.tar.gz to /workspace/data


'/workspace/data'

In [3]:
!ls -lh $DATA_DIR

total 63M
drwxr-xr-x. 4 root root 105 Jun 26  2011 aclImdb
-rw-r--r--. 1 root root 32M Jul  4 14:43 dev.tsv
-rw-r--r--. 1 root root 32M Jul  4 14:43 train.tsv


## Read imdb data

In [4]:
import pandas as pd
import numpy as np
import re

def clean_html(raw):
    cleanr = re.compile('<.*?>')
    clean = re.sub(cleanr, '  ', raw)
    return re.sub(' +', ' ', clean)


def read_imdb(imdb_dir: str, text_col='text', label_col='label'):

    "Read imdb data to {'label', 'text'} format"
    imdb_dir = Path(imdb_dir)
    datasets = {}
    for t in ['train', 'test']:        
        texts, labels = [], []
        for p in ['pos', 'neg']:
            for file in tqdm((imdb_dir/'train'/p).glob("*.txt"), desc=f'reading {t}/{p}'):
                with open(file, 'r') as fin:
                    text = fin.readlines()[0].replace(r'\n', ' ')
                    text = clean_html(text).strip()
                    texts +=  [text]
                    labels += [p]
                    
        df = pd.DataFrame(
            {label_col: labels, text_col: texts})
        datasets[t] = df.sample(frac=1)

    return datasets

In [6]:
datasets = read_imdb(IMDB)

labels = list(set(datasets['train']['label'].tolist()))
label2int = {label: i for i, label in enumerate(labels)}

reading train/pos: 12500it [00:02, 4832.21it/s]
reading train/neg: 12500it [00:02, 5095.42it/s]
reading test/pos: 12500it [00:01, 9276.68it/s]
reading test/neg: 12500it [00:01, 9406.30it/s]


## Configs - args: base model parameters, adapt_args: fine-tuning parameters

In [7]:
from collections import namedtuple
import torch

LOG_DIR = "./logs/"
CACHE_DIR = "./cache/"

NUM_MAX_POSITIONS = 256
BATCH_SIZE = 32

device = "cuda" if torch.cuda.is_available() else "cpu"

AdaptationConfig = namedtuple('AdaptationConfig',
  field_names="num_classes, dropout, initializer_range, batch_size, lr, max_norm, n_epochs,"
              "n_warmup, valid_pct, gradient_accumulation_steps, device,"
              "log_dir, dataset_cache")
adapt_args = AdaptationConfig(
               2          , 0.1    , 0.02             , 32        , 6.5e-5, 1.0   , 3,
               10      , 0.1           , 1, device,
               LOG_DIR, CACHE_DIR+'dataset_cache.bin')

## Load OpenAI GPT

In [8]:
from pytorch_pretrained_bert import (OpenAIGPTTokenizer, OpenAIGPTModel,
                                     OpenAIAdam, cached_path, WEIGHTS_NAME, CONFIG_NAME)

from pytorch_pretrained_bert.modeling_openai import OpenAIGPTPreTrainedModel, OpenAIGPTMultipleChoiceHead


In [77]:
import torch
import torch.nn as nn

class GPTFromPretrainedWithClfHead(nn.Module):
    
    def __init__(self,  base_model, fine_tuning_config, output_attentions=False, keep_multihead_output=False):
        super().__init__()
        self.transformer = OpenAIGPTModel.from_pretrained(base_model)
        embed_dim = self.transformer.tokens_embed.weight.shape[-1]
        
        self.transformer.output_attentions = output_attentions
        self.transformer.keep_multihead_output = keep_multihead_output
        
        self.config = fine_tuning_config
        self.classification_head = nn.Linear(embed_dim, self.config.num_classes)
        
        # init only clf head
        self.init_weights(self.classification_head)
        
    def init_weights(self, module):
        if isinstance(module, (nn.Linear, nn.Embedding, nn.LayerNorm)):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if isinstance(module, (nn.Linear, nn.LayerNorm)) and module.bias is not None:
            module.bias.data.zero_()
    
    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
        
        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
        self.transformer.set_num_special_tokens(num_special_tokens)
    
    def forward(self, input_ids, clf_labels=None, position_ids=None, token_type_ids=None):
        """
            - input_ids: [batch_size, seq_len], word embeddings of sequence
            - position_ids: same shape, position ids in range [0, config.n_positions -1[, position embeddings
            - token_type_ids: same shape, optional for third type of embedding to input tokens in sequences
        
        output: 
         - hidden_states: list of all encoded hidden states in the model (length: num layers+1 for the output of embeddings),
                          [batch_size, seq_len, hidden_size]
        """
        hidden_states = self.transformer(input_ids, position_ids=position_ids, 
                                         token_type_ids=token_type_ids)
        
        # hidden_states [B, S, H] - pick last along 2nd dimension
        hidden_states = hidden_states[:, -1, :]
        logits = self.classification_head(hidden_states)
        
        if clf_labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits.view(-1, logits.size(-1)), clf_labels.view(-1))
            return logits, loss
        return logits
        

## DataProcessor

In [88]:
import torch
from torch.utils.data import TensorDataset, random_split, DataLoader
import numpy as np
import warnings
from tqdm import tqdm_notebook as tqdm


class DataProcessor:
    
    CLS = '[CLS]'
    PAD = '[PAD]'
    
    def __init__(self, tokenizer, label2id, num_max_positions=512):
        self.tokenizer=tokenizer
        self.label2id = label2id
        self.num_labels = len(label2id)
        self.num_max_positions = num_max_positions
        
    
    def process_example(self, example):
        assert len(example) == 2
        label, text = example[0], example[1]
        assert isinstance(text, str)
        
#         with warnings.catch_warnings():
#             warnings.simplefilter("ignore")
        tokens = self.tokenizer.tokenize(text)
        if len(tokens) >= self.num_max_positions:
            tokens = tokens[:self.num_max_positions-1] 
            ids =  self.tokenizer.convert_tokens_to_ids(tokens) + [self.tokenizer.vocab[self.CLS]]
        else:
            pad = [self.tokenizer.vocab[self.PAD]] * (self.num_max_positions-len(tokens)-1)
            ids =  self.tokenizer.convert_tokens_to_ids(tokens) + [self.tokenizer.vocab[self.CLS]] + pad
        
#         ids =  self.tokenizer.convert_tokens_to_ids(tokens)         
#         if len(ids) >= self.num_max_positions:
#             ids = ids[:self.num_max_positions-1] + [self.tokenizer.vocab[self.CLS]]
#         else:
#             pad = [self.tokenizer.vocab[self.PAD]] * (self.num_max_positions-len(ids)-1)
#             ids = ids + [self.tokenizer.vocab[self.CLS]] + pad
        return ids, self.label2id[label]
    

def create_dataset(df, processor, batch_size=32, shuffle=False, valid_pct=None, 
                   text_col="text", label_col="label"):
    "Process rows in `df` with `processor` and return a DataLoader"
    
    features, labels = [], []
    
    for i, row in tqdm(df.iterrows(), total=len(df)):
        ids, lbl = processor.process_example([row.label, row.text])
        features += [ids]
        labels += [lbl]
    
    dataset = TensorDataset(
                    torch.tensor(features, dtype=torch.long), 
                    torch.tensor(labels, dtype=torch.long))
    
    if valid_pct is not None:
        
        valid_size = int(valid_pct * len(df))
        train_size = len(df) - valid_size
        valid_dataset, train_dataset = random_split(dataset, [valid_size, train_size])
        
        valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)    
        return train_loader, valid_loader

    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    return data_loader
    

In [89]:
from pytorch_pretrained_bert import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
processor = DataProcessor(tokenizer, label2int, num_max_positions=NUM_MAX_POSITIONS)

In [90]:
df_train = datasets["train"].sample(500)
df_test = datasets["test"].sample(500)

In [91]:
train_dl, valid_dl = create_dataset(df_train, processor, batch_size=adapt_args.batch_size, 
                                    valid_pct=adapt_args.valid_pct)

test_dl = create_dataset(df_test, processor, batch_size=adapt_args.batch_size, valid_pct=None)

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))




HBox(children=(IntProgress(value=0, max=500), HTML(value='')))




## Init model

In [93]:
base_model = "openai-gpt"

tokenizer = OpenAIGPTTokenizer.from_pretrained(base_model)

FineTuningConfig = namedtuple('FineTuningConfig', 
                              field_names="base_model, num_classes, initializer_range")

fine_tuning_config = FineTuningConfig(base_model, 2, 0.02)


smodel = GPTFromPretrainedWithClfHead(fine_tuning_config.base_model, fine_tuning_config)

In [94]:
def get_num_params(model):
    import numpy as np
    mp = filter(lambda p: p.requires_grad, model.parameters())
    return sum(np.prod(p.size()) for p in mp)


In [61]:
adaptation_model.classification_head

Linear(in_features=410, out_features=2, bias=True)

In [66]:
input = input.to(device)
adaptation_model(input, clf_tokens_mask = (input==101))

> <ipython-input-57-a45d39264aef>(78)forward()
-> if clf_labels is not None:
(Pdb) clf_logits.shape
torch.Size([8, 2])
(Pdb) exit


BdbQuit: 

## Prepare fine-tuning loop

In [87]:
from ignite.engine import Engine, Events
from ignite.metrics import RunningAverage, Accuracy 
from ignite.handlers import ModelCheckpoint
from ignite.contrib.handlers import CosineAnnealingScheduler, PiecewiseLinear, create_lr_scheduler_with_warmup, ProgressBar

optimizer = torch.optim.Adam(adaptation_model.parameters(), lr=adapt_args.lr)

def update(engine, batch):
    
    adaptation_model.train()
    inputs, labels = (t.to(adapt_args.device) for t in batch)
    
    inputs = inputs.transpose(0, 1).contiguous() # [S, B]
    _, loss = adaptation_model(inputs, 
                               clf_tokens_mask = (inputs == tokenizer.vocab[processor.CLS]), 
                               clf_labels=labels)
    loss = loss / adapt_args.gradient_accumulation_steps
    loss.backward()
    
    torch.nn.utils.clip_grad_norm_(adaptation_model.parameters(), adapt_args.max_norm)
    if engine.state.iteration % adapt_args.gradient_accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()
    return loss.item()

def inference(engine, batch):
    adaptation_model.eval()
    with torch.no_grad():
        batch, labels = (t.to(adapt_args.device) for t in batch)
        inputs = batch.transpose(0, 1).contiguous()
        logits = adaptation_model(inputs,
                                  clf_tokens_mask = (inputs == tokenizer.vocab[processor.CLS]),
                                  padding_mask = (batch == tokenizer.vocab[processor.PAD]))
    return logits, labels
                              
trainer = Engine(update)
evaluator = Engine(inference)

# Attache metric to evaluator & evaluation to trainer: evaluate on valid set after each epoch
Accuracy().attach(evaluator, "accuracy")

@trainer.on(Events.EPOCH_COMPLETED)
def log_validation_results(engine):
    evaluator.run(valid_dl)
    print(f"validation epoch: {engine.state.epoch} acc: {100*evaluator.state.metrics['accuracy']}")
          
# Learning rate schedule: linearly warm-up to lr and then to zero
scheduler = PiecewiseLinear(optimizer, 'lr', [(0, 0.0), (adapt_args.n_warmup, adapt_args.lr),
                                              (len(train_dl)*adapt_args.n_epochs, 0.0)])
trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)


# Add progressbar with loss
RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
ProgressBar(persist=True).attach(trainer, metric_names=['loss'])

# Save checkpoints and finetuning config
checkpoint_handler = ModelCheckpoint(adapt_args.log_dir, 'finetuning_checkpoint', 
                                     save_interval=1, require_empty=False)
trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': adaptation_model})
torch.save(args, os.path.join(adapt_args.log_dir, 'fine_tuning_args.bin'))          

## Lets fine-tune on imdb!

In [88]:
%%time 

trainer.run(train_dl, max_epochs=1)

HBox(children=(IntProgress(value=0, max=141), HTML(value='')))

validation epoch: 1 acc: 84.39999999999999
CPU times: user 23.2 s, sys: 41.7 s, total: 1min 4s
Wall time: 1min 16s


<ignite.engine.engine.State at 0x7f7f004e0fd0>

In [89]:
evaluator.run(test_dl)
print(f"test results - acc: {100*evaluator.state.metrics['accuracy']:.3f}")

test results - acc: 86.860


In [162]:
import torch.nn.functional as F


def predict(model, tokenizer, int2label, input="test"):
    tok = tokenizer.tokenize(input)
    ids = tokenizer.convert_tokens_to_ids(tok) + [tokenizer.vocab['[CLS]']]
    tensor = torch.tensor(ids, dtype=torch.long)
    tensor = tensor.to(device)
    tensor = tensor.reshape(1, -1)
    tensor_in = tensor.transpose(0, 1).contiguous() # [S, 1]
    logits = adaptation_model(tensor_in,
                              clf_tokens_mask = (tensor_in == tokenizer.vocab['[CLS]']),
                              padding_mask = (tensor == tokenizer.vocab['[PAD]']))
    val, _ = torch.max(logits, 0)
    val = F.softmax(val, dim=0).detach().cpu().numpy()    
    return {int2label[val.argmax()]: val.max(),
            int2label[val.argmin()]: val.min()}

In [163]:
int2label = {i:label for label,i in label2int.items()}

predict(adaptation_model, tokenizer, int2label, input = "This movie is poorly directed")

{'neg': 0.58218247, 'pos': 0.41781756}

In [164]:
predict(adaptation_model, tokenizer, int2label, input = "I just love how the actors are playing")

{'pos': 0.98019135, 'neg': 0.019808643}

## Build flask app

In [None]:
!wget https://bottlepy.org/bottle.py