In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import numpy as np
import torch.nn.functional as F

In [3]:
import torch
import os

In [4]:
from tqdm import trange
from transformers import (GPT2Tokenizer, GPT2LMHeadModel)

I0107 06:35:24.310554 139899932444416 file_utils.py:32] TensorFlow version 2.0.0-rc1 available.
I0107 06:35:24.311983 139899932444416 file_utils.py:39] PyTorch version 1.1.0 available.


In [5]:
# Check library version
!pip list | grep -E 'transformers|torch'

torch                1.1.0                 
transformers         2.2.0                 


This notebook work with env:

- torch                1.1.0                 
- transformers         2.2.0      

# Introduction

In this notebook will show how to generate text with GPT-2 model, including:
- Load model
- Set generate parameters
- Set context and make it into embedding
- Generate text
    

Tips:
- Update PyTorch to 1.1.0

**Also this notebook come with a post [Text Generation with GPT-2 in Action](https://medium.com/@yingbiao/text-generation-with-gpt-2-in-action-174e0335e1f6)<br>
Feel free to check it, hope that it could help you.**

 **Set GPU env first**

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

## Load Model

load model and load tokenizer

In [7]:
# Here will use the GPT-2 model ,which had the SOTA preformance for GPT-2 model
# Recommand to download model and tokenize file into local folder first
model_address = 'models/gpt2-large/'

In [8]:
# Downlaod pytorch_model.bin : https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin
# Downlaod config.json : https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json
model = GPT2LMHeadModel.from_pretrained(model_address)

I0107 06:35:39.164489 139899932444416 configuration_utils.py:149] loading configuration file models/gpt2-large/config.json
I0107 06:35:39.179713 139899932444416 configuration_utils.py:169] Model config {
  "attn_pdrop": 0.1,
  "embd_pdrop": 0.1,
  "finetuning_task": null,
  "initializer_range": 0.02,
  "is_decoder": false,
  "layer_norm_epsilon": 1e-05,
  "n_ctx": 1024,
  "n_embd": 1280,
  "n_head": 20,
  "n_layer": 36,
  "n_positions": 1024,
  "num_labels": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "torchscript": false,
  "use_bfloat16": false,
  "vocab_size": 50257
}

I0107 06:35:39.184469 139899932444416 modeling_utils.py:384] loading weights file models/gpt2-large/pytorch_model.bin


In [9]:
# Download vocab file : https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json
# Download merges_file : https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt
tokenizer =  GPT2Tokenizer.from_pretrained(model_address)

I0107 06:36:44.005394 139899932444416 tokenization_utils.py:307] Model name 'models/gpt2-large/' not found in model shortcut name list (gpt2-large, gpt2-xl, gpt2-medium, gpt2, distilgpt2). Assuming 'models/gpt2-large/' is a path or url to a directory containing tokenizer files.
I0107 06:36:44.010035 139899932444416 tokenization_utils.py:336] Didn't find file models/gpt2-large/special_tokens_map.json. We won't load it.
I0107 06:36:44.012279 139899932444416 tokenization_utils.py:336] Didn't find file models/gpt2-large/added_tokens.json. We won't load it.
I0107 06:36:44.014819 139899932444416 tokenization_utils.py:336] Didn't find file models/gpt2-large/tokenizer_config.json. We won't load it.
I0107 06:36:44.016863 139899932444416 tokenization_utils.py:372] loading file None
I0107 06:36:44.018291 139899932444416 tokenization_utils.py:372] loading file None
I0107 06:36:44.019231 139899932444416 tokenization_utils.py:372] loading file models/gpt2-large/vocab.json
I0107 06:36:44.020401 13989

In [10]:
# Set model to  device support
model.to(device);

In [11]:
# Add multi GPU support
if n_gpu >1:
    model = torch.nn.DataParallel(model)

In [12]:
# Set model to evalue mode
model.eval();

## Set generate parameters

Before generate text with model, we need to set some parametes for the process

About the parameter:<br>
- Seed, int num for getting a random num
- Temperature, int num to treat as a magic num to make the generating process unpredictable
- Max_len, int num to define how long the text the model will generate
- Top_k, int num help the model only pick top K possible candidate token base on the context for each run of next token prediction
- Top_p, float num to filter the next predict token, only when the next token's possibility higher than this num, can be taken into consideration for the predicting process

### Set random seed and tempreture

With random seed and tempreture, the generate process will be some kind of random

In [13]:
seed = 4

In [14]:
np.random.seed(seed)
torch.manual_seed(seed)
if n_gpu > 0:
    torch.cuda.manual_seed_all(seed)

In [15]:
temperature = 1.0

### Set other parameters

In [16]:
max_len = 30

In [17]:
top_k=100

In [18]:
top_p=0.8

## Set context and make it into embedding

**Disclaimer: the demo text came from [wikipedia](https://en.wikipedia.org/wiki/Apple_Inc.)**

In [19]:
# Demo test text, the generation will base on this sentence
raw_text = "Apple was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in April 1976."

### Text to embedding

Tokenize the text then map token to id

In [20]:
context_tokens = tokenizer.encode(raw_text)

W0107 06:37:11.859341 139899932444416 tokenization_utils.py:938] This tokenizer does not make use of special tokens. Input is returned with no modification.
W0107 06:37:11.860861 139899932444416 tokenization_utils.py:938] This tokenizer does not make use of special tokens. Input is returned with no modification.
W0107 06:37:11.863465 139899932444416 tokenization_utils.py:925] This tokenizer does not make use of special tokens.


In [21]:
context_tokens;

## Generate text

The functions below came from [huggingface's demo showcase](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_generation.py)

In [22]:
def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
        Args:
            logits: logits distribution shape (vocabulary size)
            top_k > 0: keep only top k tokens with highest probability (top-k filtering).
            top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
    """
    assert logits.dim() == 1  # batch size 1 for now - could be updated for more but the code would be less clear
    top_k = min(top_k, logits.size(-1))  # Safety check
    if top_k > 0:
        # Remove all tokens with a probability less than the last token of the top-k
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value

    if top_p > 0.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        # Remove tokens with cumulative probability above the threshold
        sorted_indices_to_remove = cumulative_probs > top_p
        # Shift the indices to the right to keep also the first token above the threshold
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        indices_to_remove = sorted_indices[sorted_indices_to_remove]
        logits[indices_to_remove] = filter_value
    return logits

In [23]:
def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=0, top_p=0.0,  device='cpu'):
    '''Method to generate text with GPT-2 '''
    context = torch.tensor(context, dtype=torch.long, device=device)
    context = context.unsqueeze(0).repeat(num_samples, 1)
    generated = context
    
    with torch.no_grad():
        for _ in trange(length):
            inputs = {'input_ids': generated}
            
            outputs = model(**inputs)  # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet (cached hidden-states)
            next_token_logits = outputs[0][0, -1, :] / temperature
            filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
            next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
            generated = torch.cat((generated, next_token.unsqueeze(0)), dim=1)
    return generated

In [24]:
# Generate 
out = sample_sequence(model=model,length=max_len,context=context_tokens,num_samples=1,temperature=temperature,top_k=top_k,top_p= top_p,device=device)

100%|██████████| 30/30 [00:03<00:00,  9.83it/s]


In [25]:
# Paraser result 
out = out[0, len(context_tokens):].tolist()
text = tokenizer.decode(out, clean_up_tokenization_spaces=True)

**Text generated by computer**

In [26]:
text

' The company was founded to create "a portable computer with an LCD display that could be operated by the hand, using a simple two-button mouse."'

**The context we written before**

In [27]:
raw_text

'Apple was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in April 1976.'

**Full text combined the context and generated text**

In [28]:
full_text = raw_text+' '+text

In [29]:
full_text

'Apple was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in April 1976.  The company was founded to create "a portable computer with an LCD display that could be operated by the hand, using a simple two-button mouse."'