In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [49]:
import numpy as np
import torch.nn.functional as F

In [2]:
import torch
import os

In [3]:
from tqdm import trange
from pytorch_transformers import (GPT2Tokenizer, GPT2LMHeadModel)

# Introduction

In this notebook will show how to generate text with GPT-2 model, including:
- Load model
- Set generate parameters
- Set context and make it into embedding
- Generate text
    

Tips:
- Update PyTorch to 1.1.0
- Update pytorch_transformers to 1.0.0

**Also this notebook come with a post [Text Generation with GPT-2 in Action](https://medium.com/@yingbiao/text-generation-with-gpt-2-in-action-174e0335e1f6)<br>
Feel free to check it, hope that it could help you.**

 **Set GPU env first**

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

## Load Model

load model and load tokenizer

In [6]:
# Here will use the GPT-2 model ,which had the SOTA preformance for GPT-2 model
# Recommand to download model and tokenize file into local folder first
model_address = 'models/gpt2-large/'

In [7]:
# Downlaod pytorch_model.bin : https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin
# Downlaod config.json : https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json
model = GPT2LMHeadModel.from_pretrained(model_address)

In [8]:
# Download vocab file : https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json
# Download merges_file : https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt
tokenizer =  GPT2Tokenizer.from_pretrained(model_address)

In [9]:
# Set model to  device support
model.to(device);

In [10]:
# Add multi GPU support
if n_gpu >1:
    model = torch.nn.DataParallel(model)

In [11]:
# Set model to evalue mode
model.eval();

## Set generate parameters

Before generate text with model, we need to set some parametes for the process

About the parameter:<br>
- Seed, int num for getting a random num
- Temperature, int num to treat as a magic num to make the generating process unpredictable
- Max_len, int num to define how long the text the model will generate
- Top_k, int num help the model only pick top K possible candidate token base on the context for each run of next token prediction
- Top_p, float num to filter the next predict token, only when the next token's possibility higher than this num, can be taken into consideration for the predicting process

### Set random seed and tempreture

With random seed and tempreture, the generate process will be some kind of random

In [15]:
seed = 4

In [16]:
np.random.seed(seed)
torch.manual_seed(seed)
if n_gpu > 0:
    torch.cuda.manual_seed_all(seed)

In [17]:
temperature = 1.0

### Set other parameters

In [40]:
max_len = 30

In [19]:
top_k=100

In [20]:
top_p=0.8

## Set context and make it into embedding

**Disclaimer: the demo text came from [wikipedia](https://en.wikipedia.org/wiki/Apple_Inc.)**

In [50]:
# Demo test text, the generation will base on this sentence
raw_text = "Apple was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in April 1976."

### Text to embedding

Tokenize the text then map token to id

In [33]:
context_tokens = tokenizer.encode(raw_text)

In [34]:
context_tokens;

## Generate text

The functions below came from [huggingface's demo showcase](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_generation.py)

In [35]:
def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
        Args:
            logits: logits distribution shape (vocabulary size)
            top_k > 0: keep only top k tokens with highest probability (top-k filtering).
            top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
    """
    assert logits.dim() == 1  # batch size 1 for now - could be updated for more but the code would be less clear
    top_k = min(top_k, logits.size(-1))  # Safety check
    if top_k > 0:
        # Remove all tokens with a probability less than the last token of the top-k
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value

    if top_p > 0.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        # Remove tokens with cumulative probability above the threshold
        sorted_indices_to_remove = cumulative_probs > top_p
        # Shift the indices to the right to keep also the first token above the threshold
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        indices_to_remove = sorted_indices[sorted_indices_to_remove]
        logits[indices_to_remove] = filter_value
    return logits

In [36]:
def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=0, top_p=0.0,  device='cpu'):
    '''Method to generate text with GPT-2 '''
    context = torch.tensor(context, dtype=torch.long, device=device)
    context = context.unsqueeze(0).repeat(num_samples, 1)
    generated = context
    
    with torch.no_grad():
        for _ in trange(length):
            inputs = {'input_ids': generated}
            
            outputs = model(**inputs)  # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet (cached hidden-states)
            next_token_logits = outputs[0][0, -1, :] / temperature
            filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
            next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
            generated = torch.cat((generated, next_token.unsqueeze(0)), dim=1)
    return generated

In [42]:
# Generate 
out = sample_sequence(model=model,length=max_len,context=context_tokens,num_samples=1,temperature=temperature,top_k=top_k,top_p= top_p,device=device)

100%|██████████| 30/30 [00:02<00:00, 10.16it/s]


In [43]:
# Paraser result 
out = out[0, len(context_tokens):].tolist()
text = tokenizer.decode(out, clean_up_tokenization_spaces=True)

**Text generated by computer**

In [44]:
text

' Its stock began trading on the New York Stock Exchange on June 1, 1978.\n\nIt was the first computer company to be listed on the NAS'

**The context we written before**

In [46]:
raw_text

'Apple was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in April 1976.'

**Full text combined the context and generated text**

In [47]:
full_text = raw_text+' '+text

In [52]:
full_text

'Apple was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in April 1976.  Its stock began trading on the New York Stock Exchange on June 1, 1978.\n\nIt was the first computer company to be listed on the NAS'