In [1]:
from fastai import *

In [2]:
import torch
from pytorch_pretrained_bert import OpenAIGPTTokenizer#, OpenAIGPTModel, OpenAIGPTLMHeadModel
from gpt import OpenAIGPTModel, OpenAIGPTLMHeadModel

# Load pre-trained model tokenizer (vocabulary)
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')

# Tokenized input

text = "Who was Jim Henson ? Jim Henson was a puppeteer"
tokenized_text = tokenizer.tokenize(text)

# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens]).cuda()

In [3]:
tokenized_text

['who</w>',
 'was</w>',
 'jim</w>',
 'hen',
 'son</w>',
 '?</w>',
 'jim</w>',
 'hen',
 'son</w>',
 'was</w>',
 'a</w>',
 'pupp',
 'ete',
 'er</w>']

In [4]:
indexed_tokens

[763, 509, 4265, 2298, 945, 257, 4265, 2298, 945, 509, 246, 10148, 39041, 483]

In [5]:
tokenizer.convert_ids_to_tokens(indexed_tokens)

['who</w>',
 'was</w>',
 'jim</w>',
 'hen',
 'son</w>',
 '?</w>',
 'jim</w>',
 'hen',
 'son</w>',
 'was</w>',
 'a</w>',
 'pupp',
 'ete',
 'er</w>']

In [6]:
# Load pre-trained model (weights)
model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt').cuda()
model.eval()

# Predict all tokens
predictions = model(tokens_tensor)

In [7]:
model.config

{
  "afn": "gelu",
  "attn_pdrop": 0.1,
  "embd_pdrop": 0.1,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "n_ctx": 512,
  "n_embd": 768,
  "n_head": 12,
  "n_layer": 12,
  "n_special": 0,
  "resid_pdrop": 0.1,
  "type_vocab_size": 2,
  "vocab_size": 40478
}

In [8]:
model.transformer.embed

Embedding(40990, 768)

In [9]:
model.lm_head

OpenAIGPTLMHead(
  (decoder): Linear(in_features=768, out_features=40479, bias=False)
)

In [9]:
predictions.shape

torch.Size([1, 14, 40479])

In [8]:
tokens_tensor.shape

torch.Size([1, 14])

In [10]:
predictions

tensor([[[-9.3879e+00, -7.1672e+00, -1.5829e+01,  ..., -1.3558e+01,
          -7.2946e-01,  1.0565e+00],
         [-7.6233e+00, -6.4463e+00, -1.5546e+01,  ..., -1.2833e+01,
          -3.8460e+00,  2.0839e+00],
         [-5.9411e+00, -5.6336e+00, -1.6422e+01,  ..., -1.3572e+01,
          -1.6187e+00,  8.3399e-01],
         ...,
         [-1.7412e+00,  1.9140e+00, -1.8032e+01,  ..., -1.5236e+01,
          -1.8405e+00,  1.4918e+00],
         [-2.0943e+00, -1.1625e+00, -2.1236e+01,  ..., -1.5415e+01,
          -2.2250e+00,  2.9565e+00],
         [-9.5570e+00, -6.0555e+00, -1.9492e+01,  ..., -1.3988e+01,
          -3.0488e+00,  1.3846e+00]]], device='cuda:0',
       grad_fn=<UnsafeViewBackward>)

In [11]:
torch.argmax(predictions, dim=-1).shape

torch.Size([1, 14])

In [12]:
predicted_index = torch.argmax(predictions, dim=-1)

In [13]:
predicted_index.cpu().tolist()

[[509, 538, 239, 826, 239, 244, 2298, 945, 509, 246, 762, 39041, 483, 239]]

In [14]:
tokenizer.convert_ids_to_tokens(predicted_index.cpu().tolist()[0])

['was</w>',
 "n't</w>",
 '.</w>',
 'ning</w>',
 '.</w>',
 '"</w>',
 'hen',
 'son</w>',
 'was</w>',
 'a</w>',
 'man</w>',
 'ete',
 'er</w>',
 '.</w>']

In [3]:
tokenized_text

['who</w>',
 'was</w>',
 'jim</w>',
 'hen',
 'son</w>',
 '?</w>',
 'jim</w>',
 'hen',
 'son</w>',
 'was</w>',
 'a</w>',
 'pupp',
 'ete',
 'er</w>']

In [20]:
# get the predicted last token
# predicted_index = torch.argmax(predictions[0, masked_index]).item()
predicted_index = torch.argmax(predictions, dim=-1)
predicted_token = tokenizer.convert_ids_to_tokens(predicted_index)[0]

KeyError: tensor([  509, 40484,   239,   826,   239,   244,  2298,   945,   509,   246,
          762, 39041,   483,   239], device='cuda:0')

In [11]:
predicted_index

239

In [10]:
predicted_token

'.</w>'

In [7]:
predicted_index

492363