In [4]:
from pytorch_pretrained_bert import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2').cuda()

In [433]:
import torch
import torch.nn.functional as F
import random

def predict_next(text, include_only=None, topk=10):
    toks = torch.tensor(tokenizer.encode(text)).unsqueeze(0).cuda()
    model_out = model(toks)[0][0, -1].view(-1)
    if include_only:
        include_ids = torch.tensor([tokenizer.encode(x)[0] for x in include_only])
        model_include_only = model_out[:, include_ids].clone().detach()
        model_out[:] = -1000
        model_out[include_ids] = model_include_only        
    indices = torch.sort(model_out, descending=True)[1][:topk].tolist()
    return [tokenizer.decode([idx]) for idx in indices]

def eval_specific(text, raw_toks):
    eval_toks = list(map(tokenizer.encode, raw_toks))
    toks = torch.tensor(tokenizer.encode(text)).unsqueeze(0).cuda()
    model_out = model(toks)[0][0, -1].view(-1)
    return sorted([(tok_text, model_out[tok[0]].item()) for tok_text, tok in zip(raw_toks, eval_toks)], 
                  key=lambda x: -x[1])

def greedy_predict(text, n=10, sample=False, topk=10):
    for _ in range(n):
        next_ = predict_next(text, topk=topk)
        text = text + (random.choice(next_) if sample else next_[0])
    return text

def beam_predict(text, n=10, w=4, active_h=1000):
    open_set = [text]
    

In [564]:
# predict_next('http://youtube.com/user/')
print(greedy_predict('The movie was bad', n=256, topk=5, sample=True))#, include_only='abcdefghijklmnopqrstuvwxyz')
# eval_specific('Bill Gates created M', [' writer', ' artist', ' scientist', ' pianist', ' failure', ' politician', 
#                                    ' president', ' singer', ' director', ' success', ' genius', ' magnate'])

The movie was bad for me, it's bad enough for you. You can see how I'm feeling right now."<|endoftext|> "And then it just hit home, because the way it was set and the fact that you can watch it for hours at an time was a big plus because there were no subtitles."

,
.<|endoftext|>

, <</endofchapter>><|endoftext|> <</p> "I don. Well."</<p>"You know what? I don't think so." <span class=">I think I'm going through the whole 'what the f-- is happening to you, and you don't understand' and then I don`T know."> <p> "So you just don�T care, you know what? I'm not a fan and it�m really hard for me."<span id='favorit-2'>
.
 (end) "But you're a good guy!"" <p class=="fan-link-1\" />

. ( end )
. (end of text)<|endoftext|> "You can. But I think. You just aren`tee good enough."
 (p class='comment'/> <</span class> <\/comment></html> <</span


In [468]:
text = 'year 1905 served to be importantly albert einsteiner'
[tokenizer.decoder[x] for x in tokenizer.encode(text)]

['year',
 'Ġ1905',
 'Ġserved',
 'Ġto',
 'Ġbe',
 'Ġimportantly',
 'Ġal',
 'bert',
 'Ġe',
 'instein',
 'er']

In [129]:
tokenizer.decoder[1096]

'ize'

In [137]:
modules = list(model.modules())

In [151]:
import torch.nn as nn
from pytorch_pretrained_bert.modeling_gpt2 import Conv1D
# list(map(type, modules))
attn_mods = list(filter(lambda x: isinstance(x, Conv1D), modules))

In [157]:
sum(sum(x.numel() for x in y.parameters()) for y in attn_mods)

85017600

In [170]:
attn_mods[3].weight.size()

torch.Size([3072, 768])

In [163]:
len(attn_mods)

48

In [182]:
tokenizer.encode('\n \n \n \n \n \n')

[198, 220, 198, 220, 198, 220, 198, 220, 198, 220, 198]

In [196]:
tokenizer.decoder[220]

'Ġ'