In [1]:
import torch
import numpy as np
from text_utils import TextEncoder

In [2]:
from model_pytorch import TransformerModel, load_openai_pretrained_model, DEFAULT_CONFIG, LMModel

In [3]:
text_encoder = TextEncoder('./model/encoder_bpe_40000.json', './model/vocab_40000.bpe')
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cpu"

In [4]:
args = DEFAULT_CONFIG
n_special = 0
n_vocab = len(text_encoder.encoder)
n_ctx = 512
vocab = n_ctx + n_vocab
gen_len = 20
topk = 4

In [5]:
lm_model = LMModel(args, vocab, n_ctx, return_probs=True)

In [6]:
load_openai_pretrained_model(lm_model.transformer, n_ctx=n_ctx, n_special=n_special)

Loading weights...


In [7]:
print(lm_model.transformer.embed)

Embedding(40990, 768)


In [7]:
lm_model.to("cpu")
lm_model.eval()

LMModel(
  (transformer): TransformerModel(
    (embed): Embedding(40990, 768)
    (drop): Dropout(p=0.1)
    (h): ModuleList(
      (0): Block(
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1)
          (resid_dropout): Dropout(p=0.1)
        )
        (ln_1): LayerNorm()
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1)
        )
        (ln_2): LayerNorm()
      )
      (1): Block(
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1)
          (resid_dropout): Dropout(p=0.1)
        )
        (ln_1): LayerNorm()
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1)
        )
        (ln_2): LayerNorm()
      )
      (2): Block(
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
      

In [8]:
def append_batch(X, next_idx):
    next_pos = X[:, -1:, 1] + 1
    next_x = torch.cat((next_idx, next_pos), -1).unsqueeze(1)
    return torch.cat((X, next_x), 1)

In [18]:
def make_batch(X):
    X = np.array(X)
    assert X.ndim in [1, 2]
    if X.ndim == 1:
        X = np.expand_dims(X, axis=0)
    pos_enc = np.arange(n_vocab + n_special, n_vocab + n_special + X.shape[-1])
#     print(pos_enc)
    pos_enc = np.expand_dims(pos_enc, axis=0)
#     print(pos_enc.shape)
    batch = np.stack([X, pos_enc], axis=-1)
#     print(batch, batch.shape)
    batch = torch.tensor(batch, dtype=torch.long).to(device)
    return batch

In [None]:
text = input('Input some beginning words:')
while text != 'q':
    X = text_encoder.encode([text,])
#     print(X)
    XMB = make_batch(X)

    for _ in range(gen_len):
        lm_probs = lm_model(XMB)
        print(lm_model.transformer)
#         print(lm_probs.shape)
        if topk == 0:
            next_idx = torch.multinomial(lm_probs[:, -1, :], 1)
        else:
            values, indices = lm_probs[:, -1, :].topk(topk)
            print(values[0, 0])
            next_idx = indices.gather(-1, torch.multinomial(values, 1))
        next_token = text_encoder.decoder[next_idx.item()].replace('</w>', '')
        print(next_token, end=' ')
        XMB = append_batch(XMB, next_idx)

    print()
    text = input('Input some beginning words:')

Input some beginning words:ss dd


                                                                                

TransformerModel(
  (embed): Embedding(40990, 768)
  (drop): Dropout(p=0.1)
  (h): ModuleList(
    (0): Block(
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1)
        (resid_dropout): Dropout(p=0.1)
      )
      (ln_1): LayerNorm()
      (mlp): MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (dropout): Dropout(p=0.1)
      )
      (ln_2): LayerNorm()
    )
    (1): Block(
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1)
        (resid_dropout): Dropout(p=0.1)
      )
      (ln_1): LayerNorm()
      (mlp): MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (dropout): Dropout(p=0.1)
      )
      (ln_2): LayerNorm()
    )
    (2): Block(
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1)
        (resid_dropout): Dropout(p=0.1)
      )
      (ln_1): Layer

" TransformerModel(
  (embed): Embedding(40990, 768)
  (drop): Dropout(p=0.1)
  (h): ModuleList(
    (0): Block(
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1)
        (resid_dropout): Dropout(p=0.1)
      )
      (ln_1): LayerNorm()
      (mlp): MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (dropout): Dropout(p=0.1)
      )
      (ln_2): LayerNorm()
    )
    (1): Block(
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1)
        (resid_dropout): Dropout(p=0.1)
      )
      (ln_1): LayerNorm()
      (mlp): MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (dropout): Dropout(p=0.1)
      )
      (ln_2): LayerNorm()
    )
    (2): Block(
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1)
        (resid_dropout): Dropout(p=0.1)
      )
      (ln_1): Lay

n't TransformerModel(
  (embed): Embedding(40990, 768)
  (drop): Dropout(p=0.1)
  (h): ModuleList(
    (0): Block(
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1)
        (resid_dropout): Dropout(p=0.1)
      )
      (ln_1): LayerNorm()
      (mlp): MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (dropout): Dropout(p=0.1)
      )
      (ln_2): LayerNorm()
    )
    (1): Block(
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1)
        (resid_dropout): Dropout(p=0.1)
      )
      (ln_1): LayerNorm()
      (mlp): MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (dropout): Dropout(p=0.1)
      )
      (ln_2): LayerNorm()
    )
    (2): Block(
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1)
        (resid_dropout): Dropout(p=0.1)
      )
      (ln_1): L

hear TransformerModel(
  (embed): Embedding(40990, 768)
  (drop): Dropout(p=0.1)
  (h): ModuleList(
    (0): Block(
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1)
        (resid_dropout): Dropout(p=0.1)
      )
      (ln_1): LayerNorm()
      (mlp): MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (dropout): Dropout(p=0.1)
      )
      (ln_2): LayerNorm()
    )
    (1): Block(
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1)
        (resid_dropout): Dropout(p=0.1)
      )
      (ln_1): LayerNorm()
      (mlp): MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (dropout): Dropout(p=0.1)
      )
      (ln_2): LayerNorm()
    )
    (2): Block(
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1)
        (resid_dropout): Dropout(p=0.1)
      )
      (ln_1): 

" TransformerModel(
  (embed): Embedding(40990, 768)
  (drop): Dropout(p=0.1)
  (h): ModuleList(
    (0): Block(
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1)
        (resid_dropout): Dropout(p=0.1)
      )
      (ln_1): LayerNorm()
      (mlp): MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (dropout): Dropout(p=0.1)
      )
      (ln_2): LayerNorm()
    )
    (1): Block(
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1)
        (resid_dropout): Dropout(p=0.1)
      )
      (ln_1): LayerNorm()
      (mlp): MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (dropout): Dropout(p=0.1)
      )
      (ln_2): LayerNorm()
    )
    (2): Block(
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1)
        (resid_dropout): Dropout(p=0.1)
      )
      (ln_1): Lay

tensor(0.1244, grad_fn=<SelectBackward>)
my TransformerModel(
  (embed): Embedding(40990, 768)
  (drop): Dropout(p=0.1)
  (h): ModuleList(
    (0): Block(
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1)
        (resid_dropout): Dropout(p=0.1)
      )
      (ln_1): LayerNorm()
      (mlp): MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (dropout): Dropout(p=0.1)
      )
      (ln_2): LayerNorm()
    )
    (1): Block(
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1)
        (resid_dropout): Dropout(p=0.1)
      )
      (ln_1): LayerNorm()
      (mlp): MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (dropout): Dropout(p=0.1)
      )
      (ln_2): LayerNorm()
    )
    (2): Block(
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1)
        (resid_dropout)