In [43]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [44]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

Downloading: 100%|██████████| 548M/548M [00:40<00:00, 13.7MB/s]


In [45]:
prompt_text = "a robot must obey the orders given"
encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")
encoded_prompt

tensor([[   64,  9379,  1276, 22389,   262,  6266,  1813]])

In [46]:
with torch.no_grad():
    prediction_scores, past = model(encoded_prompt)

In [47]:
next_token_logits = prediction_scores[0, -1, :]
next_token_logits.argmax()

tensor(284)

In [48]:
[tok.replace('Ġ', ' ') for tok in tokenizer.convert_ids_to_tokens(next_token_logits.topk(10).indices)]

[' to', ' by', ' it', ' him', ' them', ' in', '.', ',', ' the', ' and']

In [49]:
# return normalized probability for the 0th dimension of next_token_logits
torch.softmax(next_token_logits, 0)[284]

tensor(0.5773)

In [None]:
#============================================!
# Code from here on is for BERT multilingual
# Taken from my BERT practice file
#============================================!

In [51]:
import torch
from transformers import BertTokenizer, BertForMaskedLM

In [52]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')

In [9]:
init_text = "[CLS] es un dia hermoso. [SEP]"
token_masked_text = "[CLS] es un [MASK] hermoso. [SEP]"
full_masked_text = "[CLS] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [SEP]"

In [10]:
tokenized_text = tokenizer.tokenize(token_masked_text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 0]
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensor = torch.tensor([segments_ids])

In [11]:
with torch.no_grad():
    outputs = model(tokens_tensor, token_type_ids=segments_tensor)
    predictions = outputs[0]

In [12]:
predicted_outputs = [tokenizer.convert_ids_to_tokens([index.item()])[0] for index in predictions[0, 3].topk(100).indices]
print(predicted_outputs)

['árbol', 'bien', 'flor', 'color', 'material', 'tall', 'compuesto', 'mineral', 'dato', '.', 'ser', 'fruto', 'Sol', 'órgano', 'suelo', 'pez', 'sistema', 'solo', 'buen', ':', 'elemento', 'género', 'un', 'pequeño', 'valor', 'herb', 'sol', ',', 'agua', 'sin', 'poco', 'sal', 'ar', 'es', 'cuerpo', 'pie', 'sabor', 'simple', '-', 'porte', 'tubo', '##ce', 'polen', 'carácter', 'bin', 'crecimiento', 'h', 'tamaño', 'canto', 'animal', 'timbre', 'alto', 'alimento', 'vigor', 'ácido', 'flores', 'niño', 'vas', 'lo', 'clima', 'árboles', 'tronco', 'componente', 'sencillo', 'todo', 'terreno', 'sello', 'muy', 'campo', 'principio', 'conjunto', 'cultivo', 'aceite', '##bust', 'municipio', 'origen', '##o', 'tipo', 'Bien', 'uso', 'al', 'vino', 'líquido', 'fuego', 'producto', 'este', 'escudo', 'del', 'no', 'tax', 'vuelo', 'viento', 'planta', 'tal', '##te', 'ave', 'frutos', 'verde', 'coche', 'oso']


In [17]:
prompt_text = "오늘도 저랑 만날래요?"
encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")
encoded_prompt

tensor([[  9580, 118762,  12092,   9663,  62200,   9248,  41919,  37388,  48549,
            136]])

In [None]:
prediction_scores, past = model.forward(encoded_prompt)

In [13]:
# text_to_translate = "[CLS] I have a banana in Spanish: [MASK]"

In [14]:
# segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]
# for i in range(0, 10):
#     print(text_to_translate)
#     tokenized_text = tokenizer.tokenize(text_to_translate)
#     print(tokenized_text)

#     indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
#     tokens_tensor = torch.tensor([indexed_tokens])
#     segments_tensor = torch.tensor([segments_ids])

#     predicted_outputs = [tokenizer.convert_ids_to_tokens([index.item()])[0] for index in predictions[0, -1].topk(10).indices]
#     print(predicted_outputs)

#     text_to_translate = text_to_translate.replace("[MASK]", predicted_outputs[0])
#     text_to_translate += " [MASK]"

In [53]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def prepareInputs(init_text):
    # List of punctuation to determine where segments end
    punc_list = [".", "?", "!"]
    # Prepend the [CLS] tag
    prompt_text = "[CLS] " + init_text
    # Insert the [SEP] tags
    for i in range(0, len(prompt_text)):
        if prompt_text[i] in punc_list:
            prompt_text = prompt_text[:i + 1] + " [SEP]" + prompt_text[i + 1:]

    return prompt_text

def createSegIDs(tokenized_text):
    currentSeg = 0
    seg_ids = []
    for token in tokenized_text:
        seg_ids.append(currentSeg)
        if token == "[SEP]":
            currentSeg += 1

    return seg_ids

def addMask(tokenized_text, mask_word):
    print(tokenized_text)
    mask_indices = []
    for i in range(0, len(tokenized_text)):
        if tokenized_text[i] == mask_word:
            tokenized_text[i] = "[MASK]"
            mask_indices.append(i)

    return (tokenized_text, mask_indices)

In [60]:
text = "es un árbol hermoso."
prepped_text = prepareInputs(text)
tokenized_text = tokenizer.tokenize(prepped_text)
segment_ids = createSegIDs(tokenized_text)
word_to_mask = "árbol"
masked_text, mask_indices = addMask(tokenized_text, word_to_mask)

print(prepped_text)
print(masked_text)

indexed_tokens = tokenizer.convert_tokens_to_ids(masked_text)
masked_token_id = tokenizer.convert_tokens_to_ids(word_to_mask)
print(masked_token_id)
tokens_tensor = torch.tensor([indexed_tokens])
segment_tensor = torch.tensor([segment_ids])

with torch.no_grad():
    outputs = model(tokens_tensor, token_type_ids=segment_tensor)
    prediction_scores = outputs[0]

for i in mask_indices:
    predicted_outputs = [tokenizer.convert_ids_to_tokens([index.item()])[0] for index in prediction_scores[0, i].topk(10).indices]

predicted_outputs

['[CLS]', 'es', 'un', 'árbol', 'her', '##mos', '##o', '.', '[SEP]']
[CLS] es un árbol hermoso. [SEP]
['[CLS]', 'es', 'un', '[MASK]', 'her', '##mos', '##o', '.', '[SEP]']
55220


ValueError: not enough values to unpack (expected 2, got 1)

In [58]:
next_token_logits = prediction_scores[0, mask_indices[0], :]
next_token_logits

torch.softmax(next_token_logits, 0)[masked_token_id]

tensor(0.4303)

In [61]:
import wordfreq

ModuleNotFoundError: No module named 'wordfreq'