In [None]:
#!pip install torch
#!pip install transformers

In [2]:
import torch
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer

In [3]:
def get_predictions(model, tokenizer, sentence):
    # encode the sentence using the tokenizer and return the model predictions
    inputs = tokenizer.encode(sentence, return_tensors="pt")
    with torch.no_grad():
        outputs = model(inputs)
        predictions = outputs[0]
    return predictions

In [None]:
# alternative: gpt2-xl, and some others
tokenizer = AutoTokenizer.from_pretrained('gpt2-large')
model = AutoModelForCausalLM.from_pretrained('gpt2-large')

In [5]:
def predict_next_word(model, tokenizer, prefix):
    predictions = get_predictions(model, tokenizer, prefix)

    # get the next token candidates
    next_token_candidates_tensor = predictions[0, -1, :]
    # get the token probabilities for all candidates
    all_candidates_probabilities = torch.nn.functional.softmax(next_token_candidates_tensor, dim=-1)
    # get the top k next token candidates
    topk_candidates_indexes = torch.topk(next_token_candidates_tensor, 20).indices.tolist()
    # filter the token probabilities for the top k candidates
    topk_candidates_probabilities = all_candidates_probabilities[topk_candidates_indexes].tolist()
    # decode the top k candidates back to words
    topk_candidates_tokens = [tokenizer.decode([idx]).strip() for idx in topk_candidates_indexes]
    # return the top k candidates and their probabilities
    for t, p in list(zip(topk_candidates_tokens, topk_candidates_probabilities)):
        print(f'token: {t.ljust(30)}\t prob: {round(p, 3)}')

In [6]:
prefix = "I heard that Germany is really amazing! Have you ever been to"
predict_next_word(model, tokenizer, prefix)

token: Germany                       	 prob: 0.495
token: Berlin                        	 prob: 0.174
token: the                           	 prob: 0.049
token: a                             	 prob: 0.022
token: Munich                        	 prob: 0.022
token: Hamburg                       	 prob: 0.016
token: Frankfurt                     	 prob: 0.011
token: it                            	 prob: 0.009
token: there                         	 prob: 0.009
token: Cologne                       	 prob: 0.008
token: Europe                        	 prob: 0.007
token: this                          	 prob: 0.006
token: Vienna                        	 prob: 0.005
token: one                           	 prob: 0.005
token: D                             	 prob: 0.004
token: their                         	 prob: 0.004
token: that                          	 prob: 0.003
token: Dresden                       	 prob: 0.003
token: Bav                           	 prob: 0.003
token: G                       

In [7]:
prefix = "I heard that Germany is really amazing! Have you ever been to Berlin"
predict_next_word(model, tokenizer, prefix)

token: ?                             	 prob: 0.528
token: ?"                            	 prob: 0.306
token: ,                             	 prob: 0.032
token: or                            	 prob: 0.021
token: before                        	 prob: 0.018
token: and                           	 prob: 0.012
token: yet                           	 prob: 0.007
token: ?!                            	 prob: 0.007
token: ??                            	 prob: 0.005
token: ?",                           	 prob: 0.005
token: ?!"                           	 prob: 0.005
token: ?                             	 prob: 0.005
token: ?'                            	 prob: 0.003
token: ?".                           	 prob: 0.003
token: in                            	 prob: 0.002
token: !?"                           	 prob: 0.002
token: (                             	 prob: 0.001
token: then                          	 prob: 0.001
token: !?                            	 prob: 0.001
token: .                       

In [10]:
# our language is full of biases... and so language models

prefix = "Someone just broke into my car... His skin color was"
#prefix = "Someone just was very kind to me at the store... His skin color was"
predict_next_word(model, tokenizer, prefix)

token: black                         	 prob: 0.119
token: white                         	 prob: 0.079
token: dark                          	 prob: 0.061
token: a                             	 prob: 0.06
token: very                          	 prob: 0.032
token: like                          	 prob: 0.031
token: different                     	 prob: 0.028
token: not                           	 prob: 0.024
token: brown                         	 prob: 0.024
token: darker                        	 prob: 0.02
token: so                            	 prob: 0.019
token: all                           	 prob: 0.015
token: light                         	 prob: 0.015
token: Caucasian                     	 prob: 0.014
token: the                           	 prob: 0.013
token: Asian                         	 prob: 0.012
token: ...                           	 prob: 0.011
token: blue                          	 prob: 0.01
token: really                        	 prob: 0.008
token: Hispanic                   