In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
from transformers import GPT2Tokenizer, TFAutoModelForCausalLM
import tensorflow as tf
import pandas as pd
import tensorflow_probability as tfp
from tensorflow_probability import distributions as tfd
import numpy as np

# De-Anonymizing Text by Fingerprinting Language Generation (https://arxiv.org/abs/2006.09615)

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = TFAutoModelForCausalLM.from_pretrained('gpt2')
model.trainable = False

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/498M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [80]:
# Partly adapted from https://github.com/huggingface/transformers/blob/master/src/transformers/generation_tf_utils.py

def return_nucleus_size(logits, top_p=0.8, print_greedy=False):
    logits = tf.squeeze(logits)
    indices_sorted = tf.argsort(logits, direction="DESCENDING")
    logits_sorted = tf.gather(logits, indices_sorted)
    probs_sorted = tf.nn.softmax(logits_sorted, axis=-1)
    cumulative_probs_sorted = tf.math.cumsum(probs_sorted, axis=-1)
    if print_greedy:
        print(tokenizer.decode(indices_sorted[0]))
#     cutoff_index = tf.argmax(cumulative_probs_sorted>top_p)
#     indices_to_filter = indices_sorted[cutoff_index+1:]
#     indices_to_filter = tf.expand_dims(indices_to_filter, axis=1)
#     filters = tf.fill([len(indices_to_filter)],-np.inf)
#     filtered_logits = tf.tensor_scatter_nd_update(logits, indices_to_filter, filters)    
    return 1 + tf.argmax(cumulative_probs_sorted>top_p).numpy()

In [90]:
%%time
text = "I once saw a man so tall I could not look into his eyes"
tokenized = tokenizer(text)
input_ids = tokenized['input_ids']
# for input_id in input_ids:
#     print(tokenizer.prepare_for_model([input_id]))

past = None
NSS = []
for input_id in input_ids:
    inputs = tokenizer.prepare_for_model([input_id], return_tensors='tf')['input_ids']
    output = model({'input_ids': inputs, 'past':past})
    logits = output.logits
    past = output.past_key_values
    NSS.append(return_nucleus_size(logits, top_p=0.9, print_greedy=True))
NSS    

.
 had
 a
 man
 who
 beautiful
 that
 could
 barely
 see
 him
 his
 eyes
.
CPU times: user 899 ms, sys: 24.3 ms, total: 923 ms
Wall time: 906 ms


[10855, 287, 781, 2755, 924, 1251, 10, 73, 137, 118, 26, 4, 7, 23]

In [91]:
tokenizer.model_max_length

1024

In [36]:
tokenizer.prepare_for_model?