In [None]:
!pip install -U tensorflow

Requirement already up-to-date: tensorflow in /usr/local/lib/python3.6/dist-packages (2.6.0)
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [None]:
from transformers import pipeline, BartTokenizer, BartForConditionalGeneration, BartConfig

In [None]:
model = BartForConditionalGeneration.from_pretrained('sshleifer/distilbart-cnn-12-6')
tokenizer = BartTokenizer.from_pretrained('sshleifer/distilbart-cnn-12-6')
nlp = pipeline('summarization', model=model, tokenizer=tokenizer)

In [None]:
text = '''
We order two different types of jewelry from this compnay the other jewelry we order is perfect.
However with this jewelery I have a few things I don't link. The little Stone comes out of these 
customers are complaining and bring them bac and we are having to put new jewelry in their holes.
You cannot sterilize these in an autoclave as well because it heats up too much and the glue
does not hold up so the second group of thes that we used I did not sterilize them that way
and the stones still came out. When I use a dermal clamp to put the top on the stones come out
immediately. Do not waste your money on this particular product buy the three mm. that has the
claws that hold the jewelry in those are perfect. So now I'm stuck with jewelry that I can't sell
not good for business.
'''

In [None]:
# nlp(text) long dividision error

In [None]:
!apt-get install -y wget

Reading package lists... Done
Building dependency tree       
Reading state information... Done
wget is already the newest version (1.19.4-1ubuntu2.2).
0 upgraded, 0 newly installed, 0 to remove and 27 not upgraded.


In [None]:
!wget https://raw.githubusercontent.com/teropa/nlp/master/resources/corpora/gutenberg/austen-emma.txt

--2021-10-17 22:40:29--  https://raw.githubusercontent.com/teropa/nlp/master/resources/corpora/gutenberg/austen-emma.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 887071 (866K) [text/plain]
Saving to: 'austen-emma.txt.4'


2021-10-17 22:40:29 (22.6 MB/s) - 'austen-emma.txt.4' saved [887071/887071]



In [None]:
from tokenizers.models import BPE
from tokenizers import Tokenizer 
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import Sequence, Lowercase
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer

In [None]:
# this is a more advanced, custom tokenizer

tokenizer = Tokenizer(BPE())
tokenizer.normalizer = Sequence([
    Lowercase()
])
tokenizer.pre_tokenizer = ByteLevel()
tokenizer.decoder = ByteLevelDecoder()

In [None]:
# train tokenizer with desired 50k max vocab size
trainer = BpeTrainer(vocab_size=50000, initial_alphabet=ByteLevel.alphabet(), special_tokens=[
    "<s>", "<pad>", "</s>", "<unk>", "<mask>"
])
tokenizer.train(["austen-emma.txt"], trainer)

In [None]:
!mkdir tokenizer_gpt

mkdir: cannot create directory 'tokenizer_gpt': File exists


In [None]:
tokenizer.save("tokenizer_gpt/tokenizer.json")

In [None]:
# GPT 2
from transformers import GPT2TokenizerFast, GPT2Config, TFGPT2LMHeadModel

In [None]:
tokenizer_gpt = GPT2TokenizerFast.from_pretrained("tokenizer_gpt")
tokenizer_gpt.add_special_tokens({
    "eos_token": "</s>",
    "bos_token": "<s>",
    "pad_token": "<pad>",
    "unk_token": "<unk>",
    "mask_token": "<mask"
})

tokenizer_gpt.eos_token_id

file tokenizer_gpt/config.json not found
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


2

In [None]:
tokenizer_gpt.encode("<s> this is </s>")

[0, 469, 361, 225, 2]

In [None]:
config = GPT2Config(
    vocab_size=tokenizer_gpt.vocab_size,
    bos_token_id=tokenizer_gpt.bos_token_id,
    eos_token_id=tokenizer_gpt.eos_token_id
)

model = TFGPT2LMHeadModel(config)

In [None]:
config

GPT2Config {
  "activation_function": "gelu_new",
  "attn_pdrop": 0.1,
  "bos_token_id": 0,
  "embd_pdrop": 0.1,
  "eos_token_id": 2,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.11.3",
  "use_cache": true,
  "vocab_size": 11954
}

In [None]:
with open("austen-emma.txt", "r", encoding='utf-8') as f:
    content = f.readlines()

In [None]:
# remove \n chars from content
# als remove short lines to ensure the model is learning on long sequences 
# as that's what we want to generate
content_p = []
for c in content:
    if len(c) > 10:
        content_p.append(c.strip())
content_p = " ".join(content_p) + tokenizer_gpt.eos_token

In [None]:
tokenized_content = tokenizer_gpt.encode(content_p)

In [None]:
# make the samples for training
sample_len = 100
examples = []
for i in range(0, len(tokenized_content)):
    examples.append(tokenized_content[i:i + sample_len])

In [None]:
# make training data and next word labels
train_data = []
labels = []
for example in examples:
    train_data.append(example[:-1])
    labels.append(example[1:])

In [None]:
len(train_data), len(labels)

(195221, 195221)

In [None]:
import tensorflow as tf 
buff = 1000
batch_size = 12
dataset = tf.data.Dataset.from_tensor_slices((train_data[:1000], labels[:1000]))
dataset = dataset.shuffle(buff).batch(batch_size, drop_remainder=True)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)


In [None]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)



In [None]:
epochs=10
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')


In [None]:
model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric])

In [None]:
history = model.fit(dataset, epochs=1)



In [None]:
def generate(start, model):
    input_token_ids = tokenizer_gpt.encode(start, return_tensors='tf')
    output = model.generate(
        input_token_ids,
        max_length=20,
        num_beams=5,
        temperature=0.7,
        no_repeat_ngram_size=2,
        num_return_sequences=1
    )
    return tokenizer_gpt.decode([0])

In [None]:
generate(" ", model)

Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


'<s>'

In [None]:
generate("wetson was very good", model)

Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


'<s>'

In [None]:
model.save_pretrained("my_gpt-2/")

In [None]:
model_reloaded = TFGPT2LMHeadModel.from_pretrained("my_gpt-2/")

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at my_gpt-2/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [None]:
from transformers import WEIGHTS_NAME, CONFIG_NAME, TF2_WEIGHTS_NAME

In [None]:
tokenizer_gpt.save_pretrained("tokenizer_gpt_auto/")

('tokenizer_gpt_auto/tokenizer_config.json',
 'tokenizer_gpt_auto/special_tokens_map.json',
 'tokenizer_gpt_auto/vocab.json',
 'tokenizer_gpt_auto/merges.txt',
 'tokenizer_gpt_auto/added_tokens.json',
 'tokenizer_gpt_auto/tokenizer.json')