In [3]:
import pandas as pd
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer, TFAutoModel
import numpy as np

In [4]:
# see number of processors
import multiprocessing
multiprocessing.cpu_count()

16

In [5]:
# load yelp review dataset
dataset = load_dataset('yelp_polarity', split='train')
dataset

Downloading readme: 100%|██████████| 8.66k/8.66k [00:00<00:00, 1.44MB/s]
Found cached dataset yelp_polarity (/Users/alexanderkell/.cache/huggingface/datasets/yelp_polarity/plain_text/1.0.0/14f90415c754f47cf9087eadac25823a395fef4400c7903c5897f55cfaaa6f61)


Dataset({
    features: ['text', 'label'],
    num_rows: 560000
})

In [6]:
dataset_split = dataset.train_test_split(test_size=0.2)

In [7]:
dataset_split['train'][0]

{'text': "Don't go out of your way to go here.  They are open 24 hours and that is the best I can say about them.  One night I was so tired that by the time my food came I was too tired to eat. I had three bites, tossed some money on the table and went upstairs to crash.  What's crazy is that there were plenty of servers but there must have only been one cook.",
 'label': 0}

In [8]:
dataset_split = dataset_split.flatten()
dataset_split['train'][0]

{'text': "Don't go out of your way to go here.  They are open 24 hours and that is the best I can say about them.  One night I was so tired that by the time my food came I was too tired to eat. I had three bites, tossed some money on the table and went upstairs to crash.  What's crazy is that there were plenty of servers but there must have only been one cook.",
 'label': 0}

In [10]:
# Load the GPT-2 tokenizer which uses a byte-level Byte Pair Encoding (BPE) algorithm to break down input text into a sequence of subword units.
# It is designed to handle large amounts of text data and to be highly efficient
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

# Load the GPT-2 model
bert = TFAutoModel.from_pretrained("distilgpt2")

# Define a preprocessing function that tokenizes the text using the tokenizer
# and joins the tokens back together into a string
def preprocess_function(examples):
    return tokenizer(["".join(x) for x in examples["text"]], truncation=True)

# Apply the preprocessing function to the Yelp dataset using the `map` method
# `batched=True` allows the dataset to be processed in batches, reducing memory usage
# `num_proc=8` parallelizes the preprocessing across 8 processes for faster execution
# `remove_columns=dataset_split["train"].column_names` removes the columns from the dataset that are not needed for further processing
tokenized_yelp = dataset_split.map(
    preprocess_function,
    batched=True,
    num_proc=8,
    remove_columns=dataset_split["train"].column_names,
)


All model checkpoint layers were used when initializing TFGPT2Model.

All the layers of TFGPT2Model were initialized from the model checkpoint at distilgpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2Model for predictions without further training.

[A


[A[A[A

#0:   0%|          | 0/56 [00:00<?, ?ba/s]



[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A



[A[A[A[A

[A[A
[A


[A[A[A




[A[A[A[A[A





#0:   2%|▏         | 1/56 [00:01<01:13,  1.34s/ba]
[A



[A[A[A[A


[A[A[A




[A[A[A[A[A

#0:   4%|▎         | 2/56 [00:02<01:02,  1.17s/ba]





[A[A[A[A[A[A
[A


[A[A[A



[A[A[A[A




[A[A[A[A[A

[A[A





#0:   5%|▌         | 3/56 [00:03<00:59,  1.13s/ba]
[A


[A[A[A




#0:   7%|▋         | 4/56 [00:04<00:53,  1.03s/ba]



[A[A[A[A

[A[A





[A[A[A[A[A[A
[A




[A[A[A[A[A


#0:   9%|▉         | 5/56 [00:05<00:50,  1.01ba/s]



[A[A[

In [11]:
# Set the maximum block size for the text data
block_size = 128

# Define a function to group input examples into blocks of size block_size
# The function concatenates the input examples into a single list and then
# divides the list into blocks of size block_size
def group_texts(examples):
    # Concatenate the examples into a single list
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    
    # Compute the total length of the concatenated examples
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    
    # Round the total length down to the nearest multiple of block_size
    total_length = (total_length // block_size) * block_size
    
    # Divide the concatenated examples into blocks of size block_size
    # for each key in the input examples
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    
    # Set the labels for each block to be the same as the input_ids
    result["labels"] = result["input_ids"].copy()
    
    # Return the resulting blocks
    return result


In [12]:
lm_dataset = tokenized_yelp.map(group_texts, batched=True, num_proc=8)

#0:   0%|          | 0/56 [00:00<?, ?ba/s]

[A[A
[A



[A[A[A[A


[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A

[A[A


[A[A[A



[A[A[A[A




[A[A[A[A[A
#0:   2%|▏         | 1/56 [00:04<03:51,  4.20s/ba]





[A[A[A[A[A[A

[A[A




#0:   4%|▎         | 2/56 [00:08<03:34,  3.97s/ba]


[A[A[A



[A[A[A[A
[A





[A[A[A[A[A[A

[A[A




[A[A[A[A[A



[A[A[A[A


[A[A[A





[A[A[A[A[A[A
#0:   5%|▌         | 3/56 [00:11<03:27,  3.92s/ba]

#0:   7%|▋         | 4/56 [00:15<03:18,  3.83s/ba]




[A[A[A[A[A



[A[A[A[A
[A


[A[A[A





[A[A[A[A[A[A

[A[A




#0:   9%|▉         | 5/56 [00:18<03:06,  3.65s/ba]
[A



[A[A[A[A


[A[A[A





[A[A[A[A[A[A

[A[A




#0:  11%|█         | 6/56 [00:22<03:02,  3.64s/ba]
[A



[A[A[A[A





[A[A[A[A[A[A


[A[A[A

#0:  12%|█▎        | 7/56 [00:26<03:00,  3.69s/ba]




[A[A[A[A[A
[A


[A[A[A





[A[A[A[A[A[A



[A[A[A[

In [13]:
# Import the necessary classes from the Transformers library
from transformers import TFAutoModelForCausalLM
from transformers import DataCollatorForLanguageModeling

# Load the pre-trained GPT-2 model for causal language modeling
model = TFAutoModelForCausalLM.from_pretrained("distilgpt2")

# Create a data collator for language modeling using the loaded tokenizer
# The collator is used to prepare the input data for training the language model
# The `mlm` flag is set to `False` to indicate that the model should perform causal language modeling rather than masked language modeling
# The `return_tensors` parameter is set to `tf` to ensure that the collator returns TensorFlow tensors
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="tf")


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at distilgpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [14]:
# Set the padding token for the tokenizer to be the end-of-sequence (eos) token
tokenizer.pad_token = tokenizer.eos_token

# Prepare the training set as a TensorFlow dataset
# The `prepare_tf_dataset` method prepares a dataset of input sequences for training the language model
# The `lm_dataset["train"]` argument provides the training set as a `Dataset` object
# The `shuffle` parameter is set to `True` to shuffle the examples in the dataset
# The `batch_size` parameter sets the number of examples in each training batch
# The `collate_fn` parameter specifies the data collator to use to prepare the input data for training
tf_train_set = model.prepare_tf_dataset(
    lm_dataset["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

# Prepare the test set as a TensorFlow dataset
# The `prepare_tf_dataset` method prepares a dataset of input sequences for evaluation of the language model
# The `lm_dataset["test"]` argument provides the test set as a `Dataset` object
# The `shuffle` parameter is set to `False` to keep the examples in the original order
# The `batch_size` parameter sets the number of examples in each evaluation batch
# The `collate_fn` parameter specifies the data collator to use to prepare the input data for evaluation
tf_test_set = model.prepare_tf_dataset(
    lm_dataset["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)


You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [15]:
# see first 5 tokenized reviews
tf_train_set.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(16, 128), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(16, 128), dtype=tf.int64, name=None)}, TensorSpec(shape=(16, 128), dtype=tf.int64, name=None))>

In [17]:
# Load the pre-trained GPT-2 model for causal language modeling
not_tuned_model = TFAutoModelForCausalLM.from_pretrained("distilgpt2")

# Compile the not_tuned_model
# Compiling the model is required to initialize the optimizer and other parameters
not_tuned_model.compile()

# Evaluate the not_tuned_model on the test set
# The `evaluate` method evaluates the performance of the model on a given dataset
# The `tf_test_set` argument provides the test set as a TensorFlow dataset
not_tuned_model.evaluate(tf_test_set.take(100))


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at distilgpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.




4.288843631744385

In [22]:
from transformers import AdamWeightDecay

optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)

model.compile(optimizer=optimizer)
# model.compile()
model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=1)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


    4/37978 [..............................] - ETA: 60:24:25 - loss: 6.6792

KeyboardInterrupt: 

In [None]:
model.evaluate(tf_test_set)

In [24]:
from transformers import pipeline

generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

generator("I love this restaurant")

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


[{'generated_text': 'I love this restaurant.  The restaurant has a little dim lighting; this was our first stop.  The menu is diverse with vegetarian dishes, so we all enjoyed everything we ordered.  From the appetizers, we had the crab, scall'}]

In [29]:
model.push_to_hub("yelp-review-generator", use_auth_token="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")