In [6]:
import pandas as pd
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer, TFAutoModel
import numpy as np

In [2]:
# load yelp review dataset
dataset = load_dataset('yelp_polarity', split='train')
dataset

Found cached dataset yelp_polarity (/Users/alexander.kell/.cache/huggingface/datasets/yelp_polarity/plain_text/1.0.0/14f90415c754f47cf9087eadac25823a395fef4400c7903c5897f55cfaaa6f61)


Dataset({
    features: ['text', 'label'],
    num_rows: 560000
})

In [8]:
dataset_split = dataset.train_test_split(test_size=0.2)

In [9]:
dataset_split['train'][0]

{'text': "I feel like I might be swimming against the current on this one, with so many great reviews of this place. I came he with my usual lunch crowd (we all work together) thus right off the bat we asked for separate checks and was told no problem. The special was volcano chicken so three of us ordered that and one ordered the yellow curry and the last ordered Chicken Pad Thai. Well the Volcano chicken was atrocious, it was sooooo overcooked I couldn't taste any chicken, I swear when I say it tasted like glazed pork rinds. My jaw hurt from crunching so much. Everyone that got the Volcano Chicken had the same issue. My friends Pad Thai came out pink?? And smelled and tasted like fish.. he took two bites and pushed it aside. When the check came they forgot to give us separate checks and a overcharge and refund later we finally got out of there.\\n\\nNot sure if there was a new cook or the what, but this is one place that will not get a second chance from me, and I love Asian food.",


In [11]:
dataset_split = dataset_split.flatten()
dataset_split['train'][0]

{'text': "I feel like I might be swimming against the current on this one, with so many great reviews of this place. I came he with my usual lunch crowd (we all work together) thus right off the bat we asked for separate checks and was told no problem. The special was volcano chicken so three of us ordered that and one ordered the yellow curry and the last ordered Chicken Pad Thai. Well the Volcano chicken was atrocious, it was sooooo overcooked I couldn't taste any chicken, I swear when I say it tasted like glazed pork rinds. My jaw hurt from crunching so much. Everyone that got the Volcano Chicken had the same issue. My friends Pad Thai came out pink?? And smelled and tasted like fish.. he took two bites and pushed it aside. When the check came they forgot to give us separate checks and a overcharge and refund later we finally got out of there.\\n\\nNot sure if there was a new cook or the what, but this is one place that will not get a second chance from me, and I love Asian food.",


In [12]:
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
bert = TFAutoModel.from_pretrained("distilgpt2")

def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in examples["text"]], truncation=True)

tokenized_yelp = dataset_split.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=dataset_split["train"].column_names,
)

All model checkpoint layers were used when initializing TFGPT2Model.

All the layers of TFGPT2Model were initialized from the model checkpoint at distilgpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2Model for predictions without further training.


#0:   0%|          | 0/112 [00:00<?, ?ba/s]
#0:   1%|          | 1/112 [00:02<04:28,  2.41s/ba]
[A

[A[A
#0:   2%|▏         | 2/112 [00:05<04:54,  2.68s/ba]

[A[A
#0:   3%|▎         | 3/112 [00:08<04:58,  2.74s/ba]

[A[A
[A

#0:   4%|▎         | 4/112 [00:11<05:16,  2.93s/ba]

[A[A
#0:   4%|▍         | 5/112 [00:14<05:23,  3.03s/ba]
[A

#0:   5%|▌         | 6/112 [00:17<05:09,  2.92s/ba]
[A

#0:   6%|▋         | 7/112 [00:20<05:05,  2.91s/ba]

[A[A
#0:   7%|▋         | 8/112 [00:22<05:01,  2.90s/ba]
[A

#0:   8%|▊         | 9/112 [00:26<05:03,  2.94s/ba]

[A[A
#0:  10%|▉         | 11/112 [00:32<05:01,  2.98s/ba]

[A[A
#0:  11%|█         | 12/112 [00:35<04:58,

In [15]:
# show first 5 tokenized reviews
tokenized_yelp['train'][:1]

{'input_ids': [[40,
   220,
   220,
   277,
   304,
   304,
   300,
   220,
   220,
   300,
   1312,
   479,
   304,
   220,
   220,
   314,
   220,
   220,
   285,
   1312,
   308,
   289,
   256,
   220,
   220,
   275,
   304,
   220,
   220,
   264,
   266,
   1312,
   285,
   285,
   1312,
   299,
   308,
   220,
   220,
   257,
   308,
   257,
   1312,
   299,
   264,
   256,
   220,
   220,
   256,
   289,
   304,
   220,
   220,
   269,
   334,
   374,
   374,
   304,
   299,
   256,
   220,
   220,
   267,
   299,
   220,
   220,
   256,
   289,
   1312,
   264,
   220,
   220,
   267,
   299,
   304,
   837,
   220,
   220,
   266,
   1312,
   256,
   289,
   220,
   220,
   264,
   267,
   220,
   220,
   285,
   257,
   299,
   331,
   220,
   220,
   308,
   374,
   304,
   257,
   256,
   220,
   220,
   374,
   304,
   410,
   1312,
   304,
   266,
   264,
   220,
   220,
   267,
   277,
   220,
   220,
   256,
   289,
   1312,
   264,
   220,
   220,
   279,
   300,
   

In [14]:
block_size = 128

def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [16]:
lm_dataset = tokenized_yelp.map(group_texts, batched=True, num_proc=5)

#0:   0%|          | 0/90 [00:00<?, ?ba/s]


[A[A[A
[A

#0:   1%|          | 1/90 [00:11<17:02, 11.49s/ba]
[A


[A[A[A

#0:   2%|▏         | 2/90 [00:23<17:30, 11.94s/ba]

[A[A
[A


[A[A[A

[A[A


#0:   3%|▎         | 3/90 [00:35<17:18, 11.94s/ba]
[A

[A[A


#0:   4%|▍         | 4/90 [00:47<16:57, 11.83s/ba]
[A

[A[A


[A[A[A
#0:   6%|▌         | 5/90 [00:59<16:45, 11.83s/ba]

#0:   7%|▋         | 6/90 [01:10<16:26, 11.74s/ba]
[A


[A[A[A

#0:   8%|▊         | 7/90 [01:22<16:02, 11.59s/ba]
[A


[A[A[A

#0:   9%|▉         | 8/90 [01:33<15:46, 11.55s/ba]


[A[A[A
[A

[A[A


#0:  10%|█         | 9/90 [01:44<15:31, 11.50s/ba]
[A

#0:  11%|█         | 10/90 [01:57<15:42, 11.78s/ba]


[A[A[A
[A

[A[A


#0:  12%|█▏        | 11/90 [02:09<15:40, 11.91s/ba]
[A

#0:  13%|█▎        | 12/90 [02:22<15:50, 12.19s/ba]


[A[A[A
[A

#0:  14%|█▍        | 13/90 [02:34<15:38, 12.19s/ba]


[A[A[A
[A

#0:  16%|█▌        | 14/90 [02:47<15:50, 12.50s/ba]



In [21]:
from transformers import TFAutoModelForCausalLM
from transformers import DataCollatorForLanguageModeling

model = TFAutoModelForCausalLM.from_pretrained("distilgpt2")

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="tf")

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at distilgpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [23]:
tokenizer.pad_token = tokenizer.eos_token

tf_train_set = model.prepare_tf_dataset(
    lm_dataset["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_test_set = model.prepare_tf_dataset(
    lm_dataset["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [26]:
# see first 5 tokenized reviews
tf_train_set.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(16, 128), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(16, 128), dtype=tf.int64, name=None)}, TensorSpec(shape=(16, 128), dtype=tf.int64, name=None))>

In [29]:
from transformers import create_optimizer, AdamWeightDecay

optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)

model.compile()
model.fit(x=tf_train_set.take(50), validation_data=tf_test_set.take(50), epochs=3)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


Epoch 1/3