In [58]:
from datasets import load_dataset

dataset = load_dataset("yelp_review_full")
dataset["train"]



  0%|          | 0/2 [00:00<?, ?it/s]

Dataset({
    features: ['label', 'text'],
    num_rows: 650000
})

In [59]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", max_length=200, truncation=True)

# map function work just like apply in pandas
tokenized_datasets = dataset.map(tokenize_function, batched=True)



  0%|          | 0/50 [00:00<?, ?ba/s]

In [62]:
# resample using tensorflow methodology
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(500))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(500))

# dataset is the pandas library (class) from huggingface
print(small_train_dataset[0])



{'label': 4, 'text': "I stalk this truck.  I've been to industrial parks where I pretend to be a tech worker standing in line, strip mall parking lots, and of course the farmer's market.  The bowls are so so absolutely divine.  The owner is super friendly and he makes each bowl by hand with an incredible amount of pride.  You gotta eat here guys!!!", 'input_ids': [101, 146, 27438, 1142, 4202, 119, 146, 112, 1396, 1151, 1106, 3924, 8412, 1187, 146, 9981, 1106, 1129, 170, 13395, 7589, 2288, 1107, 1413, 117, 6322, 8796, 5030, 7424, 117, 1105, 1104, 1736, 1103, 9230, 112, 188, 2319, 119, 1109, 20400, 1132, 1177, 1177, 7284, 10455, 119, 1109, 3172, 1110, 7688, 4931, 1105, 1119, 2228, 1296, 7329, 1118, 1289, 1114, 1126, 10965, 2971, 1104, 8188, 119, 1192, 13224, 3940, 1303, 3713, 106, 106, 106, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [63]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator(return_tensors="tf")

# we must use "labels" in label_cols because the class DefaultDataCollator overwrite the label name.
# For more detail check here: 
# https://github.com/huggingface/transformers/blob/d0acc9537829e7d067edbb791473bbceb2ecf056/src/transformers/data/data_collator.py#L165

tf_train_dataset = small_train_dataset.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

tf_validation_dataset = small_eval_dataset.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)

print(tf_train_dataset)

<PrefetchDataset shapes: ({input_ids: (None, 200), token_type_ids: (None, 200), attention_mask: (None, 200)}, (None,)), types: ({input_ids: tf.int64, token_type_ids: tf.int64, attention_mask: tf.int64}, tf.int64)>


In [64]:
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [65]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)

# 3 epochs and 63 batches (500/8)
model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f3c2d877790>