# Summarization (TensorFlow)

Test GPU

In [1]:
import tensorflow as tf
from tensorflow.keras import layers, models

# Check TensorFlow version and GPU support
print("TensorFlow version:", tf.__version__)
print("Keras version:", tf.keras.__version__)
print("GPUs detected:", tf.config.list_physical_devices('GPU'))

# Simple test model
model = models.Sequential([
    layers.Input(shape=(100,)),
    layers.Dense(64, activation='relu'),
    layers.Dense(10, activation='softmax')
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
print("Model built successfully.")

TensorFlow version: 2.18.0
Keras version: 3.9.0
GPUs detected: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Model built successfully.


2025-03-09 18:31:01.994591: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4 Pro
2025-03-09 18:31:01.994630: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 48.00 GB
2025-03-09 18:31:01.994636: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 18.00 GB
I0000 00:00:1741559461.994653 13097393 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1741559461.994674 13097393 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [2]:
import tensorflow as tf
print(tf.config.list_physical_devices('GPU'))


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


Verify GPU usage: When training, TensorFlow should automatically use the Metal GPU. To be sure, you can enable device placement logging:

In [None]:
tf.debugging.set_log_device_placement(True)

Enable Mixed Precision Training (float16)

In [None]:
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy('mixed_float16')

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.

In [None]:
# from huggingface_hub import notebook_login

# notebook_login()

In [None]:
import os
import warnings

# Suppress future warnings from huggingface_hub
warnings.filterwarnings("ignore", category=FutureWarning)

# Disable parallelism warnings from tokenizers
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
from datasets import load_dataset

spanish_dataset = load_dataset("mteb/amazon_reviews_multi", "es")
english_dataset = load_dataset("mteb/amazon_reviews_multi", "en")
english_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label', 'label_text'],
        num_rows: 200000
    })
    validation: Dataset({
        features: ['id', 'text', 'label', 'label_text'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['id', 'text', 'label', 'label_text'],
        num_rows: 5000
    })
})

In [None]:
def split_review(example):
    # Split on a double newline; adjust delimiter if necessary
    parts = example["text"].split("\n\n", 1)
    if len(parts) == 2:
        review_title, review_body = parts
    else:
        review_title = parts[0]
        review_body = ""
    # Return a dictionary with the new fields
    return {"review_title": review_title, "review_body": review_body}

# Update the dataset with the new keys
english_dataset = english_dataset.map(split_review)
spanish_dataset = spanish_dataset.map(split_review)

In [None]:
def show_samples(dataset, num_samples=3, seed=42):
    sample = dataset["train"].shuffle(seed=seed).select(range(num_samples))
    for example in sample:
        print(f"\n'>> Title: {example['review_title']}'")
        print(f"'>> Review: {example['review_body']}'")

show_samples(english_dataset)


'>> Title: Worked in front position, not rear'
'>> Review: 3 stars because these are not rear brakes as stated in the item description. At least the mount adapter only worked on the front fork of the bike that I got it for.'

'>> Title: meh'
'>> Review: Does it’s job and it’s gorgeous but mine is falling apart, I had to basically put it together again with hot glue'

'>> Title: Can't beat these for the money'
'>> Review: Bought this for handling miscellaneous aircraft parts and hanger "stuff" that I needed to organize; it really fit the bill. The unit arrived quickly, was well packaged and arrived intact (always a good sign). There are five wall mounts-- three on the top and two on the bottom. I wanted to mount it on the wall, so all I had to do was to remove the top two layers of plastic drawers, as well as the bottom corner drawers, place it when I wanted and mark it; I then used some of the new plastic screw in wall anchors (the 50 pound variety) and it easily mounted to the wall. 

In [None]:
english_dataset.set_format("pandas")
english_df = english_dataset["train"][:]
# Show counts for top 20 products
english_df["label"].value_counts()[:20]

label
0    40000
1    40000
2    40000
3    40000
4    40000
Name: count, dtype: int64

In [None]:
def filter_types(example):
    return (
        example["label"] == 4
    )

In [None]:
english_dataset.reset_format()

In [None]:
spanish_type_4 = spanish_dataset.filter(filter_types)
english_type_4 = english_dataset.filter(filter_types)
show_samples(spanish_type_4)
show_samples(english_type_4)


'>> Title: Todo correcto rápido y según la descripción.'
'>> Review: Todo correcto rápido y según la descripción'

'>> Title: Buen producto'
'>> Review: Por el precio no esperaba demasiado de ellos y estoy muy gratamente sorprendido.'

'>> Title: Buena función'
'>> Review: Me va genial para mi problema y dolor de coxis, alivia mucho el dolor y aguanta muy bien el peso.'

'>> Title: High quality easy to attach band'
'>> Review: Beautiful band that attaches easily to my Versa. The colors all me to wear with both pink and red but are not garish at all. Stylish and comfortable addition to my Fitbit Versa'

'>> Title: Got it for work.'
'>> Review: I work in a bar and we have to open cans right away all the time. This is precisely the tool for that, just punch a hole and next.'

'>> Title: So comfortable. I ordered my son a pair'
'>> Review: My husband loves these shoes. So comfortable. I ordered my son a pair.'


In [None]:
from datasets import concatenate_datasets, DatasetDict

books_dataset = DatasetDict()

for split in english_type_4.keys():
    books_dataset[split] = concatenate_datasets(
        [english_type_4[split], spanish_type_4[split]]
    )
    books_dataset[split] = books_dataset[split].shuffle(seed=13213)

# Peek at a few examples
show_samples(books_dataset)


'>> Title: Like the back up battery'
'>> Review: Our neighborhood has many power surges which affect the electrical gadgets. No more re-setting the clock with this the back up battery!'

'>> Title: muy contenta de momento'
'>> Review: ha sido un regalo para una compañera de trabajo, y llego muy bien todo muy ordenadito. todavía no ha nacido el niño pero imagino que le ira muy bien'

'>> Title: Darling and quality material'
'>> Review: Darling and quality material but was too small for my frenchie'


In [None]:
books_dataset = books_dataset.filter(lambda x: len(x["review_title"].split()) > 2)

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



In [None]:
inputs = tokenizer("I loved reading the Hunger Games!")
inputs

{'input_ids': [336, 259, 28387, 11807, 287, 62893, 295, 12507, 309, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
tokenizer.convert_ids_to_tokens(inputs.input_ids)

['▁I', '▁', 'loved', '▁reading', '▁the', '▁Hung', 'er', '▁Games', '!', '</s>']

In [None]:
max_input_length = 512
max_target_length = 30


def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["review_body"],
        max_length=max_input_length,
        truncation=True,
    )
    labels = tokenizer(
        examples["review_title"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = books_dataset.map(preprocess_function, batched=True)

In [None]:
generated_summary = "I absolutely loved reading the Hunger Games"
reference_summary = "I loved reading the Hunger Games"

In [None]:
import evaluate

rouge_score = evaluate.load("rouge")

In [None]:
scores = rouge_score.compute(
    predictions=[generated_summary], references=[reference_summary]
)
scores

{'rouge1': 0.923076923076923,
 'rouge2': 0.7272727272727272,
 'rougeL': 0.923076923076923,
 'rougeLsum': 0.923076923076923}

In [None]:
scores["rouge1"]

0.923076923076923

In [None]:
import nltk

nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/benito/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
from nltk.tokenize import sent_tokenize


def three_sentence_summary(text):
    return "\n".join(sent_tokenize(text)[:3])


print(three_sentence_summary(books_dataset["train"][1]["review_body"]))

Gave as a gift for one of my top performing team members on my team!


In [None]:
def evaluate_baseline(dataset, metric):
    summaries = [three_sentence_summary(text) for text in dataset["review_body"]]
    return metric.compute(predictions=summaries, references=dataset["review_title"])

In [None]:
import pandas as pd

score = evaluate_baseline(books_dataset["validation"], rouge_score)
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = dict((rn, round(score[rn] * 100, 2)) for rn in rouge_names)
rouge_dict

{'rouge1': 17.47, 'rouge2': 9.36, 'rougeL': 16.22, 'rougeLsum': 16.62}

In [None]:
from transformers import TFAutoModelForSeq2SeqLM

model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

All model checkpoint layers were used when initializing TFMT5ForConditionalGeneration.

All the layers of TFMT5ForConditionalGeneration were initialized from the model checkpoint at google/mt5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMT5ForConditionalGeneration for predictions without further training.


In [None]:
# from huggingface_hub import notebook_login

# notebook_login()

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(
    books_dataset["train"].column_names
)

In [None]:
features = [tokenized_datasets["train"][i] for i in range(2)]
data_collator(features)

{'input_ids': <tf.Tensor: shape=(2, 20), dtype=int32, numpy=
array([[  9981,   1559,    259,  45389,    263,    259,    276,    259,
        121112,    289,    283,  53204,  14293,    260,    259,  89227,
         17560,      1,      0,      0],
       [   259, 215612,    527,    259,    262,  19483,    332,   1371,
           304,   1037,   2672,  17312,    347,   4644,  14939,    351,
          1037,   4644,    309,      1]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(2, 20), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
      dtype=int32)>, 'labels': <tf.Tensor: shape=(2, 7), dtype=int64, numpy=
array([[   259, 104293,    259,    276,   4331,  89254,      1],
       [   259, 102162,   3869,    263,   7047,    309,      1]])>, 'decoder_input_ids': <tf.Tensor: shape=(2, 7), dtype=int64, numpy=
array([[     0,    259, 104293,    259,    276,   4331,  89254],
       

In [None]:
tf_train_dataset = model.prepare_tf_dataset(
    tokenized_datasets["train"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=8,
)
tf_eval_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=8,
)

In [None]:
from transformers import create_optimizer
import tensorflow as tf

# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
num_train_epochs = 8
num_train_steps = len(tf_train_dataset) * num_train_epochs
model_name = model_checkpoint.split("/")[-1]

optimizer, schedule = create_optimizer(
    init_lr=5e-6,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)

model.compile(optimizer=optimizer)

# Train in mixed-precision float16
# tf.keras.mixed_precision.set_global_policy("mixed_float16")

: 

In [None]:
from transformers.keras_callbacks import PushToHubCallback

callback = PushToHubCallback(
    output_dir=f"{model_name}-finetuned-amazon-en-es", tokenizer=tokenizer
)

model.fit(
    tf_train_dataset, validation_data=tf_eval_dataset, callbacks=[callback], epochs=8
)

/Users/benito/Library/CloudStorage/OneDrive-Personal/Documentos/Academy/AFIT/EENG645A Practical Machine Learning/Project/summarization/MachineLearning/course/en/chapter7/mt5-small-finetuned-amazon-en-es is already a clone of https://huggingface.co/benitoals/mt5-small-finetuned-amazon-en-es. Make sure you pull the latest changes with `repo.git_pull()`.


Epoch 1/8


  return np.array(values, copy=copy, order=order).astype(dtype)


 121/4854 [..............................] - ETA: 15:07:40 - loss: nan

In [None]:
from tqdm import tqdm
import numpy as np

generation_data_collator = DataCollatorForSeq2Seq(
    tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=320
)

tf_generate_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    collate_fn=generation_data_collator,
    shuffle=False,
    batch_size=8,
    drop_remainder=True,
)


@tf.function(jit_compile=True)
def generate_with_xla(batch):
    return model.generate(
        input_ids=batch["input_ids"],
        attention_mask=batch["attention_mask"],
        max_new_tokens=32,
    )


all_preds = []
all_labels = []
for batch, labels in tqdm(tf_generate_dataset):
    predictions = generate_with_xla(batch)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = labels.numpy()
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    all_preds.extend(decoded_preds)
    all_labels.extend(decoded_labels)

In [None]:
result = rouge_score.compute(
    predictions=decoded_preds, references=decoded_labels, use_stemmer=True
)
result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
{k: round(v, 4) for k, v in result.items()}

In [None]:
from transformers import pipeline

hub_model_id = "huggingface-course/mt5-small-finetuned-amazon-en-es"
summarizer = pipeline("summarization", model=hub_model_id)

In [None]:
def print_summary(idx):
    review = books_dataset["test"][idx]["review_body"]
    title = books_dataset["test"][idx]["review_title"]
    summary = summarizer(books_dataset["test"][idx]["review_body"])[0]["summary_text"]
    print(f"'>>> Review: {review}'")
    print(f"\n'>>> Title: {title}'")
    print(f"\n'>>> Summary: {summary}'")

In [None]:
print_summary(100)

In [None]:
print_summary(0)