# Training an Article Title Generation Model with T5

## Install libraries and download the dataset

Load kaggle.json file.

In [1]:
!pip install datasets transformers rouge-score nltk

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/547.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.6/547.8 kB[0m [31m4.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [3

In [2]:
!kaggle

Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 5, in <module>
    from kaggle.cli import main
  File "/usr/local/lib/python3.10/dist-packages/kaggle/__init__.py", line 7, in <module>
    api.authenticate()
  File "/usr/local/lib/python3.10/dist-packages/kaggle/api/kaggle_api_extended.py", line 398, in authenticate
    raise IOError('Could not find {}. Make sure it\'s located in'
OSError: Could not find kaggle.json. Make sure it's located in /root/.kaggle. Or use the environment method.


In [3]:
!cp kaggle.json ~/.kaggle/kaggle.json

cp: cannot stat 'kaggle.json': No such file or directory


In [4]:
!kaggle datasets download -d fabiochiusano/medium-articles

Dataset URL: https://www.kaggle.com/datasets/fabiochiusano/medium-articles
License(s): CC0-1.0
Downloading medium-articles.zip to /content
 98% 361M/369M [00:02<00:00, 175MB/s]
100% 369M/369M [00:03<00:00, 129MB/s]


In [5]:
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.32.1-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.w

## Load the dataset

In [1]:
import transformers
from datasets import load_dataset, load_metric, Dataset

In [2]:
medium_datasets = load_dataset("csv", data_files="medium-articles.zip")

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
medium_datasets

DatasetDict({
    train: Dataset({
        features: ['title', 'text', 'url', 'authors', 'timestamp', 'tags'],
        num_rows: 192368
    })
})

## Dataset train/validation/test split

In [3]:
!git clone https://github.com/ryanzhumich/AESLC.git

Cloning into 'AESLC'...
remote: Enumerating objects: 17469, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 17469 (delta 1), reused 0 (delta 0), pack-reused 17461[K
Receiving objects: 100% (17469/17469), 7.36 MiB | 18.47 MiB/s, done.
Resolving deltas: 100% (48/48), done.


In [4]:
import os
import json
import pandas as pd

# Define the folder containing the text files
folder_path = '/content/AESLC/enron_subject_line/train'

# Initialize lists to store the data
data = []
i = 0
for filename in os.listdir(folder_path):
    if filename.endswith(".subject"):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            content = file.read()
            # Split the content into body and subject
            if '@subject' in content:
                body_text, subject_text = content.split('@subject')
                data.append({
                    'id': i,
                    'text': body_text.strip(),
                    'title': subject_text.strip()
                })
                i = i+1
train_dataset: Dataset = Dataset.from_list(data[:5000])

folder_path = '/content/AESLC/enron_subject_line/test'

# Initialize lists to store the data
data = []
i = 0
for filename in os.listdir(folder_path):
    if filename.endswith(".subject"):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            content = file.read()
            # Split the content into body and subject
            if '@subject' in content:
                body_text, subject_text = content.split('@subject')
                data.append({
                    'id': i,
                    'text': body_text.strip(),
                    'title': subject_text.strip()
                })
                i = i+1
test_dataset: Dataset = Dataset.from_list(data[:200])

folder_path = '/content/AESLC/enron_subject_line/dev'
data = []
i = 0
for filename in os.listdir(folder_path):
    if filename.endswith(".subject"):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            content = file.read()
            # Split the content into body and subject
            if '@subject' in content:
                body_text, subject_text = content.split('@subject')
                data.append({
                    'id': i,
                    'text': body_text.strip(),
                    'title': subject_text.strip()
                })
                i = i+1
validation_dataset: Dataset = Dataset.from_list(data[:100])

In [5]:
# keep only a subsample of the datasets
medium_datasets["train"] = train_dataset
medium_datasets["validation"] = validation_dataset
medium_datasets["test"] = test_dataset


In [6]:
medium_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'title'],
        num_rows: 5000
    })
    validation: Dataset({
        features: ['id', 'text', 'title'],
        num_rows: 100
    })
    test: Dataset({
        features: ['id', 'text', 'title'],
        num_rows: 200
    })
})

## Data preprocessing

In [7]:
import nltk
nltk.download('punkt')
import string
from transformers import AutoTokenizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [8]:
model_checkpoint = "t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [9]:
prefix = "provide email subject: "

max_input_length = 512
max_target_length = 64

def clean_text(text):
  sentences = nltk.sent_tokenize(text.strip())
  sentences_cleaned = [s for sent in sentences for s in sent.split("\n")]
  sentences_cleaned_no_titles = [sent for sent in sentences_cleaned
                                 if len(sent) > 0 and
                                 sent[-1] in string.punctuation]
  text_cleaned = "\n".join(sentences_cleaned_no_titles)
  return text_cleaned

def preprocess_data(examples):
  texts_cleaned = [clean_text(text) for text in examples["text"]]
  inputs = [prefix + text for text in texts_cleaned]
  model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

  # Setup the tokenizer for targets
  with tokenizer.as_target_tokenizer():
    labels = tokenizer(examples["title"], max_length=max_target_length,
                       truncation=True)

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [10]:
medium_datasets_cleaned = medium_datasets
tokenized_datasets = medium_datasets_cleaned.map(preprocess_data, batched=True)
tokenized_datasets

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'title', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5000
    })
    validation: Dataset({
        features: ['id', 'text', 'title', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['id', 'text', 'title', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 200
    })
})

## Fine-tune T5

In [11]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [None]:
!rm -r {model_dir}

In [13]:
batch_size = 8
model_name = "t5-base-medium-email-subject-generation-v2"
model_dir = f"/content/{model_name}"
args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=200,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=200,
    learning_rate=4e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to="tensorboard"
)

In [14]:
data_collator = DataCollatorForSeq2Seq(tokenizer)

In [15]:
import numpy as np

metric = load_metric("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip()))
                      for label in decoded_labels]

    # Compute ROUGE scores
    result = metric.compute(predictions=decoded_preds, references=decoded_labels,
                            use_stemmer=True)

    # Extract ROUGE f1 scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    # Add mean generated length to metrics
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

  metric = load_metric("rouge")


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

The repository for rouge contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/rouge.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


In [16]:
# Function that returns an untrained model to be trained
def model_init():
    return AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

trainer = Seq2SeqTrainer(
    model_init=model_init,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [18]:
trainer.train()

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
200,3.2592,3.744923,13.5367,6.9273,13.4712,13.4893,5.58
400,3.2057,3.775923,13.6945,6.547,13.625,13.7892,5.65
600,3.1994,3.761946,15.5924,8.354,15.4952,15.658,6.17


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=625, training_loss=3.274387854003906, metrics={'train_runtime': 370.0419, 'train_samples_per_second': 13.512, 'train_steps_per_second': 1.689, 'total_flos': 2164483732193280.0, 'train_loss': 3.274387854003906, 'epoch': 1.0})

In [19]:
trainer.save_model()

## Load the model from GDrive

In [20]:
model_name = "t5-base-medium-email-subject-generation-v2"
model_dir = f"/content/{model_name}"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

max_input_length = 512

In [22]:
text = """
Phillip,   Could you please do me a favor?
I would like  to read your current title policy to see what it says about easements.
You  should have received a copy during your closing.
I don't know how many  pages it will be but let me know how you want to handle getting a copy  made.
I'll be happy to make the copy, or whatever makes it easy for  you.
Thanks,:
"""

inputs = ["provide email subject: " + text]

inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, return_tensors="pt")
output = model.generate(**inputs, num_beams=8, do_sample=True, min_length=1, max_length=10)
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
predicted_title = nltk.sent_tokenize(decoded_output.strip())[0]

print(predicted_title)
# Session State and Callbacks in Streamlit

Title Policy


In [23]:
text = """
The following reports have been waiting for your approval for more than 4 days.
Please review.
Owner: James W Reitmeyer Report Name: JReitmeyer 10/24/01 Days In Mgr.
"""

inputs = ["provide email subject: " + text]

inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, return_tensors="pt")
output = model.generate(**inputs, num_beams=8, do_sample=True, min_length=1, max_length=10)
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
predicted_title = nltk.sent_tokenize(decoded_output.strip())[0]

print(predicted_title)
# Conversational AI: The Future of Customer Service

Reports Waiting For Your Approval


## Upload the model to the Hugging Space Hub

https://huggingface.co/docs/transformers/model_sharing

In [24]:
!pip install --upgrade jax jaxlib # CPU-only
!pip install flax

Collecting jax
  Downloading jax-0.4.30-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
Collecting jaxlib
  Downloading jaxlib-0.4.30-cp310-cp310-manylinux2014_x86_64.whl (79.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.6/79.6 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jaxlib, jax
  Attempting uninstall: jaxlib
    Found existing installation: jaxlib 0.4.26+cuda12.cudnn89
    Uninstalling jaxlib-0.4.26+cuda12.cudnn89:
      Successfully uninstalled jaxlib-0.4.26+cuda12.cudnn89
  Attempting uninstall: jax
    Found existing installation: jax 0.4.26
    Uninstalling jax-0.4.26:
      Successfully uninstalled jax-0.4.26
Successfully installed jax-0.4.30 jaxlib-0.4.30




In [25]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [26]:
from transformers import T5ForConditionalGeneration, TFT5ForConditionalGeneration, FlaxT5ForConditionalGeneration

tokenizer = AutoTokenizer.from_pretrained(model_dir)
pt_model = T5ForConditionalGeneration.from_pretrained(model_dir)
tf_model = TFT5ForConditionalGeneration.from_pretrained(model_dir, from_pt=True)
flax_model = FlaxT5ForConditionalGeneration.from_pretrained(model_dir, from_pt=True)

tokenizer.push_to_hub(model_name)
pt_model.push_to_hub(model_name)
tf_model.push_to_hub(model_name)
flax_model.push_to_hub(model_name)

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/920 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/anukvma/t5-base-medium-email-subject-generation-v2/commit/3ef9e2a862f81204735f1743ac7316ec3b14083c', commit_message='Upload FlaxT5ForConditionalGeneration', commit_description='', oid='3ef9e2a862f81204735f1743ac7316ec3b14083c', pr_url=None, pr_revision=None, pr_num=None)

## Load the model from the Hugging Face Hub

In [27]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import nltk
nltk.download('punkt')

tokenizer = AutoTokenizer.from_pretrained("anukvma/t5-base-medium-email-subject-generation-v2")
model = AutoModelForSeq2SeqLM.from_pretrained("anukvma/t5-base-medium-email-subject-generation-v2")

text = """
Harry - I got kicked out of the system, so I'm sending this from Tom's account.
He can fill you in on the potential deal with STEAG.
I left my resume on your chair.
I'll e-mail a copy when I have my home account running.
My contact info is:
"""

inputs = ["provide email subject: " + text]

inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, return_tensors="pt")
output = model.generate(**inputs, num_beams=8, do_sample=True, min_length=1, max_length=10)
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
predicted_title = nltk.sent_tokenize(decoded_output.strip())[0]

print(predicted_title)
# Conversational AI: The Future of Customer Service

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


tokenizer_config.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

  pt_model_dict[flax_key] = torch.from_numpy(flax_tensor)
All Flax model weights were used when initializing T5ForConditionalGeneration.

Some weights of T5ForConditionalGeneration were not initialized from the Flax model and are newly initialized: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

STEAG


## Evaluate the model on the test set

In [28]:
import torch

# get test split
test_tokenized_dataset = tokenized_datasets["test"]

# pad texts to the same length
def preprocess_test(examples):
  inputs = [prefix + text for text in examples["text"]]
  model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True,
                           padding="max_length")
  return model_inputs

test_tokenized_dataset = test_tokenized_dataset.map(preprocess_test, batched=True)

# prepare dataloader
test_tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
dataloader = torch.utils.data.DataLoader(test_tokenized_dataset, batch_size=32)

# generate text for each batch
all_predictions = []
for i,batch in enumerate(dataloader):
  predictions = model.generate(**batch)
  all_predictions.append(predictions)

# flatten predictions
all_predictions_flattened = [pred for preds in all_predictions for pred in preds]

# tokenize and pad titles
all_titles = tokenizer(test_tokenized_dataset["title"], max_length=max_target_length,
                       truncation=True, padding="max_length")["input_ids"]

# compute metrics
predictions_labels = [all_predictions_flattened, all_titles]
compute_metrics(predictions_labels)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]



{'rouge1': 14.4567,
 'rouge2': 7.0306,
 'rougeL': 14.0258,
 'rougeLsum': 14.1119,
 'gen_len': 5.245}