In [1]:
'''
pip install --user transformers
pip install --user datasets
pip install --user tensorboard
pip install --user sentencepiece
pip install --user accelerate
pip install --user rouge_score
pip install --user evaluate
'''  


'\npip install --user transformers\npip install --user datasets\npip install --user tensorboard\npip install --user sentencepiece\npip install --user accelerate\npip install --user rouge_score\npip install --user evaluate\n'

In [2]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
import torch
import torch
print(torch.cuda.is_available())  # This should print True if CUDA is available
print(torch.version.cuda)         # This should print the CUDA version
print(torch.cuda.get_device_name(0))  # This should print the name of your GPU

True
12.1
NVIDIA RTX A1000 6GB Laptop GPU


In [3]:
# pip install --upgrade --user datasets fsspec aiohttp
# pip install rouge_score
# pip install accelerate
# pip install tensorboard

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## Imports

In [5]:
import pprint
import evaluate
import numpy as np

from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset

In [6]:
pp = pprint.PrettyPrinter()

In [7]:
torch.cuda.empty_cache()  # Clear unused memory
print(os.getenv('PYTORCH_CUDA_ALLOC_CONF'))


expandable_segments:True


## Prepare Dataset

In [8]:
# Test with a standard, commonly used dataset
#dataset = load_dataset('cnn_dailymail', '3.0.0', split='train')
#print(dataset)

In [9]:
dataset = load_dataset('gopalkalpande/bbc-news-summary', split='train')

In [10]:
full_dataset = dataset.train_test_split(test_size=0.2, shuffle=True)

In [11]:
dataset_train = full_dataset['train']
dataset_valid = full_dataset['test']

In [12]:
print(dataset_train)
print(dataset_valid)

Dataset({
    features: ['File_path', 'Articles', 'Summaries'],
    num_rows: 1779
})
Dataset({
    features: ['File_path', 'Articles', 'Summaries'],
    num_rows: 445
})


## Dataset Analysis

In [13]:
def find_longest_length(dataset):
    """
    Find the longest article and summary in the entire training set.
    """
    max_length = 0
    counter_4k = 0
    counter_2k = 0
    counter_1k = 0
    counter_500 = 0
    for text in dataset:
        corpus = [
            word for word in text.split()
        ]
        if len(corpus) > 4000:
            counter_4k += 1
        if len(corpus) > 2000:
            counter_2k += 1
        if len(corpus) > 1000:
            counter_1k += 1
        if len(corpus) > 500:
            counter_500 += 1
        if len(corpus) > max_length:
            max_length = len(corpus)
    return max_length, counter_4k, counter_2k, counter_1k, counter_500

longest_article_length, counter_4k, counter_2k, counter_1k, counter_500 = find_longest_length(dataset_train['Articles'])
print(f"Longest article length: {longest_article_length} words")
print(f"Artciles larger than 4000 words: {counter_4k}")
print(f"Artciles larger than 2000 words: {counter_2k}")
print(f"Artciles larger than 1000 words: {counter_1k}")
print(f"Artciles larger than 500 words: {counter_500}")
longest_summary_length, counter_4k, counter_2k, counter_1k, counter_500 = find_longest_length(dataset_train['Summaries'])
print(f"Longest summary length: {longest_summary_length} words")
print(f"Summaries larger than 4000 words: {counter_4k}")
print(f"Summaries larger than 2000 words: {counter_2k}")
print(f"Summaries larger than 1000 words: {counter_1k}")
print(f"Summaries larger than 500 words: {counter_500}")

Longest article length: 4377 words
Artciles larger than 4000 words: 1
Artciles larger than 2000 words: 7
Artciles larger than 1000 words: 18
Artciles larger than 500 words: 348
Longest summary length: 2073 words
Summaries larger than 4000 words: 0
Summaries larger than 2000 words: 1
Summaries larger than 1000 words: 7
Summaries larger than 500 words: 15


In [14]:
def find_avg_sentence_length(dataset):
    """
    Find the average sentence in the entire training set.
    """
    sentence_lengths = []
    for text in dataset:
        corpus = [
            word for word in text.split()
        ]
        sentence_lengths.append(len(corpus))
    return sum(sentence_lengths)/len(sentence_lengths)

avg_article_length = find_avg_sentence_length(dataset_train['Articles'])
print(f"Average article length: {avg_article_length} words")
avg_summary_length = find_avg_sentence_length(dataset_train['Summaries'])
print(f"Averrage summary length: {avg_summary_length} words")

Average article length: 381.43226531759416 words
Averrage summary length: 166.10118043844858 words


## Configurations

In [15]:
MODEL = 't5-base'
BATCH_SIZE = 4
NUM_PROCS = 4
EPOCHS = 10
OUT_DIR = 'results_t5base'
MAX_LENGTH = 512 # Maximum context length to consider while preparing dataset.

## Tokenization

In [16]:
tokenizer = T5Tokenizer.from_pretrained(MODEL)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [17]:
# Function to convert text data into model inputs and targets
def preprocess_function(examples,tokenizer,MAX_LENGTH):
    inputs = [f"summarize: {article}" for article in examples['Articles']]
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_LENGTH,
        truncation=True,
        padding='max_length'
    )
 
    # Set up the tokenizer for targets
    targets = [summary for summary in examples['Summaries']]
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=MAX_LENGTH,
            truncation=True,
            padding='max_length'
        )
 
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

 
# Apply the function to the whole dataset
tokenized_train = dataset_train.map(
    preprocess_function,
    batched=True,
    fn_kwargs={"tokenizer": tokenizer, "MAX_LENGTH": MAX_LENGTH},
    num_proc=NUM_PROCS
)
tokenized_valid = dataset_valid.map(
    preprocess_function,
    batched=True,
    fn_kwargs={"tokenizer": tokenizer, "MAX_LENGTH": MAX_LENGTH},
    num_proc=NUM_PROCS
)

Map (num_proc=4):   0%|          | 0/1779 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/445 [00:00<?, ? examples/s]

## Model

In [18]:
model = T5ForConditionalGeneration.from_pretrained(MODEL)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

222,903,552 total parameters.
222,903,552 training parameters.


  return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, non_blocking)


## ROUGE Metric

In [19]:
rouge = evaluate.load("rouge")

In [20]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions[0], eval_pred.label_ids

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(
        predictions=decoded_preds, 
        references=decoded_labels, 
        use_stemmer=True, 
        rouge_types=[
            'rouge1', 
            'rouge2', 
            'rougeL'
        ]
    )

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [21]:
def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak. 
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels

## Training

In [None]:
training_args = TrainingArguments(
    output_dir=OUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    warmup_steps=200,
    weight_decay=0.01,
    logging_dir=OUT_DIR,
    logging_steps=100,
    evaluation_strategy='steps',
    eval_steps=10,
    save_strategy='epoch',
    save_total_limit=2,
    report_to='tensorboard',
    learning_rate=0.01,
    dataloader_num_workers=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics
)

history = trainer.train()

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Gen Len
10,No log,0.385688,0.9002,0.8265,0.8806,229.7528
20,No log,0.381434,0.9019,0.8287,0.8824,229.7573
30,No log,0.375038,0.9003,0.8283,0.882,229.7506
40,No log,0.373853,0.8994,0.8284,0.882,229.7573
50,No log,0.396152,0.8988,0.8274,0.8809,229.7506
60,No log,0.398411,0.8948,0.8233,0.8774,229.7663
70,No log,0.406079,0.894,0.8232,0.8779,229.7483
80,No log,0.417289,0.8913,0.82,0.8737,229.7528
90,No log,0.422892,0.8898,0.8165,0.8723,229.7528
100,0.404600,0.44638,0.8902,0.8159,0.8717,229.7506


In [None]:
tokenizer.save_pretrained(OUT_DIR)

In [None]:
!zip -r {OUT_DIR} {OUT_DIR}

## Inference

In [None]:
# Download data.
!wget "https://www.dropbox.com/scl/fi/561r8pfhem4lu70hf438q/inference_data.zip?rlkey=aedt2saqmmp3a67qc4o34k04y&dl=1" -O inference_data.zip

In [None]:
!unzip inference_data.zip

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

import glob

In [None]:
model_path = f"{OUT_DIR}/checkpoint-4450"  # the path where you saved your model
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(OUT_DIR)

In [None]:
def summarize_text(text, model, tokenizer, max_length=512, num_beams=5):
    # Preprocess the text
    inputs = tokenizer.encode(
        "summarize: " + text,
        return_tensors='pt',
        max_length=max_length,
        truncation=True
    )

    # Generate the summary
    summary_ids = model.generate(
        inputs,
        max_length=50,
        num_beams=num_beams,
        # early_stopping=True,
    )

    # Decode and return the summary
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [None]:
for file_path in glob.glob('inference_data/*.txt'):
    file = open(file_path)
    text = file.read()
    summary = summarize_text(text, model, tokenizer)
    pp.pprint(summary)
    print('-'*75) 