# T5- Architectured Transformer Summary Model

## Necessary Imports

In [1]:
!pip install datasets
!pip install transformers
!pip install torch
!pip install rouge-score

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Ensuring all the necessary libraries are imported

In [2]:
import sys
print(sys.executable)
print(sys.path)

try:
    # Import necessary libraries
    import os
    import zipfile
    import pandas as pd
    from datasets import Dataset, load_metric
    from transformers import T5Config, T5ForConditionalGeneration, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForSeq2Seq

    print("All packages imported successfully.")
except ImportError as e:
    print("Error importing packages:", e)

/usr/bin/python3
['/content', '/env/python', '/usr/lib/python310.zip', '/usr/lib/python3.10', '/usr/lib/python3.10/lib-dynload', '', '/usr/local/lib/python3.10/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.10/dist-packages/IPython/extensions', '/usr/local/lib/python3.10/dist-packages/setuptools/_vendor', '/root/.ipython']
All packages imported successfully.


## Extracting dataset and converting to Huggingface compatability

In [3]:
def extract_and_load_csv(zip_filename, csv_filename):
    with zipfile.ZipFile(zip_filename, 'r') as z:
        z.extract(csv_filename)
    return pd.read_csv(csv_filename)

# Extract and load the datasets
train_data = extract_and_load_csv('train.csv.zip', 'train.csv')
validation_data = extract_and_load_csv('validation.csv.zip', 'validation.csv')
test_data = extract_and_load_csv('test.csv.zip', 'test.csv')

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_data)
validation_dataset = Dataset.from_pandas(validation_data)
test_dataset = Dataset.from_pandas(test_data)

# Sample the dataset to avoid overfitting
train_dataset = train_dataset.shuffle(seed=42).select(range(1000))
validation_dataset = validation_dataset.shuffle(seed=42).select(range(500))
test_dataset = test_dataset.shuffle(seed=42).select(range(200))

# # Save the datasets locally
train_dataset.save_to_disk('train_dataset')
validation_dataset.save_to_disk('validation_dataset')
test_dataset.save_to_disk('test_dataset')

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/200 [00:00<?, ? examples/s]

In [4]:
from datasets import load_from_disk

# Load the datasets from disk
train_dataset = load_from_disk('train_dataset')
validation_dataset = load_from_disk('validation_dataset')
test_dataset = load_from_disk('test_dataset')

## Importing the T5 (small) architecute transformer model with tokenizer with ranomized weights (no previous leanrings applied)

In [5]:
# Initialize a tokenizer
tokenizer = AutoTokenizer.from_pretrained('t5-small')

# Create a new T5 configuration and model from scratch
config = T5Config.from_pretrained('t5-small')
model = T5ForConditionalGeneration(config)  # Model is initialized from scratch

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

## Preprocessing the text for training

In [6]:
# Define the preprocessing function
def preprocess_function(examples):
    inputs = examples['article']
    targets = examples['highlights']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# # Preprocess datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
validation_dataset = validation_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]



Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [7]:
# Initialize the data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Load the ROUGE metric for evaluation
rouge_metric = load_metric("rouge")

  rouge_metric = load_metric("rouge")


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

The repository for rouge contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/rouge.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


## Setting up customized callback function to save checkpoints based on time

In [8]:
from transformers import TrainerCallback
import time
import os

class TimeBasedCheckpointCallback(TrainerCallback):
    def __init__(self, save_interval_minutes=30):
        self.save_interval_seconds = save_interval_minutes * 60
        self.last_save_time = None

    def on_train_begin(self, args, state, control, **kwargs):
        self.last_save_time = time.time()  # Initialize the last save time at the start of training

    def on_step_end(self, args, state, control, **kwargs):
        current_time = time.time()
        elapsed_time = current_time - self.last_save_time

        if elapsed_time >= self.save_interval_seconds:
            # It's time to save a checkpoint
            epoch = int(state.epoch)
            output_dir = os.path.join(args.output_dir, f"checkpoint-epoch{epoch}-step{state.global_step}")
            os.makedirs(output_dir, exist_ok=True)
            print(f"Saving checkpoint to {output_dir}")
            kwargs['model'].save_pretrained(output_dir)  # Save the model
            kwargs['tokenizer'].save_pretrained(output_dir)  # Save the tokenizer
            state.save_to_json(os.path.join(output_dir, "trainer_state.json"))  # Save the trainer state
            self.last_save_time = current_time  # Reset the last save time

## Metrics initialization

In [9]:
from transformers import EvalPrediction
import numpy as np
import torch

# Function to compute metrics
def compute_metrics(eval_pred: EvalPrediction):
    predictions, labels = eval_pred

    # Check if predictions are a tuple (as some models return tuples)
    if isinstance(predictions, tuple):
        predictions = predictions[0]

    # Ensure predictions are handled as a NumPy array or PyTorch tensor
    if isinstance(predictions, torch.Tensor):
        predictions = predictions.argmax(dim=-1).tolist()
    elif isinstance(predictions, np.ndarray):
        predictions = np.argmax(predictions, axis=-1).tolist()
    else:
        # Ensure predictions are converted to a list of lists
        predictions = [np.argmax(p, axis=-1) for p in predictions]

    # Convert labels to lists
    if isinstance(labels, torch.Tensor):
        labels = labels.tolist()
    elif isinstance(labels, np.ndarray):
        labels = labels.tolist()

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute ROUGE scores
    result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    # Compute accuracy
    result["accuracy"] = sum(1 for pred, ref in zip(decoded_preds, decoded_labels) if pred == ref) / len(decoded_preds)
    return result

## Initializing the training paramets, implementing Adam optimizer with weight decay and learning rate scheduler and Trainer

In [10]:
from transformers import TrainingArguments, Trainer, get_linear_schedule_with_warmup,AdamW

# Set training arguments with increased epochs and adjusted evaluation steps
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=100,  # Evaluate less frequently
    learning_rate=3e-5,
    per_device_train_batch_size=2,  # Adjust based on memory
    num_train_epochs=20,  # Increase epochs to 20 for more training
    weight_decay=0.01,
    logging_dir='./results/logs',
    logging_steps=100,  # Log less frequently
    save_total_limit=4,
    save_steps=1000,  # Adjust as needed for your workflow
    load_best_model_at_end=True,
    gradient_accumulation_steps=16,
    fp16=False,
    warmup_steps=50,
    max_grad_norm=1.0,
    report_to="none"  # Disable all reporting, including MLflow
)

# Initialize the AdamW optimizer with weight decay
optimizer = AdamW(model.parameters(), lr=3e-5, weight_decay=0.01)

# Set up a learning rate scheduler
num_training_steps = len(train_dataset) // training_args.per_device_train_batch_size * training_args.num_train_epochs
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=num_training_steps
)

# Initialize the Trainer with custom optimizer and scheduler
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, lr_scheduler),  # Pass the optimizer and scheduler here
    callbacks=[TimeBasedCheckpointCallback(save_interval_minutes=3)]  # Custom callback for checkpoints
)


def get_latest_checkpoint():
    if os.path.isdir('./results'):
        checkpoints = [d for d in os.listdir('./results') if d.startswith('checkpoint') and '-' in d]
        if checkpoints:
            # Extract the step number from the directory name and convert it to an integer
            checkpoints = [d for d in checkpoints if 'step' in d.split('-')[-1] and os.path.isfile(os.path.join('./results', d, 'trainer_state.json'))]
            if checkpoints:
                latest_checkpoint = max(checkpoints, key=lambda d: int(d.split('-')[-1].replace('step', '')))
                return os.path.join('./results', latest_checkpoint)
    return None



## Training the model

In [11]:
import os
# Train the model, resume from the latest checkpoint if available
latest_checkpoint = get_latest_checkpoint()
if latest_checkpoint:
    print(f"Resuming from checkpoint: {latest_checkpoint}")
    trainer.train(resume_from_checkpoint=latest_checkpoint)
else:
    print("Starting training from scratch.")
    trainer.train()

Starting training from scratch.


Step,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 6.25 GiB. GPU 

## Saving the best model

In [None]:
# Ensure the output directory exists
output_dir = './results/best_model'
os.makedirs(output_dir, exist_ok=True)

# Save the final model at the end of training
trainer.save_model(output_dir)

# Save the model and tokenizer manually
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('./results/best_model\\tokenizer_config.json',
 './results/best_model\\special_tokens_map.json',
 './results/best_model\\tokenizer.json')

In [3]:
from transformers import T5ForConditionalGeneration, AutoTokenizer, Trainer

# Load the best model and tokenizer from the saved directory
best_model_dir = './results/best_model'
model = T5ForConditionalGeneration.from_pretrained(best_model_dir)
tokenizer = AutoTokenizer.from_pretrained(best_model_dir)

OSError: Incorrect path_or_model_id: './results/best_model'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [None]:
# Make predictions on the test set
test_predictions = trainer.predict(test_dataset)

# Compute the metrics for the test set
test_metrics = compute_metrics(test_predictions)

# Print the results
print("Test Metrics:")
for key, value in test_metrics.items():
    print(f"{key}: {value:.2f}")