In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Loading the Necessary Libraries**

In [1]:
!pip install datasets
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

# **Loading the Model**
## Model and Tokenizer Setup

This section describes how to set up the **T5 (Text-To-Text Transfer Transformer)** model and tokenizer to generate text-based outputs for tasks such as **Question Answering** and **Summarization**.

### Code Overview

1. **Loading the Model**:
   - The `T5ForConditionalGeneration` model is loaded from the pre-trained model specified by the `model_name`. Here, `"t5-small"` is used, a lightweight version of the T5 model suitable for tasks that don’t require extensive model capacity.
   
2. **Loading the Tokenizer**:
   - The `T5Tokenizer` is initialized from the same `model_name` to ensure compatibility with the model.
   - Setting `legacy=False` ensures the tokenizer works in a modern mode, avoiding deprecated behavior.

3. **Using the Model and Tokenizer**:
   - The loaded `model` and `tokenizer` can be used to generate outputs for tasks based on the tokenized input data.
   - This setup prepares the environment for further fine-tuning or direct inference.

In [2]:
model_name = "t5-small"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name, legacy = False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [3]:
task = "qa"

if task == "qa":
    dataset = load_dataset("squad")
elif task == "summarization":
    dataset = load_dataset("cnn_dailymail", "3.0.0")

README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

# Dataset Preprocessing for NLP Tasks

This project provides a simple function to preprocess text data for two different NLP tasks: **Question Answering (QA)** and **Summarization**. The preprocessing function is designed to format the data into input and target pairs suitable for training models on these tasks.

## Code Overview

The `preprocess_function` formats text data based on the task type:
- For **QA**, it combines the `question` and `context` fields into a single input text and assigns the first answer as the target text.
- For **Summarization**, it takes the article as the input and the highlights as the target text.

The function is then applied to a dataset to create a processed dataset that models can be trained on.

## Usage

1. Set the `task` variable to `"qa"` or `"summarization"`.
2. Run the `preprocess_function` on each example in your dataset to format it for the specified task.
3. The dataset will be mapped and processed to include `input_text` and `target_text` fields, with other =dataset["train"].column_names)


In [4]:
def preprocess_function(example):
    if task == "qa":
        question = example['question']
        context = example['context']
        input_text = f"question: {question} context: {context}"
        target_text = example['answers']['text'][0]
    elif task == "summarization":
        input_text = f"summarize: {example['article']}"
        target_text = example['highlights']
    return {"input_text": input_text, "target_text": target_text}

processed_dataset = dataset.map(preprocess_function, remove_columns=dataset["train"].column_names)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

## Tokenizing Processed Dataset for Model Training

This section provides a function to tokenize a processed dataset, which is necessary for training transformer-based models on NLP tasks like **Question Answering** and **Summarization**.

### Code Overview

The `tokenize_function` uses a tokenizer (usually from the Hugging Face Transformers library) to convert the `input_text` and `target_text` fields from text into token IDs, preparing them for model input. Here's how it works:

1. **Tokenizing Inputs and Targets**:
   - The `input_text` and `target_text` fields from each example are tokenized separately.
   - The maximum length for `input_text` is set to 512 tokens, and for `target_text` (often shorter in summarization and QA tasks), it’s set to 150 tokens.
   - `truncation=True` ensures text exceeding these limits is truncated, and `padding="max_length"` pads shorter text to the maximum length for consistent input sizes.

2. **Creating Labels for Training**:
   - The tokenized `target_text` tokens (IDs) are assigned to the `labels` field. This allows the model to learn to map `input_text` to `target_text` during training.

3. **Applying Tokenization**:
   - The `tokenize_function` is applied to the entire `processed_dataset` in batches, generating a tokenized dataset ready for training.



In [5]:
def tokenize_function(example):
    inputs = tokenizer(example["input_text"], max_length=512, truncation=True, padding="max_length")
    targets = tokenizer(example["target_text"], max_length=150, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_dataset = processed_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

## Training Configuration and Model Training

This section explains the setup of training arguments and the process of training the T5 model using the Hugging Face `Trainer` API, which streamlines training, evaluation, and fine-tuning for transformer models.

### Code Overview

1. **Setting Training Arguments**:
   - `TrainingArguments` is configured to manage key aspects of the training process:
     - `output_dir="./results"`: Directory where the model checkpoints and outputs will be saved.
     - `eval_strategy="epoch"`: Model evaluation happens at the end of every epoch.
     - `fp16=True`: Enables mixed-precision training, which speeds up training and reduces memory usage on compatible GPUs.
     - `gradient_accumulation_steps=4`: Accumulates gradients over 4 steps, effectively increasing the batch size.
     - `learning_rate=3e-5`: Sets the learning rate for the optimizer, a critical hyperparameter for model training.
     - `per_device_train_batch_size=4` and `per_device_eval_batch_size=4`: Specifies batch sizes per GPU for training and evaluation.
     - `num_train_epochs=1`: Sets the total number of training epochs.
     - `weight_decay=0.01`: Adds weight decay regularization to reduce overfitting.

2. **Creating the Trainer Instance**:
   - The `Trainer` class manages the training loop, evaluation, and model updates.
   - It requires:
     - `model`: The pre-loaded T5 model.
     - `args`: Training arguments to control the training process.
     - `train_dataset` and `eval_dataset`: Tokenized datasets for training and evaluation.

3. **Running the Training Loop**:
   - After setting up the `Trainer`, simply call `trainer.train()` to begin training.


In [6]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    fp16=True,
    gradient_accumulation_steps=4,
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
)

## Model Training with Weights & Biases Tracking

This section describes how to set up training with **Weights & Biases (W&B)** for tracking experiment metrics and model performance over time. W&B integration allows you to monitor and log metrics, helping you visualize model improvements and compare different training runs.

### Code Overview

1. **Setting Up W&B API Key**:
   - Weights & Biases requires an API key to log data to your W&B account. Set the API key as an environment variable using `os.environ["WANDB_API_KEY"]`.
   - **Note**: Replace `"f21a85f92c718c6edad2b43ee7c2717ce5b83380"` with your actual W&B API key.

2. **Training the Model**:
   - After setting the API key, call `trainer.train()` to begin training. With W&B integration, metrics such as loss, accuracy, and evaluation scores are logged automatically to your W&B dashboard.
   - This makes it easy to visualize your training progress and evaluate model performance over epochs.


In [7]:
import os
os.environ["WANDB_API_KEY"] = "f21a85f92c718c6edad2b43ee7c2717ce5b83380"
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mjvibhor74[0m ([33mjvibhor74-g-h-raisoni-college-of-engineering[0m). Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,0.0157,0.015671


TrainOutput(global_step=5475, training_loss=0.16755540699719293, metrics={'train_runtime': 2730.6448, 'train_samples_per_second': 32.08, 'train_steps_per_second': 2.005, 'total_flos': 1.1855806467145728e+16, 'train_loss': 0.16755540699719293, 'epoch': 1.0})

## Testing Model Output with the `generate_output` Function

This section shows how to test the T5 model’s summarization capabilities using sample input articles. The `generate_output` function processes the input through the model, returning concise summaries for each article.

### Code Overview

1. **Preparing the Test Articles**:
   - Define sample articles (`test_article_1`, `test_article_2`, `test_article_3`) to check the model’s summarization ability.
   - Each article provides information on a unique topic, allowing the model to generate diverse summaries.

2. **Formatting Input for Summarization**:
   - The T5 model requires inputs to specify the task explicitly, so we prefix each article with `summarize:` to indicate a summarization task.
   - Each formatted article is assigned to `input_text`.

3. **Generating and Displaying Summaries**:
   - The `generate_output` function takes each formatted `input_text` and generates a summary. The function outputs the summarized content for easy evaluation of the model’s performance.

4. **Example Output**:
   - Sample output includes summaries for various topics such as the history of the internet, space exploration, and renewable energy.


In [9]:
import torch
def generate_output(input_text, max_length=50):
    # Move model to the appropriate device (GPU if available, else CPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Encode the input text and move it to the same device as the model
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

    # Generate the model output
    outputs = model.generate(input_ids, max_length=max_length, num_beams=4, early_stopping=True)

    # Decode and return the output text
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

# Test cases
test_article_1 = """The history of the internet dates back to the 1960s when ARPANET, a research network funded by the U.S. Department of Defense, was created. Over time, the technology evolved, and in 1991, the World Wide Web became publicly accessible, revolutionizing how people communicate and access information."""
input_text_1 = f"summarize: {test_article_1}"
print("Summary 1:", generate_output(input_text_1))

test_article_2 = """Space exploration has always fascinated humanity. With advancements in technology, companies like SpaceX and NASA are working on missions to send humans to Mars, aiming to make space travel more accessible and affordable for future generations."""
input_text_2 = f"summarize: {test_article_2}"
print("Summary 2:", generate_output(input_text_2))

test_article_3 = """Renewable energy sources, such as solar, wind, and hydroelectric power, are becoming increasingly important in combating climate change. These sources are sustainable and help reduce our dependence on fossil fuels, which contribute to greenhouse gas emissions."""
input_text_3 = f"summarize: {test_article_3}"
print("Summary 3:", generate_output(input_text_3))


Summary 1: the history of the internet dates back to the 1960s when ARPANET, a research network funded by the u.s. department of defense, was created. over time, the technology evolved, and in 1991, the world
Summary 2: space exploration has always fascinated humanity. companies like SpaceX and NASA are working on missions to send humans to Mars.
Summary 3: renewable energy sources, such as solar, wind, and hydroelectric power, are becoming increasingly important in combating climate change.


## Testing Model Output with the `generate_output` Function for Question Answering (QA)

This section demonstrates how to test the QA capabilities of a model using the `generate_output` function. The function processes input questions along with context, returning concise answers.

### Code Overview

1. **Preparing the Test Questions and Contexts**:
   - Define sample questions (`test_question_1`, `test_question_2`, `test_question_3`) alongside relevant contexts (`test_context_1`, `test_context_2`, `test_context_3`) for each example.
   - Each example provides a unique question-context pair, allowing the model to generate diverse answers.

2. **Formatting Input for Question Answering**:
   - The model requires input to specify the task and provide necessary context, so each question is prefixed with `question:` and followed by `context:` along with the background information.
   - Each formatted question-context pair is assigned to `input_text`.

3. **Generating and Displaying Answers**:
   - The `generate_output` function processes each `input_text` and produces an answer, which is then printed. This enables easy evaluation of the model’s response accuracy.

4. **Example Output**:
   - Sample output includes answers for questions on topics like:
     - The developer of the theory of relativity.
     - The tallest mountain in the world.
     - The main ingredient in guacamole.

This setup provides a simple framework for testing and evaluating QA models with different question-context pairs.


In [10]:
# Test output with generate_output function for QA
# Example 1
test_question_1 = "Who developed the theory of relativity?"
test_context_1 = "Albert Einstein developed the theory of relativity, which revolutionized theoretical physics."
input_text_1 = f"question: {test_question_1} context: {test_context_1}"
print("Answer 1:", generate_output(input_text_1))

# Example 2
test_question_2 = "What is the tallest mountain in the world?"
test_context_2 = "Mount Everest is the tallest mountain in the world, standing at 8,848 meters above sea level."
input_text_2 = f"question: {test_question_2} context: {test_context_2}"
print("Answer 2:", generate_output(input_text_2))

# Example 3
test_question_3 = "What is the main ingredient in guacamole?"
test_context_3 = "The main ingredient in guacamole is avocado, which gives it its creamy texture and rich flavor."
input_text_3 = f"question: {test_question_3} context: {test_context_3}"
print("Answer 3:", generate_output(input_text_3))

Answer 1: Albert Einstein
Answer 2: Mount Everest
Answer 3: avocado


# Saving the Model and the tokenized files


In [None]:
trainer.save_model("./models")
tokenizer.save_pretrained("./models")

('./models/tokenizer_config.json',
 './models/special_tokens_map.json',
 './models/spiece.model',
 './models/added_tokens.json')

In [None]:
import pandas as pd
import torch

# Assuming `model` and `tokenizer` are already loaded and set up
# Example input and label samples
sample_inputs = [
    "Artificial Intelligence is transforming industries by automating processes, improving decision-making, and enhancing user experiences. From healthcare to finance, AI-powered systems are enabling new levels of efficiency and insights.",
    "The Great Barrier Reef, located off the coast of Queensland, Australia, is the largest coral reef system in the world. It faces numerous threats, including climate change and coral bleaching, which endanger the reef's biodiversity.",
    "Electric vehicles (EVs) are becoming increasingly popular as they offer an eco-friendly alternative to traditional gasoline-powered cars. With advancements in battery technology, EVs now provide longer ranges and faster charging times."
]
sample_labels = [
    "AI is revolutionizing various industries by automating processes and enhancing decision-making.",
    "The Great Barrier Reef in Australia is the world’s largest coral reef, facing threats from climate change and coral bleaching.",
    "Electric vehicles are gaining popularity due to eco-friendly benefits and advances in battery technology."
]

# Lists to store data
input_texts = []
label_texts = []
generated_texts = []

# Generate and store data
for i in range(3):
    # Encode the input text with a prompt for English output
    input_text = f"summarize in English: {sample_inputs[i]}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    # Get the label text for reference
    label_text = sample_labels[i]

    # Generate output
    output_ids = model.generate(input_ids, max_length=50, num_beams=4, early_stopping=True)
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Append to lists
    input_texts.append(sample_inputs[i])  # Original input text
    label_texts.append(label_text)        # Reference label
    generated_texts.append(output_text)   # Generated summary or output

# Create DataFrame
df = pd.DataFrame({
    "Input": input_texts,
    "Label": label_texts,
    "Generated": generated_texts
})

# Display the DataFrame
print(df)


                                               Input  \
0  Artificial Intelligence is transforming indust...   
1  The Great Barrier Reef, located off the coast ...   
2  Electric vehicles (EVs) are becoming increasin...   

                                               Label  \
0  AI is revolutionizing various industries by au...   
1  The Great Barrier Reef in Australia is the wor...   
2  Electric vehicles are gaining popularity due t...   

                                           Generated  
0  artificial intelligence is transforming indust...  
1  the Great Barrier Reef, located off the coast ...  
2  electric vehicles (EVs) are becoming increasin...  


# Saving the result in a .csv


In [None]:
df.to_csv("/content/generated_texts_table.csv", index=False)

print("DataFrame saved as 'generated_texts_table.csv'.")

DataFrame saved as 'generated_texts_table.csv'.
