<a href="https://colab.research.google.com/github/abhishekseth0023-ship-it/Text-Summarization/blob/main/Automatic_Text_Summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from datasets import load_dataset

dataset = load_dataset("cnn_dailymail", "3.0.0")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

3.0.0/train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

3.0.0/train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

3.0.0/train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

3.0.0/validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

3.0.0/test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

tokenizer = T5Tokenizer.from_pretrained("t5-base")

model = T5ForConditionalGeneration.from_pretrained("t5-base")

device = torch.device("cuda")
model = model.to(device)

print("Model loaded on:", device)

Model loaded on: cuda


In [None]:
MAX_INPUT_LEN = 512
MAX_TARGET_LEN = 128

In [None]:
def preprocess_function(batch):
    inputs = ["summarize: " + article for article in batch["article"]]

    targets = batch["highlights"]

    return {"inputs": inputs, "targets": targets}

In [None]:
def tokenize_function(batch):

    model_inputs = tokenizer(
        batch["inputs"],
        max_length=MAX_INPUT_LEN,
        truncation=True,
        padding="max_length"
    )


    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["targets"],
            max_length=MAX_TARGET_LEN,
            truncation=True,
            padding="max_length"
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:

processed_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)

tokenized_dataset = processed_dataset.map(
    tokenize_function,
    batched=True
)

Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

Map:   0%|          | 0/287113 [00:00<?, ? examples/s]



Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [None]:
small_train = tokenized_dataset["train"].select(range(5000))
small_val = tokenized_dataset["validation"].select(range(1000))

In [None]:
from torch.utils.data import DataLoader

BATCH_SIZE = 4

def collate_fn(batch):
    input_ids = torch.tensor([item["input_ids"] for item in batch])
    attention_mask = torch.tensor([item["attention_mask"] for item in batch])
    labels = torch.tensor([item["labels"] for item in batch])

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }

train_loader = DataLoader(
    small_train,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_fn
)

val_loader = DataLoader(
    small_val,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_fn
)

In [None]:
import torch
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=3e-5)
num_epochs = 1

model.train()

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")

    for step, batch in enumerate(train_loader):
        input_ids = batch["input_ids"].cuda()
        attention_mask = batch["attention_mask"].cuda()
        labels = batch["labels"].cuda()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if step % 200 == 0:
            print(f"Step {step} - Loss: {loss.item():.4f}")


Epoch 1/1
Step 0 - Loss: 0.5982
Step 200 - Loss: 0.6569
Step 400 - Loss: 0.8983
Step 600 - Loss: 1.0585
Step 800 - Loss: 0.6668
Step 1000 - Loss: 0.8903
Step 1200 - Loss: 0.7987


In [None]:
def summarize(text):
    model.eval()

    input_text = "summarize: " + text

    inputs = tokenizer.encode(
        input_text,
        return_tensors="pt",
        max_length=512,
        truncation=True
    ).to(device)

    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=80,
            min_length=20,
            num_beams=6,
            length_penalty=1.5,
            no_repeat_ngram_size=3,
            early_stopping=True
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
sample_text = dataset["test"][0]["article"]
print("Original Article:\n", sample_text[:500], "...")

print("\nSummary:\n", summarize(sample_text))

Original Article:
 (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, includin ...

Summary:
 The Palestinian Authority officially becomes the 123rd member of the ICC . The accession was marked by a ceremony at The Hague, where the court is based . As members, Palestinians may be subject to counter-charges as well .


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
model.save_pretrained("/content/drive/MyDrive/t5_finetuned_cnn")
tokenizer.save_pretrained("/content/drive/MyDrive/t5_finetuned_cnn")

('/content/drive/MyDrive/t5_finetuned_cnn/tokenizer_config.json',
 '/content/drive/MyDrive/t5_finetuned_cnn/special_tokens_map.json',
 '/content/drive/MyDrive/t5_finetuned_cnn/spiece.model',
 '/content/drive/MyDrive/t5_finetuned_cnn/added_tokens.json')

In [None]:
model.save_pretrained("/content/drive/MyDrive/t5_finetuned_cnn")

After Restarting need to Reload

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

model_path = "/content/drive/MyDrive/t5_finetuned_cnn"

tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

device = torch.device("cuda")
model = model.to(device)

Loading weights:   0%|          | 0/257 [00:00<?, ?it/s]

In [None]:
def summarize(text):
    model.eval()

    input_text = "summarize: " + text

    inputs = tokenizer.encode(
        input_text,
        return_tensors="pt",
        max_length=512,
        truncation=True
    ).to(device)

    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=80,
            min_length=20,
            num_beams=6,
            length_penalty=1.5,
            no_repeat_ngram_size=3,
            early_stopping=True
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
text="""
Gears are manufactured through processes like casting, forging, and powder metallurgy, but the most common method is machining, which involves either cutting the gear teeth directly using a formed cutter or generating them through the relative motion between the gear blank and a cutting tool. Popular generating methods include hobbing and shaping, with hobbing being widely used for high-volume production of external gears like spur and helical gears.
"""
summary=summarize(text)
print(summary)

machining involves either cutting the gear teeth directly using a formed cutter or generating them through the relative motion between the gear blank and a cutting tool . Popular generating methods include hobbing and shaping .


In [None]:
import gradio as gr

def summarize_interface(text):
    return summarize(text)

ui = gr.Interface(
    fn=summarize_interface,
    inputs=gr.Textbox(lines=10, label="Enter text to summarize"),
    outputs=gr.Textbox(lines=5, label="Summary"),
    title="Text Summarizer"
)

ui.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://4cd546b69b60dc8c46.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


