In [1]:
import pandas as pd
import re
import os
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from time import time
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW

In [2]:
!kaggle datasets download gowrishankarp/newspaper-text-summarization-cnn-dailymail

Dataset URL: https://www.kaggle.com/datasets/gowrishankarp/newspaper-text-summarization-cnn-dailymail
License(s): CC0-1.0
Downloading newspaper-text-summarization-cnn-dailymail.zip to /content
 99% 500M/503M [00:23<00:00, 22.7MB/s]
100% 503M/503M [00:23<00:00, 22.1MB/s]


In [4]:
!unzip newspaper-text-summarization-cnn-dailymail.zip -d data

Archive:  newspaper-text-summarization-cnn-dailymail.zip
  inflating: data/cnn_dailymail/test.csv  
  inflating: data/cnn_dailymail/train.csv  
  inflating: data/cnn_dailymail/validation.csv  


In [2]:
path="/content/data/cnn_dailymail"
file_labels = ["test","trian", "valid"]
file_path=[os.path.join(path,filename) for filename in os.listdir(path) if filename.endswith(".csv")]
frames = [pd.read_csv(filepath, sep=',').assign(split=label) for filepath,label in zip(file_path, file_labels) ]
df=pd.concat(frames)

In [None]:
duplicate_rows = df.duplicated().sum()
duplicate_rows

In [None]:
train_data = df[df["split"]=="trian"]
test_data = df[df["split"]=="test"]
val_data =  df[df["split"]=="valid"]

In [11]:
print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)
print("Validation data shape:", val_data.shape)

Train data shape: (287113, 3)
Test data shape: (11490, 3)
Validation data shape: (13368, 3)


In [12]:
train_data.head()

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


In [None]:
# # Reduce dataset size for initial training
# train_data = train_data.sample(500)  # Use 500 samples for training
# val_data = val_data.sample(100)      # Use 100 samples for validation
# test_data = test_data.sample(500)

In [13]:
# افتراض أن الملفات تحتوي على أعمدة 'article' و 'highlights'
train_texts = train_data['article']
train_summaries = train_data['highlights']

test_texts = test_data['article']
test_summaries = test_data['highlights']

val_texts = val_data['article']
val_summaries = val_data['highlights']


In [None]:
# Assume datasets have 'article' and 'highlights' columns
def clean_english_text(text):
    """Clean English text by removing unnecessary characters."""
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Apply cleaning function
for dataset in [train_data, test_data, val_data]:
    dataset['article'] = dataset['article'].apply(clean_english_text)
    dataset['highlights'] = dataset['highlights'].apply(clean_english_text)

In [None]:
train_data.head()
test_data.head()

                                              id  \
217919  a62347d0d9195494b613208df56669bd60ed4b51   
174023  6d33592a279d6fb17b12fedbdbaa5bfc51e3dbad   
130671  34fe21371c27c91bdbe241fb09a2dd7210635ab6   
150955  4f2c6de9a49e7717fce72ca6096981a69ccd715b   
215468  a2e755c67f777694ac6f031acecb901bebc97562   

                                                  article  \
217919  by daily mail reporters published 1054 est 11 ...   
174023  with wayne rooney set to make his 100th englan...   
130671  by damien gayle published 0613 est 23 january ...   
150955  by louise boyle a wealthy restaurateur has rev...   
215468  the worlds most enviable bodies arrived in lon...   

                                               highlights  
217919  reality tv star 39 was diagnosed with stage 3 ...  
174023  wayne rooney will be handed his 100th england ...  
130671  latest version of the microsoft tablet runs fu...  
150955  athanasios konidaris claims that his wife shel...  
215468  the worlds m

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# تحميل الـ Tokenizer والنموذج باستخدام المجلد الذي يحتوي على الملفات
tokenizer = T5Tokenizer.from_pretrained('E:/t5_small')
model = T5ForConditionalGeneration.from_pretrained('E:/t5_small')


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
# Define a custom Dataset class
class TextSummaryDataset(Dataset):
    def __init__(self, texts, summaries, tokenizer, max_input_length=512, max_output_length=150):
        self.texts = texts
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        summary = self.summaries.iloc[idx]

        input_encodings = self.tokenizer(
            text, truncation=True, padding='max_length', max_length=self.max_input_length, return_tensors="pt"
        )
        target_encodings = self.tokenizer(
            summary, truncation=True, padding='max_length', max_length=self.max_output_length, return_tensors="pt"
        )

        return {
            'input_ids': input_encodings['input_ids'].squeeze(0),
            'attention_mask': input_encodings['attention_mask'].squeeze(0),
            'labels': target_encodings['input_ids'].squeeze(0)
        }


In [None]:
# Create DataLoader instances
train_dataset = TextSummaryDataset(train_data['article'], train_data['highlights'], tokenizer)
test_dataset = TextSummaryDataset(test_data['article'], test_data['highlights'], tokenizer)
val_dataset = TextSummaryDataset(val_data['article'], val_data['highlights'], tokenizer)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4)
val_loader = DataLoader(val_dataset, batch_size=4)


In [None]:
print(f"Number of samples in train dataset: {len(train_dataset)}")
print(f"Number of batches in train loader: {len(train_loader)}")

print(f"Number of samples in test dataset: {len(test_dataset)}")



Number of samples in train dataset: 500
Number of batches in train loader: 125
Number of samples in test dataset: 500


In [None]:
# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)



In [None]:
for batch_idx, batch in enumerate(train_loader):
    print(f"Processing batch {batch_idx + 1}/{len(train_loader)}")


Processing batch 1/125
Processing batch 2/125
Processing batch 3/125
Processing batch 4/125
Processing batch 5/125
Processing batch 6/125
Processing batch 7/125
Processing batch 8/125
Processing batch 9/125
Processing batch 10/125
Processing batch 11/125
Processing batch 12/125
Processing batch 13/125
Processing batch 14/125
Processing batch 15/125
Processing batch 16/125
Processing batch 17/125
Processing batch 18/125
Processing batch 19/125
Processing batch 20/125
Processing batch 21/125
Processing batch 22/125
Processing batch 23/125
Processing batch 24/125
Processing batch 25/125
Processing batch 26/125
Processing batch 27/125
Processing batch 28/125
Processing batch 29/125
Processing batch 30/125
Processing batch 31/125
Processing batch 32/125
Processing batch 33/125
Processing batch 34/125
Processing batch 35/125
Processing batch 36/125
Processing batch 37/125
Processing batch 38/125
Processing batch 39/125
Processing batch 40/125
Processing batch 41/125
Processing batch 42/125
P

In [None]:
for batch in train_loader:
    print(batch)
    break


{'input_ids': tensor([[  57,   46,  189,  ..., 2738,   15,    1],
        [  57,  528,    7,  ..., 1436,   40,    1],
        [  57,    3,   23,  ...,  221,   26,    1],
        [ 126,   42,  109,  ...,   29, 1131,    1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[  563,  2237,    57,     3, 27341,     3,    60,    15,    26, 17376,
          3986,    13, 23332,     7,    13,   853,     3,     9,  2672,   764,
           139,     3,   115, 10694,    77,    45,     8,  3134,   760,  6347,
           147,   628,    13,    44,   709,   507,   767,     3, 21217,     3,
          8715,     3,  5840,  1152,    11,   662,   724,    13,     3,  3810,
         10195,   336,   215,  2095,   435, 29309,    28,    46,  5861,  2815,
           701,    13,   627,   770,    16, 15941, 25233,  2095,   243,    79,
           857,     3,  3810,    47,   492,   944,   770,

In [None]:
# Define training loop
def train_model(model, train_loader, val_loader, optimizer, epochs=3, device="cuda" if torch.cuda.is_available() else "cpu"):
    model.to(device)
    model.train()

    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        epoch_loss = 0

        for batch in train_loader:
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        print(f"Epoch {epoch + 1} Loss: {epoch_loss / len(train_loader)}")

    print("Training complete!")


In [None]:
# Train the model
train_model(model, train_loader, val_loader, optimizer, epochs=3)


Epoch 1/3
Epoch 1 Loss: 1.3431157188415528
Epoch 2/3
Epoch 2 Loss: 1.2234419770240783
Epoch 3/3


In [None]:
def evaluate_model(model, val_loader, device="cuda" if torch.cuda.is_available() else "cpu"):
    model.eval()
    model.to(device)
    total_loss = 0

    with torch.no_grad():
        for batch_idx, batch in enumerate(val_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            print(f"Validation Batch {batch_idx + 1}/{len(val_loader)} Loss: {loss.item()}")

    avg_loss = total_loss / len(val_loader)
    print(f"Average Validation Loss: {avg_loss}")


In [None]:
evaluate_model(model, val_loader)


Validation Batch 1/25 Loss: 1.1243853569030762
Validation Batch 2/25 Loss: 1.1185128688812256
Validation Batch 3/25 Loss: 1.1406519412994385
Validation Batch 4/25 Loss: 1.0383745431900024
Validation Batch 5/25 Loss: 0.8452817797660828
Validation Batch 6/25 Loss: 1.2136354446411133
Validation Batch 7/25 Loss: 0.7708368897438049
Validation Batch 8/25 Loss: 0.9686002731323242
Validation Batch 9/25 Loss: 1.0825281143188477
Validation Batch 10/25 Loss: 1.338932752609253
Validation Batch 11/25 Loss: 1.4474706649780273
Validation Batch 12/25 Loss: 1.4606175422668457
Validation Batch 13/25 Loss: 1.6689664125442505
Validation Batch 14/25 Loss: 1.1426756381988525
Validation Batch 15/25 Loss: 1.3984994888305664
Validation Batch 16/25 Loss: 0.868691623210907
Validation Batch 17/25 Loss: 0.9127963185310364
Validation Batch 18/25 Loss: 1.1636126041412354
Validation Batch 19/25 Loss: 1.2219665050506592
Validation Batch 20/25 Loss: 0.9548355340957642
Validation Batch 21/25 Loss: 1.7052701711654663
Val

In [None]:
# Save the cleaned datasets (Optional)
train_data.to_csv('cleaned_train.csv', index=False)
test_data.to_csv('cleaned_test.csv', index=False)
val_data.to_csv('cleaned_validation.csv', index=False)

In [None]:
# Define a function for summarization
def summarize_text(text, model, tokenizer, max_length=150):
    """Generate summary for a given text."""
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(inputs, max_length=max_length, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


In [None]:
# Example usage
print("\nGenerating summary for a sample text...")
example_text = train_data['article'].iloc[1]
generated_summary = summarize_text(example_text, model, tokenizer)
print(f"Original Text: {example_text}")
print(f"Generated Summary: {generated_summary}")



Generating summary for a sample text...
Original Text: cnn manfred zbrzezny wants to bulk up his arsenal for the past few years the founder of the liberiabased fyrkuna metalworks and his team of skilful craftsmen have been collecting weapons scrap relics of the west african countrys vicious civil conflict for their arms into art project from their opensided workshop near monrovia the capital of liberia the artisans expertly cut melt and weld scrap metals to fashion them into intricate sculptures in their hands decommissioned ak47s rpg launchers and bazookas are transformed into elaborate candle stands whimsical lamps and even lifesize metal trees what was an instrument of suffering can become something beautiful and useful says zbrzezny a germanitalian artistic blacksmith whos been calling liberia home since the mid2000s read this artist gives trash a second chance i want to do something serene out of all these things that are violent and nasty he adds i prefer doing my pieces a littl

In [None]:
from tqdm import tqdm
from torch.utils.data import Subset

# اختيار أول 100 عينة فقط من بيانات الاختبار
test_subset = Subset(test_dataset, list(range(100)))  # أول 100 عينة
test_loader = DataLoader(test_subset, batch_size=8)



def evaluate_model(model, test_loader, tokenizer, device="cuda" if torch.cuda.is_available() else "cpu"):
    """
    Evaluate the trained model on the test dataset.

    Parameters:
        model: The trained T5 model.
        test_loader: DataLoader for the test dataset.
        tokenizer: T5 tokenizer.
        device: The device to run the evaluation on (default: GPU if available).

    Returns:
        A list of tuples with the original text, true summary, and generated summary.
    """
    model.to(device)
    model.eval()
    results = []


    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            true_summaries = batch['labels']

            # Generate summaries
            generated_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=150,
                min_length=30,
                length_penalty=2.0,
                num_beams=2,
                early_stopping=True
            )

            # Decode the generated summaries and the true summaries
            generated_summaries = [tokenizer.decode(g, skip_special_tokens=True) for g in generated_ids]
            true_summaries = [tokenizer.decode(t, skip_special_tokens=True) for t in true_summaries]

            # Store results
            for i in range(len(generated_summaries)):
                results.append({
                    "original_text": tokenizer.decode(input_ids[i], skip_special_tokens=True),
                    "true_summary": true_summaries[i],
                    "generated_summary": generated_summaries[i],
                })

    return results
from torch.utils.data import Subset


# Run evaluation on the test dataset
test_results = evaluate_model(model, test_loader, tokenizer)

# Display a few results
for i, result in enumerate(test_results[:5]):  # Display first 5 results
    print(f"\nExample {i + 1}")
    print(f"Original Text: {result['original_text']}")
    print(f"True Summary: {result['true_summary']}")
    print(f"Generated Summary: {result['generated_summary']}")


NameError: name 'test_dataset' is not defined

In [None]:
for text in test_data['article']:
    summary = summarize_text(text, model, tokenizer)
    print(f"Original: {text}")
    print(f"Summary: {summary}")


Original: a gel made from patients own blood could help heal burns more quickly and stop wounds getting worse the gel contains plateletrich plasma prp a concentrated mix of substances in the blood that play a role in healing it is made by taking a small amount of blood from a patients arm amounts vary depending on wound size but typically no more than a couple of tablespoons and then processing it in a machine that spins the blood at high speed until it separates into its various components blood is largely made up of a clear yellowish liquid called plasma but it also contains small solid compounds red cells white cells and platelets the platelets are important for clotting but they also contain hundreds of proteins called growth factors that are important for healing us researchers are testing the new gel it as a dressing for burns spinning the blood leaves behind the plasma and the platelets but in higher concentration up to ten times greater than usual plateletrich plasma is sometim

KeyboardInterrupt: 