In [1]:
# Mount Google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Pip installations
!pip install transformers
!pip install torch
!pip install scikit-learn
!pip install pandas
!pip install pytorch-lightning
!pip install datasets rouge-score
!pip install evaluate

Collecting pytorch-lightning
  Downloading pytorch_lightning-2.4.0-py3-none-any.whl.metadata (21 kB)
Collecting torchmetrics>=0.7.0 (from pytorch-lightning)
  Downloading torchmetrics-1.4.2-py3-none-any.whl.metadata (19 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.11.7-py3-none-any.whl.metadata (5.2 kB)
Downloading pytorch_lightning-2.4.0-py3-none-any.whl (815 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m815.2/815.2 kB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.11.7-py3-none-any.whl (26 kB)
Downloading torchmetrics-1.4.2-py3-none-any.whl (869 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m869.2/869.2 kB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightning-utilities, torchmetrics, pytorch-lightning
Successfully installed lightning-utilities-0.11.7 pytorch-lightning-2.4.0 torchmetrics-1.4.2
Collecting datasets
  

In [3]:
# Import necessary libraries and packages
import numpy as np
import pandas as pd

import torch
import evaluate
from sklearn.model_selection import train_test_split
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader

from nltk.translate.bleu_score import sentence_bleu
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import BartTokenizer

In [4]:
# Load the data
subset_path= '/content/drive/My Drive/BART_Abstract_Text_Summarization/curated_data_subset.csv'
full_data_path= '/content/drive/My Drive/BART_Abstract_Text_Summarization/curated_data.csv'

# Read the CSV files
df_subset= pd.read_csv(subset_path)
df_full= pd.read_csv(full_data_path)

# Display the first few rows to explore the structure
print('Subset Data:')
print(df_subset.head())

print('\nFull Data:')
print(df_full.head())

Subset Data:
   Unnamed: 0                                              title  \
0           0  Tencent gains approval to sell mutual funds to...   
1           1  India testing blockchains in education, health...   
2           2  Higher living wage risks robot takeover of low...   
3           3  Regus WeWork may seek stock-market flotation t...   
4           4  AMD poised to gain market share as Intel pound...   

                                             summary  \
0  Tencent has been granted a licence from the Ch...   
1  India is testing blockchain applications in ed...   
2  The UK Institute for Fiscal Studies has warned...   
3  Co-working start-up WeWork may go public this ...   
4  Intel has seen more than $11bn wiped off its m...   

                                                 url  \
0  http://www.scmp.com/business/companies/article...   
1  https://www.vccircle.com/niti-aayog-explores-b...   
2  https://news.sky.com/story/ifs-living-wage-inc...   
3  https://www.fo

In [5]:
# Load the tokenizer
tokenizer= BartTokenizer.from_pretrained('facebook/bart-large-cnn')

# Ensure all content is string type and handle missing values
df_full['article_content']= df_full['article_content'].fillna('').astype(str)
df_full['summary']= df_full['summary'].fillna('').astype(str)

# Now tokenize the article content and summary
df_full['article_content']= df_full['article_content'].apply(lambda x: tokenizer.batch_encode_plus(
    [x], max_length= 512, padding= 'max_length', truncation= True, return_tensors= 'pt'))

df_full['summary']= df_full['summary'].apply(lambda x: tokenizer.batch_encode_plus(
    [x], max_length= 150, padding= 'max_length', truncation= True, return_tensors= 'pt'))

# View tokenized data
print(df_full[['article_content', 'summary']].head())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]



               article_content                      summary
0  [input_ids, attention_mask]  [input_ids, attention_mask]
1  [input_ids, attention_mask]  [input_ids, attention_mask]
2  [input_ids, attention_mask]  [input_ids, attention_mask]
3  [input_ids, attention_mask]  [input_ids, attention_mask]
4  [input_ids, attention_mask]  [input_ids, attention_mask]


In [6]:
# Perform train-test split (80% training, 20% validation)
train_df, val_df= train_test_split(df_full, test_size=0.2, random_state=42)

# Print the size of each set
print(f'Training set size: {len(train_df)}')
print(f'Validation set size: {len(val_df)}')

Training set size: 30213
Validation set size: 7554


In [7]:
# Define a class that helps load batches of tokenized data
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, source_len, target_len):
        self.tokenizer= tokenizer
        self.data= data
        self.source_len= source_len
        self.target_len= target_len
        self.source_text= data['article_content']
        self.target_text= data['summary']

    def __len__(self):

        # Return the number of examples
        return len(self.target_text)

    def __getitem__(self, index):

        # Get the source (article content) and target (summary) text
        source_text= str(self.source_text[index])
        target_text= str(self.target_text[index])

        # Tokenize the source and target text
        source= self.tokenizer.batch_encode_plus(
            [source_text],
            max_length= self.source_len,
            padding= 'max_length',
            truncation= True,
            return_tensors= 'pt'
        )

        target= self.tokenizer.batch_encode_plus(
            [target_text],
            max_length= self.target_len,
            padding= 'max_length',
            truncation= True,
            return_tensors= 'pt'
        )

        # Return the tokenized input and target
        return {
            'source_ids': source['input_ids'].squeeze(),
            'source_mask': source['attention_mask'].squeeze(),
            'target_ids': target['input_ids'].squeeze(),
            'target_mask': target['attention_mask'].squeeze()
        }

In [8]:
# Define a function to create data loaders for the training and validation sets
def create_data_loader(dataframe, tokenizer, source_len, target_len, batch_size):
    dataset= CustomDataset(
        data= dataframe,
        tokenizer= tokenizer,
        source_len= source_len,
        target_len= target_len
    )

    return DataLoader(dataset, batch_size= batch_size, shuffle= True, num_workers= 4)

# Define parameters for the DataLoader
BATCH_SIZE= 8
SOURCE_LEN= 512
TARGET_LEN= 150

# Reset the indices of both training and validation sets
train_df= train_df.reset_index(drop= True)
val_df= val_df.reset_index(drop= True)

# Create training and validation data loaders
train_data_loader= create_data_loader(train_df, tokenizer, SOURCE_LEN, TARGET_LEN, BATCH_SIZE)
val_data_loader= create_data_loader(val_df, tokenizer, SOURCE_LEN, TARGET_LEN, BATCH_SIZE)

# Check the DataLoader output
data= next(iter(train_data_loader))
print(data.keys())
print(f"Source_ids shape: {data['source_ids'].shape}")
print(f"Target_ids shape: {data['target_ids'].shape}")

dict_keys(['source_ids', 'source_mask', 'target_ids', 'target_mask'])
Source_ids shape: torch.Size([8, 512])
Target_ids shape: torch.Size([8, 150])


In [9]:
# Define the Abstractive Summarization Model using PyTorch Lightning
class AbstractiveSummarizationModel(pl.LightningModule):

    # Initialize the model and tokenizer
    def __init__(self, model, tokenizer):
        super(AbstractiveSummarizationModel, self).__init__()
        self.model= model  # BART model for summarization
        self.tokenizer= tokenizer  # Tokenizer used to preprocess text data

    # Forward pass - how the model processes input and generates output
    def forward(self, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask=None):
        outputs= self.model(
            input_ids= input_ids,  # Tokenized article content
            attention_mask= attention_mask,  # Mask to focus on real tokens (ignore padding)
            decoder_input_ids= decoder_input_ids,  # Tokenized summary (target text)
            decoder_attention_mask= decoder_attention_mask  # Mask to ignore padding in summary
        )
        return outputs.loss, outputs.logits  # Return loss and model output (logits)

    # Define one step of training - how the model learns from the data
    def training_step(self, batch, batch_idx):
        input_ids= batch['source_ids']  # Tokenized articles (input)
        attention_mask= batch['source_mask']  # Attention mask for the input
        decoder_input_ids= batch['target_ids']  # Tokenized summaries (target)
        decoder_attention_mask= batch['target_mask']  # Attention mask for the summaries

        # Perform forward pass for training
        loss, outputs= self(
            input_ids= input_ids,
            attention_mask= attention_mask,
            decoder_input_ids= decoder_input_ids,
            decoder_attention_mask= decoder_attention_mask
        )

        # Ensure that the loss is not None before logging
        if loss is not None:
            self.log("train_loss", loss)
        return loss

    # Define one step of validation - how the model evaluates its performance
    def validation_step(self, batch, batch_idx):
        input_ids= batch['source_ids']  # Tokenized articles (input)
        attention_mask= batch['source_mask']  # Attention mask for the input
        decoder_input_ids= batch['target_ids']  # Tokenized summaries (target)
        decoder_attention_mask= batch['target_mask']  # Attention mask for the summaries

        # Perform forward pass for validation
        loss, outputs= self(
            input_ids=input_ids,
            attention_mask= attention_mask,
            decoder_input_ids= decoder_input_ids,
            decoder_attention_mask= decoder_attention_mask
        )

        # Ensure that loss is not None before logging
        if loss is not None:
            self.log('val_loss', loss)
        return loss

    # Define the optimizer - AdamW optimizer with learning rate of 3e-5
    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr= 3e-5)

In [10]:
# Load the pretrained BART model for conditional generation
bart_model= BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

# Instantiate the summarization model we defined earlier
model= AbstractiveSummarizationModel(model= bart_model, tokenizer= tokenizer)

# Determine whether a GPU is available
accelerator= 'gpu' if torch.cuda.is_available() else 'cpu'
devices= 1

# Initialize the PyTorch Lightning trainer
trainer= pl.Trainer(
    max_epochs= 3,
    devices= devices,
    accelerator= accelerator,
    enable_progress_bar= True
)

# Start the training process
trainer.fit(model, train_data_loader, val_data_loader)

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                         | Params | Mode
--------------------------------------------------------------
0 | model | BartForConditionalGeneration | 406 M  | eval
--------------------------------------------------------------
406 M     Trainable params
0         Non-trainable params
406 M     Total params
1,625.162 Total estimated model params size (MB)
0         Modules in train mode
350       Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:475: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.


Training: |          | 0/? [00:00<?, ?it/s]



Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.


In [11]:
# Define the summarization function based on ProjectPro
def summarize_article(article):

    # Load BART model and tokenizer
    model_name= 'facebook/bart-large-cnn'
    tokenizer= BartTokenizer.from_pretrained(model_name)
    model= BartForConditionalGeneration.from_pretrained(model_name)

    # Tokenize and encode the article
    inputs= tokenizer.encode(article, return_tensors= 'pt', max_length= 1024, truncation= True)

    # Generate summary
    summary_ids= model.generate(inputs, num_beams= 4, max_length= 250, early_stopping= True)

    # Decode the summary with explicit clean_up_tokenization_spaces set
    summary= tokenizer.decode(summary_ids[0], skip_special_tokens= True, clean_up_tokenization_spaces= True)

    return summary

# Example usage
article= '''
My friends are cool but they eat too many carbs.
'''

summary= summarize_article(article)
print('Summary:')
print(summary)



Summary:
My friends are cool but they eat too many carbs. That's what this is all about. I don't want you to think I'm a bad person. I'm not. I just don't like to be around people who eat too much carbs. This is my way of telling you that.


In [12]:
# Define reference summary for comparison (human-written summary)
reference_summary= 'My friends are nice, but they consume too many carbohydrates.'

# Generate model summary
generated_summary= summarize_article(article)

print('Generated Summary:')
print(generated_summary)

# Load the ROUGE metric using the evaluate library
rouge= evaluate.load('rouge')

# Calculate the ROUGE score
scores= rouge.compute(predictions=[generated_summary], references=[reference_summary])

# Print ROUGE scores
print('\nROUGE scores:')
print(scores)

Generated Summary:
My friends are cool but they eat too many carbs. That's what this is all about. I don't want you to think I'm a bad person. I'm not. I just don't like to be around people who eat too much carbs. This is my way of telling you that.


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]


ROUGE scores:
{'rouge1': 0.21875, 'rouge2': 0.12903225806451613, 'rougeL': 0.21875, 'rougeLsum': 0.21875}


In [13]:
# Improve model summary generation
def summarize_article_with_tweaks(article):

    # Load BART model and tokenizer
    model_name= 'facebook/bart-large-cnn'
    tokenizer= BartTokenizer.from_pretrained(model_name)
    model= BartForConditionalGeneration.from_pretrained(model_name)

    # Tokenize and encode the article
    inputs= tokenizer.encode(article, return_tensors= 'pt', max_length= 1024, truncation= True)

    # Generate summary with tweaked parameters
    summary_ids= model.generate(
        inputs,
        num_beams= 6,               # Increased beams for better quality
        max_length= 150,
        repetition_penalty= 1.2,    # Add repetition penalty to avoid repeated phrases
        early_stopping= True
    )

    # Decode the summary
    summary= tokenizer.decode(summary_ids[0], skip_special_tokens= True, clean_up_tokenization_spaces= True)

    return summary

# Example usage
article= '''
My friends are cool but they eat too many carbs.
'''

# Generate model summary with the new function
generated_summary= summarize_article_with_tweaks(article)

print('Generated Summary:')
print(generated_summary)

Generated Summary:
My friends are cool but they eat too many carbs. That's what this is all about. I don't want you to think I'm a bad person. I'm not. I just don't like to be around people who eat too much carbs. This is my way of dealing with it.


In [14]:
# Define reference summary for comparison (human-written summary)
reference_summary= 'My friends are nice, but they consume too many carbohydrates.'

# Load the ROUGE metric using the evaluate library
rouge = evaluate.load('rouge')

# Calculate the ROUGE score
scores= rouge.compute(predictions=[generated_summary], references=[reference_summary])

# Print ROUGE scores
print('\nROUGE scores:')
print(scores)


ROUGE scores:
{'rouge1': 0.21875, 'rouge2': 0.12903225806451613, 'rougeL': 0.21875, 'rougeLsum': 0.21875}
