In [None]:
#======== Install pacakges =========#

# pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
# pip install transformers
# pip install datasets
# pip install pytorch-lightning
# pip install rouge_score


#======== Dataset =========#
# Download data from Kaggle (https://www.kaggle.com/datasets/sunnysai12345/news-summary)
# The dataset consists of 4515 examples and contains Author_name, Headlines, Url of Article, Short text, Complete Article. I gathered the summarized news from Inshorts and only scraped the news articles from Hindu, Indian times and Guardian. Time period ranges from febrauary to august 2017.


# reference
# https://huggingface.co/course/chapter7/5?fw=tf
# https://huggingface.co/docs/transformers/training

In [1]:
!pip install rouge_score

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [26]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import json
from tqdm import tqdm, tqdm_notebook
import pandas as pd
import numpy as np
import gc

from datasets import load_metric
import torch
from pathlib import Path 
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, TQDMProgressBar
from pytorch_lightning.loggers import TensorBoardLogger


from sklearn.model_selection import train_test_split
# from termcolor import colored
import textwrap 

import transformers
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5TokenizerFast as T5Tokenizer
)

In [4]:
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

%matplolib inline
%config InlineBackend.figure_format='retina'
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
rcParams['figure.figsize'] = 16, 10

In [6]:
print(torch.cuda.memory_summary(device=None, abbreviated=False))

In [7]:
pl.seed_everything(42)

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print(torch.cuda.get_device_name(0))
print()

In [9]:
# Data
TEXT_COLUMN = 'text'
SUMMARY_COLUMN = 'ctext' 

# Model
MODEL_NAME = "t5-base"
N_EPOCHS = 3
BATCH_SIZE = 8

In [10]:
#### Load data
df = pd.read_csv("/kaggle/input/news-summary/news_summary.csv",  encoding='iso-8859-1')
df.head()

In [11]:
df = df[[TEXT_COLUMN, SUMMARY_COLUMN]]
df.head()

In [12]:
df.columns = ['summary', 'text']
df = df.dropna()
df.head()

In [13]:
train_df, test_df = train_test_split(df, test_size=0.2)
train_df.shape, test_df.shape

### Data Preprocessing

In [14]:
class NewsSummaryDataset(Dataset):
    def __init__(self,
                 data: pd.DataFrame,
                 tokenizer: T5Tokenizer,
                 text_max_token_len: int = 512,
                 summary_max_token_len: int = 128
                ):
        self.tokenizer = tokenizer
        self.data = data
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len
  
    def __len__(self):
        return len(self.data)
  
    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]

        text = data_row["text"]
        text_encoding = self.tokenizer(text,
                                  max_length=self.text_max_token_len,
                                  padding="max_length",
                                  truncation=True,
                                  return_attention_mask=True,
                                  add_special_tokens=True,
                                  return_tensors="pt"
                                  )

        summary_encoding = self.tokenizer(data_row["summary"],
                                  max_length=self.summary_max_token_len,
                                  padding="max_length",
                                  truncation=True,
                                  return_attention_mask=True,
                                  add_special_tokens=True,
                                  return_tensors="pt"
                                  )

        labels = summary_encoding["input_ids"]
        labels[labels == 0] = -100

        return dict(text=text, 
                    summary=data_row["summary"],
                    text_input_ids=text_encoding["input_ids"].flatten(),
                    text_attention_mask=text_encoding["attention_mask"].flatten(),
                    labels=labels.flatten(),
                    labels_attention_mask=summary_encoding["attention_mask"].flatten()
                    )

In [15]:
class NewsSummaryDataModule(pl.LightningDataModule):
    def __init__(self,
                 train_df: pd.DataFrame,
                 test_df: pd.DataFrame,
                 tokenizer: T5Tokenizer,
                 batch_size: int=8,
                 text_max_token_len: int = 512,
                 summary_max_token_len: int = 128
                ):
        # call initialization of Base Class
        super().__init__()

        self.train_df = train_df
        self.test_df = test_df

        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len

    def setup(self, stage=None):
        self.train_dataset = NewsSummaryDataset(self.train_df,
                                                self.tokenizer,
                                                self.text_max_token_len,
                                                self.summary_max_token_len)

        self.test_dataset = NewsSummaryDataset(self.test_df,
                                               self.tokenizer,
                                               self.text_max_token_len,
                                               self.summary_max_token_len)
  

    def train_dataloader(self):
        # windows - num_worker should be 0
        # linux - num_workers can be more than 0
        return DataLoader(self.train_dataset,
                          batch_size=self.batch_size,
                          shuffle=True,
                          num_workers=4
                         )
    

    def val_dataloader(self):
        return DataLoader(self.test_dataset,
                          batch_size=self.batch_size,
                          shuffle=False,
                          num_workers=4
                         )

In [16]:
# create tokenizer object
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

#### EDA

In [17]:
text_token_counts, summary_token_counts = [] , []

for _, row in train_df.iterrows():
    text_token_count = len(tokenizer.encode(row["text"]))
    text_token_counts.append(text_token_count)
    
    summary_token_count = len(tokenizer.encode(row["summary"]))
    summary_token_counts.append(summary_token_count)

In [18]:
fig, (ax1, ax2) = plt.subplots(1, 2)

sns.histplot(text_token_counts, ax=ax1)
ax1.set_title("full text token counts")

sns.histplot(summary_token_counts, ax=ax2)
ax2.set_title("full summary token counts")

In [19]:
data_module = NewsSummaryDataModule(train_df, test_df, tokenizer, batch_size=BATCH_SIZE)

#### Model

In [20]:
class NewsSummaryModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)
  
    def forward(self, input_ids, attention_mask, decoder_attention_mask, labels=None):
        output = self.model(input_ids, 
                            attention_mask=attention_mask,
                            labels=labels,
                            decoder_attention_mask=decoder_attention_mask
                           )

        return output.loss, output.logits
  
  
    def training_step(self, batch, batch_idx):
        input_ids = batch["text_input_ids"]
        attention_mask = batch["text_attention_mask"]
        labels = batch["labels"]
        labels_attention_mask = batch["labels_attention_mask"]

        loss, outputs = self.forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=labels_attention_mask,
            labels=labels
        )

        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids = batch["text_input_ids"]
        attention_mask = batch["text_attention_mask"]
        labels = batch["labels"]
        labels_attention_mask = batch["labels_attention_mask"]

        loss, outputs = self.forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=labels_attention_mask,
            labels=labels
        )

        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=0.0001)

In [21]:
model = NewsSummaryModel()

In [22]:
# Model Configuration

In [23]:
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    verbose=True, 
    monitor="val_loss",
    mode="min"
)

logger = TensorBoardLogger("lightning_logs", name="news-summary")

trainer = pl.Trainer(
    logger=logger,
    callbacks=checkpoint_callback,
    max_epochs=N_EPOCHS,
    gpus=1
)

In [24]:
print(torch.cuda.memory_summary(device=None, abbreviated=False))

In [27]:
# fit a model on the given data
trainer.fit(model, data_module)

In [28]:
trainer.checkpoint_callbacks[0].best_model_path

In [29]:
trained_model = NewsSummaryModel.load_from_checkpoint(
    trainer.checkpoint_callbacks[0].best_model_path
)

In [30]:
trained_model.freeze()

#### Make Predictions

**Recall-Oriented Understudy for Gisting Evaluation), measures the number of overlapping textual units (n-grams, word sequences) between the generated summary and a set of gold reference summaries.**

In [31]:
def summarize(text):
    text_encoding = tokenizer(text,
                              max_length=512,
                              padding="max_length",
                              truncation=True,
                              return_attention_mask=True,
                              add_special_tokens=True,
                              return_tensors="pt"
                             )
    
    generated_ids = trained_model.model.generate(
        input_ids = text_encoding["input_ids"],
        attention_mask = text_encoding["attention_mask"],
        max_length=150,
        num_beams=2,
        length_penalty=1.0,
        early_stopping=True
    )
    
    preds = [
        tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True) for gen_id in generated_ids]
    
    return "".join(preds)

In [32]:
sample_row = test_df.iloc[0]
text = sample_row["text"]
model_summary = summarize(text)

In [33]:
text

In [34]:
sample_row["summary"]

In [35]:
model_summary

In [36]:
test_df = test_df.reset_index(drop=True)

In [37]:
test_df.head()

In [38]:
# evaluate performance on 100 samples
preds = []
for row in tqdm_notebook(range(100)):
    text = test_df.iloc[row,1]
    preds.append(summarize(text))

#### Evaluate Model performance using ROUGHE metric

In [39]:
preds[99]

In [40]:
test_df["text"][99]

In [41]:
test_df.reset_index()["summary"][99]

In [42]:
metric = load_metric("rouge")

In [46]:
result = metric.compute(predictions=preds, references=test_df["summary"][:100].tolist())

In [47]:
result

In [48]:
final_result = {key: round(value.mid.fmeasure * 100, 1) for key, value in result.items()}

In [49]:
final_result

#### Download files from Kaggle to local

In [50]:
os.chdir(r'/kaggle/working')

In [59]:
!zip -r file.zip /kaggle/working

In [60]:
os.listdir()

In [61]:
from IPython.display import FileLink
FileLink(r'file.zip')

In [69]:
def get_size(path):
    size = os.path.getsize(path)
    if size < 1024:
        return f"{size} bytes"
    elif size < 1024*1024:
        return f"{round(size/1024, 2)} KB"
    elif size < 1024*1024*1024:
        return f"{round(size/(1024*1024), 2)} MB"
    elif size < 1024*1024*1024*1024:
        return f"{round(size/(1024*1024*1024), 2)} GB"

In [70]:
get_size('file.zip')