## About the dataset 

The dataset for this task is built using articles and headline pairs from several leading newspapers of the country. 

The task is to generate a meaningful fixed length summary for each article. 

In [1]:
# eng_train = "../input/indian-language-summarization/Eng_train.csv"
# eng_test = "../input/indian-language-summarization/Eng_val_article.csv"
# hindi_train = "../input/indian-language-summarization/Hin_train.csv"
# hindi_test = "../input/indian-language-summarization/Hin_val_article.csv"
guj_train = '/kaggle/input/hindidataset/hindi_train.csv'
guj_test = '/kaggle/input/hindidataset/HindiNews_test.csv'

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import DataLoader
from transformers import(
    AutoTokenizer as myTokenizer,
    AutoModelForSeq2SeqLM
)

import pytorch_lightning as pl

In [3]:
train = guj_train
test = guj_test

In [4]:
df = pd.read_csv(train)
dftest = pd.read_csv(test)

In [5]:
df = df[["Summary", "Article"]]
# only preserve needed columns

In [6]:
df.head(3)

Unnamed: 0,Summary,Article
0,Kerala Minor Girl Rape Case - केरल के एर्नाकुल...,केरल के एर्नाकुलम जिले में 5 साल की बच्ची से र...
1,इस साल मानसून सीजन में कई राज्यों में भारी तबा...,मानसून सीजन में हुई भारी बारिश ने कई राज्यों म...
2,चुनावी साल में राजस्थान सरकार किसानों को लुभान...,चुनावी साल में राजस्थान सरकार किसानों को लुभान...


In [7]:
dftest.head(3)

Unnamed: 0,id,Article,Heading
0,HindiNews_test_0,‘वारिस पंजाब दे’ संगठन के चीफ अमृतपाल सिंह को ...,भिंडरांवाले के गांव में वारिस पंजाब दे का चीफ ...
1,HindiNews_test_1,कांग्रेस नेता राहुल गांधी जल्द ही साउथ दिल्ली ...,सांसदी जाने के बाद अप्रैल में सरकारी बंगला छोड़...
2,HindiNews_test_2,संसद परिसर में मंगलवार को आम आदमी पार्टी (AAP)...,"BJP का तंज- झूठ बोले कौआ काटे, राघव का पलटवार-..."


In [8]:
df_train, df_valid = train_test_split(df, random_state = 42, test_size = 0.1)
df_train.shape, df_valid.shape

((19102, 2), (2123, 2))

## Create a Custom Dataset Object
In PyTorch, this is done by subclassing a torch.utils.data.Dataset object and implementing \_\_len__ and \_\_getitem__

For parameters to pass to T5 tokenizer, refer: 

https://huggingface.co/docs/transformers/v4.22.2/en/main_classes/tokenizer#transformers.PreTrainedTokenizerFast

T5 tokenizer inherits from PreTrainedTokenizerFast which contains most of the main methods. 

For pytorch:

https://pytorch.org/tutorials/beginner/basics/data_tutorial.html

In [9]:
import re
WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))

In [10]:
class NewsSummaryDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: myTokenizer,
        text_max_tokens_len: int = 512,
        summary_max_tokens_len: int = 75
    ):
        self.data = data
        self.tokenizer = tokenizer
        self.text_max_tokens_len = text_max_tokens_len
        self.summary_max_tokens_len = summary_max_tokens_len

    def __getitem__(self, index):
        data_row = self.data.iloc[index]
        article, summary = data_row["Article"], data_row["Summary"]
        text_encoding = self.tokenizer(
            WHITESPACE_HANDLER(article),
            max_length = self.text_max_tokens_len,
            truncation = True,
            padding= "max_length", # all sents padded to max_length
            return_tensors = "pt" # pytorch tensors
        )
        summary_encoding = self.tokenizer(
            WHITESPACE_HANDLER(summary),
            max_length = self.summary_max_tokens_len,
            truncation = True,
            padding= "max_length", # all sents padded to max_length
            return_tensors = "pt" #pytorch tensors
        )
        # actual labels
        labels = summary_encoding["input_ids"]
        #Note: the input_ids includes padding too, so replace pad tokens(zero value) with value of -100
        labels[labels==0] = -100

        return {
            "text": article,
            "summary": summary,
            "text_input_ids": text_encoding["input_ids"].flatten(),
            "text_attention_mask":text_encoding["attention_mask"].flatten(),
            # attention mask: tells us what tokens to pay attention to, value: 0/1
            "labels": labels.flatten(),
            "labels_attention_mask": summary_encoding["attention_mask"].flatten(),
        }

    def __len__(self):
        return len(self.data)

In [11]:
# # Load model directly
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# tokenizer = AutoTokenizer.from_pretrained("google/mt5-base")
# model = AutoModelForSeq2SeqLM.from_pretrained("saliq7/GUJU_MT5")

## Data Module for pytorch lightining

In [12]:
# model

### Importance of a dataloader:
The Dataset retrieves our dataset’s features and labels one sample at a time. While training a model, we typically want to pass samples in “minibatches”, reshuffle the data at every epoch to reduce model overfitting, and use Python’s multiprocessing to speed up data retrieval.


DataLoader is an iterable that abstracts this complexity for us in an easy API.

pytorch >> from torch.utils.data import DataLoader


Ref: [pytorch_documentation](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html#preparing-your-data-for-training-with-dataloaders)

[pytorch_lightning_data_module](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.core.LightningDataModule.html#pytorch_lightning.core.LightningDataModule)
```
class MyDataModule(LightningDataModule):
    def __init__(self):
        super().__init__()
    def prepare_data(self):
        # download, split, etc...
        # only called on 1 GPU/TPU in distributed
    def setup(self, stage):
        # make assignments here (val/train/test split)
        # called on every process in DDP
    def train_dataloader(self):
        train_split = Dataset(...)
        return DataLoader(train_split)
    def val_dataloader(self):
        val_split = Dataset(...)
        return DataLoader(val_split)
    def test_dataloader(self):
        test_split = Dataset(...)
        return DataLoader(test_split)
    def teardown(self):
        # clean up after fit or test
        # called on every process in DDP
 ```

In [13]:
class NewsSummaryDatasetModule(pl.LightningModule):
    def __init__(
        self,
        train_df: pd.DataFrame,
        test_df: pd.DataFrame,
        tokenizer: myTokenizer,
        batch_size: int = 8,
        text_max_tokens_len: int = 512,
        summary_max_tokens_len: int = 75
    ):
        super().__init__()
        self.train_df = train_df
        self.test_df = test_df
        
        self.bs = batch_size
        self.tokenizer = tokenizer
        self.text_max_tokens_len = text_max_tokens_len
        self.summary_max_tokens_len = summary_max_tokens_len
        self.setup()
    
    def setup(self, stage=None):
        self.train_dataset = NewsSummaryDataset(
            self.train_df,
            self.tokenizer,
            self.text_max_tokens_len,
            self.summary_max_tokens_len
            )
        self.test_dataset = NewsSummaryDataset(
            self.test_df,
            self.tokenizer,
            self.text_max_tokens_len,
            self.summary_max_tokens_len
            )
    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size = self.bs,
            shuffle = True,
            num_workers = 7
        )
    
    def val_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size = self.bs,
            shuffle = False,
            num_workers = 7
        )
    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size = self.bs,
            shuffle = False,
            num_workers = 7
        )    
    

In [14]:
# # Use a pipeline as a high-level helper
# from transformers import pipeline

# pipe = pipeline("text2text-generation", model="ai4bharat/IndicBART")

In [15]:
# df['Article'] = df['Article'].apply(lambda x: x[:1024])

In [16]:
# pipe(df['Article'][0], max_length=75), df['Summary'][0]

In [17]:
# from rouge import Rouge
# scorer = Rouge()


In [18]:
MODEL_NAME = "csebuetnlp/mT5_m2o_hindi_crossSum"

TOKENIZER_NAME = "csebuetnlp/mT5_m2o_hindi_crossSum"
tokenizer = myTokenizer.from_pretrained(TOKENIZER_NAME)

tokenizer_config.json:   0%|          | 0.00/375 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Initialize the data module

In [19]:
# Setting up parameters
BATCH_SIZE = 4
EPOCHS = 0

In [20]:
datamodule = NewsSummaryDatasetModule(
    df_train,
    df_valid,
    tokenizer,
    batch_size = BATCH_SIZE
)

## MODEL 


In [21]:
class NewsSummaryModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME, return_dict = True)

    def forward(
        self,
        input_ids,
        attention_mask,
        decoder_attention_mask,
        labels        
    ):
        # calls the model, actual place where model structure can be defined
        # only 2 inputs are required for the model in order to compute a loss: input_ids and labels
        output = self.model(
            input_ids=input_ids, 
            attention_mask=attention_mask, 
            labels=labels,
            decoder_attention_mask = decoder_attention_mask
        )
        
        return output.loss, output.logits
    
    def training_step(self, batch, batch_idx ):
        input_ids = batch["text_input_ids"]
        attention_mask = batch["text_attention_mask"]
        labels = batch["labels"]
        labels_attention_mask = batch["labels_attention_mask"]
        
        # call forward method
        loss, logits = self(
            input_ids,
            attention_mask,
            labels,
            labels_attention_mask
        )
        
        # https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#log
        # Log a key, value pair.
        self.log("training_loss", loss)
        
        return loss
        
    
    def validation_step(self, batch, batch_idx ):
        input_ids = batch["text_input_ids"]
        attention_mask = batch["text_attention_mask"]
        labels = batch["labels"]
        labels_attention_mask = batch["labels_attention_mask"]
        
        # call forward method
        loss, logits = self(
            input_ids,
            attention_mask,
            labels,
            labels_attention_mask
        )
        self.log("validation_loss", loss)
        return loss
    
    def test_step(self, batch, batch_idx ):
        input_ids = batch["text_input_ids"]
        attention_mask = batch["text_attention_mask"]
        labels = batch["labels"]
        labels_attention_mask = batch["labels_attention_mask"]
        
        # call forward method
        loss, logits = self(
            input_ids,
            attention_mask,
            labels,
            labels_attention_mask
        )
        
        self.log("test_loss", loss)
        return loss
    
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=0.0001)
        return optimizer

Initialize model

In [22]:
model = NewsSummaryModel()

pytorch_model.bin:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [23]:
# Load the TensorBoard notebook extension
# %load_ext tensorboard
# %tensorboard --logdir ./lightning_logs

## Checkpointing
- For basic checkpointing,
    - Saving: save training model at every epoch: trainer = Trainer(default_root_dir="some/path/")
    - Loading: checkpoint = torch.load(CKPT_PATH)
    - Disable: trainer = Trainer(enable_checkpointing=False)
- For fine-grained control over checkpointing behavior, use the ModelCheckpoint object
    - Save the model periodically by monitoring a quantity. Every metric logged with log() or log_dict() in LightningModule is a candidate for the monitor key
    - https://pytorch-lightning.readthedocs.io/en/stable/common/checkpointing_intermediate.html
   

In [24]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath="output/checkpoints",
    filename="best_checkpoint",
    save_top_k=1,
    mode = 'min',
    monitor="validation_loss",
    verbose = True
)

## Logging
- Log to local file system in TensorBoard format.
- Logs are saved to os.path.join(save_dir, name, version)

https://pytorch-lightning.readthedocs.io/en/stable/extensions/generated/pytorch_lightning.loggers.TensorBoardLogger.html

In [25]:
# logger = pl.loggers.TensorBoardLogger("lightning_logs", name="lang_summarization")

## TRAINING
Ligtning offers various modes for trainer by setting flags.
https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html#trainer-class-api

In [26]:
trainer = pl.Trainer(
#     logger = logger,
    callbacks = [checkpoint_callback],
    max_epochs = EPOCHS,
#     accelerator='gpu', 
    devices=1,
    num_sanity_val_steps=0
    
)

> Trainer.fit(model, train_dataloaders=None, val_dataloaders=None, datamodule=None, ckpt_path=None)

We don't need to specify train/val dataloaders instead we can directly pass LightningDataModule which alread has all the dataloaders.

In [27]:
trainer.fit(model, datamodule = datamodule)

2024-04-25 09:48:49.692795: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-25 09:48:49.692931: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-25 09:48:49.870609: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
  self.pid = os.fork()
  self.pid = os.fork()


In [28]:
# trainer.validate(model, dataloaders=datamodule.val_dataloader())
# checking purposes only

### Load from checkpoint

In [30]:
trainer.save_checkpoint('checkpoints/best_checkpoint.ckpt')

In [34]:
trained_model = NewsSummaryModel.load_from_checkpoint('/kaggle/working/checkpoints/best_checkpoint.ckpt')

In [35]:
trained_model.freeze() # speeds up inference

## INFERENCE

In [36]:
def summarizer(text):
    text_encoding = tokenizer(
        text,
        max_length = 512,
        return_attention_mask = True,
        truncation = True,
        add_special_tokens = True,
        padding= "max_length", # all sents padded to max_length
        return_tensors = "pt" #pytorch tensors
        )
    
    generated_ids = trained_model.model.generate( 
        # need to do model.model because "model" is actually a class "NewsSummaryModel" having attribute self.model as T5 
        input_ids = text_encoding["input_ids"],
        attention_mask = text_encoding["attention_mask"],
        max_length = 75,
        num_beams = 2,
        early_stopping = True
    )
    summary = tokenizer.batch_decode(generated_ids,skip_special_tokens = True)
    # print(summary)
    return summary

In [39]:
text = df_valid.iloc[10]["Article"][:4096]
summary = df_valid.iloc[10]["Summary"]
text, summary

('नई दिल्ली। कांग्रेस ने शनिवार (29 अगस्त) को एक बार फिर नरेंद्र मोदी सरकार पर फेसबुक के स्वामित्व वाले व्हाट्सएप को लेकर जमकर हमला बोला। कांग्रेस ने कहा कि कथित रूप से भारतीय जनता पार्टी (भाजपा) द्वारा अप्रत्यक्ष तरीके से नियंत्रित किया जा रहा है। कांग्रेस ने इस मुद्दे पर संयुक्त संसदीय समिति (जेपीसी) की जांच की मांग भी की है। कांग्रेस ने यह भी मांग की कि जांच पूरी न होने तक व्हाट्सएप को अपनी भुगतान (पेमेंट) सेवाएं शुरू करने की अनुमति नहीं दी जानी चाहिए।BSF ने बॉर्डर के पास पकड़ी 20 फीट लंबी सुरंग, पाकिस्तान में बने बोरियों से ढका था मुंहकांग्रेस प्रवक्ता पवन खेड़ा ने यहां एक संवाददाता सम्मेलन को संबोधित करते हुए कहा, "सोशल मीडिया दिग्गज फेसबुक के भारतीय संचालन और नरेंद्र मोदी के नेतृत्व वाली भाजपा सरकार के बीच अपवित्र घनिष्ठता अब तेजी से उभर रही है और अब ये केवल भारत का नहीं, बल्कि एक वैश्विक मुद्दा है।" एक अंतर्राष्ट्रीय प्रकाशन (पब्लिकेशन) में प्रकाशित एक मीडिया रिपोर्ट का हवाला देते हुए कांग्रेस नेता ने कहा, "एक प्रतिष्ठित वैश्विक प्रकाशन ने खुलासा किया है कि व्हाट्सएप, जिसका उपयो

In [40]:
summarizer(text)

['नई दिल्ली में सोशल मीडिया कंपनी फ़ेसबुक के भारतीय संचालक अंखी दास के शिवनाथ ठुकराने की घटना को लेकर कांग्रेस ने नाराज़गी जताई है.']

In [None]:
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

In [None]:
scores = scorer.score(summary, gen_sum)

In [None]:
scores

In [42]:
actual = []
generated = []
for i in range(100):
    print("current iter" + str(i))
    text = df_valid.iloc[i]["Article"]
    summary = df_valid.iloc[i]["Summary"]
    actual.append(summary)
#     print(f"Article: {text}")
#     print(f"Original Summary: {summary}")
#     print("Predicted Summary: ", end="")
    gen_sum = summarizer(text)[0]
    generated.append(gen_sum)

current iter0
current iter1
current iter2
current iter3
current iter4
current iter5
current iter6
current iter7
current iter8
current iter9
current iter10
current iter11
current iter12
current iter13
current iter14
current iter15
current iter16
current iter17
current iter18
current iter19
current iter20
current iter21
current iter22
current iter23
current iter24
current iter25
current iter26
current iter27
current iter28
current iter29
current iter30
current iter31
current iter32
current iter33
current iter34
current iter35
current iter36
current iter37
current iter38
current iter39
current iter40
current iter41
current iter42
current iter43
current iter44
current iter45
current iter46
current iter47
current iter48
current iter49
current iter50
current iter51
current iter52
current iter53
current iter54
current iter55
current iter56
current iter57
current iter58
current iter59
current iter60
current iter61
current iter62
current iter63
current iter64
current iter65
current iter66
curre

In [44]:
!pip install rouge_score

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=bdf2c36569ffd12787b2c21efe00345544db5891255ae101c3bf03712e7e0c14
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [47]:
!pip install evaluate

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl.metadata (29 kB)
Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0


In [48]:
import evaluate
rouge = evaluate.load('rouge')
score = rouge.compute(predictions=generated, references=actual, tokenizer=lambda x: x.split())
score

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

{'rouge1': 0.22220910058162924,
 'rouge2': 0.07818724987794881,
 'rougeL': 0.17609349078022352,
 'rougeLsum': 0.17563009436759525}

In [49]:
import shutil
shutil.make_archive('output', 'zip', '/kaggle/working/')

'/kaggle/working/output.zip'