In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/pegasus-007-1/config.json
/kaggle/input/pegasus-007-1/spiece.model
/kaggle/input/pegasus-007-1/training_args.bin
/kaggle/input/pegasus-007-1/tokenizer_config.json
/kaggle/input/pegasus-007-1/pytorch_model.bin
/kaggle/input/pegasus-007-1/special_tokens_map.json
/kaggle/input/pegasus-007-1/generation_config.json
/kaggle/input/pegasus-007/config.json
/kaggle/input/pegasus-007/spiece.model
/kaggle/input/pegasus-007/training_args.bin
/kaggle/input/pegasus-007/tokenizer_config.json
/kaggle/input/pegasus-007/pytorch_model.bin
/kaggle/input/pegasus-007/special_tokens_map.json
/kaggle/input/pegasus-007/generation_config.json
/kaggle/input/nlptweets/NLPData_chatGPT.csv


In [2]:
from tqdm import tqdm
import torch
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24954 sha256=c36ffb830620b47ab1cd6b4334acc4f97bee184fe4ca40916dfc35cf407040e8
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
[0m

In [3]:
import pandas as pd
df=pd.read_csv("/kaggle/input/nlptweets/NLPData_chatGPT.csv")

In [4]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.1, random_state=1)



### Downloading pegasus pre-trained model

In [5]:
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
model_name = 'tuner007/pegasus_summarizer'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)


Downloading spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

### Function to get summary from model

In [6]:
def get_response(input_text):
  batch = tokenizer([input_text],truncation=True,padding='longest',max_length=1024, return_tensors="pt").to(torch_device)
  gen_out = model.generate(**batch,max_length=150,num_beams=5, num_return_sequences=1, temperature=1.5)
  output_text = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
  return output_text

### Sample summary before FineTuning

In [7]:
print(get_response(train['x'][100]))

['BSF jawans at Betai border post in West Bengal\'s Nadia district rescued 135 rare species of birds from wildlife smugglers who were trying to smuggle them out of India to Bangladesh by hiding them in iron cages. BSF againstWildlifeCrime tweeted, "We need more reasons to stay invested in this series, but here\'s another anyway starstruck."']


## Rouge Score with out Fine-tuning

In [21]:
pred=[]
for i in tqdm(test['x'],total=len(test)):
    pred.append(get_response(i))

100%|██████████| 116/116 [02:58<00:00,  1.54s/it]


In [13]:
ref_text=[]
for i in test['y']:
    ref_text.append(i)

In [23]:
from datasets import load_dataset, load_metric
rouge_metric = load_metric('rouge')
rouge_metric.add_batch(predictions=pred, references=ref_text)
        
#  Finally compute and return the ROUGE scores.
score = rouge_metric.compute()
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

pd.DataFrame(rouge_dict, index = ['pegasus'])

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.31015,0.133737,0.214979,0.214567


### Rouge1 = 0.31 (Before Finetuning)

# Fine Tuning

### Changin Pandas dataframe to Datasets

In [7]:
from datasets import Dataset
trainds=Dataset.from_pandas(train)
testds=Dataset.from_pandas(test)

In [8]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch['x'] , max_length = 1024, truncation = True )
    
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['y'], max_length = 256, truncation = True )
        
    return {
        'input_ids' : input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }
    
train_pt = trainds.map(convert_examples_to_features, batched = True)
test_pt=testds.map(convert_examples_to_features, batched = True)

  0%|          | 0/2 [00:00<?, ?ba/s]



  0%|          | 0/1 [00:00<?, ?ba/s]

In [9]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

### 20 epochs

In [10]:
from transformers import TrainingArguments, Trainer

trainer_args = TrainingArguments(
    output_dir='pegasus', num_train_epochs=20, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10,
    evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
    gradient_accumulation_steps=16
) 

In [11]:
trainer = Trainer(model=model, args=trainer_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=train_pt, 
                  eval_dataset=test_pt)

In [12]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss
500,1.1354,1.181082
1000,0.8227,1.153088


TrainOutput(global_step=1300, training_loss=1.20391640993265, metrics={'train_runtime': 4359.8086, 'train_samples_per_second': 4.775, 'train_steps_per_second': 0.298, 'total_flos': 2.246719139200205e+16, 'train_loss': 1.20391640993265, 'epoch': 19.98})

In [None]:
i=0
while(True):
    i=i+1

### Saving the Fine Tuned Model

In [14]:
trainer.save_model("/kaggle/working/")

# Inference and Testing

### Getting the Fine Tuned model

In [8]:
model_ckpt = "/kaggle/input/pegasus-007-20epcs"


In [9]:
tokenizer = PegasusTokenizer.from_pretrained(model_ckpt)
model = PegasusForConditionalGeneration.from_pretrained(model_ckpt).to(torch_device)

In [17]:
pred=[]
for i in tqdm(test['x'],total=len(test)):
    pred.append(get_response(i))

100%|██████████| 116/116 [03:56<00:00,  2.04s/it]


In [18]:
ref_text=[]
for i in test['y']:
    ref_text.append(i)

## Rouge Score after Fine Tuning

In [19]:
from datasets import load_dataset, load_metric
rouge_metric = load_metric('rouge')
rouge_metric.add_batch(predictions=pred, references=ref_text)
        
#  Finally compute and return the ROUGE scores.
score = rouge_metric.compute()
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

pd.DataFrame(rouge_dict, index = ['pegasus'])

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.514073,0.307685,0.388148,0.388527


# Rouge Score improved to 0.51 from 0.31

### Sample Summary after Fine Tuning

In [11]:
train['x'][100]

'Summarise the given tweets in English: \nAustralia spinner MatthewKuhnemann took five wickets and NathanLyon three to skittle India for 109 on day one of the third Test in Indore on Wednesday.INDvAUS INDvsAUS AlertBSF jawans at Betai border post in West Bengal\'s Nadia district rescued 135 rare species of birds from wildlife smugglers who were trying to smuggle them out of India to Bangladesh by hiding them in iron cages. BSFagainstWildlifeCrime PetaIndiaTwitterDownINDvAUS fire No. 1 Test batter vs No. 1 Test bowler fire Not that we need more reasons to stay invested in this series, but here\'s another anyway starstruck Also, the No. 1 allrounder is Ravindra Jadeja popcorn INDvAUS The deck at Indore is substandard. I don\'t belong to those group of blaming pitches but that doesn\'t mean I would\'ve to endorse a substandard deck. Still, the kind of resilience Usman Khawaja showed on this pitch should remain as an example for others to emulate. INDvAUS Im all for home advantage, but thi

In [10]:
print(get_response(train['x'][100]))

["On the first day of the third Test between India and Australia in Indore, Australia spinner Matthew Kuhnemann took five wickets and Nathan Lyon three to skittle India for 109. BSF jawans at the Betai border post in West Bengal's Nadia district rescued 135 rare species of birds from wildlife smugglers who were trying to smuggle them out of India to Bangladesh by hiding them in iron cages. Ravindra Jadeja is the No. 1 allrounder and Usman Khawaja showed resilience on the pitch. The deck at Indore is substandard, but the kind of resilience Khawaja showed on this pitch should remain as an example for others to emulate. The track is crumbling in the first session. Parthiv9 reflects on Team India's bowling performance on C"]
