# Install important packages 


In [1]:
!pip install transformers[sentencepiece] datasets rouge_score -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m46.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 kB[0m [31m7.3 MB/s[0m et

In [2]:
from transformers import pipeline, set_seed

import matplotlib.pyplot as plt
from datasets import load_dataset
import pandas as pd
from datasets import load_dataset, load_metric

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Download Model from Hugging Face and then initilaize tokenizer and model

In [3]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"

model_ckpt = "google/pegasus-cnn_dailymail"

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

# Importing Dataset


In [4]:
df = pd.read_csv("NLPData_set.csv")

In [5]:
df = df.rename(columns={'Unnamed: 0': 'id'})

In [6]:
df.columns

Index(['id', 'text', 'summary'], dtype='object')

# Split dataset into 3 parts Train, Test, Validatation

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset into a Pandas DataFrame

# Split the dataset into training (80%), validation (10%), and testing (10%) sets
train, test = train_test_split(df, test_size=0.2, random_state=42)
train, val = train_test_split(train, test_size=0.125, random_state=42)

# Converting Data-Frame to dataset

In [8]:
from datasets import Dataset, DatasetDict
# convert the DataFrame to a Dataset object
test = Dataset.from_dict(test)
train = Dataset.from_dict(train)
val = Dataset.from_dict(val)
#dataset

In [9]:
tweets = {'train':train, 'test': test, 'validation': val}

In [10]:
tweets['test'][0]['text']

' Memorable Pic of the Day'

# Try the Model WithOut Fine-tuning And check rouge score

In [11]:
pipe = pipeline('summarization', model = model_ckpt )

pipe_out = pipe(tweets['test'][0]['text'] )

print(pipe_out)

Your max_length is set to 128, but you input_length is only 7. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=3)


[{'summary_text': 'Memorable Pic of the Day .<n>What do you think? Share your thoughts in the comments below .<n>Sign up for our newsletter .<n>Follow us on Twitter .<n>Follow us on Facebook .'}]


In [12]:
print(pipe_out[0]['summary_text'].replace(" .<n>", ".\n"))

Memorable Pic of the Day.
What do you think? Share your thoughts in the comments below.
Sign up for our newsletter.
Follow us on Twitter.
Follow us on Facebook .


# Finding Rouge score to evaluate model

In [17]:
def generate_batch_sized_chunks(list_of_elements, batch_size):
    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]


In [18]:
def calculate_metric_on_test_ds(dataset, metric, model, tokenizer, 
                               batch_size=16, device=device, 
                               column_text="article", 
                               column_summary="highlights"):
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):
        
        inputs = tokenizer(article_batch, max_length=1024,  truncation=True, 
                        padding="max_length", return_tensors="pt")
        
        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device), 
                         length_penalty=0.8, num_beams=8, max_length=128)
        ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''
        
        # Finally, we decode the generated texts, 
        # replace the  token, and add the decoded texts with the references to the metric.
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, 
                                clean_up_tokenization_spaces=True) 
               for s in summaries]      
        
        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
        
        
        metric.add_batch(predictions=decoded_summaries, references=target_batch)
        
    #  Finally compute and return the ROUGE scores.
    score = metric.compute()
    return score

In [21]:
rouge_metric = load_metric('rouge')

score = calculate_metric_on_test_ds(tweets['test'], rouge_metric, model_pegasus, tokenizer, column_text = 'text', column_summary='summary', batch_size=4)

100%|██████████| 52/52 [04:29<00:00,  5.19s/it]


In [22]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

pd.DataFrame(rouge_dict, index = ['pegasus'])

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.016886,0.00083,0.016767,0.016757


# Finding length of tokens

In [23]:
dialogue_token_len = len([tokenizer.encode(s) for s in tweets['train']['text']])

summary_token_len = len([tokenizer.encode(s) for s in tweets['train']['summary']])

print(dialogue_token_len ,summary_token_len)


726 726


# Gentrating the numeric value and map them with existing dataset

In [11]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch['text'] , max_length = 726, truncation = True )
    
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['summary'], max_length = 208, truncation = True )
        
    return {
        'input_ids' : input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }
    


## Map with train dataset

In [12]:
train = train.map(convert_examples_to_features, batched = True)

Map:   0%|          | 0/726 [00:00<?, ? examples/s]



## Map with Val dataset

In [13]:
val = val.map(convert_examples_to_features, batched = True)

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

## Checking val and train

In [14]:
val

Dataset({
    features: ['id', 'text', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 104
})

In [15]:
train

Dataset({
    features: ['id', 'text', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 726
})

In [16]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)

# Defining Hyperparameters to train Model

In [17]:
from transformers import TrainingArguments, Trainer

trainer_args = TrainingArguments(
    output_dir='pegasus-samsum', num_train_epochs=1, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10,
    evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
    gradient_accumulation_steps=16
) 

# Settting training arguments

In [18]:
trainer = Trainer(model=model_pegasus, args=trainer_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=train, 
                  eval_dataset=val)

In [19]:
# !pip install numba

# from numba import cuda 
# device = cuda.get_current_device()
# device.reset()

In [20]:
trainer.train()


You're using a PegasusTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


TrainOutput(global_step=45, training_loss=3.2846581353081596, metrics={'train_runtime': 119.4294, 'train_samples_per_second': 6.079, 'train_steps_per_second': 0.377, 'total_flos': 49741678411776.0, 'train_loss': 3.2846581353081596, 'epoch': 0.99})

# Finding Rouge score of Trained Model

In [None]:
rouge_metric = load_metric('rouge')
score = calculate_metric_on_test_ds(
    tweets['test'], rouge_metric, trainer.model, tokenizer, batch_size = 2, column_text = 'text', column_summary= 'summary'
)



In [None]:
rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

pd.DataFrame(rouge_dict, index = [f'pegasus'] )

# SAVE Model 

In [22]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [23]:
!ls /content/gdrive/My Drive

ls: cannot access '/content/gdrive/My': No such file or directory
ls: cannot access 'Drive': No such file or directory


In [24]:
## Save model
model_save_name = 'pegasus'
path = F"/content/gdrive/My Drive/{model_save_name}" 
model_pegasus.save_pretrained("pegasus-samsum-model",path)

In [25]:
## Save tokenizer
model_save_name = 'tokenizer'
path = F"/content/gdrive/My Drive/{model_save_name}" 
tokenizer.save_pretrained("tokenizer",path)

('tokenizer/tokenizer_config.json', 'tokenizer/special_tokens_map.json')

# Test

In [None]:
print(tweets["test"][22]["text"])

In [None]:
sample_text = tweets["test"][19]["text"]

reference = tweets["test"][19]["summary"]

In [None]:
gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 128}

pipe = pipeline("summarization", model=trainer.model,tokenizer=tokenizer)


In [None]:
print("text:")
print(sample_text)


print("\nReference Summary:")
print(reference)


print("\nModel Summary:")
print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])