# Install important packages 


In [52]:
!pip install transformers[sentencepiece] datasets rouge_score -q

In [53]:
from transformers import pipeline, set_seed

import matplotlib.pyplot as plt
from datasets import load_dataset
import pandas as pd
from datasets import load_dataset, load_metric

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Download Model from Hugging Face and then initilaize tokenizer and model

In [54]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"

model_ckpt = "facebook/bart-large-xsum"

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

model_bart = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/309 [00:00<?, ?B/s]

# Importing Dataset


In [55]:
df = pd.read_csv("NLPData_set.csv")

In [56]:
df = df.rename(columns={'Unnamed: 0': 'id'})

In [57]:
df.columns

Index(['id', 'text', 'summary'], dtype='object')

# Split dataset into 3 parts Train, Test, Validatation

In [58]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset into a Pandas DataFrame

# Split the dataset into training (80%), validation (10%), and testing (10%) sets
train, test = train_test_split(df, test_size=0.2, random_state=42)
train, val = train_test_split(train, test_size=0.125, random_state=42)

# Converting Data-Frame to dataset

In [59]:
from datasets import Dataset, DatasetDict
# convert the DataFrame to a Dataset object
test = Dataset.from_dict(test)
train = Dataset.from_dict(train)
val = Dataset.from_dict(val)
#dataset

In [60]:
tweets = {'train':train, 'test': test, 'validation': val}

In [61]:
tweets['test'][0]['text']

' Memorable Pic of the Day'

# Try the Model WithOut Fine-tuning And check rouge score

In [62]:
pipe = pipeline('summarization', model = model_ckpt )

pipe_out = pipe(tweets['test'][0]['text'] )

print(pipe_out)

Your max_length is set to 62, but you input_length is only 9. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)


[{'summary_text': 'Images courtesy of AFP, EPA, Getty Images and Reuters.'}]


In [63]:
print(pipe_out[0]['summary_text'].replace(" .<n>", ".\n"))

Images courtesy of AFP, EPA, Getty Images and Reuters.


# Finding Rouge score to evaluate model

In [64]:
def generate_batch_sized_chunks(list_of_elements, batch_size):
    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]


In [65]:
def calculate_metric_on_test_ds(dataset, metric, model, tokenizer, 
                               batch_size=16, device=device, 
                               column_text="article", 
                               column_summary="highlights"):
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):
        
        inputs = tokenizer(article_batch, max_length=1024,  truncation=True, 
                        padding="max_length", return_tensors="pt")
        
        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device), 
                         length_penalty=0.8, num_beams=8, max_length=128)
        ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''
        
        # Finally, we decode the generated texts, 
        # replace the  token, and add the decoded texts with the references to the metric.
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, 
                                clean_up_tokenization_spaces=True) 
               for s in summaries]      
        
        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
        
        
        metric.add_batch(predictions=decoded_summaries, references=target_batch)
        
    #  Finally compute and return the ROUGE scores.
    score = metric.compute()
    return score

In [66]:
rouge_metric = load_metric('rouge')

score = calculate_metric_on_test_ds(tweets['test'], rouge_metric, model_bart, tokenizer, column_text = 'text', column_summary='summary', batch_size=4)

100%|██████████| 52/52 [02:37<00:00,  3.03s/it]


In [67]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

pd.DataFrame(rouge_dict, index = ['BART'])

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
BART,0.027462,0.000179,0.026568,0.026572


# Finding length of tokens

In [68]:
dialogue_token_len = len([tokenizer.encode(s) for s in tweets['train']['text']])

summary_token_len = len([tokenizer.encode(s) for s in tweets['train']['summary']])

print(dialogue_token_len ,summary_token_len)


726 726


# Gentrating the numeric value and map them with existing dataset

In [69]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch['text'] , max_length = 726, truncation = True )
    
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['summary'], max_length = 208, truncation = True )
        
    return {
        'input_ids' : input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }
    


## Map with train dataset

In [70]:
train = train.map(convert_examples_to_features, batched = True)

Map:   0%|          | 0/726 [00:00<?, ? examples/s]



## Map with Val dataset

In [71]:
val = val.map(convert_examples_to_features, batched = True)

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

## Checking val and train

In [72]:
val

Dataset({
    features: ['id', 'text', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 104
})

In [73]:
train

Dataset({
    features: ['id', 'text', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 726
})

In [74]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_bart)

# Defining Hyperparameters to train Model

In [75]:
from transformers import TrainingArguments, Trainer

trainer_args = TrainingArguments(
    output_dir='bart-large-xsum', num_train_epochs=1, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10,
    evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
    gradient_accumulation_steps=16
) 

# Settting training arguments

In [77]:
trainer = Trainer(model=model_bart, args=trainer_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=train, 
                  eval_dataset=val)

In [81]:
# !pip install numba

# from numba import cuda 
# device = cuda.get_current_device()
# device.reset()

In [78]:
trainer.train()


You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


TrainOutput(global_step=45, training_loss=2.8522963841756184, metrics={'train_runtime': 94.2813, 'train_samples_per_second': 7.7, 'train_steps_per_second': 0.477, 'total_flos': 39699917217792.0, 'train_loss': 2.8522963841756184, 'epoch': 0.99})

# Finding Rouge score of Trained Model

In [79]:
rouge_metric = load_metric('rouge')
score = calculate_metric_on_test_ds(
    tweets['test'], rouge_metric, trainer.model, tokenizer, batch_size = 2, column_text = 'text', column_summary= 'summary'
)



100%|██████████| 104/104 [01:54<00:00,  1.10s/it]


In [80]:
rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

pd.DataFrame(rouge_dict, index = [f'BART'] )

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
BART,0.031862,0.001085,0.031257,0.03123


# SAVE Model 

In [82]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [83]:
!ls /content/gdrive/My Drive

ls: cannot access '/content/gdrive/My': No such file or directory
ls: cannot access 'Drive': No such file or directory


In [84]:
## Save model
model_save_name = 'Bart'
path = F"/content/gdrive/My Drive/{model_save_name}" 
model_bart.save_pretrained("Bart",path)

In [85]:
## Save tokenizer
model_save_name = 'tokenizer'
path = F"/content/gdrive/My Drive/{model_save_name}" 
tokenizer.save_pretrained("tokenizer",path)

('tokenizer/tokenizer_config.json', 'tokenizer/special_tokens_map.json')

# Test

In [86]:
print(tweets["test"][22]["text"])

 day totals +78230 , winning days in row , style


In [87]:
sample_text = tweets["test"][19]["text"]

reference = tweets["test"][19]["summary"]

In [88]:
gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 128}

pipe = pipeline("summarization", model=trainer.model,tokenizer=tokenizer)


In [None]:
print("text:")
print(sample_text)


print("\nReference Summary:")
print(reference)


print("\nModel Summary:")
print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])