In [None]:
!pip install sentencepiece
!pip install transformers
!pip install torch
!pip install rich[jupyter]
!pip install -q sumeval==0.2.2

In [None]:
import gc
import random
import warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sumeval.metrics.rouge import RougeCalculator
import torch
import transformers
from transformers import BartTokenizer, BartForConditionalGeneration

print('Pytorch version: %s'  % torch.__version__)

# Load Dataset

In [None]:
from datasets import load_dataset
train= load_dataset('multi_news', split='train[:1%]')
test= load_dataset('multi_news', split='test[:1%]')
print(len(train),len(test))


In [None]:

data = {'text': train['document'],
        'summary': train['summary']}
df = pd.DataFrame(data)


In [None]:
# Importing libraries
import os
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
# , RandomSampler, SequentialSampler
import os

from rich.table import Column, Table
from rich import box
from rich.console import Console

# define a rich console logger
console=Console(record=True)

def display_df(df):
  """display dataframe in ASCII format"""

  console=Console()
  table = Table(Column("source_text", justify="center" ), Column("target_text", justify="center"), title="Sample Data",pad_edge=False, box=box.ASCII)

  for i, row in enumerate(df.values.tolist()):
    table.add_row(row[0], row[1])

  console.print(table)

training_logger = Table(Column("Epoch", justify="center" ), 
                        Column("Steps", justify="center"),
                        Column("Loss", justify="center"), 
                        title="Training Status",pad_edge=False, box=box.ASCII)



In [None]:
# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
class TransformMultiNews(Dataset):
  """
  Creating a custom dataset for reading the dataset and 
  loading it into the dataloader to pass it to the neural network for finetuning the model

  """

  def __init__(self, dataframe, tokenizer, sourcelen, targetlen, source_data, target_data):
    self.tokenizer = tokenizer
    self.data = dataframe
    self.sourcelen = sourcelen
    self.summ_len = targetlen
    self.target_data = self.data[target_data]
    self.source_data = self.data[source_data]

  def __len__(self):
    return len(self.target_data)

  def __getitem__(self, index):
    source_data = str(self.source_data[index])
    target_data = str(self.target_data[index])

    #cleaning data so as to ensure data is in string type
    source_data = ' '.join(source_data.split())
    target_data = ' '.join(target_data.split())

    source = self.tokenizer.batch_encode_plus([source_data], max_length= self.sourcelen, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')
    target = self.tokenizer.batch_encode_plus([target_data], max_length= self.summ_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')

    sourceids = source['input_ids'].squeeze()
    sourcemask = source['attention_mask'].squeeze()
    targetids = target['input_ids'].squeeze()
    targetmask = target['attention_mask'].squeeze()

    return {
        'source_ids': sourceids.to(dtype=torch.long), 
        'source_mask': sourcemask.to(dtype=torch.long), 
        'target_ids': targetids.to(dtype=torch.long),
        'target_ids_y': targetids.to(dtype=torch.long)
    }

In [None]:
def train(epoch, tokenizer, model, device, loader, optimizer):

    #Function to train model 

  model.train()
  for _,data in enumerate(loader, 0):
    y = data['target_ids'].to(device, dtype = torch.long)
    y_ids = y[:, :-1].contiguous()
    labellm = y[:, 1:].clone().detach()
    labellm[y[:, 1:] == tokenizer.pad_token_id] = -100
    ids = data['source_ids'].to(device, dtype = torch.long)
    mask = data['source_mask'].to(device, dtype = torch.long)

    outputm = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=labellm)
    loss = outputm[0]

    if _%10==0:
      training_logger.add_row(str(epoch), str(_), str(loss))
      console.print(training_logger)
      torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            }, './outputs/training_model')
      
        

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


In [None]:
def validate(epoch, tokenizer, model, device, loader):

  
  #Function to evaluate model 

 
  model.eval()
  predictiondata = []
  actualdata = []
  with torch.no_grad():
      for _, data in enumerate(loader, 0):
          y = data['target_ids'].to(device, dtype = torch.long)
          ids = data['source_ids'].to(device, dtype = torch.long)
          mask = data['source_mask'].to(device, dtype = torch.long)

          generated_ids = model.generate(
              input_ids = ids,
              attention_mask = mask, 
              max_length=150, 
              repetition_penalty=2.5, 
              )
          predicted = [tokenizer.decode(g_id, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g_id in generated_ids]
          target = [tokenizer.decode(t_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t_id in y]
          if _%10==0:
              console.print(f'Completed {_}')

          predictiondata.extend(predicted)
          actualdata.extend(target)
  return predictiondata, actualdata

In [None]:
def BartTrainer(dataframe, source_data, target_data, model_params, output_dir="./outputs/" ):
  
  """
  Bart trainer

  """

  # Set random seeds and deterministic pytorch for reproducibility
  torch.manual_seed(model_params["SEED"]) # pytorch random seed
  np.random.seed(model_params["SEED"]) # numpy random seed
  torch.backends.cudnn.deterministic = True 
  
  # tokenzier for encoding the text
  
  tokenizer = BartTokenizer.from_pretrained(model_params["MODEL"])
  model = BartForConditionalGeneration.from_pretrained(model_params["MODEL"])   
  model = model.to(device)
  
  # Importing the raw dataset
  dataframe = dataframe[[source_data,target_data]]
  
  train_size = 0.8
  train_ds=dataframe.sample(frac=train_size,random_state = model_params["SEED"])
  valid_ds=dataframe.drop(train_ds.index).reset_index(drop=True)
  train_ds = train_ds.reset_index(drop=True)


  # Creating the Training and Validation dataset for further creation of Dataloader
  training_set = TransformMultiNews(train_ds, tokenizer, model_params["MAX_LENGTH_SOURCE_TEXT"], model_params["MAX_LENGTH_TARGET_TEXT"], source_data, target_data)
  valid_set = TransformMultiNews(valid_ds, tokenizer, model_params["MAX_LENGTH_SOURCE_TEXT"], model_params["MAX_LENGTH_TARGET_TEXT"], source_data, target_data)


  # Defining the parameters for creation of dataloaders
  train_params = {
      'batch_size': model_params["BATCH_SIZE_TRAIN"],
      'shuffle': True,
      'num_workers': 0
      }


  valid_params = {
      'batch_size': model_params["BATCH_SIZE_VALID"],
      'shuffle': False,
      'num_workers': 0
      }


  # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
  train_loader = DataLoader(training_set, **train_params)
  val_loader = DataLoader(valid_set, **valid_params)


  # Defining the optimizer that will be used to tune the weights of the network in the training session. 
  optimizer = torch.optim.Adam(params =  model.parameters(), lr=model_params["LEARNING_RATE"])

    


  # Training loop

  console.log(f'[Initiating Fine Tuning]...\n')
 

  for epoch1 in range(model_params["TRAIN_EPOCHS"]):
      train(epoch1, tokenizer, model, device, train_loader, optimizer)
        
# Training loop ends
      
  console.log(f"[Saving Model]...\n")
  #Saving the model after training
  path = os.path.join(output_dir, "model_files")
  model.save_pretrained(path)
  tokenizer.save_pretrained(path)


  # evaluating test dataset
  console.log(f"[Initiating Validation]...\n")
  for epoch in range(model_params["VALID_EPOCHS"]):
    predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
    final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
    final_df.to_csv(os.path.join(output_dir,'predictions.csv'))
  
  console.save_text(os.path.join(output_dir,'logs.txt'))
  
  console.log(f"[Validation Completed.]\n")
  console.print(f"""[Model] Model saved @ {os.path.join(output_dir, "model_files")}\n""")
  console.print(f"""[Validation] Generation on Validation data saved @ {os.path.join(output_dir,'predictions.csv')}\n""")
  console.print(f"""[Logs] Logs saved @ {os.path.join(output_dir,'logs.txt')}\n""")


In [None]:
model_params={
    "MODEL":"facebook/bart-base",#"sshleifer/distilbart-cnn-6-6",#"sshleifer/distilbart-cnn-6-6", #"facebook/bart-base",# model_type: facebook/bart-base
    "BATCH_SIZE_TRAIN":8,          # training batch size
    "BATCH_SIZE_VALID":8,          # validation batch size
    "TRAIN_EPOCHS":1,              # number of training epochs
    "VALID_EPOCHS":2,                # number of validation epochs
    "LEARNING_RATE":2e-5,          # learning rate
    "MAX_LENGTH_SOURCE_TEXT":1024,  # max length of source text
    "MAX_LENGTH_TARGET_TEXT":200,   # max length of target text
    "SEED": 42                     # set seed for reproducibility 

}

# Start Training

In [None]:
BartTrainer(dataframe=df ,source_data="text", target_data="summary", model_params=model_params, output_dir="outputs")


In [None]:
mkdir ./outputs/

# FOR ANALYSIS AND TESTING

Load Model

In [None]:
tokenizer = BartTokenizer.from_pretrained("/kaggle/working/outputs/model_files", do_lower_case=False)
model = BartForConditionalGeneration.from_pretrained("/kaggle/working/outputs/model_files")
model.to(device)

In [None]:
from sumeval.metrics.rouge import RougeCalculator

rouge = RougeCalculator(stopwords=True, lang="en")

def rouge_calc(preds, targets):
    rouge_1 = rouge.rouge_n(summary=preds,references=targets,n=1) 
    rouge_2 = rouge.rouge_n(summary=preds,references=targets,n=2)
    rouge_l = rouge.rouge_l(summary=preds,references=targets)

    return {"Rouge_1": rouge_1,
            "Rouge_2": rouge_2,
            "Rouge_L": rouge_l}


Cross attention

In [None]:
from IPython.display import display, HTML
import matplotlib as mpl
from matplotlib.colors import Normalize, rgb2hex
import pandas as pd
from IPython.display import HTML
import tensorflow as tf

def get_max_attn(c_atten):
    lst1 = []
    for target,i in enumerate(c_atten):
        lst2 = []
        for ipword in range(512):
            max_head = 0.0
            for layer in range(6):
                max_ = 0
                for head in range(8):
                    if(max_ < c_atten[target][layer][0][head][0][ipword].tolist()):
                        max_ = c_atten[target][layer][0][head][0][ipword].tolist()
                max_head += max_
            avg = max_head/6
            lst2.append(avg)
        lst1.append(lst2)
    return lst1

def predict(model,tokenizer,parameters,sent, device):
    sent = " ".join(sent.split())
    
    source = tokenizer.__call__(
            [sent],
            max_length=parameters["MAX_LENGTH_SOURCE_TEXT"],
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
    
    ids = source["input_ids"]
    mask = source["attention_mask"]
    
    model.eval()
    with torch.no_grad():
        ids = ids.to(device, dtype = torch.long)
        mask = mask.to(device, dtype = torch.long)
        
        generated_ids = model.generate(
              input_ids = ids,
              attention_mask = mask, 
              max_length=150, 
#               num_beams=2,
              repetition_penalty=2.5,  # there is a research paper for this
              #length_penalty=1.0,  # > 0 encourages to generate short sentences, < 0 to generate long sentences
#               early_stopping=True,  # stops beam search when number of beams sentences are generated per batch
              output_attentions=True,
              return_dict_in_generate=True
              )
        
        
        preds = tokenizer.decode(generated_ids.sequences[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
        #print(preds)
    c_atten = generated_ids["cross_attentions"]
    
    return c_atten, generated_ids, ids

def predict_(model,tokenizer,parameters,sent, device):
    sent = " ".join(sent.split())
    
    source = tokenizer.__call__(
            [sent],
            max_length=parameters["MAX_LENGTH_SOURCE_TEXT"],
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
    
    ids = source["input_ids"]
    mask = source["attention_mask"]
    
    model.eval()
    with torch.no_grad():
        ids = ids.to(device, dtype = torch.long)
        mask = mask.to(device, dtype = torch.long)
        
        generated_ids = model.generate(
              input_ids = ids,
              attention_mask = mask, 
              max_length=150, 
              repetition_penalty=2.5,  # there is a research paper for this
              output_attentions=True,
              return_dict_in_generate=True
              )
        
        
        preds = tokenizer.decode(generated_ids.sequences[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
#         print(preds)
        enumerated_preds = tokenizer.convert_ids_to_tokens(generated_ids.sequences[0])
        print("enumerated predictions in token format: ")
        for i,token in enumerate(enumerated_preds):
            print(i,":",enumerated_preds[i])
    c_atten = generated_ids["cross_attentions"]
    
    return c_atten, generated_ids, ids


def colorize(attrs, cmap='bwr'):

    cmap_bound = tf.reduce_max(tf.abs(attrs))

    norm = Normalize(vmin=-cmap_bound, vmax=cmap_bound)

    cmap = mpl.cm.get_cmap(cmap)
    colors = list(map(lambda x: rgb2hex(cmap(norm(x))), attrs))

    return colors

def  hlstr(string, color='white'):

    return f"<mark style=background-color:{color}>{string} </mark>"



def color_(max_atten_per_ipword , input_tokens):

    input_tokens = [x[1:] for x in input_tokens]
    colors = colorize(max_atten_per_ipword)
    colored_input=[]
    display(HTML("".join(list(map(hlstr, input_tokens, colors)))))
   

def cross_atten(model,tokenizer,parameters,sent, device):
    
    c_atten, generated_ids, input_ids = predict(model,tokenizer,parameters,sent, device)
    
    target_input_attn = get_max_attn(c_atten)
    
    max_atten_per_ipword = []
    for ipword in range(512):
        max_ = 0.0
        for target in range(len(target_input_attn)):
            if(max_ <= target_input_attn[target][ipword]):
                max_ = target_input_attn[target][ipword]
        max_atten_per_ipword.append(max_)
    input_tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    input_tokens = [token for token in input_tokens if token != '<pad>']
    
    color_(max_atten_per_ipword , input_tokens)

def cross_atten_per_word(model,tokenizer,parameters,sent, device):
    
    c_atten, generated_ids, input_ids = predict_(model,tokenizer,parameters,sent, device)
    tarid = (int)(input("Enter the input id of the target word to be analysed :"))
    target_input_attn = get_max_attn(c_atten)
    
    input_tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    input_tokens = [token for token in input_tokens if token != '<pad>']
    
    color_(target_input_attn[tarid] , input_tokens)
    

In [None]:
# load the pre-trained best-checkpoint model
from datasets import load_dataset
def find_cross_atten(model, device, tokenizer, model_params,i):

    test= load_dataset('multi_news', split='test[:5%]')
    
    sent = test[i]["document"]  # taking an example sentence
    sent = "summarize: " + sent
    sent = " ".join(sent.split())
    
    source = tokenizer.__call__(
            [sent],
            max_length=model_params["MAX_LENGTH_SOURCE_TEXT"],
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
    
    ids = source["input_ids"]
    mask = source["attention_mask"]
    
    model.eval()
    with torch.no_grad():
        ids = ids.to(device, dtype = torch.long)
        mask = mask.to(device, dtype = torch.long)
        
        generated_ids = model.generate(
              input_ids = ids,
              attention_mask = mask, 
              max_length=150, 
              repetition_penalty=2.5,  # there is a research paper for this
              )
        
        preds = tokenizer.decode(generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
        
        print("Input dialogue is: ", test[i]["summary"])
        print("###############################")
        print("Output summary is: ", preds)
        c=rouge_calc(preds,test[i]["summary"])
        print("###############################")
        cross_atten(model,tokenizer,model_params,test["document"][i], device)
        print(c)
    
    
    
cuda =  torch.cuda.is_available()
device = torch.device("cuda") if cuda else torch.device("cpu")


find_cross_atten(model, device, tokenizer, model_params,7)

# Calculate Rouge scores on Test data

In [None]:
# load the pre-trained best-checkpoint model
from datasets import load_dataset
def inference(model, device, tokenizer, model_params,test):
#     dataset = load_dataset("multi_news")
#     test = dataset["test"]
    #test = test.remove_columns(["id"])
#     test= load_dataset('multi_news', split='test[:5%]')
    
    sent = test["document"]  # taking an example sentence
    sent = "summarize: " + sent
    sent = " ".join(sent.split())
    
    source = tokenizer.__call__(
            [sent],
            max_length=model_params["MAX_LENGTH_SOURCE_TEXT"],
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
    
    ids = source["input_ids"]
    mask = source["attention_mask"]
    
    model.eval()
    with torch.no_grad():
        ids = ids.to(device, dtype = torch.long)
        mask = mask.to(device, dtype = torch.long)
        
        generated_ids = model.generate(
              input_ids = ids,
              attention_mask = mask, 
              max_length=150, 
              repetition_penalty=2.5,  # there is a research paper for this
             )
        
        preds = tokenizer.decode(generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
        
        c=rouge_calc(preds,test["summary"])
        return c
    


In [None]:
testdata = {'text': test['document'],
        'summary': test['summary']}
df_test = pd.DataFrame(testdata)

In [None]:
c=[inference(model, device, tokenizer, model_params,test[i]) for i in range(len(df_test))]
# print(np.array(c).mean())

In [None]:
r1= [x.get('Rouge_1') for x in c]
r2= [x.get('Rouge_2') for x in c]
rl= [x.get('Rouge_L') for x in c]
print("mean values are Rouge_1:",np.mean(r1)," Rouge_2:",np.mean(r2)," Rouge_L:",np.mean(rl))