In [None]:
!pip install sentencepiece
!pip install transformers
!pip install rich[jupyter]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# New Section

In [None]:
import pandas as pd
df = pd.read_csv('/content/Book3.csv',names=["text","summary"])

In [None]:
df.head()

Unnamed: 0,text,summary
0,I can't wait to see what happens next! Click ...,bit.ly/2Vn3q3The author is excited to see what...
1,"""Seaside Shoreline,"" an x14"" Acrylic on Canva...","A painting titled ""Seaside Shoreline"" measurin..."
2,An ice-cold glass of guava lemonade is the pe...,is the perfect summer drink!Guava Lemonade is...
3,Thinking about summer? We sure are Check out ...,bit.ly/2XzKHJWe are already looking forward to...
4,A great design that you need to add to the li...,"Before travelling to Istanbul, it is recommend..."


In [None]:
df["text"] = "summarize: "+df["text"]

In [None]:
df.head()

Unnamed: 0,text,summary
0,summarize: I can't wait to see what happens n...,bit.ly/2Vn3q3The author is excited to see what...
1,"summarize: ""Seaside Shoreline,"" an x14"" Acryl...","A painting titled ""Seaside Shoreline"" measurin..."
2,summarize: An ice-cold glass of guava lemonad...,is the perfect summer drink!Guava Lemonade is...
3,summarize: Thinking about summer? We sure are...,bit.ly/2XzKHJWe are already looking forward to...
4,summarize: A great design that you need to ad...,"Before travelling to Istanbul, it is recommend..."


In [None]:
# Importing libraries
import os
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import os

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration

from rich.table import Column, Table 
#Rich library in Python module
# provides classes and functions for working with tables in console output.
from rich import box
from rich.console import Console
#The rich.console module provides a way to create a console
#object that can be used to display rich text and other output in the terminal.
# define a rich console logger
console=Console(record=True)

def display_df(df):
  """display dataframe in ASCII format"""

  console=Console()
  table = Table(Column("source_text", justify="center" ), Column("target_text", justify="center"), title="Sample Data",pad_edge=False, box=box.ASCII)

  for i, row in enumerate(df.values.tolist()):
    table.add_row(row[0], row[1])

  console.print(table)

training_logger = Table(Column("Epoch", justify="center" ), 
                        Column("Steps", justify="center"),
                        Column("Loss", justify="center"), 
                        title="Training Status",pad_edge=False, box=box.ASCII)


In [None]:
# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

**`Token and attention mask is generated for dataset:`**

In [None]:
class YourDataSetClass(Dataset):
  """
  Creating a custom dataset for reading the dataset and 
  loading it into the dataloader to pass it to the neural network for finetuning the model

  """

  def __init__(self, dataframe, tokenizer, source_len, target_len, source_text, target_text):
    self.tokenizer = tokenizer
    self.data = dataframe
    self.source_len = source_len
    self.summ_len = target_len
    self.target_text = self.data[target_text]
    self.source_text = self.data[source_text]

  def __len__(self):
    return len(self.target_text)

  def __getitem__(self, index):
    source_text = str(self.source_text[index])
    target_text = str(self.target_text[index])

    #cleaning data so as to ensure data is in string type
    source_text = ' '.join(source_text.split())
    target_text = ' '.join(target_text.split())

    source = self.tokenizer.batch_encode_plus([source_text], max_length= self.source_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')
    target = self.tokenizer.batch_encode_plus([target_text], max_length= self.summ_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')

    #the batch_encode_plus method of the tokenizer object is called twice with source_text and target_text as input arguments. 
    #This method tokenizes the input text and returns a dictionary containing the input IDs and attention masks.

    source_ids = source['input_ids'].squeeze()
    source_mask = source['attention_mask'].squeeze()
    target_ids = target['input_ids'].squeeze()
    target_mask = target['attention_mask'].squeeze()
    #squeeze() method to remove any unnecessary dimensions.

    return {
        'source_ids': source_ids.to(dtype=torch.long), 
        'source_mask': source_mask.to(dtype=torch.long), 
        'target_ids': target_ids.to(dtype=torch.long),
        
    }

In [None]:
def train(epoch, tokenizer, model, device, loader, optimizer):

  """
  Function to be called for training with the parameters passed from main function

  """
  #loader: A PyTorch DataLoader object that loads training data in batches

  model.train()
  #setting the model to training mode
  for _,data in enumerate(loader, 0):

    y = data['target_ids'].to(device, dtype = torch.long)
    y_ids = y[:, :-1].contiguous()
    #selects all col except the last one,as it is used as EOS.

    # Some operations in PyTorch, such as certain slicing and indexing operations, 
    # require the tensor to be contiguous in order to work properly. 
    # In such cases, calling contiguous() on the tensor ensures that the
    #  tensor's' memory is laid out in a contiguous fashion,
    #   so that the operation can be performed efficiently.


    lm_labels = y[:, 1:].clone().detach()
    #select all col except the first one,

    #clone().detach() ensures that lm_labels is a new tensor
    # that is not connected to the computation graph. 
    #This prevents any gradients from being computed for this tensor during backpropagation
    # and avoids unnecessary memory usage.
    lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100

    
    ids = data['source_ids'].to(device, dtype = torch.long)
    mask = data['source_mask'].to(device, dtype = torch.long)

    outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
    loss = outputs[0]

    if _%10==0:
      training_logger.add_row(str(epoch), str(_), str(loss))
      console.print(training_logger)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

**lm_labels:**The purpose of creating lm_labels is to set up the loss calculation for training the model. The transformer-based language model learns to predict the next token in the target sequence given the previous tokens. During training, the model is fed the input sequence and is trained to predict the target sequence tokens one by one. The model's predictions are compared against the actual target tokens to compute the loss, which is used to update the model parameters during backpropagation.

To set up the loss calculation, we need to create a tensor that represents the target tokens for each predicted token in the sequence. We do this by shifting the y tensor one position to the right (y[:, 1:]) to get the "actual" next token for each input token. We then assign this shifted tensor to lm_labels.

**outputs:**The outputs tensor is obtained by passing the input sequence (ids) and the attention mask (mask) to the encoder, and the y_ids tensor (which contains the target sequence with the last token removed) to the decoder. The labels argument specifies the "actual" next tokens for each predicted token in the sequence (i.e., lm_labels).

**optimizer.zero.grad:**After logging the current batch's loss, the optimizer.zero_grad() statement is called to reset the gradients of all model parameters to zero. This is necessary because PyTorch accumulates gradients across batches by default, so we need to explicitly clear them after each batch.

**loss.backward() and optimizer.step()**:The loss.backward() statement computes the gradients of the loss with respect to all model parameters using automatic differentiation. This allows us to compute the gradients efficiently without having to manually derive and implement the backpropagation algorithm.

Finally, the optimizer.step() statement updates the model parameters using the computed gradients and the optimizer's update rule (e.g., SGD, Adam, etc.). This step is what actually causes the model to learn from the training data, by adjusting the parameters to minimize the loss.



In [None]:
def validate(epoch, tokenizer, model, device, loader):

  """
  Function to evaluate model for predictions

  """
  model.eval()
  predictions = []
  actuals = []
  with torch.no_grad():
      for _, data in enumerate(loader, 0):
          y = data['target_ids'].to(device, dtype = torch.long)
          ids = data['source_ids'].to(device, dtype = torch.long)
          mask = data['source_mask'].to(device, dtype = torch.long)

          generated_ids = model.generate(
              input_ids = ids,
              attention_mask = mask, 
              max_length=150, 
              num_beams=2,
              repetition_penalty=2.5, 
              length_penalty=1.0, 
              early_stopping=True
              )
          preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
          target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
          if _%10==0:
              console.print(f'Completed {_}')

          predictions.extend(preds)
          actuals.extend(target)
  return predictions, actuals

**model.generate:**
**input_ids:** A tensor containing the token IDs of the input sequences.
attention_mask: A tensor indicating which tokens should be attended to (1) and which tokens should not be attended to (0).

**max_length:** An integer indicating the maximum length of the generated sequences.

**num_beams:** An integer indicating the number of beams to use in beam search decoding. A higher number of beams may result in better quality predictions, but will also increase the computation time.

**repetition_penalty:** A float value controlling the degree to which repeated tokens are penalized in the generation process. A higher value will result in fewer repeated tokens in the generated sequences.

**length_penalty:** A float value controlling the degree to which shorter sequences are favored in the generation process. A higher value will result in shorter sequences.

**early_stopping:** A boolean indicating whether to stop generation early when all beam hypotheses have reached the EOS token. This can be useful to avoid generating overly long sequences.

**What is Beam?**
In natural language processing, beam search is a decoding algorithm used in sequence-to-sequence models such as neural machine translation and text generation. Beam search generates multiple candidate output sequences, known as beams, and scores them using a scoring function. The scoring function takes into account the probability of each word in the sequence given the previous words generated so far, as well as other factors such as length normalization and repetition penalties.

During generation, beam search maintains a set of k partial hypotheses, or "beams", where k is the beam width or the number of beams. At each time step, the model generates the probability distribution over the next token given the input sequence and the previously generated tokens. The k most likely partial hypotheses are then expanded by appending each possible next token to each of them, resulting in k * vocabulary_size new partial hypotheses. These new hypotheses are then pruned based on their scores and only the k most likely ones are retained for the next time step.

In [None]:
def T5Trainer(dataframe, source_text, target_text, model_params, output_dir="./outputs/" ):
  
  """
  T5 trainer

  """

  # Set random seeds and deterministic pytorch for reproducibility
  #  By setting the same seed for each run, we can ensure that the order of the data samples 
  #  is the same, and therefore the resulting weights and biases are also the same. 
  torch.manual_seed(model_params["SEED"]) # pytorch random seed
  np.random.seed(model_params["SEED"]) # numpy random seed
  torch.backends.cudnn.deterministic = True

  # logging
  console.log(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

  # tokenzier for encoding the text
  tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])

  # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
  # Further this model is sent to device (GPU/TPU) for using the hardware.
  model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
  model = model.to(device)
  
  # logging
  console.log(f"[Data]: Reading data...\n")

  # Importing the raw dataset
  dataframe = dataframe[[source_text,target_text]]
  display_df(dataframe.head(2))

  
  # Creation of Dataset and Dataloader
  # Defining the train size. So 80% of the data will be used for training and the rest for validation. 
  train_size = 0.8
  train_dataset=dataframe.sample(frac=train_size,random_state = model_params["SEED"])
  val_dataset=dataframe.drop(train_dataset.index).reset_index(drop=True)
  train_dataset = train_dataset.reset_index(drop=True)

  console.print(f"FULL Dataset: {dataframe.shape}")
  console.print(f"TRAIN Dataset: {train_dataset.shape}")
  console.print(f"TEST Dataset: {val_dataset.shape}\n")


  # Creating the Training and Validation dataset for further creation of Dataloader
  training_set = YourDataSetClass(train_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)
  val_set = YourDataSetClass(val_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)


  # Defining the parameters for creation of dataloaders
  train_params = {
      'batch_size': model_params["TRAIN_BATCH_SIZE"],
      'shuffle': True,
      'num_workers': 0
      }


  val_params = {
      'batch_size': model_params["VALID_BATCH_SIZE"],
      'shuffle': False,
      'num_workers': 0
      }


  # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
  training_loader = DataLoader(training_set, **train_params)
  val_loader = DataLoader(val_set, **val_params)


  # Defining the optimizer that will be used to tune the weights of the network in the training session. 
  optimizer = torch.optim.Adam(params =  model.parameters(), lr=model_params["LEARNING_RATE"])


  # Training loop
  console.log(f'[Initiating Fine Tuning]...\n')

  for epoch in range(model_params["TRAIN_EPOCHS"]):
      train(epoch, tokenizer, model, device, training_loader, optimizer)
      
  
  console.log(f"[Saving Model]...\n")
  #Saving the model after training
  path = os.path.join(output_dir, "model_files")
  model.save_pretrained(path)
  tokenizer.save_pretrained(path)


  # evaluating test dataset
  console.log(f"[Initiating Validation]...\n")
  for epoch in range(model_params["VAL_EPOCHS"]):
    predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
    final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
    final_df.to_csv(os.path.join(output_dir,'predictions.csv'))
  

  
  console.log(f"[Validation Completed.]\n")
  console.print(f"""[Model] Model saved @ {os.path.join(output_dir, "model_files")}\n""")
  console.print(f"""[Validation] Generation on Validation data saved @ {os.path.join(output_dir,'predictions.csv')}\n""")
 



In [None]:
model_params={
    "MODEL":"t5-base",             # model_type: t5-base/t5-large
    "TRAIN_BATCH_SIZE":8,          # training batch size
    "VALID_BATCH_SIZE":8,          # validation batch size
    "TRAIN_EPOCHS":3,              # number of training epochs
    "VAL_EPOCHS":1,                # number of validation epochs
    "LEARNING_RATE":1e-4,          # learning rate
    "MAX_SOURCE_TEXT_LENGTH":512,  # max length of source text
    "MAX_TARGET_TEXT_LENGTH":50,   # max length of target text
    "SEED": 42                     # set seed for reproducibility 

}

In [None]:
T5Trainer(dataframe=df[:1000], source_text="text", target_text="summary", model_params=model_params, output_dir="outputs")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
Prediction_df = pd.read_csv('/content/outputs/predictions.csv')

In [None]:
Prediction_df 

Unnamed: 0.1,Unnamed: 0,Generated Text,Actual Text
0,0,"""Seaside Shoreline,"" an x14"" acrylic on canvas...","A painting titled ""Seaside Shoreline"" measurin..."
1,1,A great design that you need to add to the lis...,"Before travelling to Istanbul, it is recommend..."
2,2,Our photography competition is open! Categorie...,
3,3,Jasmin Bhasin looks glamorous in green and pin...,Jasmin Bhasin was spotted looking glamorous in...
4,4,the traditional system was always way too long...,This tweet suggests that traditional schooling...
...,...,...,...
195,195,Important levels for Apr.,-Jun quarterThe tweets are discussing the impo...
196,196,My realtor has sent me gift cards/nice notes s...,"pandemic.Since the start of the pandemic, my r..."
197,197,We use the worlds open monetary network to giv...,This tweet is about how the open monetary netw...
198,198,U.S. Employers added jobs in March: Live Updat...,"In March, US employers added jobs, according t..."


In [None]:
Generated_text = Prediction_df["Generated Text"]

In [None]:
Generated_text=Generated_text.values.tolist()

In [None]:
Actual_text = Prediction_df["Actual Text"]

In [None]:
Actual_text=Actual_text.values.tolist()

In [None]:
Generated_text = str(Generated_text)
Actual_text = str(Actual_text)

In [None]:
!pip install rouge
from rouge import Rouge

# predicted and actual texts for multiple rows
# predicted_texts = ['this is a predicted text', 'this is another predicted text']
# actual_texts = ['this is an actual text', 'this is another actual text']

# initialize the ROUGE scorer
rouge = Rouge()

# compute the ROUGE scores for each row
scores = rouge.get_scores(Generated_text, Actual_text, avg=True)

# print the average scores
print(scores)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
{'rouge-1': {'r': 0.4482081492390771, 'p': 0.5178672716959728, 'f': 0.48052631081546954}, 'rouge-2': {'r': 0.20324629498941427, 'p': 0.2638973732437385, 'f': 0.2296345465793009}, 'rouge-l': {'r': 0.41580756013745707, 'p': 0.48043108338060125, 'f': 0.44578946871020647}}
