In [None]:
!pip install transformers -q
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [51]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import torch 
import torch.nn.functional as F 
from torch.utils.data import Dataset , DataLoader  , RandomSampler ,SequentialSampler 

from transformers import T5Tokenizer , T5ForConditionalGeneration 


device =  'cuda' if torch.cuda.is_available() else 'cpu'


In [None]:
!nvidia-smi

/bin/bash: nvidia-smi: command not found


# class for prepare dataset 

In [None]:
# here we will load data and then tokenize it to return model inputs that is   [source_ids , source_mask ,  target_ids , target_mask]

class CustomData(Dataset):
  def __init__(self , data  , tokenizer , artical_len ,summ_len):
    self.data        = data 
    self.tokenizer   = tokenizer
    self.artical_len = artical_len
    self.summ_len    = summ_len 
    self.text = data.Headline  
    self.ctext= data.Short 

  def __len__(self):
    return len(self.text) 

  def __getitem__(self, index):
    ctext = self.ctext[index] 
    ctext = str(ctext.split())
    ctext = ' '.join(ctext)

    text  = self.text[index]
    text  = str(text.split())
    text  = ' '.join(text)


    source = self.tokenizer.batch_encode_plus([ctext] , max_length = self.artical_len , pad_to_max_length = True , return_tensor='pt') 
    target = self.tokenizer.batch_encode_plus([text] , max_length = self.summ_len ,pad_to_max_length = True , return_tensor ='pt')

    source_ids = source['input_ids'].squeeze()
    source_mask= source['attention_mask'].squeeze()

    target_ids = target['input_ids'].squeeze()
    target_mask= target['attention_mask'].squeeze() 

    return {
        'source_ids' : source_ids.to(dtype= torch.long),  
        'source_mask': source_mask.to(dtype=torch.long) ,
        'target_ids' : target_ids.to(dtype=torch.long) ,
        'target_mask': target_mask.to(dtype=torch.long)

    }

In [None]:
## we will creat train function that will be called in main function later 

def train(epoch , tokenizer , model , loader , optimizer):


  model.train()
  for _,data in enumerate(loader ,0):
    y = data['target_ids'].to(device , dtype=torch.long)
    y_ids  = y[: , :-1].contiguous()
    lm_labels = y[: , 1:].clone().detach()
    lm_labels[ y[: , 1:] == tokenizer.pad_token_id    ] = -100 
    ids =  data['source_ids'].to(device ,dtype=torch.long)
    mask = data['source_mask'].to(device , dtype=torch.long)

    output = model(input_ids = ids ,  attention_mask = mask , decoder_input_ids = y_ids , lm_labels =lm_labels)
    loss =output[0]

    if _% 500 ==0 :
      print(f'epoch: {epoch} , loss: {loss.item()}')

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()



In [None]:
## we will creat evaluate function: this function return predicted value and true value 

def validation(model , loader , epoch , tokenizer):


  model.eval()
  predictions =[]
  actual =[]
  with torch.no_grad(): 
    for _,data in enumerate(loader ,0):
      y   = data['target_ids'].to(device , dtype = torch.long)
      ids = data['source_ids'].to(device , dtype = torch.long) 
      mask= data['source_mask'].to(device , dtype =torch.long)

      pred = model.generate(input_ids = ids ,
                            attention_mask = mask ,
                            max_length =150 , 
                            num_beams = 2 ,
                            repetition_penalty = 2.5 ,
                            length_penalty = 1.0 ,
                            early_stopping =True
                            )
      preds  =  [tokenizer.decode(t, skip_special_tokens=True , clean_up_tokenization_spaces =True) for t in pred]
      actuals=  [tokenizer.decode(t, skip_special_tokens=True , clean_up_tokenization_spaces =True) for t in y   ]

      predictions.extend(preds)
      actual.extend(actuals)
      if _%100 ==0:
        print(f'Completed {_}')
  return predictions , actual




## here we will creat our main function to collect all functions and class together to fine-tunnnig our model 


In [None]:
def main():
  ## intiate parameters for train and evaluate 
  ## preparing data 
  ## call tokenizer and model 
  ## fine-tunning our model 
  ## evaluate our model 
  ## show our result 
  parameters={ 'train_batch_size':2 ,
              'val_batch_size'   :2 ,

              'train_epochs':2,
              'val_epochs'  :2,

              'learning_rate':1e-4,
              'seed':42 ,

              'artical_len':512,
              'summ_len'   :150}

  np.random.seed(parameters['seed'])
  torch.manual_seed(parameters['seed'])
  torch.backends.cudnn.deterministic =True 
############################################################
############################################################
  tokenizer = T5Tokenizer.from_pretrained("t5-base").to(device)
  model     = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)
############################################################
############################################################
  # importing data from googel drive 
  from google.colab import drive 
  drive.mount('/content/drive', force_remount=True)
  DATA_DIR = "/content/drive/My Drive/Colab Notebook/Text Summarization With Attention/Inshorts Cleaned Data.xlsx"
  data  =  pd.read_excel( DATA_DIR ,engine = 'openpyxl')
  df = data[['Headline' , 'Short'] ]
  ## import think to get good performance to prepare data as t5 model trained 
  # it trained as get type of task first so we will add summarize in beginning of artical text (unsummarized)
  df = 'summarize: '+ df.Short 
  print('samples of our data :\n')
  print(df.head())
  ## spliting our data to 80% train data 20% test or validation data 
  from sklearn.model_selection import train_test_split 
  train_data, test_data = train_test_split(df , test_size=0.2, shuffle =True , random_state= parameters['seed'])
  print(f'shape of all data {df.shape}')
  print(f'shape of train data {train_data.shape}')
  print(f'shape of test data {test_data.shape }')
  ## calling customdata class to get (ids and mask ) for inputs and targets 
  train_custom = CustomData(train_data , tokenizer , parameters['artical_len'] , parameters['summ_len']) 
  test_custom  = CustomData(test_data  , tokenizer , parameters['artical_len'] , parameters['summ_len'])
  ## now creating data loader 
  train_loader = DataLoader(train_custom , batch_size = parameters['train_batch_size'] , shuffle=True , num_workers =0) 
  test_loader  = DataLoader(test_custom  , batch_size = parameters['val_batch_size'] , shuffle =True  , num_workers = 0)
############################################################
############################################################
  ## now we are ready for training our model that's called fine-tunning model 
  optimizer = torch.optim.Adam(params=model.parameters()  , lr = parameters['learning_rate'])
  print(f'Intiating Fine-tunning for the model in our data \n')
  for epoch in parameters['train_epochs'] :
    # just calling train function 
    train(epoch ,tokenizer , model , train_loader ,optimizer)

  print(f'Now we started created summray.....\n')
  for epoch in parameters['eval_epochs']:
    pred , actual = validation(model , data , epoch ,tokenizer)
    new_df = pf.DataFrame({'Generated':pred , 
                           'Actual':actual})
    store_dir = '/content/drive/My Drive/Colab Notebook/Text Summarization With Attention/generated.csv'
    new_df.to_csv(store_dir)
    print(f'number of generate is {epoch+1}')
  





In [None]:
if __name__ == '__main__':
  main()

ImportError: ignored