In [1]:
import torch

import numpy as np
import pandas as pd
import torch.nn.functional as F

from torch import cuda
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

In [2]:
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
df = pd.read_excel('Inshorts Cleaned Data.xlsx')

In [4]:
df.head()

Unnamed: 0,Headline,Short,Source,Time,Publish Date
0,4 ex-bank officials booked for cheating bank o...,The CBI on Saturday booked four former officia...,The New Indian Express,09:25:00,2017-03-26
1,Supreme Court to go paperless in 6 months: CJI,Chief Justice JS Khehar has said the Supreme C...,Outlook,22:18:00,2017-03-25
2,"At least 3 killed, 30 injured in blast in Sylh...","At least three people were killed, including a...",Hindustan Times,23:39:00,2017-03-25
3,Why has Reliance been barred from trading in f...,Mukesh Ambani-led Reliance Industries (RIL) wa...,Livemint,23:08:00,2017-03-25
4,Was stopped from entering my own studio at Tim...,TV news anchor Arnab Goswami has said he was t...,YouTube,23:24:00,2017-03-25


In [5]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.Headline
        self.ctext = self.data.Short

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length = self.source_len, padding = 'max_length',\
                                                  return_tensors = 'pt')
        target = self.tokenizer.batch_encode_plus([text], max_length = self.summ_len, padding = 'max_length',\
                                                  return_tensors = 'pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype = torch.long), 
            'source_mask': source_mask.to(dtype = torch.long), 
            'target_ids': target_ids.to(dtype = torch.long),
            'target_ids_y': target_ids.to(dtype = torch.long)
        }

In [6]:
def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _, data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous() # make y_ids contiguous 
        lm_labels = y[:, 1:].clone().detach() # make fast copy
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100 # replace pad tokens 
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids = y_ids, labels = lm_labels)
        loss = outputs[0]
        
        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [7]:
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length = 100, 
                num_beams = 2,
                repetition_penalty = 2.5, 
                length_penalty = 1.0, 
                early_stopping = True
                )
            preds = [tokenizer.decode(g, skip_special_tokens = True, clean_up_tokenization_spaces = True)\
                     for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens = True, clean_up_tokenization_spaces = True)\
                      for t in y]
            if _%100==0:
                print(f'Completed {_}')

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

In [8]:
TRAIN_BATCH_SIZE = 2
VALID_BATCH_SIZE = 2 
TRAIN_EPOCHS = 10      
VAL_EPOCHS = 1 
LEARNING_RATE = 1e-4    
SEED = 42               
MAX_LEN = 512
SUMMARY_LEN = 100

torch.manual_seed(SEED) 
np.random.seed(SEED) 
torch.backends.cudnn.deterministic = True

tokenizer = T5Tokenizer.from_pretrained("t5-base")

df = df[['Headline','Short']]
df.Headline = 'summarize: ' + df.Headline

train_size = 0.8
train_dataset = df.sample(frac=train_size,random_state = SEED)
val_dataset = df.drop(train_dataset.index).reset_index(drop = True)
train_dataset = train_dataset.reset_index(drop = True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(val_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN, SUMMARY_LEN)
val_set = CustomDataset(val_dataset, tokenizer, MAX_LEN, SUMMARY_LEN)

train_params = {
    'batch_size': TRAIN_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 0
    }

val_params = {
    'batch_size': VALID_BATCH_SIZE,
    'shuffle': False,
    'num_workers': 0
    }

training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)

model = T5ForConditionalGeneration.from_pretrained("t5-base")
model = model.to(device)

optimizer = torch.optim.Adam(params =  model.parameters(), lr = LEARNING_RATE)

FULL Dataset: (55104, 2)
TRAIN Dataset: (44083, 2)
TEST Dataset: (11021, 2)


Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
print('Initiating Fine-Tuning for the model on our dataset')

for epoch in range(TRAIN_EPOCHS):
    train(epoch, tokenizer, model, device, training_loader, optimizer)

Initiating Fine-Tuning for the model on our dataset
Epoch: 0, Loss:  5.8626837730407715
Epoch: 0, Loss:  1.535601019859314
Epoch: 0, Loss:  1.0182313919067383
Epoch: 0, Loss:  2.2324159145355225
Epoch: 0, Loss:  1.2453153133392334
Epoch: 0, Loss:  1.2783634662628174
Epoch: 0, Loss:  2.4592106342315674
Epoch: 0, Loss:  0.6765856742858887
Epoch: 0, Loss:  1.6162558794021606
Epoch: 0, Loss:  0.9466649293899536
Epoch: 0, Loss:  1.051295280456543
Epoch: 0, Loss:  0.5608026385307312
Epoch: 0, Loss:  1.1044048070907593
Epoch: 0, Loss:  0.6694570779800415
Epoch: 0, Loss:  0.8327499032020569
Epoch: 0, Loss:  0.9408615231513977
Epoch: 0, Loss:  0.5147339701652527
Epoch: 0, Loss:  0.9305832982063293
Epoch: 0, Loss:  1.420927882194519
Epoch: 0, Loss:  1.6219096183776855
Epoch: 0, Loss:  0.4562230110168457
Epoch: 0, Loss:  1.4659337997436523
Epoch: 0, Loss:  1.1376267671585083
Epoch: 0, Loss:  1.4021466970443726
Epoch: 0, Loss:  0.871911883354187
Epoch: 0, Loss:  0.48270684480667114
Epoch: 0, Loss:

Epoch: 5, Loss:  0.4171774089336395
Epoch: 5, Loss:  0.2690272331237793
Epoch: 5, Loss:  0.532575249671936
Epoch: 5, Loss:  0.7844458222389221
Epoch: 5, Loss:  0.3309836685657501
Epoch: 5, Loss:  0.3058425486087799
Epoch: 5, Loss:  0.6042974591255188
Epoch: 5, Loss:  0.38814160227775574
Epoch: 5, Loss:  0.5136160254478455
Epoch: 5, Loss:  0.8710018992424011
Epoch: 5, Loss:  0.5453805923461914
Epoch: 5, Loss:  0.30934610962867737
Epoch: 5, Loss:  0.7441872954368591
Epoch: 5, Loss:  0.2547917068004608
Epoch: 5, Loss:  0.4765646457672119
Epoch: 5, Loss:  0.46921685338020325
Epoch: 5, Loss:  0.31470802426338196
Epoch: 5, Loss:  0.33740732073783875
Epoch: 5, Loss:  0.482659250497818
Epoch: 5, Loss:  0.34207069873809814
Epoch: 5, Loss:  0.5052774548530579
Epoch: 5, Loss:  0.4857509732246399
Epoch: 5, Loss:  0.8999201655387878
Epoch: 5, Loss:  0.47042861580848694
Epoch: 5, Loss:  0.5252408385276794
Epoch: 5, Loss:  0.3099709749221802
Epoch: 5, Loss:  0.08584362268447876
Epoch: 5, Loss:  0.724

In [10]:
print('Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe')
for epoch in range(VAL_EPOCHS):
    predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
    final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})

Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe
Completed 0
Completed 100
Completed 200
Completed 300
Completed 400
Completed 500
Completed 600
Completed 700
Completed 800
Completed 900
Completed 1000
Completed 1100
Completed 1200
Completed 1300
Completed 1400
Completed 1500
Completed 1600
Completed 1700
Completed 1800
Completed 1900
Completed 2000
Completed 2100
Completed 2200
Completed 2300
Completed 2400
Completed 2500
Completed 2600
Completed 2700
Completed 2800
Completed 2900
Completed 3000
Completed 3100
Completed 3200
Completed 3300
Completed 3400
Completed 3500
Completed 3600
Completed 3700
Completed 3800
Completed 3900
Completed 4000
Completed 4100
Completed 4200
Completed 4300
Completed 4400
Completed 4500
Completed 4600
Completed 4700
Completed 4800
Completed 4900
Completed 5000
Completed 5100
Completed 5200
Completed 5300
Completed 5400
Completed 5500


In [11]:
final_df

Unnamed: 0,Generated Text,Actual Text
0,": 3 dead, 30 injured in two blasts in Bangladesh","summarize: At least 3 killed, 30 injured in bl..."
1,: 30 blasts at ordnance factory in MP,summarize: 30 blasts occur at ordnance factory...
2,: Centre rejects reports of larger Nagaland state,summarize: Govt dismisses reports of carving o...
3,: Thousands protest against Brexit in London,summarize: Thousands march in London to protes...
4,: Wankhede Stadium reserved for underprivilege...,summarize: Underprivileged kids to fill Wankhe...
...,...,...
11016,: Hanumantha Rao had also written to HRD in 2014,summarize: Congress MP wrote to HRD ministry i...
11017,: SPG moves SC against diesel vehicles ban,summarize: SPG seeks exception in SC ban on di...
11018,: Sonu Nigam surprises passengers with impromp...,summarize: Sonu Nigam sings for co-passengers ...
11019,: TRAI slams FB over &#39;Free Basics&#39; debate,summarize: TRAI slams FB for &#39;orchestrated...


In [12]:
model.save_pretrained('models/t5-inshorts/')

In [1]:
import transformers

In [2]:
tf_model = transformers.TFT5ForConditionalGeneration.from_pretrained('models/t5-inshorts/', from_pt=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFT5ForConditionalGeneration: ['lm_head.weight', 'decoder.embed_tokens.weight', 'encoder.embed_tokens.weight']
- This IS expected if you are initializing TFT5ForConditionalGeneration from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFT5ForConditionalGeneration from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [3]:
tf_model.save_pretrained('models/t5-inshorts/')

In [5]:
tokenizer = transformers.T5Tokenizer.from_pretrained("t5-base")

In [6]:
tokenizer.save_pretrained('./')

('./tokenizer_config.json',
 './special_tokens_map.json',
 './spiece.model',
 './added_tokens.json')