In [81]:
import pandas as pd 
from datasets import Dataset
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

In [82]:
class CustomDataset(Dataset):

    def __init__(self, data, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = data.text
        self.ctext = data.ctext

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text], max_length= self.summ_len, pad_to_max_length=True,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [83]:
def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
        loss = outputs[0]
        

        print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    inputs_text = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=150, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            input_text = [tokenizer.decode(id, skip_special_tokens=True, clean_up_tokenization_spaces=True) for id in ids]
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            if _%100==0:
                print(f'Completed {_}')
            
            inputs_text.extend(input_text)
            predictions.extend(preds)
            actuals.extend(target)
    return inputs_text, predictions, actuals

In [84]:
config = {"MAX_LEN" : 512,
          "TRAIN_BATCH_SIZE" : 2,
          "VALID_BATCH_SIZE" : 2,
          "TRAIN_EPOCHS" : 2,
          "VAL_EPOCHS" : 1,
          "LEARNING_RATE" : 1e-4,
          "SEED" : 42,
          "MAX_LEN" : 512,
          "SUMMARY_LEN" :150        
         }
  
config["MAX_LEN"]

512

In [85]:
df = pd.read_csv('./cnn_dailymail/train.csv',encoding='latin-1') 
df = df[['article','highlights']]
df = df.rename({"article" : "ctext", "highlights" : "text"}, axis = 1)
df.ctext = 'summarize: ' + df.ctext
df = df[:10]
print(df.head())


                                               ctext  \
0  summarize: By . Associated Press . PUBLISHED: ...   
1  summarize: (CNN) -- Ralph Mata was an internal...   
2  summarize: A drunk driver who killed a young w...   
3  summarize: (CNN) -- With a breezy sweep of his...   
4  summarize: Fleetwood are the only team still t...   

                                                text  
0  Bishop John Folda, of North Dakota, is taking ...  
1  Criminal complaint: Cop used his role to help ...  
2  Craig Eccleston-Todd, 27, had drunk at least t...  
3  Nina dos Santos says Europe must be ready to a...  
4  Fleetwood top of League One after 2-0 win at S...  


In [86]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [87]:
train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state = 42)
val_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

In [88]:
print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(val_dataset.shape))

FULL Dataset: (10, 2)
TRAIN Dataset: (8, 2)
TEST Dataset: (2, 2)


In [89]:
training_set = CustomDataset(data = train_dataset, tokenizer = tokenizer,source_len = config["MAX_LEN"], 
                             summ_len = config["SUMMARY_LEN"])
val_set = CustomDataset(data = val_dataset, tokenizer = tokenizer,source_len = config["MAX_LEN"], 
                             summ_len = config["SUMMARY_LEN"])

In [90]:
train_params = {
        'batch_size': config["TRAIN_BATCH_SIZE"],
        'shuffle': True,
        'num_workers': 0
}

val_params = {
        'batch_size': config["VALID_BATCH_SIZE"],
        'shuffle': False,
        'num_workers': 0
}

In [91]:
training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)

In [92]:
model = T5ForConditionalGeneration.from_pretrained("t5-base")

In [93]:
optimizer = torch.optim.Adam(params = model.parameters(), lr=config["LEARNING_RATE"])

In [94]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
for epoch in range(config["TRAIN_EPOCHS"]):
    train(epoch, tokenizer, model, device, training_loader, optimizer)


    # Validation loop and saving the resulting file with predictions and acutals in a dataframe.
    # Saving the dataframe as predictions.csv
    print('Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe')
    for epoch in range(config["VAL_EPOCHS"]):
        input_text, predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
        final_df = pd.DataFrame({'Input Text ': input_text, 'Generated Text':predictions,'Actual Text':actuals})
        final_df.to_csv('./predictions.csv')
        print('Output Files generated for review')

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch: 0, Loss:  7.836664199829102
Epoch: 0, Loss:  4.574958324432373
Epoch: 0, Loss:  4.176817417144775
Epoch: 0, Loss:  4.8595123291015625
Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe
Completed 0
Output Files generated for review
Epoch: 1, Loss:  3.9061052799224854
Epoch: 1, Loss:  2.2678771018981934
Epoch: 1, Loss:  3.5002641677856445
Epoch: 1, Loss:  2.4181504249572754
Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe
Completed 0
Output Files generated for review


In [96]:
sentence = "summarize: A NASA spacecraft will deliberately slam into an asteroid Monday, and it's all in the name of planetary protection. The DART mission, or the Double Asteroid Redirection Test, will crash into the space rock at 7:14 p.m. ET after launching 10 months ago.The spacecraft will attempt to affect the motion of an asteroid in space. A live stream of images captured by the spacecraft will be available on NASA's website beginning at 6 p.m. ET. The mission is heading for Dimorphos, a small moon orbiting the near-Earth asteroid Didymos. The asteroid system poses no threat to Earth, NASA officials have said, making it a perfect target to test out a kinetic impact -- which may be needed if an asteroid is ever on track to hit Earth. The event will be the agency's first full-scale demonstration of deflection technology that can protect the planet. 'For the first time ever, we will measurably change the orbit of a celestial body in the universe,' said Robert Braun, head of the Johns Hopkins University Applied Physics Laboratory's Space Exploration Sector in Laurel, Maryland."

In [99]:
tokenized_sentence = tokenizer.batch_encode_plus([sentence], max_length= 150, pad_to_max_length=True,return_tensors='pt')
tokenized_sentence

{'input_ids': tensor([[21603,    10,    71, 15971,   628,  6696,    56, 24067,     3,     7,
            40,   265,   139,    46,     3,  8323,  8184,  2089,     6,    11,
            34,    31,     7,    66,    16,     8,   564,    13,     3, 30351,
          1711,     5,    37,   309,  8241,  2253,     6,    42,     8,  8405,
            71,   849,  8184,  1624,  2060,  4985,  2300,     6,    56,  8420,
           139,     8,   628,  2480,    44,   489,    10,  2534,     3,   102,
             5,    51,     5, 10104,   227,     3, 14138,   335,   767,   977,
             5,   634,   628,  6696,    56,  3332,    12,  2603,     8,  4644,
            13,    46,     3,  8323,  8184,    16,   628,     5,    71,   619,
          6093,    13,  1383,  9534,    57,     8,   628,  6696,    56,    36,
           347,    30, 15971,    31,     7,   475,  1849,    44,   431,     3,
           102,     5,    51,     5, 10104,     5,    37,  2253,    19,  6904,
            21,  2043,  8886,    32,  

In [100]:
generated_ids = model.generate(
                input_ids = tokenized_sentence["input_ids"],
                attention_mask = tokenized_sentence["attention_mask"], 
                max_length=150, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )

In [101]:
preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]

In [102]:
preds

['the spacecraft will crash into the space rock at 7:14 p.m. ET after launching 10 months ago. the mission is heading for Dimorphos, a small moon orbiting the near-Earth asteroid Didymos.']

In [103]:
torch.save(model, "model.pt")