In [1]:
!pip install transformers -q
!pip install sentencepiece

[K     |████████████████████████████████| 1.4MB 9.3MB/s 
[K     |████████████████████████████████| 890kB 29.8MB/s 
[K     |████████████████████████████████| 2.9MB 45.0MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 1.8MB 8.5MB/s 
[K     |████████████████████████████████| 102kB 15.0MB/s 
[K     |████████████████████████████████| 133kB 44.5MB/s 
[K     |████████████████████████████████| 102kB 14.3MB/s 
[K     |████████████████████████████████| 163kB 45.5MB/s 
[K     |████████████████████████████████| 71kB 11.3MB/s 
[?25h  Building wheel for subprocess32 (setup.py) ... [?25l[?25hdone
  Building wheel for watchdog (setup.py) ... [?25l[?25hdone
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 9.

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration


In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
model = T5ForConditionalGeneration.from_pretrained("t5-base")

Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")

In [11]:
model = model.to(device)

In [10]:
# from google.colab import drive
# drive.mount('/content/drive')
test_path="./new_test.csv"

In [6]:
#remember to change the file name after you trained a model, we 
#check devices locally but recommend using cuda
if device=='cuda':
    model.load_state_dict(torch.load('./t5_epoch5.pth'))
else:
    model.load_state_dict(torch.load('./t5_epoch5.pth',map_location=torch.device('cpu')))

In [7]:
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            #TODO: a lot of parameters you can change
            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=150, 
                num_beams=2,
                repetition_penalty=2.5,
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            if _%100==0:
                print(f'Completed {_}')

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

In [8]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.subject
        self.ctext = self.data.content

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())
        #DON"T  change this part, ignore the warnings in the training.
        source = self.tokenizer.batch_encode_plus([ctext], padding='max_length', truncation=True,max_length= self.source_len,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text],  padding='max_length', truncation=True,max_length= self.summ_len,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
        }

In [12]:
#set up the test data frame
df_test=pd.read_csv(test_path,encoding='latin-1')
df_test=df_test.dropna()
df_test=df_test.reset_index(drop=True)
df_test.content='summarize: ' + df_test.content

In [13]:
#set up the parameters
val_params = {
    'batch_size': 2,
    'shuffle': False,
    'num_workers': 0
    }

In [14]:
#512 is for the max_length of input, 15 is the max_len for the summary
val_set = CustomDataset(df_test, tokenizer, 512, 15)
val_loader = DataLoader(val_set, **val_params)

In [15]:
print('Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe')
for epoch in range(1):
    predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
    final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
    #save the final dataframe as the csv file, remember to change it to the name you want
    final_df.to_csv('./zh_predictions_t5_epoch5_basic.csv')
    print('Output Files generated for review')

Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe
Completed 0


KeyboardInterrupt: 