In [1]:
!pip install transformers -q
#import wandab which is the software log our traning process
!pip install wandb -q
!pip install sentencepiece

[K     |████████████████████████████████| 1.4MB 22.3MB/s 
[K     |████████████████████████████████| 2.9MB 45.8MB/s 
[K     |████████████████████████████████| 890kB 55.9MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 1.8MB 22.1MB/s 
[K     |████████████████████████████████| 102kB 13.3MB/s 
[K     |████████████████████████████████| 133kB 55.5MB/s 
[K     |████████████████████████████████| 163kB 56.3MB/s 
[K     |████████████████████████████████| 102kB 14.3MB/s 
[K     |████████████████████████████████| 71kB 11.2MB/s 
[?25h  Building wheel for watchdog (setup.py) ... [?25l[?25hdone
  Building wheel for subprocess32 (setup.py) ... [?25l[?25hdone
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration

# WandB – Import the wandb library
import wandb

In [3]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [4]:
!wandb login

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [5]:
#Return a sereis of ids and masks we have for each dataset
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.subject
        self.ctext = self.data.content

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())
        source = self.tokenizer.batch_encode_plus([ctext], padding='max_length', truncation=True,max_length= self.source_len,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text],  padding='max_length', truncation=True,max_length= self.summ_len,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long)
        }

In [6]:
def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        #this is to figure out how the alignment would be
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        #input ids and attention mask for the input, others for the output
        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids,labels=lm_labels)
        loss = outputs[0]
        
        if _%10 == 0:
            wandb.log({"Training Loss": loss.item()})

        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [7]:
# from google.colab import drive
# drive.mount('/content/drive')
train_path="./train.csv"

Mounted at /content/drive


In [8]:
# WandB – Initialize a new run
wandb.init(project="cs269-t5-epoch5")

# WandB – Config is a variable that holds and saves hyperparameters and inputs
# Defining some key variables that will be used later on in the training  
config = wandb.config          # Initialize config
config.TRAIN_BATCH_SIZE = 2    # input batch size for training (default: 64)
config.VALID_BATCH_SIZE = 2    # input batch size for testing (default: 1000)
config.TRAIN_EPOCHS = 5        # number of epochs to train (default: 10)
config.VAL_EPOCHS = 1 
config.LEARNING_RATE = 1e-4    # learning rate (default: 0.01)
config.SEED = 42               # random seed (default: 42)
config.MAX_LEN = 512            #set up max number of length
#set the summary length to 15 tokens
config.SUMMARY_LEN = 15

[34m[1mwandb[0m: Currently logged in as: [33mzhh083[0m (use `wandb login --relogin` to force relogin)


In [9]:

'''
THE WHOLE TRAINING PROCESS!!!!!
'''


# Set random seeds and deterministic pytorch for reproducibility
torch.manual_seed(config.SEED) # pytorch random seed
np.random.seed(config.SEED) # numpy random seed
torch.backends.cudnn.deterministic = True

# tokenzier for encoding the text
tokenizer = T5Tokenizer.from_pretrained("t5-base")
    

# Importing and Pre-Processing the domain data
# Selecting the needed columns only. 
# Adding the summarzie text in front of the text. This is to format the dataset similar to how T5 model was trained for summarization task. 
df = pd.read_csv(train_path,encoding='latin-1')
df=df.dropna()
df=df.reset_index(drop=True)
df.content = 'summarize: ' + df.content
print(df.head())

    
# Creation of Dataset and Dataloader
# Defining the train size. So 80% of the data will be used for training and the rest will be used for validation. 
#train_size = 0.8
train_dataset=df


print("TRAIN Dataset: {}".format(train_dataset.shape))


# Creating the Training set for dataloader
training_set = CustomDataset(train_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)

#removed print out the test set, we used to load it to check the size

# Defining the parameters for creation of dataloaders
train_params = {
    'batch_size': config.TRAIN_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 0
}


# Creation of Dataloaders for training model.
training_loader = DataLoader(training_set, **train_params)


    
# Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
# Further this model is sent to device (GPU/TPU) for using the hardware.
model = T5ForConditionalGeneration.from_pretrained("t5-base")
model = model.to(device)

# Defining the optimizer that will be used to tune the weights of the network in the training session. 
optimizer = torch.optim.Adam(params =  model.parameters(), lr=config.LEARNING_RATE)

# Log metrics with wandb
wandb.watch(model, log="all")
# Training loop
print('Initiating Fine-Tuning for the model on our dataset')

#saving during the loop
for epoch in range(config.TRAIN_EPOCHS):
    model_save_name = 'T5_model' + str(config.TRAIN_EPOCHS) +'.h5'
    wandb.save(model_save_name)
    train(epoch, tokenizer, model, device, training_loader, optimizer)
    model_save_name = 'T5_model' + str(config.TRAIN_EPOCHS) + '.h5'
    wandb.save(model_save_name)

print("NOW saving the model trained")
#save the whole model
torch.save(model.state_dict(),'./t5_epoch5.pth')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…


                                             content                                           subject
0  summarize: Attached please find the UPDATED re...  Global Contracts/Facilities new responsibilities
1  summarize: =                                  ...                                       Gas Indices
2  summarize: Does anyone know this person or his...                24th IAEE International Conference
3  summarize: I have attached the review.\nPlease...                                    Theresa review
4  summarize: Mark:  We figured this out...\nIn l...                              Hafslund - DMS 12596
TRAIN Dataset: (14436, 2)
TEST Dataset: (1906, 3)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1199.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=891691430.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Initiating Fine-Tuning for the model on our dataset
Epoch: 0, Loss:  10.473517417907715
Epoch: 0, Loss:  2.704354763031006
Epoch: 0, Loss:  2.4902725219726562
Epoch: 0, Loss:  4.965333461761475
Epoch: 0, Loss:  1.616382360458374
Epoch: 0, Loss:  4.100566387176514
Epoch: 0, Loss:  2.8744378089904785
Epoch: 0, Loss:  2.7490859031677246
Epoch: 0, Loss:  3.3814127445220947
Epoch: 0, Loss:  2.571134090423584
Epoch: 0, Loss:  3.795753240585327
Epoch: 0, Loss:  3.243058681488037
Epoch: 0, Loss:  3.5792601108551025
Epoch: 0, Loss:  3.2578444480895996
Epoch: 0, Loss:  1.3446284532546997
Epoch: 1, Loss:  1.9511864185333252
Epoch: 1, Loss:  1.096887469291687
Epoch: 1, Loss:  2.140054941177368
Epoch: 1, Loss:  0.4459782540798187
Epoch: 1, Loss:  0.4884955883026123
Epoch: 1, Loss:  2.3803281784057617
Epoch: 1, Loss:  1.5634585618972778
Epoch: 1, Loss:  4.680304050445557
Epoch: 1, Loss:  1.293945550918579
Epoch: 1, Loss:  1.6252366304397583
Epoch: 1, Loss:  1.2229573726654053
Epoch: 1, Loss:  3.3474

In [None]:
#a test for successfully creating the model
model.load_state_dict(torch.load('./t5_epoch5.pth'))

<All keys matched successfully>