In [1]:
!pip install transformers -q
#import wandab which is the software log our traning process
!pip install wandb -q

[K     |████████████████████████████████| 1.4MB 23.3MB/s 
[K     |████████████████████████████████| 2.9MB 56.8MB/s 
[K     |████████████████████████████████| 890kB 47.1MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 1.8MB 17.1MB/s 
[K     |████████████████████████████████| 163kB 60.7MB/s 
[K     |████████████████████████████████| 133kB 63.2MB/s 
[K     |████████████████████████████████| 102kB 15.8MB/s 
[K     |████████████████████████████████| 102kB 13.7MB/s 
[K     |████████████████████████████████| 71kB 12.1MB/s 
[?25h  Building wheel for subprocess32 (setup.py) ... [?25l[?25hdone
  Building wheel for watchdog (setup.py) ... [?25l[?25hdone


In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# Importing the bart modules from huggingface/transformers
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig

# WandB – Import the wandb library
import wandb

In [3]:
#torch devices
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [4]:
#Return a sereis of ids and masks we have for each dataset
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.subject
        self.ctext = self.data.content

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())
        source = self.tokenizer.batch_encode_plus([ctext], padding='max_length', truncation=True,max_length= self.source_len,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text], padding='max_length', truncation=True,max_length= self.summ_len,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long)
        }

In [5]:
def shift_tokens_right(input_ids, pad_token_id):
  #Shift input ids one token to the right, and wrap the last non pad token (usually <eos>).
  prev_output_tokens = input_ids.clone()
  index_of_eos = (input_ids.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1)
  prev_output_tokens[:, 0] = input_ids.gather(1, index_of_eos).squeeze()
  prev_output_tokens[:, 1:] = input_ids[:, :-1]
  return prev_output_tokens 


def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = shift_tokens_right(data['target_ids'], tokenizer.pad_token_id)
        y = y.to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids,labels=lm_labels)
        #print(outputs)
        loss = outputs[0]
        if _%10 == 0:
            wandb.log({"Training Loss": loss.item()})

        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [6]:
# from google.colab import drive
# drive.mount('/content/drive')
train_path="./train.csv"

Mounted at /content/drive


In [7]:
wandb.init(project="BART_generation_epoch=5_max_len=512")

# WandB – Config is a variable that holds and saves hyperparameters and inputs
# Defining some key variables that will be used later on in the training  
config = wandb.config          # Initialize config
config.TRAIN_BATCH_SIZE = 2    # input batch size for training (default: 64)
config.VALID_BATCH_SIZE = 2    # input batch size for testing (default: 1000)
config.TRAIN_EPOCHS = 5        # number of epochs to train (default: 10)
config.VAL_EPOCHS = 1 
config.LEARNING_RATE = 1e-4    # learning rate (default: 0.01)
config.SEED = 42               # random seed (default: 42)
config.MAX_LEN = 512           #see if changing max_len would give a better result
#set the summary length to 15 characters
config.SUMMARY_LEN = 15

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [8]:

'''
THE WHOLE TRAINING PROCESS!!!!!
'''
# Set random seeds and deterministic pytorch for reproducibility
torch.manual_seed(config.SEED) # pytorch random seed
np.random.seed(config.SEED) # numpy random seed
torch.backends.cudnn.deterministic = True

# tokenzier for encoding the text
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
    

# Importing and Pre-Processing the domain data
# Selecting the needed columns only. 
df = pd.read_csv(train_path,encoding='latin-1')
df_test=pd.read_csv(test_path,encoding='latin-1')
df=df.dropna()
df_test=df_test.dropna()
df=df.reset_index(drop=True)
df_test=df_test.reset_index(drop=True)

    
# Creation of Dataset and Dataloader
train_dataset=df
val_dataset=df_test


print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(val_dataset.shape))


# Creating the Training and Validation dataset for further creation of Dataloader
training_set = CustomDataset(train_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)
val_set = CustomDataset(val_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)

# Defining the parameters for creation of dataloaders
train_params = {
    'batch_size': config.TRAIN_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 0
}

val_params = {
    'batch_size': config.VALID_BATCH_SIZE,
    'shuffle': False,
    'num_workers': 0
}

# Creation of Dataloaders for testing and validation.
training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)


    
# Defining the model. We are using BART model and added a Language model layer
# Further this model is sent to device (GPU/TPU) for using the hardware.
model =BartForConditionalGeneration.from_pretrained('facebook/bart-base')
model = model.to(device)

# Defining the optimizer that will be used to tune the weights of the network in the training session. 
optimizer = torch.optim.Adam(params =  model.parameters(), lr=config.LEARNING_RATE)

# Log metrics with wandb
wandb.watch(model, log="all")
# Training loop
print('Initiating Fine-Tuning for the model on our dataset')

for epoch in range(config.TRAIN_EPOCHS):
  train(epoch, tokenizer, model, device, training_loader, optimizer)

#save the model state dict to avoid collision with wandb callbacks
torch.save(model.state_dict(),'/content/drive/Shared drives/CS 269 project/BART_epoch5.pth')


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…


TRAIN Dataset: (14436, 3)
TEST Dataset: (1906, 3)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1553.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=557941479.0, style=ProgressStyle(descri…


Initiating Fine-Tuning for the model on our dataset
Epoch: 0, Loss:  7.349852561950684
Epoch: 0, Loss:  8.388046264648438
Epoch: 0, Loss:  4.106700420379639
Epoch: 0, Loss:  3.2653019428253174
Epoch: 0, Loss:  2.100693941116333
Epoch: 0, Loss:  4.293817520141602
Epoch: 0, Loss:  4.416386127471924
Epoch: 0, Loss:  2.5949437618255615
Epoch: 0, Loss:  2.6787545680999756
Epoch: 0, Loss:  3.521331787109375
Epoch: 0, Loss:  4.710511684417725
Epoch: 0, Loss:  5.1894965171813965
Epoch: 0, Loss:  2.253654718399048
Epoch: 0, Loss:  1.7638822793960571
Epoch: 0, Loss:  4.400583267211914
Epoch: 1, Loss:  1.5291191339492798
Epoch: 1, Loss:  3.3334178924560547
Epoch: 1, Loss:  4.432965278625488
Epoch: 1, Loss:  1.5852587223052979
Epoch: 1, Loss:  3.4488611221313477
Epoch: 1, Loss:  2.9370715618133545
Epoch: 1, Loss:  2.2794997692108154
Epoch: 1, Loss:  4.023043632507324
Epoch: 1, Loss:  4.466510772705078
Epoch: 1, Loss:  2.5971808433532715
Epoch: 1, Loss:  0.9521037340164185
Epoch: 1, Loss:  4.51896

In [None]:
#a test of things being saved
model.load_state_dict(torch.load('/content/drive/Shared drives/CS 269 project/BART_epoch5.pth'))

<All keys matched successfully>