In [1]:
!pip install transformers -q
!pip install torch -q
!pip install pandas -q
!pip install sacrebleu -q
!pip install wandb -q
!pip install sentencepiece -q

[K     |████████████████████████████████| 3.1 MB 8.2 MB/s 
[K     |████████████████████████████████| 596 kB 51.8 MB/s 
[K     |████████████████████████████████| 895 kB 28.1 MB/s 
[K     |████████████████████████████████| 3.3 MB 25.6 MB/s 
[K     |████████████████████████████████| 56 kB 4.7 MB/s 
[K     |████████████████████████████████| 90 kB 5.2 MB/s 
[K     |████████████████████████████████| 1.7 MB 8.3 MB/s 
[K     |████████████████████████████████| 139 kB 51.0 MB/s 
[K     |████████████████████████████████| 97 kB 7.4 MB/s 
[K     |████████████████████████████████| 180 kB 44.6 MB/s 
[K     |████████████████████████████████| 63 kB 1.8 MB/s 
[?25h  Building wheel for subprocess32 (setup.py) ... [?25l[?25hdone
  Building wheel for pathtools (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 1.2 MB 7.1 MB/s 
[?25h

In [6]:
# Load required libraries
import sentencepiece
from transformers import T5Config, T5Tokenizer, T5ForConditionalGeneration
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader,RandomSampler, SequentialSampler
import pandas as pd
import sacrebleu
import wandb
import numpy as np

In [4]:
# Log into wandab
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter: ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [7]:
# Set up the device to run on GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [8]:
# Mount Google drive to upload datasets
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
# The path to the data on my drive
D = '/content/drive/My Drive/W266_Project_Data/pmi_data'

In [10]:
# Load one of the Indian language training files and its English translation into a Pandas csv object
english_df = pd.read_csv(D+"/train/en-te/train.en", sep = "\t", header = None)
indic_df = pd.read_csv(D+"/train/en-te/train.te", sep = "\t", header = None)

# Combine these datsets into a training dataframe
train_df = pd.concat([english_df[0], indic_df[0]], axis=1, keys=['english', 'indic'])

In [11]:
# Repeat for the evaluation dataset from the dev data
english_eval_df = pd.read_csv(D+"/dev/dev.en", sep = "\t", header = None)
indic_eval_df = pd.read_csv(D+"/dev/dev.te", sep = "\t", header = None)

# Combine these datasets into a testing dataframe
eval_df = pd.concat([english_eval_df[0], indic_eval_df[0]], axis=1, keys=['english', 'indic'])

# Delete the monolingual datasets to save memory
del english_df, indic_df, english_eval_df, indic_eval_df

In [12]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

In [13]:
# We'll use the T5ForConditionalGeneration model, as it has a pre-trained language modeling head
model = T5ForConditionalGeneration.from_pretrained("t5-small")

Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

In [14]:
# Save the pre-trained tokenizer and model to a folder on my directory for faster loading next time
tokenizer.save_pretrained('/content/drive/My Drive/W266_Project_Data/models/T5Tokenizer')
model.save_pretrained('/content/drive/My Drive/W266_Project_Data/models/T5model')

In [15]:
# Temporarily delete the tokenizer and model objects to save memory
del tokenizer, model

In [22]:
# Create a custom class to process the English and Indian datasets
class CustomDataset(Dataset):
    
    def __init__(self, dataframe, tokenizer, source_len, trans_len):
    
        self.tokenizer = tokenizer
        self.data = dataframe
        
        # Cap the length for the source and target language lengths
        self.source_len = source_len
        self.trans_len = trans_len
        
        # Extract the English and Indic translations from the dataframe
        self.english = self.data.english
        self.indic = self.data.indic
        
    def __len__(self):
        """Find the number of English training examples"""
        return len(self.english)
    
    def __getitem__(self, index):
        """Create the ids and attention masks of the source and target language"""
        
        # Get the English and Indic texts for each row and convert each to a string
        english = str(self.english[index])
        indic = str(self.indic[index])

        english = " ".join(english.split())
        indic = " ".join(indic.split())
        
        # Get the PyTorch encodings of the source and target strings
        source = self.tokenizer.batch_encode_plus([english], max_length = self.source_len,
                                pad_to_max_length = True, truncation = True, 
                                padding = "max_length", return_tensors = 'pt')
        
        target = self.tokenizer.batch_encode_plus([indic], max_length = self.trans_len,
                                pad_to_max_length = True, truncation = True, 
                                padding = "max_length", return_tensors = 'pt')
        
        # Get the necessary encoding components to be passed to the seq2seq model
        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()
        
        return {
            'source_ids': source_ids.to(dtype=torch.long),
            'source_mask': source_mask.to(dtype=torch.long),
            'target_mask': target_ids.to(dtype=torch.long),
            'taget_ids_y': target_ids.to(dtype=torch.long)
        }

In [27]:
# Define a training function for the T5 translation task
def train(epoch, tokenizer, model, device, loader, optimizer):
    # Start to train the model using GPUs
    model.train()
    # Use the batch size to decide how much training data to pass to the model
    for _, data in enumerate(loader, 0):
        # Pass the target ids to the GPU if available and specify that it is PyTorch format
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:,:-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:,1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype=torch.long)
        mask = data['source_mask'].to(device, dtype=torch.long)
        
        ouputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids = y_ids, 
                      lm_labels = lm_labels)
        
        # Get the model training data loss
        loss = outputs[0]
        
        # Log training loss every 10 steps to create a plot of training losss
        if _%10 == 0:
            wandb.log({"Training Loss": loss.item()})
        
        # Print the training loss to the console after every 500 steps
        if _%500 == 0:
            print(f'Epoch: {epoch}, Loss: {loss.item()}')
            
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [28]:
# Define a function to validate the data on the dev set
def validate(epoch, tokenizer, model, device, loader):
    # Set the model to evaluate
    model.eval()
    # Create two empty lists to hold the predicted vs actual text
    predictions = []
    actuals = []
    
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype=torch.long)
            ids = data['source_ids'].to(device, dtype=torch.long)
            mask = data['source_mask'].to(devic, dtype=torch.long)
            
            # Take in the input ids, input attention mask
            # Use a beam search of 2 based upon reading assignment paper
            generated_ids = model.generate(input_ids = ids, attention_mask = mask,
                                           max_length = 150, 
                                           num_beams = 2,
                                           repetition_penalty = 2.5,
                                           length_penalty = 1.0, 
                                           early_stopping = True)
            
            # Get the predicted text and actual text and append to their respective lists
            preds = [tokenizer.decode(g, skip_special_tokens=True, 
                                     clean_up_tokenization=True) for g in generated_ids]
            
            # Decode the original text 
            target = [tokenizer.decode(t, skip_special_tokens=True, 
                                     clean_up_tokenization=True) for t in y]
            
            if _%100==0:
                print(f'Completed{_}')
            
            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

In [19]:
# Start a wandab project
wandb.init(project="t5_translation_en_te", entity="bdougall")

[34m[1mwandb[0m: Currently logged in as: [33mbdougall[0m (use `wandb login --relogin` to force relogin)


In [20]:
def collate_fn(data):
    img, bbox = data
    zipped = zip(img, bbox)
    return list(zipped)

In [29]:
# Define a function to perform the training and validation on the test set
def main():
    
    # Set the training parameters
    # Set the Wandab configuration
    config = wandb.config
    config.BATCH_SIZE = 2
    config.TRAIN_EPOCHS = 2
    config.VAL_EPOCHS = 1
    config.LEARNING_RATE = 1e-4
    config.SEED = 42
    config.MAX_LEN = 512
    config.TRANS_LEN = 128
    
    # Set random seeds and deterministic PyTorch for reproducibility
    torch.manual_seed(config.SEED) # Set PyTorch random seed
    np.random.seed(config.SEED) # Set Numpy random seed
    torch.backends.cudnn.deterministic = True
    
    # Load the tokenizer that I saved locally
    tokenizer = T5Tokenizer.from_pretrained('/content/drive/My Drive/W266_Project_Data/models/T5Tokenizer')
    
    # Load the training and eval data and append the T5 task to the translation target
    train_df.indic = 'translate: ' + train_df.indic
    eval_df.indic = 'translate: ' + eval_df.indic
    
    # Load and tokenize the training and test datasets
    train_set = CustomDataset(train_df, tokenizer, config.MAX_LEN, config.TRANS_LEN)
    val_set = CustomDataset(eval_df, tokenizer, config.MAX_LEN, config.TRANS_LEN)
    
    # Create the Dataloaders for testing and validation
    training_loader = DataLoader(train_set, batch_size=config.BATCH_SIZE, shuffle=True)#, collate_fn=collate_fn)

    val_loader = DataLoader(val_set, batch_size=config.BATCH_SIZE, shuffle=True)#, collate_fn=collate_fn)
    
    # We'll use the T5ForConditionalGeneration model, as it has a pre-trained language modeling head
    model = T5ForConditionalGeneration.from_pretrained('/content/drive/My Drive/W266_Project_Data/models/T5model')
    model = model.to(device)
    
    # Define the optimizer that will be used to tune the model weights
    optimizer = torch.optim.Adam(params=model.parameters(), lr=config.LEARNING_RATE)
    
    # Log metrics with wandb
    wandb.watch(model, log = "all")
    
    print('Fine-tuning the model using training data')
    for epoch in range(config.TRAIN_EPOCHS):
        train(epoch, tokenizer, model, device, training_loader, optimizer)

    print('Generating the translated text and saving to a dataframe')
    for epoch in range(config.VAL_EPOCHS):
        predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
        # Create a dataframe of the actual versus translated text and add this new dataframe to the final dataframe for each loop
        final_df = pd.DataFrame({'Actual Text': actuals, 'Generated Translation': predictions})

        #final_df = pd.DataFrame(np.concatenate([final_df.values, df.values]), columns=final_df.columns)

    final_df.to_csv(D+'/predicted_text/predictions.csv')
    print('Translated text saved')

if __name__ == '__main__':
    main()

Fine-tuning the model using training data


KeyError: ignored