# T5 Model Hyperparameter Tuning

## Load Train/Test Data

In [10]:
from google.cloud import storage
from io import StringIO
import pandas as pd

def get_df_from_gcs_blob(blob, bucket='recipe-data-bucket'):
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket)

    blob = bucket.blob(blob)
    blob = blob.download_as_string()
    blob = blob.decode()
    blob = StringIO(blob)  #tranform bytes to string here
    df = pd.read_csv(blob)
    return df

train_df = get_df_from_gcs_blob('train_only_cal.csv')
test_df = get_df_from_gcs_blob('test_only_cal.csv')
test_df = test_df.sample(100).reset_index(drop=True)

To avoid CUDA OOM, let's only train samples with max length under 120.

In [37]:
train_df = train_df[train_df['input'].map(str).map(len) < 120].reset_index(drop=True)

## Train

In [2]:
import torch

class T5Dataset:
  def __init__(self, inps, outs, tokenizer, inp_max_len, out_max_len):   
    self.inps = inps
    self.outs = outs
    self.tokenizer = tokenizer
    self.input_max_len = inp_max_len
    self.output_max_len = out_max_len
  
  def __len__(self):                      # This method retrives the number of item from the dataset
    return len(self.inps)

  def __getitem__(self, item):             # This method retrieves the item at the specified index item. 
    inp = str(self.inps[item])
    out = str(self.outs[item])

    input_tokenize = self.tokenizer(      
            inp,
            add_special_tokens=True,
            max_length=self.input_max_len,
            padding = 'max_length',
            truncation = True,
            return_attention_mask=True,
            return_tensors="pt"
        )
    output_tokenize = self.tokenizer(
            out,
            add_special_tokens=True,
            max_length=self.output_max_len,
            padding = 'max_length',
            truncation = True,
            return_attention_mask=True,
            return_tensors="pt"
            
        )
    

    input_ids = input_tokenize["input_ids"].flatten().to(dtype=torch.long)
    attention_mask = input_tokenize["attention_mask"].flatten().to(dtype=torch.long)
    output_ids = output_tokenize['input_ids'].flatten().to(dtype=torch.long)

    out = {
            'input': inp,      
            'target': out,
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'target_ids': output_ids
        }
        
    return out 


In [7]:
import wandb

def train(tokenizer, model, device, loader, optimizer, fp16=True):
    losses = []
    if fp16: model.half()
    model.train()
    for _, data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
        loss = outputs[0]
        losses.append(loss.item())
        
        if _%10 == 0:
            wandb.log({"Training Loss": loss.item()})
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return losses

def test(tokenizer, model, device, loader, fp16=True):
    losses = []
    if fp16: model.half()
    model.eval()
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            
            y = data['target_ids'].to(device, dtype = torch.long)
            y_ids = y[:, :-1].contiguous()
            lm_labels = y[:, 1:].clone().detach()
            lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)

            outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
            loss = outputs[0]
            losses.append(loss.item())
            
            if _%10 == 0:
                wandb.log({"Validation Loss": loss.item()})
    return losses

In [18]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import DataLoader

def tune(config=None, MOD='./inp_cal_ingred_cal/final', DEVICE='CPU'):
    with wandb.init(config=config):
        tokenizer = T5Tokenizer.from_pretrained(MOD)
        #tokenizer.add_special_tokens({'additional_special_tokens': ['<ingredients>', '<calories>', '<title>', '<directions>']})

        train_dataset = T5Dataset(train_df['input'].values, train_df['output'].values, tokenizer, config.inp_max_len, config.out_max_len)
        test_dataset = T5Dataset(test_df['input'].values, test_df['output'].values, tokenizer, config.inp_max_len, config.out_max_len)

        train_loader = DataLoader(train_dataset, batch_size=config.train_batch_size, num_workers=config.train_num_workers, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=config.test_batch_size, num_workers=config.train_num_workers)

        model = T5ForConditionalGeneration.from_pretrained(MOD).to(DEVICE)

        opt = torch.optim.Adam(params =  model.parameters(), lr=config.lr)

        for epoch in range(config.epochs):
            #train_losses = train(tokenizer, model, DEVICE, train_loader, opt, fp16=config.fp16)
            test_losses = test(tokenizer, model, DEVICE, test_loader, fp16=config.fp16)
            test_loss = sum(test_losses)/len(test_loader)
            wandb.log({'loss': test_loss, 'epoch': epoch})

In [24]:
import pprint

sweep_config = {
    'method': 'random',
    'metric': {'name': 'loss', 'goal': 'minimize'},
    'parameters': {
        'lr': {'values': [1e-5, 1e-4, 1e-3, 1e-2, 1]},
        'fp16': {'values': [True, False]},
        'inp_max_len': {'values': [int(train_df['input'].map(len).max()), int(test_df['input'].map(len).max()), int(train_df['input'].map(len).max() / 2), int(test_df['input'].map(len).max() / 2)]},
        'out_max_len': {'values': [int(train_df['output'].map(len).max()), int(test_df['output'].map(len).max()), int(train_df['output'].map(len).max() / 2), int(test_df['output'].map(len).max() / 2)]},
        'train_batch_size': {'values': [1, 4, 8]},
        'test_batch_size': {'values': [1, 4, 8]},
        'train_num_workers': {'values': [1, 2, 4]},
        'test_num_workers': {'values': [1, 2, 4]},
        'epochs': {'value': 100}
    }
}
pprint.pprint(sweep_config)

{'method': 'random',
 'metric': {'goal': 'minimize', 'name': 'loss'},
 'parameters': {'epochs': {'value': 1},
                'fp16': {'values': [True, False]},
                'inp_max_len': {'values': [250, 230, 125, 115]},
                'lr': {'values': [1e-05, 0.0001, 0.001, 0.01, 1]},
                'out_max_len': {'values': [786, 739, 393, 369]},
                'test_batch_size': {'values': [1, 4, 8]},
                'test_num_workers': {'values': [1, 2, 4]},
                'train_batch_size': {'values': [1, 4, 8]},
                'train_num_workers': {'values': [1, 2, 4]}}}


In [26]:
sweep_id = wandb.sweep(sweep_config, project="recipe-t5")
wandb.agent(sweep_id, lambda x: tune(config=sweep_config), count=10)



Create sweep with ID: w27u6nmd
Sweep URL: https://wandb.ai/rl-final-project/recipe-t5/sweeps/w27u6nmd
<IPython.core.display.HTML object>


[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.
