# T5 Model Fine-Tuning

## Load Train/Test Data

In [39]:
from google.cloud import storage
from io import StringIO
import pandas as pd

def get_df_from_gcs_blob(blob, bucket='recipe-data-bucket'):
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket)

    blob = bucket.blob(blob)
    blob = blob.download_as_string()
    blob = blob.decode()
    blob = StringIO(blob)  #tranform bytes to string here
    df = pd.read_csv(blob)
    return df

train_df = get_df_from_gcs_blob('train.csv')
test_df = get_df_from_gcs_blob('test.csv')

In [2]:
train_df.head()

Unnamed: 0,input,output
0,"<start-ingredients>salmon steaks, olive oil, S...",<start-title>Broiled Salmon Steaks<end-title><...
1,"<start-ingredients>rice, scallops, rice wine, ...",<start-title>Sticky Rice With Chinese Sausage ...
2,"<start-ingredients>escarole, Medjool dates, wa...","<start-title>Escarole With Bacon, Dates, And W..."
3,"<start-ingredients>skirt steak, garlic, olive ...",<start-title>Grilled Garlic-Marinated Skirt St...
4,"<start-ingredients>ricotta, honey, vanilla ext...",<start-title>Honeyed Ricotta<end-title><start-...


In [3]:
test_df.head()

Unnamed: 0,input,output
0,"<start-ingredients>fennel bulb ani, onion, ted...",<start-title>Potato And Fennel Soup Hodge<end-...
1,"<start-ingredients>spinach soufflé, extra egg ...",<start-title>Spinach Noodle Casserole<end-titl...
2,"<start-ingredients>soy sauce, sugar, Asian ses...",<start-title>Korean Marinated Beef<end-title><...
3,"<start-ingredients>pecan halves, ted butter, s...",<start-title>Sea Salt-Roasted Pecans<end-title...
4,"<start-ingredients>garlic, olive oil, salt, bl...",<start-title>Garlic Baguette Crumbs<end-title>...


## Train

In [44]:
import torch

class T5Dataset:
  def __init__(self, inps, outs, tokenizer, inp_max_len, out_max_len):   
    self.inps = inps
    self.outs = outs
    self.tokenizer = tokenizer
    self.input_max_len = inp_max_len
    self.output_max_len = out_max_len
  
  def __len__(self):                      # This method retrives the number of item from the dataset
    return len(self.inps)

  def __getitem__(self, item):             # This method retrieves the item at the specified index item. 
    inp = str(self.inps[item])
    out = str(self.outs[item])

    input_tokenize = self.tokenizer(      
            inp,
            add_special_tokens=True,
            max_length=self.input_max_len,
            padding = 'max_length',
            truncation = True,
            return_attention_mask=True,
            return_tensors="pt"
        )
    output_tokenize = self.tokenizer(
            out,
            add_special_tokens=True,
            max_length=self.output_max_len,
            padding = 'max_length',
            truncation = True,
            return_attention_mask=True,
            return_tensors="pt"
            
        )
    

    input_ids = input_tokenize["input_ids"].flatten().to(dtype=torch.long)
    attention_mask = input_tokenize["attention_mask"].flatten().to(dtype=torch.long)
    output_ids = output_tokenize['input_ids'].flatten().to(dtype=torch.long)

    out = {
            'input': inp,      
            'target': out,
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'target_ids': output_ids
        }
        
    return out 


In [45]:
import wandb

def train(tokenizer, model, device, loader, optimizer):
    losses = []
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, lm_labels=lm_labels)
        loss = outputs[0]
        losses.append(loss.item())
        
        if _%10 == 0:
            wandb.log({"Training Loss": loss.item()})
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return losses

In [46]:
from torch import cuda

TRAIN_BATCH_SIZE = 128
TEST_BATCH_SIZE = 64
TRAIN_NUM_WORKERS = 2
TEST_NUM_WORKERS = 2

INP_MAX_LEN = max(train_df['input'].map(len).max(), test_df['input'].map(len).max())
OUT_MAX_LEN = max(train_df['output'].map(len).max(), test_df['output'].map(len).max())

EPOCHS = 10
LR = 1e-4
DEVICE = 'cuda' if cuda.is_available() else 'cpu'

In [48]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import DataLoader

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="recipe-t5",
    
    # track hyperparameters and run metadata
    config={
    "epochs": EPOCHS,
    "train_data_batch_size": TRAIN_BATCH_SIZE,
    "train_dataloader_num_workers": TRAIN_NUM_WORKERS,
    "test_data_batch_size": TEST_BATCH_SIZE,
    "test_dataloader_num_workers": TEST_NUM_WORKERS,
    "inp_max_len": INP_MAX_LEN,
    "out_max_len": OUT_MAX_LEN,
    "device": DEVICE,
    "lr": LR
    }
)

tokenizer = T5Tokenizer.from_pretrained("t5-base")

train_dataset = T5Dataset(train_df['input'].values, train_df['output'].values, tokenizer, INP_MAX_LEN, OUT_MAX_LEN)
test_dataset = T5Dataset(test_df['input'].values, test_df['output'].values, tokenizer, INP_MAX_LEN, OUT_MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=TRAIN_NUM_WORKERS, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=TEST_BATCH_SIZE, num_workers=TEST_NUM_WORKERS)

model = T5ForConditionalGeneration.from_pretrained("t5-base").to(DEVICE)

opt = torch.optim.Adam(params =  model.parameters(), lr=LR)

for epoch in range(EPOCHS):
    print(f"Beginning training in epoch {epoch}...")
    losses = train(tokenizer, model, DEVICE, train_loader, opt)
    epoch_running_loss = sum(losses)
    wandb.log({"Epoch Running Training Loss": epoch_running_loss})
    print(f"Epoch {epoch} Running Loss: {epoch_running_loss}")

wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011130161988936986, max=1.0…

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Beginning training in epoch 0...


KeyboardInterrupt: 