# T5 Model Fine-Tuning

## Load Train/Test Data

In [6]:
from google.cloud import storage
from io import StringIO
import pandas as pd

def get_df_from_gcs_blob(blob, bucket='recipe-data-bucket'):
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket)

    blob = bucket.blob(blob)
    blob = blob.download_as_string()
    blob = blob.decode()
    blob = StringIO(blob)  #tranform bytes to string here
    df = pd.read_csv(blob)
    return df

train_df = get_df_from_gcs_blob('train.csv')
test_df = get_df_from_gcs_blob('test.csv')

In [7]:
train_df.head()

Unnamed: 0,input,output
0,"<start-ingredients>salmon steaks, olive oil, S...",<start-title>Broiled Salmon Steaks<end-title><...
1,"<start-ingredients>rice, scallops, rice wine, ...",<start-title>Sticky Rice With Chinese Sausage ...
2,"<start-ingredients>escarole, Medjool dates, wa...","<start-title>Escarole With Bacon, Dates, And W..."
3,"<start-ingredients>skirt steak, garlic, olive ...",<start-title>Grilled Garlic-Marinated Skirt St...
4,"<start-ingredients>ricotta, honey, vanilla ext...",<start-title>Honeyed Ricotta<end-title><start-...


In [8]:
test_df.head()

Unnamed: 0,input,output
0,"<start-ingredients>fennel bulb ani, onion, ted...",<start-title>Potato And Fennel Soup Hodge<end-...
1,"<start-ingredients>spinach soufflé, extra egg ...",<start-title>Spinach Noodle Casserole<end-titl...
2,"<start-ingredients>soy sauce, sugar, Asian ses...",<start-title>Korean Marinated Beef<end-title><...
3,"<start-ingredients>pecan halves, ted butter, s...",<start-title>Sea Salt-Roasted Pecans<end-title...
4,"<start-ingredients>garlic, olive oil, salt, bl...",<start-title>Garlic Baguette Crumbs<end-title>...


## Train

In [9]:
import torch

class T5Dataset:
  def __init__(self, inps, outs, tokenizer, inp_max_len, out_max_len):   
    self.inps = inps
    self.outs = outs
    self.tokenizer = tokenizer
    self.input_max_len = inp_max_len
    self.output_max_len = out_max_len
  
  def __len__(self):                      # This method retrives the number of item from the dataset
    return len(self.inps)

  def __getitem__(self, item):             # This method retrieves the item at the specified index item. 
    inp = str(self.inps[item])
    out = str(self.outs[item])

    input_tokenize = self.tokenizer(      
            inp,
            add_special_tokens=True,
            max_length=self.input_max_len,
            padding = 'max_length',
            truncation = True,
            return_attention_mask=True,
            return_tensors="pt"
        )
    output_tokenize = self.tokenizer(
            out,
            add_special_tokens=True,
            max_length=self.output_max_len,
            padding = 'max_length',
            truncation = True,
            return_attention_mask=True,
            return_tensors="pt"
            
        )
    

    input_ids = input_tokenize["input_ids"].flatten().to(dtype=torch.long)
    attention_mask = input_tokenize["attention_mask"].flatten().to(dtype=torch.long)
    output_ids = output_tokenize['input_ids'].flatten().to(dtype=torch.long)

    out = {
            'input': inp,      
            'target': out,
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'target_ids': output_ids
        }
        
    return out 


In [10]:
import wandb

def train(tokenizer, model, device, loader, optimizer, fp16=False):
    losses = []
    if fp16: model.half()
    model.train()
    for _, data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
        loss = outputs[0]
        losses.append(loss.item())
        
        if _%10 == 0:
            wandb.log({"Training Loss": loss.item()})
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return losses

In [11]:
from torch import cuda

TRAIN_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16
TRAIN_NUM_WORKERS = 2
TEST_NUM_WORKERS = 2

INP_MAX_LEN = max(train_df['input'].map(len).max(), test_df['input'].map(len).max())
OUT_MAX_LEN = max(train_df['output'].map(len).max(), test_df['output'].map(len).max())
INP_MAX_LEN = 100
OUT_MAX_LEN = 100

MOD = 't5-small'
EPOCHS = 10
LR = 1e-4
DEVICE = 'cuda' if cuda.is_available() else 'cpu'

In [12]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import DataLoader

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="recipe-t5",
    
    # track hyperparameters and run metadata
    config={
    "epochs": EPOCHS,
    "train_data_batch_size": TRAIN_BATCH_SIZE,
    "train_dataloader_num_workers": TRAIN_NUM_WORKERS,
    "test_data_batch_size": TEST_BATCH_SIZE,
    "test_dataloader_num_workers": TEST_NUM_WORKERS,
    "inp_max_len": INP_MAX_LEN,
    "out_max_len": OUT_MAX_LEN,
    "device": DEVICE,
    "lr": LR,
    "model": MOD
    }
)

tokenizer = T5Tokenizer.from_pretrained(MOD)

train_dataset = T5Dataset(train_df['input'].values, train_df['output'].values, tokenizer, INP_MAX_LEN, OUT_MAX_LEN)
test_dataset = T5Dataset(test_df['input'].values, test_df['output'].values, tokenizer, INP_MAX_LEN, OUT_MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=TRAIN_NUM_WORKERS, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=TEST_BATCH_SIZE, num_workers=TEST_NUM_WORKERS)

model = T5ForConditionalGeneration.from_pretrained(MOD).to(DEVICE)

opt = torch.optim.Adam(params =  model.parameters(), lr=LR)

for epoch in range(EPOCHS):
    print(f"Beginning training in epoch {epoch}...")
    losses = train(tokenizer, model, DEVICE, train_loader, opt)
    epoch_running_loss = sum(losses)
    wandb.log({"Epoch Running Training Loss": epoch_running_loss})
    print(f"Epoch {epoch} Running Loss: {epoch_running_loss}")

wandb.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mamanichopra[0m. Use [1m`wandb login --relogin`[0m to force relogin


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Beginning training in epoch 0...


KeyboardInterrupt: 

In [27]:
torch.cuda.empty_cache()
print(torch.cuda.memory_summary(device=None, abbreviated=False))


|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 7            |        cudaMalloc retries: 10        |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  15308 MiB |  15330 MiB |  57792 MiB |  42483 MiB |
|       from large pool |  15306 MiB |  15326 MiB |  57772 MiB |  42465 MiB |
|       from small pool |      1 MiB |      4 MiB |     20 MiB |     18 MiB |
|---------------------------------------------------------------------------|
| Active memory         |  15308 MiB |  15330 MiB |  57792 MiB |  42483 MiB |
|       from large pool |  15306 MiB |  15326 MiB |  57772 MiB |  42465 MiB |
|       from small pool |      1 MiB |      4 MiB |     20 MiB |     18 MiB |
|---------------------------------------------------------------

In [12]:
model.size()

AttributeError: 'T5ForConditionalGeneration' object has no attribute 'size'