In [2]:
#@title Mount your Google Drive
# If you run this notebook locally or on a cluster (i.e. not on Google Colab)
# you can delete this cell which is specific to Google Colab. You may also
# change the paths for data/logs in Arguments below.
%matplotlib inline
%load_ext autoreload
%autoreload 2

from google.colab import drive
drive.mount('/content/gdrive')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Mounted at /content/gdrive


In [3]:
!pip install transformers datasets evaluate rouge_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.0-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━

In [4]:
import sys
import os
import shutil
import warnings
import pandas as pd
import numpy as np
import json
import torch
import evaluate

from transformers import (
  AutoModelForSeq2SeqLM, 
  DataCollatorForSeq2Seq,
  Seq2SeqTrainingArguments, 
  Seq2SeqTrainer,
  AutoTokenizer, 
  pipeline
) 

from pathlib import Path
from datasets import load_dataset, load_metric

In [5]:
# Paths

PROJ_DIR = Path('/content/gdrive/MyDrive/IFT6759/quick-recipe')
LOG_DIR = PROJ_DIR / 'logs'

LOG_DIR.mkdir(parents=True, exist_ok=True) 

if str(PROJ_DIR) not in sys.path:
    sys.path.insert(0, str(PROJ_DIR))

MODEL_DIR = PROJ_DIR / 'models'
DATA_DIR = PROJ_DIR / 'youcook2'
ANNOTATED_DF_PATH = str(DATA_DIR / 'reviewed_0812.csv')
TRAIN_FRAC = 0.7
MAX_INPUT_LENGTH = 1024
MAX_SUMMARY_LENGTH = 128

RANDOM_SEED = 23456
np.random.seed(RANDOM_SEED)

In [6]:
df = pd.read_csv(ANNOTATED_DF_PATH)

In [7]:
key_sentences = df[df['IsUsefulSentence'] == 1]

In [8]:
len(key_sentences)

3569

In [9]:
indices = list(range(len(key_sentences)))
np.random.shuffle(indices)
train_len = int(TRAIN_FRAC * len(key_sentences))

In [10]:
sentences, instructions = key_sentences['Sentence'].to_numpy() , key_sentences['Key steps'].to_numpy()

In [11]:
train_sentences, train_instructions = sentences[:train_len], instructions[:train_len]
val_sentences, val_instructions = sentences[train_len:], instructions[train_len:]

In [13]:
len(train_sentences), len(val_sentences)

(2498, 1071)

In [14]:
checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, model_max_length=MAX_INPUT_LENGTH)

In [15]:
prefix = "summarize: "

def preprocess_function(examples, max_input_length=MAX_INPUT_LENGTH, max_summary_length=MAX_SUMMARY_LENGTH):
    inputs = [prefix + doc for doc in examples["text"]]
    
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=max_summary_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs



In [16]:
tokenizer('hi how are you', max_length=MAX_INPUT_LENGTH, truncation=True)

{'input_ids': [7102, 149, 33, 25, 1], 'attention_mask': [1, 1, 1, 1, 1]}

In [17]:
def generate_encodings(sentences, instructions, tokenizer, max_input_length=MAX_INPUT_LENGTH, max_summary_length=MAX_SUMMARY_LENGTH):
    examples = []
    for sentence, instruction in zip(list(sentences), list(instructions)):
        try:            
            sentence = str(sentence)
            instruction = str(instruction)
            example = {'text': sentence, 'summary': instruction}
            model_inputs = tokenizer(sentence, max_length=max_input_length, truncation=True)
            labels = tokenizer(text_target=instruction, max_length=max_summary_length, truncation=True)
            model_inputs['labels'] = labels['input_ids']
            example['input_ids'] = model_inputs['input_ids']
            example['attention_mask'] = model_inputs['attention_mask']
            example['labels'] = model_inputs['labels']
            examples.append(example)
        except Exception as e:
            print(sentence, instruction)
            continue
    
    return examples

In [18]:
class YouCookDatasetForKnowledgeExtraction(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        item = self.encodings[idx]
        item['input_ids'] = torch.tensor(item['input_ids'])
        item['attention_mask'] = torch.tensor(item['attention_mask'])
        item['labels'] = torch.tensor(item['labels'])
        return item

    def __len__(self):
        return len(self.encodings)

In [19]:
train_encodings = generate_encodings(train_sentences, train_instructions, tokenizer)
val_encodings = generate_encodings(val_sentences, val_instructions, tokenizer)

train_dataset = YouCookDatasetForKnowledgeExtraction(train_encodings)
val_dataset = YouCookDatasetForKnowledgeExtraction(val_encodings)

In [21]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [22]:
rouge = evaluate.load("rouge")

In [23]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [24]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [25]:
MODEL_SAVE_DIR = MODEL_DIR / 'youcook_t5_small'
LOG_SAVE_DIR = LOG_DIR / 'youcook_t5_small'

MODEL_SAVE_DIR.mkdir(parents=True, exist_ok=True) 
LOG_SAVE_DIR.mkdir(parents=True, exist_ok=True) 

In [27]:
MODEL_SAVE_DIR, LOG_SAVE_DIR

(PosixPath('/content/gdrive/MyDrive/IFT6759/quick-recipe/models/youcook_t5_small'),
 PosixPath('/content/gdrive/MyDrive/IFT6759/quick-recipe/logs/youcook_t5_small'))

In [28]:
training_args = Seq2SeqTrainingArguments(
    output_dir=str(MODEL_SAVE_DIR),
    logging_dir=str(LOG_SAVE_DIR),
    logging_steps=10,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=128,
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,4.3892,4.084556,0.2277,0.0825,0.2152,0.2151,16.1382
2,3.4152,2.953156,0.3395,0.144,0.3274,0.3273,11.888
3,3.1465,2.507384,0.4373,0.203,0.4296,0.4292,7.6218
4,2.6553,2.350186,0.4575,0.2164,0.4486,0.4482,6.9972
5,2.5136,2.265012,0.4757,0.2251,0.4669,0.4662,6.5854
6,2.5441,2.212165,0.4871,0.2367,0.4774,0.4771,6.3548
7,2.2922,2.17291,0.5001,0.2474,0.4899,0.4899,6.2082
8,2.5903,2.153184,0.5077,0.2541,0.4972,0.4976,6.1858
9,2.6419,2.139508,0.5095,0.257,0.4996,0.4996,6.0868
10,2.3747,2.134829,0.5086,0.2578,0.4989,0.4989,6.1111


  item['input_ids'] = torch.tensor(item['input_ids'])
  item['attention_mask'] = torch.tensor(item['attention_mask'])
  item['labels'] = torch.tensor(item['labels'])
  item['input_ids'] = torch.tensor(item['input_ids'])
  item['attention_mask'] = torch.tensor(item['attention_mask'])
  item['labels'] = torch.tensor(item['labels'])
  item['input_ids'] = torch.tensor(item['input_ids'])
  item['attention_mask'] = torch.tensor(item['attention_mask'])
  item['labels'] = torch.tensor(item['labels'])
  item['input_ids'] = torch.tensor(item['input_ids'])
  item['attention_mask'] = torch.tensor(item['attention_mask'])
  item['labels'] = torch.tensor(item['labels'])
  item['input_ids'] = torch.tensor(item['input_ids'])
  item['attention_mask'] = torch.tensor(item['attention_mask'])
  item['labels'] = torch.tensor(item['labels'])
  item['input_ids'] = torch.tensor(item['input_ids'])
  item['attention_mask'] = torch.tensor(item['attention_mask'])
  item['labels'] = torch.tensor(item['labels'])
  it

TrainOutput(global_step=400, training_loss=2.992163128852844, metrics={'train_runtime': 181.1589, 'train_samples_per_second': 137.89, 'train_steps_per_second': 2.208, 'total_flos': 687620411424768.0, 'train_loss': 2.992163128852844, 'epoch': 10.0})

## Inference and metrics calculation

In [31]:
MODEL_CHECKPOINT_PATH = str(MODEL_SAVE_DIR / 'checkpoint-400')

In [37]:
summarizer = pipeline("summarization", model=MODEL_CHECKPOINT_PATH)

In [98]:
total_true_pos = 0
total_num_predicted = 0
total_num_gold = 0

for index in range(len(val_dataset)):
  if (index+1) % 50 == 0:
    print(f"Processing example {index+1}")
  text = val_dataset[index]['text']
  summary = val_dataset[index]['summary']
  text_words = text.split(' ')
  summary_words = summary.split(' ')
  max_len = len(text_words)
  predictions = summarizer(text, min_length=3, max_length=max_len)
  predicted_words = set(predictions[0]['summary_text'].split(' '))
  # print("Text: ", text)
  # print("Predicted: ", predictions)
  # print("Actual: ", summary)
  true_pos = len(set(predicted_words) & set(summary_words))
  num_predicted = len(set(predicted_words))
  num_gold = len(set(summary_words))
  total_true_pos += true_pos
  total_num_predicted += num_predicted
  total_num_gold += num_gold

  item['input_ids'] = torch.tensor(item['input_ids'])
  item['attention_mask'] = torch.tensor(item['attention_mask'])
  item['labels'] = torch.tensor(item['labels'])


Processing example 50
Processing example 100
Processing example 150
Processing example 200
Processing example 250
Processing example 300
Processing example 350
Processing example 400
Processing example 450
Processing example 500
Processing example 550
Processing example 600
Processing example 650
Processing example 700
Processing example 750
Processing example 800
Processing example 850
Processing example 900
Processing example 950
Processing example 1000
Processing example 1050


In [105]:
precision = round(total_true_pos / total_num_predicted, 2)
recall = round(total_true_pos / total_num_gold, 2)
f1 = round(2*(precision*recall) / (precision+recall), 2)

print(f"Total true positives (predicted words overlap with gold words): {total_true_pos}")
print(f"Total predicted words: {total_num_predicted}")
print(f"Total gold words: {total_num_gold}")
print(f"Precision: {precision}, Recall: {recall}, F1: {f1}")

Total true positives (predicted words overlap with gold words): 1722
Total predicted words: 5673
Total gold words: 3795
Precision: 0.3, Recall: 0.45, F1: 0.36
