In [None]:
#@title Mount your Google Drive
# If you run this notebook locally or on a cluster (i.e. not on Google Colab)
# you can delete this cell which is specific to Google Colab. You may also
# change the paths for data/logs in Arguments below.
%matplotlib inline
%load_ext autoreload
%autoreload 2

from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:

!pip install transformers datasets evaluate rouge_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m89.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m47.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

In [None]:
import sys
import os
import shutil
import warnings
import pandas as pd
import numpy as np
import json
import torch
import evaluate

from transformers import (
  AutoModelForSeq2SeqLM, 
  DataCollatorForSeq2Seq,
  Seq2SeqTrainingArguments, 
  Seq2SeqTrainer,
  AutoTokenizer, 
  pipeline
) 

from pathlib import Path
from datasets import load_dataset, load_metric

In [None]:
# Paths

PROJ_DIR = Path('/content/gdrive/MyDrive/IFT6759/quick-recipe')
LOG_DIR = PROJ_DIR / 'logs'

LOG_DIR.mkdir(parents=True, exist_ok=True) 

if str(PROJ_DIR) not in sys.path:
    sys.path.insert(0, str(PROJ_DIR))

MODEL_DIR = PROJ_DIR / 'models'
DATA_DIR = PROJ_DIR / 'youcook2'
MODEL_SAVE_DIR = MODEL_DIR / 'youcook_BART_Coref'
LOG_SAVE_DIR = LOG_DIR / 'youcook_BART_Coref'

MODEL_SAVE_DIR.mkdir(parents=True, exist_ok=True) 
LOG_SAVE_DIR.mkdir(parents=True, exist_ok=True) 

ANNOTATED_DF_PATH = str(DATA_DIR / 'reviewed_0812_coref_aligned.csv')
SPLIT_DF_PATH = str(DATA_DIR / 'train_val_split.csv')
TRAIN_FRAC = 0.7
MAX_INPUT_LENGTH = 1024
MAX_SUMMARY_LENGTH = 128

RANDOM_SEED = 23456
np.random.seed(RANDOM_SEED)

In [None]:
df = pd.read_csv(ANNOTATED_DF_PATH)

In [None]:
key_sentences = df[df['IsUsefulSentence'] == 1]

In [None]:
len(key_sentences)

3569

In [None]:
split_df = pd.read_csv(SPLIT_DF_PATH)

train_video_urls = list(split_df[split_df['Split'] == 'train']['VideoUrl'].values)
test_video_urls = list(split_df[split_df['Split'] == 'val']['VideoUrl'].values)

train_df = df[df['VideoUrl'].isin(train_video_urls)]
test_df = df[df['VideoUrl'].isin(test_video_urls)]

train_sentences, train_instructions = train_df[train_df['IsUsefulSentence'] == 1]['Sentence_Coref'].to_numpy(), train_df[train_df['IsUsefulSentence'] == 1]['Key steps'].to_numpy()
test_sentences, test_instructions = test_df[test_df['IsUsefulSentence'] == 1]['Sentence_Coref'].to_numpy(), test_df[test_df['IsUsefulSentence'] == 1]['Key steps'].to_numpy()

# Split val set from within train
indices = list(range(len(train_sentences)))
np.random.shuffle(indices)
train_len = int(TRAIN_FRAC * len(train_sentences))

val_sentences, val_instructions = train_sentences[indices[train_len:]], train_instructions[indices[train_len:]]
train_sentences, train_instructions = train_sentences[indices[:train_len]], train_instructions[indices[:train_len]]

In [None]:
# indices = list(range(len(key_sentences)))
# np.random.shuffle(indices)
# train_len = int(TRAIN_FRAC * len(key_sentences))

In [None]:
# sentences, instructions = key_sentences['Sentence'].to_numpy() , key_sentences['Key steps'].to_numpy()

In [None]:
# train_sentences, train_instructions = sentences[:train_len], instructions[:train_len]
# val_sentences, val_instructions = sentences[train_len:], instructions[train_len:]

In [None]:
len(train_sentences), len(val_sentences), len(train_instructions), len(val_sentences)

(1755, 753, 1755, 753)

In [None]:
checkpoint = "sshleifer/distilbart-xsum-12-3"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, model_max_length=MAX_INPUT_LENGTH)

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [None]:
prefix = ""

def preprocess_function(examples, max_input_length=MAX_INPUT_LENGTH, max_summary_length=MAX_SUMMARY_LENGTH):
    inputs = [prefix + doc for doc in examples["text"]]
    
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=max_summary_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs



In [None]:
tokenizer('hi how are you', max_length=MAX_INPUT_LENGTH, truncation=True)

{'input_ids': [0, 3592, 141, 32, 47, 2], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [None]:
def generate_encodings(sentences, instructions, tokenizer, max_input_length=MAX_INPUT_LENGTH, max_summary_length=MAX_SUMMARY_LENGTH):
    examples = []
    for sentence, instruction in zip(list(sentences), list(instructions)):
        try:            
            sentence = str(sentence)
            instruction = str(instruction)
            example = {'text': sentence, 'summary': instruction}
            model_inputs = tokenizer(sentence, max_length=max_input_length, truncation=True)
            labels = tokenizer(text_target=instruction, max_length=max_summary_length, truncation=True)
            model_inputs['labels'] = labels['input_ids']
            example['input_ids'] = model_inputs['input_ids']
            example['attention_mask'] = model_inputs['attention_mask']
            example['labels'] = model_inputs['labels']
            examples.append(example)
        except Exception as e:
            print(sentence, instruction)
            continue
    
    return examples

In [None]:
class YouCookDatasetForKnowledgeExtraction(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        item = self.encodings[idx]
        item['input_ids'] = torch.tensor(item['input_ids'])
        item['attention_mask'] = torch.tensor(item['attention_mask'])
        item['labels'] = torch.tensor(item['labels'])
        return item

    def __len__(self):
        return len(self.encodings)

In [None]:

train_encodings = generate_encodings(train_sentences, train_instructions, tokenizer)
val_encodings = generate_encodings(val_sentences, val_instructions, tokenizer)
test_encodings = generate_encodings(test_sentences, test_instructions, tokenizer)

train_dataset = YouCookDatasetForKnowledgeExtraction(train_encodings)
val_dataset = YouCookDatasetForKnowledgeExtraction(val_encodings)
test_dataset = YouCookDatasetForKnowledgeExtraction(test_encodings)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [None]:
rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

Downloading pytorch_model.bin:   0%|          | 0.00/716M [00:00<?, ?B/s]

In [None]:
MODEL_SAVE_DIR, LOG_SAVE_DIR

(PosixPath('/content/gdrive/MyDrive/IFT6759/quick-recipe/models/youcook_BART_Coref'),
 PosixPath('/content/gdrive/MyDrive/IFT6759/quick-recipe/logs/youcook_BART_Coref'))

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir=str(MODEL_SAVE_DIR),
    logging_dir=str(LOG_SAVE_DIR),
    logging_steps=10,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=3,
    # load_best_model_at_end=False,
    num_train_epochs=6,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,2.3316,1.976171,0.4128,0.19,0.3989,0.3982,12.8274
2,1.6416,1.830468,0.4302,0.2064,0.4153,0.4148,12.5737
3,1.5989,1.783984,0.4343,0.2098,0.4205,0.4204,12.761
4,1.4215,1.77466,0.4373,0.2161,0.4214,0.4218,12.6826
5,1.1523,1.770243,0.436,0.2109,0.4222,0.4223,12.7703
6,1.1973,1.768727,0.4368,0.2131,0.4236,0.4239,12.7769


  item['input_ids'] = torch.tensor(item['input_ids'])
  item['attention_mask'] = torch.tensor(item['attention_mask'])
  item['labels'] = torch.tensor(item['labels'])
  item['input_ids'] = torch.tensor(item['input_ids'])
  item['attention_mask'] = torch.tensor(item['attention_mask'])
  item['labels'] = torch.tensor(item['labels'])
  item['input_ids'] = torch.tensor(item['input_ids'])
  item['attention_mask'] = torch.tensor(item['attention_mask'])
  item['labels'] = torch.tensor(item['labels'])
  item['input_ids'] = torch.tensor(item['input_ids'])
  item['attention_mask'] = torch.tensor(item['attention_mask'])
  item['labels'] = torch.tensor(item['labels'])
  item['input_ids'] = torch.tensor(item['input_ids'])
  item['attention_mask'] = torch.tensor(item['attention_mask'])
  item['labels'] = torch.tensor(item['labels'])


TrainOutput(global_step=330, training_loss=1.687768233906139, metrics={'train_runtime': 382.6792, 'train_samples_per_second': 27.517, 'train_steps_per_second': 0.862, 'total_flos': 1022177715763200.0, 'train_loss': 1.687768233906139, 'epoch': 6.0})

## Inference and metrics calculation

In [None]:
MODEL_CHECKPOINT_PATH = str(MODEL_SAVE_DIR / 'checkpoint-275')

In [None]:
summarizer = pipeline("summarization", model=MODEL_CHECKPOINT_PATH)

In [None]:
total_true_pos = 0
total_num_predicted = 0
total_num_gold = 0
text_list = []
summary_list = []
predictions_list = []
true_positive_list = []
num_predicted_list = []
num_gold_list = []

for index in range(len(test_dataset)):
  if (index+1) % 50 == 0:
    print(f"Processing example {index+1}")
  text = test_dataset[index]['text']
  summary = test_dataset[index]['summary']
  text_words = text.split(' ')
  summary_words = summary.split(' ')
  max_len = len(text_words)
  predictions = summarizer(text, min_length=3, max_length=max_len)
  predicted_words = set(predictions[0]['summary_text'].split(' '))
  # print("Text: ", text)
  # print("Predicted: ", predictions)
  # print("Actual: ", summary)
  true_pos = len(set(predicted_words) & set(summary_words))
  num_predicted = len(set(predicted_words))
  num_gold = len(set(summary_words))
  total_true_pos += true_pos
  total_num_predicted += num_predicted
  total_num_gold += num_gold
  text_list.append(text)
  summary_list.append(summary)
  predictions_list.append(predictions[0]['summary_text'])
  true_positive_list.append(true_pos)
  num_predicted_list.append(num_predicted)
  num_gold_list.append(num_gold)

  item['input_ids'] = torch.tensor(item['input_ids'])
  item['attention_mask'] = torch.tensor(item['attention_mask'])
  item['labels'] = torch.tensor(item['labels'])


Processing example 50
Processing example 100
Processing example 150
Processing example 200
Processing example 250
Processing example 300
Processing example 350
Processing example 400
Processing example 450
Processing example 500
Processing example 550
Processing example 600
Processing example 650
Processing example 700
Processing example 750
Processing example 800
Processing example 850
Processing example 900
Processing example 950
Processing example 1000
Processing example 1050


In [None]:
predictions_list

['place dough in caputo flour',
 'flip pizza',
 'sprinkle the surface',
 'squish pizza',
 'dimple pizza',
 'Do a quick stretch while rotating',
 'put pizza on the backs of hands',
 'put sauce on pizza',
 'put cheese on',
 'put on olive oil',
 'place pizza on stone',
 'rotrot pizza',
 'put olive oil directly on',
 'cut pizza',
 'stretch dough',
 'add tomato sauce',
 'finish margarita pizza',
 'add tomato sauce',
 'leave pizza',
 'lay bread on bread',
 'lay basil',
 'Press down the mixture',
 'pop dough in oven',
 'keep pizza on pizza stone',
 'cut into bite size pieces',
 'take peanut butter',
 'combine ingredients',
 'put in chili sauce',
 'put in soy sauce',
 'mix salt',
 'heat pan',
 'put in garlic',
 'put ginger in chicken',
 'remove fat',
 'put bread in fan',
 'stir fry mixture',
 'coast',
 'add peanut butter mixture',
 'cook peanut butter',
 'drain water',
 'put tofu on paper towel',
 'take green onion, chopped up carrot',
 'combine egg whites',
 'take green onion',
 'add water',


In [None]:
_predictions = [prediction for prediction in predictions_list]
predictions_list = _predictions

In [None]:
results_df = pd.DataFrame({'text': text_list,
                'summary': summary_list,
                'prediction': predictions_list,
                'true_positives': true_positive_list,
                'num_predicted': num_predicted_list,
                'num_gold': num_gold_list
                })

In [None]:
results_df.to_csv(str(LOG_SAVE_DIR / 'val_performance.csv'), index=False)

In [None]:
pd.read_csv(str(LOG_SAVE_DIR / 'val_performance.csv'))

Unnamed: 0,text,summary,prediction,true_positives,num_predicted,num_gold
0,so we ' ve placed the dough directly into the ...,place dough in caputo flour,place dough in caputo flour,5,5,5
1,and then we give the pizza a flip as i ' ve re...,flip dough,flip pizza,1,2,2
2,"but that 's what we do anyway , we sprinkle th...",sprinkle the surface,sprinkle the surface,3,3,3
3,we just give a squish with our palm and make t...,squish dough with palm; flatten center,squish pizza,1,2,6
4,"we dimple the rest of the pizza , moving the p...",dimple the rest of pizza; move the pizza around,dimple pizza,2,2,8
...,...,...,...,...,...,...
1056,"so of vegetable oil into a large , deep and he...",oil skillet,add vegetable oil,1,3,2
1057,"best , if you have one as one holds , and dist...",fry at 365 degrees,heat oil,0,2,4
1058,"the chicken pieces in the skillet and fry , th...",fry for 20 minutes,fry chicken for 15 minutes,3,6,4
1059,"pieces , several times during the cooking time...",drain chicken,drain chicken,2,2,2


In [None]:
precision = round(total_true_pos / total_num_predicted, 2)
recall = round(total_true_pos / total_num_gold, 2)
f1 = round(2*(precision*recall) / (precision+recall), 2)

print(f"Total true positives (predicted words overlap with gold words): {total_true_pos}")
print(f"Total predicted words: {total_num_predicted}")
print(f"Total gold words: {total_num_gold}")
print(f"Precision: {precision}, Recall: {recall}, F1: {f1}")

Total true positives (predicted words overlap with gold words): 1895
Total predicted words: 3490
Total gold words: 4154
Precision: 0.54, Recall: 0.46, F1: 0.5


## Generate Knowledge Extraction metrics based on best Key-clip Prediction model's predictions

In [None]:
def best_keyclip_prediction_true_positives(row):
    return row['true_positives'] if row['IsPredUseful'] == 1 else 0

def generate_metrics_from_master_and_val_df(master_df, val_df):
  combined_df = pd.merge(master_df, val_df, left_on='Sentence', right_on='text', how='inner')
  combined_df['best_kc_pred_true_positives'] = combined_df.apply(lambda row: best_keyclip_prediction_true_positives(row), axis=1)
  total_num_gold = combined_df['num_gold'].sum()
  total_num_predicted = combined_df['num_predicted'].sum()
  total_true_pos = combined_df['best_kc_pred_true_positives'].sum()
  precision = round(total_true_pos / total_num_predicted, 2)
  recall = round(total_true_pos / total_num_gold, 2)
  f1 = round(2*(precision*recall) / (precision+recall), 2)

  print(f"Total true positives (predicted words overlap with gold words): {total_true_pos}")
  print(f"Total predicted words: {total_num_predicted}")
  print(f"Total gold words: {total_num_gold}")
  print(f"Precision: {precision}, Recall: {recall}, F1: {f1}")
  
  return precision, recall, f1

In [None]:
master_df = pd.read_pickle(DATA_DIR / 'full_master_updated.pkl')
val_df = pd.read_csv(str(LOG_SAVE_DIR / 'val_performance.csv'))

In [None]:
generate_metrics_from_master_and_val_df(master_df, val_df)

Total true positives (predicted words overlap with gold words): 532
Total predicted words: 1278
Total gold words: 1544
Precision: 0.42, Recall: 0.34, F1: 0.38


(0.42, 0.34, 0.38)