In [None]:
#@title Mount your Google Drive
# If you run this notebook locally or on a cluster (i.e. not on Google Colab)
# you can delete this cell which is specific to Google Colab. You may also
# change the paths for data/logs in Arguments below.
%matplotlib inline
%load_ext autoreload
%autoreload 2

from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!pip install transformers datasets evaluate rouge_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m88.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m38.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━

In [None]:
import sys
import os
import shutil
import warnings
import pandas as pd
import numpy as np
import json
import torch
import evaluate

from transformers import (
  AutoModelForSeq2SeqLM, 
  DataCollatorForSeq2Seq,
  Seq2SeqTrainingArguments, 
  Seq2SeqTrainer,
  AutoTokenizer, 
  pipeline
) 

from pathlib import Path
from datasets import load_dataset, load_metric

In [None]:
# Paths

PROJ_DIR = Path('/content/gdrive/MyDrive/IFT6759/quick-recipe')
LOG_DIR = PROJ_DIR / 'logs'

LOG_DIR.mkdir(parents=True, exist_ok=True) 

if str(PROJ_DIR) not in sys.path:
    sys.path.insert(0, str(PROJ_DIR))

MODEL_DIR = PROJ_DIR / 'models'
MAIN_DATA_DIR = PROJ_DIR / 'data'
DATA_DIR = PROJ_DIR / 'youcook2'
MODEL_SAVE_DIR = MODEL_DIR / 'youcook_BART_2'
LOG_SAVE_DIR = LOG_DIR / 'youcook_BART_2'

MODEL_SAVE_DIR.mkdir(parents=True, exist_ok=True) 
LOG_SAVE_DIR.mkdir(parents=True, exist_ok=True) 

ANNOTATED_DF_PATH = str(DATA_DIR / 'reviewed_0812_coref_aligned.csv')
SPLIT_DF_PATH = str(DATA_DIR / 'train_val_split.csv')
TRAIN_FRAC = 0.7
MAX_INPUT_LENGTH = 1024
MAX_SUMMARY_LENGTH = 128

RANDOM_SEED = 23456
np.random.seed(RANDOM_SEED)

In [None]:
df = pd.read_csv(ANNOTATED_DF_PATH)

In [None]:
key_sentences = df[df['IsUsefulSentence'] == 1]

In [None]:
len(key_sentences)

3569

In [None]:
split_df = pd.read_csv(SPLIT_DF_PATH)

train_video_urls = list(split_df[split_df['Split'] == 'train']['VideoUrl'].values)
test_video_urls = list(split_df[split_df['Split'] == 'val']['VideoUrl'].values)

train_df = df[df['VideoUrl'].isin(train_video_urls)]
test_df = df[df['VideoUrl'].isin(test_video_urls)]

train_sentences, train_instructions = train_df[train_df['IsUsefulSentence'] == 1]['Sentence'].to_numpy(), train_df[train_df['IsUsefulSentence'] == 1]['Key steps'].to_numpy()
test_sentences, test_instructions = test_df[test_df['IsUsefulSentence'] == 1]['Sentence'].to_numpy(), test_df[test_df['IsUsefulSentence'] == 1]['Key steps'].to_numpy()

# Split val set from within train
indices = list(range(len(train_sentences)))
np.random.shuffle(indices)
train_len = int(TRAIN_FRAC * len(train_sentences))

val_sentences, val_instructions = train_sentences[indices[train_len:]], train_instructions[indices[train_len:]]
train_sentences, train_instructions = train_sentences[indices[:train_len]], train_instructions[indices[:train_len]]

In [None]:
# indices = list(range(len(key_sentences)))
# np.random.shuffle(indices)
# train_len = int(TRAIN_FRAC * len(key_sentences))

In [None]:
# sentences, instructions = key_sentences['Sentence'].to_numpy() , key_sentences['Key steps'].to_numpy()

In [None]:
# train_sentences, train_instructions = sentences[:train_len], instructions[:train_len]
# val_sentences, val_instructions = sentences[train_len:], instructions[train_len:]

In [None]:
len(train_sentences), len(val_sentences), len(train_instructions), len(val_sentences)

(1755, 753, 1755, 753)

In [None]:
checkpoint = "sshleifer/distilbart-xsum-12-3"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, model_max_length=MAX_INPUT_LENGTH)

In [None]:
prefix = ""

def preprocess_function(examples, max_input_length=MAX_INPUT_LENGTH, max_summary_length=MAX_SUMMARY_LENGTH):
    inputs = [prefix + doc for doc in examples["text"]]
    
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=max_summary_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs



In [None]:
tokenizer('hi how are you', max_length=MAX_INPUT_LENGTH, truncation=True)

{'input_ids': [0, 3592, 141, 32, 47, 2], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [None]:
def generate_encodings(sentences, instructions, tokenizer, max_input_length=MAX_INPUT_LENGTH, max_summary_length=MAX_SUMMARY_LENGTH):
    examples = []
    for sentence, instruction in zip(list(sentences), list(instructions)):
        try:            
            sentence = str(sentence)
            instruction = str(instruction)
            example = {'text': sentence, 'summary': instruction}
            model_inputs = tokenizer(sentence, max_length=max_input_length, truncation=True)
            labels = tokenizer(text_target=instruction, max_length=max_summary_length, truncation=True)
            model_inputs['labels'] = labels['input_ids']
            example['input_ids'] = model_inputs['input_ids']
            example['attention_mask'] = model_inputs['attention_mask']
            example['labels'] = model_inputs['labels']
            examples.append(example)
        except Exception as e:
            print(sentence, instruction)
            continue
    
    return examples

In [None]:
class YouCookDatasetForKnowledgeExtraction(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        item = self.encodings[idx]
        item['input_ids'] = torch.tensor(item['input_ids'])
        item['attention_mask'] = torch.tensor(item['attention_mask'])
        item['labels'] = torch.tensor(item['labels'])
        return item

    def __len__(self):
        return len(self.encodings)

In [None]:
train_encodings = generate_encodings(train_sentences, train_instructions, tokenizer)
val_encodings = generate_encodings(val_sentences, val_instructions, tokenizer)
test_encodings = generate_encodings(test_sentences, test_instructions, tokenizer)

train_dataset = YouCookDatasetForKnowledgeExtraction(train_encodings)
val_dataset = YouCookDatasetForKnowledgeExtraction(val_encodings)
test_dataset = YouCookDatasetForKnowledgeExtraction(test_encodings)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [None]:
rouge = evaluate.load("rouge")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
MODEL_SAVE_DIR, LOG_SAVE_DIR

(PosixPath('/content/gdrive/MyDrive/IFT6759/quick-recipe/models/youcook_BART_2'),
 PosixPath('/content/gdrive/MyDrive/IFT6759/quick-recipe/logs/youcook_BART_2'))

In [1]:
training_args = Seq2SeqTrainingArguments(
    output_dir=str(MODEL_SAVE_DIR),
    logging_dir=str(LOG_SAVE_DIR),
    logging_steps=10,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

## Inference and metrics calculation

In [None]:
MODEL_CHECKPOINT_PATH = str(MODEL_SAVE_DIR / 'checkpoint-220')

In [None]:
summarizer = pipeline("summarization", model=MODEL_CHECKPOINT_PATH)

In [None]:
total_true_pos = 0
total_num_predicted = 0
total_num_gold = 0
text_list = []
summary_list = []
predictions_list = []
true_positive_list = []
num_predicted_list = []
num_gold_list = []

for index in range(len(val_dataset)):
  if (index+1) % 50 == 0:
    print(f"Processing example {index+1}")
  text = val_dataset[index]['text']
  summary = val_dataset[index]['summary']
  text_words = text.split(' ')
  summary_words = summary.split(' ')
  max_len = len(text_words)
  predictions = summarizer(text, min_length=3, max_length=max_len)
  predicted_words = set(predictions[0]['summary_text'].split(' '))
  # print("Text: ", text)
  # print("Predicted: ", predictions)
  # print("Actual: ", summary)
  true_pos = len(set(predicted_words) & set(summary_words))
  num_predicted = len(set(predicted_words))
  num_gold = len(set(summary_words))
  total_true_pos += true_pos
  total_num_predicted += num_predicted
  total_num_gold += num_gold
  text_list.append(text)
  summary_list.append(summary)
  predictions_list.append(predictions[0]['summary_text'])
  true_positive_list.append(true_pos)
  num_predicted_list.append(num_predicted)
  num_gold_list.append(num_gold)

  item['input_ids'] = torch.tensor(item['input_ids'])
  item['attention_mask'] = torch.tensor(item['attention_mask'])
  item['labels'] = torch.tensor(item['labels'])


Processing example 50
Processing example 100
Processing example 150
Processing example 200
Processing example 250
Processing example 300
Processing example 350
Processing example 400
Processing example 450
Processing example 500
Processing example 550
Processing example 600
Processing example 650
Processing example 700
Processing example 750


In [None]:
predictions_list

['break apart',
 'Add paneer pieces',
 'cover baking',
 'mix dough',
 'mix eggs, parmesan cheese, reheat bacon',
 'add flour, salt baking powder, baking soda, sugar',
 'season salt, flour dust',
 'boil seaweed',
 'dip pork inside',
 'add olive oil',
 'add water',
 'add olive oil',
 'start with almond butter',
 'add water',
 'cut onion',
 'toss potatoes',
 'place lettuce',
 'pull shrimp in batter',
 'add chives',
 'put potatoes in butter',
 'add brown pan',
 'smash potatoes',
 'saute onions and',
 'put roti over',
 'cook yolks',
 'boil water',
 'stir mixture',
 'add meatloaf',
 'saute vegetables',
 'add butter',
 'slice onions',
 'Add salt',
 'make carbonara',
 'add baking powder, salt, pepper, mix orexin',
 'put in green onions',
 'fry bacon',
 'stir mixture',
 'put parmesan on top',
 'bring to boil',
 'fold dough',
 'mix mixture',
 'season with salt and pepper',
 'stir mixture',
 'add pepper',
 'power pork loin, shrimp',
 'spread mixture on top of meatloaf',
 'cook garlic, mushrooms',

In [None]:
_predictions = [prediction for prediction in predictions_list]
predictions_list = _predictions

In [None]:
results_df = pd.DataFrame({'text': text_list,
                'summary': summary_list,
                'prediction': predictions_list,
                'true_positives': true_positive_list,
                'num_predicted': num_predicted_list,
                'num_gold': num_gold_list
                })

In [None]:
results_df.to_csv(str(LOG_SAVE_DIR / 'val_performance.csv'), index=False)

In [None]:
precision = round(total_true_pos / total_num_predicted, 2)
recall = round(total_true_pos / total_num_gold, 2)
f1 = round(2*(precision*recall) / (precision+recall), 2)

print(f"Total true positives (predicted words overlap with gold words): {total_true_pos}")
print(f"Total predicted words: {total_num_predicted}")
print(f"Total gold words: {total_num_gold}")
print(f"Precision: {precision}, Recall: {recall}, F1: {f1}")

Total true positives (predicted words overlap with gold words): 1260
Total predicted words: 2182
Total gold words: 2867
Precision: 0.58, Recall: 0.44, F1: 0.5


## Generate Knowledge Extraction metrics based on best Key-clip Prediction model's predictions

In [None]:
def best_keyclip_prediction_true_positives(row):
    return row['true_positives'] if row['IsPredUseful'] == 1 else 0

def generate_metrics_from_master_and_val_df(master_df, val_df):
  combined_df = pd.merge(master_df, val_df, left_on='Sentence', right_on='text', how='inner')
  combined_df['best_kc_pred_true_positives'] = combined_df.apply(lambda row: best_keyclip_prediction_true_positives(row), axis=1)
  total_num_gold = combined_df['num_gold'].sum()
  total_num_predicted = combined_df['num_predicted'].sum()
  total_true_pos = combined_df['best_kc_pred_true_positives'].sum()
  precision = round(total_true_pos / total_num_predicted, 2)
  recall = round(total_true_pos / total_num_gold, 2)
  f1 = round(2*(precision*recall) / (precision+recall), 2)

  print(f"Total true positives (predicted words overlap with gold words): {total_true_pos}")
  print(f"Total predicted words: {total_num_predicted}")
  print(f"Total gold words: {total_num_gold}")
  print(f"Precision: {precision}, Recall: {recall}, F1: {f1}")
  
  return precision, recall, f1

In [None]:
master_df = pd.read_pickle(MAIN_DATA_DIR / 'full_master_updated.pkl')
val_df = pd.read_csv(str(LOG_SAVE_DIR / 'val_performance.csv'))

In [None]:
generate_metrics_from_master_and_val_df(master_df, val_df)

Total true positives (predicted words overlap with gold words): 1044
Total predicted words: 2095
Total gold words: 2756
Precision: 0.5, Recall: 0.38, F1: 0.43


(0.5, 0.38, 0.43)