In [111]:
#@title Mount your Google Drive
# If you run this notebook locally or on a cluster (i.e. not on Google Colab)
# you can delete this cell which is specific to Google Colab. You may also
# change the paths for data/logs in Arguments below.
%matplotlib inline
%load_ext autoreload
%autoreload 2

# from google.colab import drive
# drive.mount('/content/gdrive')

In [21]:
import sys
import os
import shutil
import warnings
import pandas as pd
import numpy as np
import json
import torch

from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from pathlib import Path
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer

In [38]:
# Paths

# PROJ_DIR = Path('/content/gdrive/MyDrive/IFT6135/assignment1_release')
# LOG_DIR = PROJ_DIR / 'logs'

# LOG_DIR.mkdir(parents=True, exist_ok=True) 

# if str(PROJ_DIR) not in sys.path:
#     sys.path.insert(0, str(PROJ_DIR))

DATA_DIR = Path().cwd().parent / 'youcook2'
ANNOTATED_DF_PATH = str(DATA_DIR / 'reviewed_0812.csv')
TRAIN_FRAC = 0.7
MAX_INPUT_LENGTH = 1024
MAX_SUMMARY_LENGTH = 128

RANDOM_SEED = 23456
np.random.seed(RANDOM_SEED)

In [3]:
df = pd.read_csv(ANNOTATED_DF_PATH)

In [4]:
key_sentences = df[df['IsUsefulSentence'] == 1]

In [5]:
len(key_sentences)

3569

In [6]:
indices = list(range(len(key_sentences)))
np.random.shuffle(indices)
train_len = int(TRAIN_FRAC * len(key_sentences))

In [7]:
sentences, instructions = key_sentences['Sentence'].to_numpy() , key_sentences['Key steps'].to_numpy()

In [8]:
train_sentences, train_instructions = sentences[:train_len], instructions[:train_len]
val_sentences, val_instructions = sentences[train_len:], instructions[train_len:]

In [9]:
val_instructions

array(['take egg, Italian herb, granulated garlic, red pepper flakes',
       'add black pepper, white pepper and salt', 'add parmesan cheese',
       ..., 'fry for 20 minutes', 'drain chicken', 'season chicken'],
      dtype=object)

In [10]:
from datasets import load_dataset

billsum = load_dataset("billsum", split="ca_test")

Found cached dataset billsum (/Users/jonathanlim/.cache/huggingface/datasets/billsum/default/3.0.0/75cf1719d38d6553aa0e0714c393c74579b083ae6e164b2543684e3e92e0c4cc)


In [11]:
billsum = billsum.train_test_split(test_size=0.2)

Loading cached split indices for dataset at /Users/jonathanlim/.cache/huggingface/datasets/billsum/default/3.0.0/75cf1719d38d6553aa0e0714c393c74579b083ae6e164b2543684e3e92e0c4cc/cache-f1aa08105ba921e4.arrow and /Users/jonathanlim/.cache/huggingface/datasets/billsum/default/3.0.0/75cf1719d38d6553aa0e0714c393c74579b083ae6e164b2543684e3e92e0c4cc/cache-a665f04d753b781c.arrow


In [65]:
for data in billsum['train']:
    print(data)
    break

{'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nThe Legislature finds and declares all of the following:\n(a)  In 1977, the United States Food and Drug Administration (FDA) concluded that feeding livestock low doses of antibiotics from antibiotic classes that are used in human disease treatment could promote the development of antibiotic-resistance in bacteria and pose a risk to human health. The FDA, however, did not act in response to these findings, despite laws requiring the agency to do so.\n(b)  The FDA issued voluntary guidance in December 2013 on the nontherapeutic use of antibiotics; however, this guidance is unlikely to significantly reduce the nontherapeutic use of antibiotics in livestock because of a broad exemption allowing for the use of antibiotics for disease prevention.\n(c)  Not only do antibiotic-resistant bacteria affect the health of our society, but they also have a monetary impact. In 1998, the National Academy of Sciences n

In [63]:
from transformers import AutoTokenizer

checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, model_max_length=MAX_INPUT_LENGTH)

In [60]:
prefix = "summarize: "


def preprocess_function(examples, max_input_length=MAX_INPUT_LENGTH, max_summary_length=MAX_SUMMARY_LENGTH):
    inputs = [prefix + doc for doc in examples["text"]]
    
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=MAX_SUMMARY_LENGTH, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs



In [72]:
tokenizer('hi how are you', max_length=MAX_INPUT_LENGTH, truncation=True)

{'input_ids': [7102, 149, 33, 25, 1], 'attention_mask': [1, 1, 1, 1, 1]}

In [93]:
list(train_instructions)[0]

'place dough in caputo flour'

In [100]:
def generate_encodings(sentences, instructions, tokenizer, max_input_length=MAX_INPUT_LENGTH, max_summary_length=MAX_SUMMARY_LENGTH):
    examples = []
    for sentence, instruction in zip(list(sentences), list(instructions)):
        try:            
            sentence = str(sentence)
            instruction = str(instruction)
            example = {'text': sentence, 'summary': instruction}
            model_inputs = tokenizer(sentence, max_length=max_input_length, truncation=True)
            labels = tokenizer(text_target=instruction, max_length=max_summary_length, truncation=True)
            model_inputs['labels'] = labels['input_ids']
            example['input_ids'] = model_inputs['input_ids']
            example['attention_mask'] = model_inputs['attention_mask']
            example['labels'] = model_inputs['labels']
            examples.append(example)
        except Exception as e:
            print(sentence, instruction)
            continue
    
    return examples

In [105]:
class YouCookDatasetForKnowledgeExtraction(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        item = self.encodings[idx]
        item['input_ids'] = torch.tensor(item['input_ids'])
        item['attention_mask'] = torch.tensor(item['attention_mask'])
        item['labels'] = torch.tensor(item['labels'])
        return item

    def __len__(self):
        return len(self.encodings)

In [108]:
train_encodings = generate_encodings(train_sentences, train_instructions, tokenizer)
val_encodings = generate_encodings(val_sentences, val_instructions, tokenizer)

train_dataset = YouCookDatasetForKnowledgeExtraction(train_encodings)
val_dataset = YouCookDatasetForKnowledgeExtraction(val_encodings)

In [62]:
# tokenized_billsum = billsum.map(preprocess_function, batched=True)
# index = 1
# for idx, datapoint in enumerate(tokenized_billsum['train']):
#     if idx == index:
#         print(datapoint)
#         break
        
# print(len(datapoint['text']))
# print(len(datapoint['input_ids']))
# print(len(datapoint['attention_mask']))
# print(len(datapoint['labels']))

Map:   0%|          | 0/989 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

In [16]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [17]:
import evaluate

rouge = evaluate.load("rouge")

In [18]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [19]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

Downloading pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="youcook_t5_small",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()