In [1]:
from SQP1Dataset import initialize_datasets, SQP1Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, T5Tokenizer, T5Model, T5ForConditionalGeneration
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.optim import Optimizer, AdamW
from tqdm.notebook import tqdm

# Load tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained('t5-base')

datasets = initialize_datasets('../data/train.json', '../data/dev.json', tokenizer)
print(datasets['train'][0])
train_dataloader = DataLoader(datasets['train'],
                                   batch_size=64,
                                   shuffle=False,
                                   collate_fn=SQP1Dataset.collate_fn)

validation_dataloader = DataLoader(datasets['dev'],
                                   batch_size=64,
                                   shuffle=False,
                                   collate_fn=SQP1Dataset.collate_fn)

# print(validation_dataloader)
# for data in validation_dataloader:
#   print(data)
batch = next(iter(validation_dataloader))
print(batch['input_ids'])

print(f"{len(datasets['dev'])}")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


SQP1Example(question='Are more people today related to Genghis Khan than Julius Caesar?', decompositions=['How many kids did Julius Caesar have?', 'How many kids did Genghis Khan have?', 'Is #2 greater than #1?'])
['Will the Albany in Georgia reach a hundred thousand occupants before the one in New York?', 'Is the language used in Saint Vincent and the Grenadines rooted in English?', 'Is greed the most prevalent of the Seven Deadly Sins?', 'Would the top of Mount Fuji stick out of the Sea of Japan? ', "Was Lil Jon's top ranked Billboard song a collaboration with a member of The Lox?", 'Is Miami a city on the American West Coast?', 'Can the Swiss Guard fill the Virginia General Assembly chairs?', 'Did any country in Portuguese Colonial War share Switzerlands role in WWII?', 'Would a Pict be confused by Old English?', 'Could Lil Wayne legally operate a vehicle on his own at the beginning of his career?', 'Are you likely to find a crucifix in Karachi?', "Was a person sold a Creative Commo

In [7]:
#### THIS WORKS WITHOUT ERRORS
input_question = "Are more people today related to Genghis Khan than Julius Caesar?"
decompositions = [
            "How many kids did Julius Caesar have?",
            "How many kids did Genghis Khan have?",
            "Is #2 greater than #1?"
        ]

inputs = tokenizer(input_question, return_tensors="pt", padding=True, truncation=True).input_ids
outputs = tokenizer("<SEP>".join(decompositions), return_tensors="pt", padding=True, truncation=True).input_ids
print(inputs)
print(outputs)

model.train()
# Fine-tuning
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
for epoch in range(5):
    optimizer.zero_grad()
    oputs = model(input_ids=inputs, labels=outputs)
    loss = oputs.loss
    loss.backward()
    optimizer.step()

# evaluate model after training on one example 5 times
model.eval()
predictions = model.generate(input_ids=inputs)
print(tokenizer.decode(predictions[0], skip_special_tokens=True))


tensor([[ 1521,    72,   151,   469,  1341,    12,  5945,  5649,     7, 14420,
           145,  9983,   302, 26218,    58,     1]])
tensor([[  571,   186,  1082,   410,  9983,   302, 26218,    43,    58,     2,
           134,  8569,  3155,  7825,   186,  1082,   410,  5945,  5649,     7,
         14420,    43,    58,     2,   134,  8569,  3155,   196,     7, 15493,
          2123,   145,  7172,    58,     1]])




Are more people related to Julius Caesar than Julius Caesar?
['Are more people related to Julius Caesar than Julius Caesar?']
['Are more people related to Julius Caesar than Julius Caesar?']


In [2]:
def train_one_epoch(model: nn.Module, train_dataloader: DataLoader, optimizer: Optimizer, epoch: int):

    model.train()
    with tqdm(train_dataloader, desc=f"Train Ep {epoch}", total=len(train_dataloader)) as tq:
        for batch in tq:
            inputs = batch['input_ids'].input_ids
            outputs = batch['target_ids'].input_ids

            loss = model(input_ids=inputs, labels=outputs).loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()


In [None]:
def evaluate(model: nn.Module, dataloader: DataLoader):
  model.eval()
  with torch.no_grad():
    with tqdm(dataloader, desc=f"Train Ep {epoch}", total=len(train_dataloader)) as tq:
      for batch in tq:
        inputs = batch['input_ids'].input_ids
        predictions = model.generate(input_ids=inputs)

        # text predictions
        print(tokenizer.decode(predictions[0], skip_special_tokens=True))


In [3]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
train_one_epoch(model, train_dataloader, optimizer, 1)

Train Ep 1:   0%|          | 0/33 [00:00<?, ?it/s]

['Are more people today related to Genghis Khan than Julius Caesar?', 'Could the members of The Police perform lawful arrests?', 'Would a Monoamine Oxidase candy bar cheer up a depressed friend?', 'Would a dog respond to bell before Grey seal?', 'Is a pound sterling valuable?', 'Is shrimp scampi definitely free of plastic?', 'Do the anchors on Rede Globo speak Chinese?', 'Is a Boeing 737 cost covered by Wonder Woman (2017 film) box office receipts?', 'Can you buy Casio products at Petco?', 'Did the Space Race use relay batons?', 'Are Christmas trees dissimilar to deciduous trees?', 'Does Biochemistry study gluons?', 'Did land owners elect their rulers in the Kingdom of Hungary?', 'Would Nancy Pelosi publicly denounce abortion?', 'Does Dragon Ball shows and movies fall short of Friday 13th number of projects?', 'Would a student of the class of 2017 have amnesia about 9/11?', 'Would a psychic who admits to hot reading be trustworthy?', 'Is average number of peas in a pod enough commas fo

KeyboardInterrupt: 