# Train UnifiedQA

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -qqqq transformers datasets sentencepiece wandb

In [None]:
import wandb
wandb.login()

In [None]:
!huggingface-cli login

In [None]:
!mkdir -p CausalQA/input
!cp -r /content/drive/MyDrive/CausalQA/input/* ./CausalQA/input/

In [None]:
!unzip ./CausalQA/input/original-splits.zip
!unzip ./CausalQA/input/random-splits.zip

In [1]:
import argparse
from argparse import Namespace
from datasets import load_dataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    set_seed,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)

In [2]:
# concatenate question+context with \\n as a separator
def build_input(batch):
    input_ = [
        (question + " \\n " + context if context is not None else question)
        for question, context in zip(
            batch["question_processed"], batch["context_processed"]
        )
    ]
    batch["input"] = input_
    return batch

In [3]:
def generate_tokenizer(args):
  print("Load tokenizer and model...")
  tokenizer = T5Tokenizer.from_pretrained(args.checkpoint)
  model = T5ForConditionalGeneration.from_pretrained(args.checkpoint)

  return tokenizer, model

In [4]:
print("Define tokenize_function_train...")
def tokenize_function_train(batches):
    encoded_inputs = tokenizer(
        batches["input"],
        max_length=args.source_length,
        padding="max_length",
        truncation=True,
    )
    encoded_answers = tokenizer(
        batches["answer"],
        max_length=args.target_length,
        padding="max_length",
        truncation=True,
    )
    encoded_inputs["labels"] = [
        [(a if a != tokenizer.pad_token_id else -100) for a in ans]
        for ans in encoded_answers["input_ids"]
    ]
    return encoded_inputs

Define tokenize_function_train...


In [5]:
args = Namespace(
    checkpoint="allenai/unifiedqa-v2-t5-base-1363200",
    train_file="Webis-CausalQA-22-v-1.0/input/original-splits/squad2_train_original_split.csv",
    eval_file="Webis-CausalQA-22-v-1.0/input/original-splits/squad2_valid_original_split.csv",
    epochs=5,
    # source_length=2048, #original
    source_length=1024,
    target_length=100,
    batch_size=2,
    seed=42,
    num_procs=8,
    output_directory="Webis-CausalQA-22-v-1.0/models/original-splits/"
)

In [6]:
set_seed(args.seed)

In [7]:
tokenizer, model = generate_tokenizer(args)

Load tokenizer and model...


In [8]:
print("Load dataset from csv file")
train_dataset = load_dataset("csv", data_files=args.train_file)["train"]

Load dataset from csv file




  0%|          | 0/1 [00:00<?, ?it/s]

In [9]:
eval_dataset = load_dataset("csv", data_files=args.eval_file)["train"]



  0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
print("Map build_input")
train_dataset = train_dataset.map(
    build_input, batched=True, load_from_cache_file=False, num_proc=args.num_procs
)
train_dataset = train_dataset.remove_columns(["context", "context_processed"])

Map build_input
            

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

In [11]:
print("Map build_input")
eval_dataset = eval_dataset.map(
    build_input, batched=True, load_from_cache_file=False, num_proc=args.num_procs
)
eval_dataset = eval_dataset.remove_columns(["context", "context_processed"])

Map build_input
           

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

   

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

In [12]:
print("Map tokenize_function_train")
train_dataset = train_dataset.map(
    tokenize_function_train,
    batched=True,
    load_from_cache_file=False,
    num_proc=args.num_procs,
)
train_dataset = train_dataset.remove_columns(["input", "answer"])

Map tokenize_function_train
           

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

In [13]:
print("Map tokenize_function_train")
eval_dataset = eval_dataset.map(
    tokenize_function_train,
    batched=True,
    load_from_cache_file=False,
    num_proc=args.num_procs,
)
eval_dataset = eval_dataset.remove_columns(["input", "answer"])

Map tokenize_function_train
           

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

In [14]:
print("DataCollator...")
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
log_steps = (len(train_dataset) // args.batch_size) // 10

DataCollator...


In [15]:
log_steps

147

In [16]:
print("Set TrainingArguments...")
train_args = Seq2SeqTrainingArguments(
    output_dir="andreaschandra/unifiedqa-v2-t5-base-1363200-finetuned-causalqa-squad",
    report_to='wandb',
    per_device_train_batch_size=args.batch_size,
    per_device_eval_batch_size=args.batch_size,
    num_train_epochs=args.epochs,
    save_strategy='epoch',
    evaluation_strategy='steps',
    logging_strategy='steps',
    logging_steps=log_steps,
    push_to_hub=True,
    log_level='error',
    seed=args.seed
)

Set TrainingArguments...


In [17]:
print("Seq2SeqTrainer...")
trainer = Seq2SeqTrainer(
    model=model,
    args=train_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

Seq2SeqTrainer...


/content/andreaschandra/unifiedqa-v2-t5-base-1363200-finetuned-causalqa-squad is already a clone of https://huggingface.co/andreaschandra/unifiedqa-v2-t5-base-1363200-finetuned-causalqa-squad. Make sure you pull the latest changes with `repo.git_pull()`.


In [18]:
print("Train...")
trainer.train()

Train...


ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mandreaschandra[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,0.1892,3.760969
2,0.1514,4.169361
3,0.0807,4.554286
4,0.0381,4.665435
5,0.0377,4.755996


TrainOutput(global_step=7395, training_loss=0.12706295333253603, metrics={'train_runtime': 5303.7895, 'train_samples_per_second': 2.788, 'train_steps_per_second': 1.394, 'total_flos': 1.80068848238592e+16, 'train_loss': 0.12706295333253603, 'epoch': 5.0})

In [19]:
trainer.push_to_hub(commit_message="Training Completed!")

Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.34k/850M [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/andreaschandra/unifiedqa-v2-t5-base-1363200-finetuned-causalqa-squad
   428ca03..87fae07  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/andreaschandra/unifiedqa-v2-t5-base-1363200-finetuned-causalqa-squad
   428ca03..87fae07  main -> main

To https://huggingface.co/andreaschandra/unifiedqa-v2-t5-base-1363200-finetuned-causalqa-squad
   87fae07..0aae5d4  main -> main

   87fae07..0aae5d4  main -> main



'https://huggingface.co/andreaschandra/unifiedqa-v2-t5-base-1363200-finetuned-causalqa-squad/commit/87fae0706b7e77e07c75bd904edb2c3a8bd6f693'

# Load finetune model

In [29]:
args.model = "andreaschandra/unifiedqa-v2-t5-base-1363200-finetuned-causalqa-squad"

In [30]:
tokenizer = T5Tokenizer.from_pretrained(args.model)

In [32]:
finetuned_model = T5ForConditionalGeneration.from_pretrained(args.model)

In [34]:
valid_dataset = load_dataset("csv", data_files="Webis-CausalQA-22-v-1.0/input/original-splits/squad2_valid_original_split.csv")["train"]



  0%|          | 0/1 [00:00<?, ?it/s]

In [35]:
valid_dataset

Dataset({
    features: ['id', 'question', 'question_processed', 'context', 'context_processed', 'answer', 'answer_processed'],
    num_rows: 252
})

In [36]:
valid_dataset = valid_dataset.map(
    build_input, batched=True, load_from_cache_file=False, num_proc=args.num_procs
)
valid_dataset = valid_dataset.remove_columns(["context", "context_processed"])

             

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

In [37]:
valid_dataset["input"][0]

'what are two factors that directly effect how powerful a turing machine may or may not be? \\n (computational_complexity_theory) many types of turing machines are used to define complexity classes, such as deterministic turing machines, probabilistic turing machines, non-deterministic turing machines, quantum turing machines, symmetric turing machines and alternating turing machines. they are all equally powerful in principle, but when resources (such as time or space) are bounded, some of these may be more powerful than others.'

In [38]:
valid_dataset = valid_dataset.map(
        tokenize_function_train,
        batched=True,
        load_from_cache_file=False,
        num_proc=args.num_procs,
    )
valid_dataset = valid_dataset.remove_columns(["input", "answer"])

           

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

In [39]:
valid_dataset

Dataset({
    features: ['id', 'question', 'question_processed', 'answer_processed', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 252
})

In [41]:
sample = valid_dataset[0]

In [42]:
from pprint import pprint

In [43]:
sample['id']

'56e1b00ce3433e140042309f'

In [44]:
sample['question']

'What are two factors that directly effect how powerful a Turing machine may or may not be?'

In [45]:
sample['question_processed']

'what are two factors that directly effect how powerful a turing machine may or may not be?'

In [46]:
len(sample['input_ids'])

1024

In [47]:
print(len(sample['attention_mask']))

1024


In [48]:
print(sample['labels'])

[97, 42, 628, 97, 42, 628, 97, 42, 628, 1, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]


In [49]:
tokenizer.decode([97, 42, 628, 97, 42, 628, 97, 42, 628, 1])

'time or space time or space time or space</s>'

In [50]:
sample.keys()

dict_keys(['id', 'question', 'question_processed', 'answer_processed', 'input_ids', 'attention_mask', 'labels'])

In [51]:
import torch

In [57]:
sample_final = {}
for k, v in sample.items():
  if k in ['input_ids', 'attention_mask']:
    sample_final[k] = torch.LongTensor([v]).to('cuda')

In [58]:
sample_final.keys()

dict_keys(['input_ids', 'attention_mask'])

In [59]:
max_label = sample['labels'].index(1) + 1

In [60]:
sample.keys()

dict_keys(['id', 'question', 'question_processed', 'answer_processed', 'input_ids', 'attention_mask', 'labels'])

In [61]:
out = model.generate(**sample_final)
print("Question:", sample['question'])
print("Reference:", tokenizer.decode(sample['labels'][:max_label]))
print("Predict:", tokenizer.decode(out[0]))

Question: What are two factors that directly effect how powerful a Turing machine may or may not be?
Reference: time or space time or space time or space</s>
Predict: <pad> time or space</s>


