# Train UnifiedQA

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -qqq transformers datasets sentencepiece wandb rouge_score

[K     |████████████████████████████████| 5.5 MB 6.8 MB/s 
[K     |████████████████████████████████| 451 kB 56.6 MB/s 
[K     |████████████████████████████████| 1.3 MB 61.6 MB/s 
[K     |████████████████████████████████| 1.9 MB 58.2 MB/s 
[K     |████████████████████████████████| 182 kB 55.0 MB/s 
[K     |████████████████████████████████| 7.6 MB 44.8 MB/s 
[K     |████████████████████████████████| 115 kB 70.3 MB/s 
[K     |████████████████████████████████| 212 kB 73.5 MB/s 
[K     |████████████████████████████████| 127 kB 72.4 MB/s 
[K     |████████████████████████████████| 168 kB 64.4 MB/s 
[K     |████████████████████████████████| 182 kB 50.9 MB/s 
[K     |████████████████████████████████| 63 kB 1.5 MB/s 
[K     |████████████████████████████████| 166 kB 69.9 MB/s 
[K     |████████████████████████████████| 166 kB 69.5 MB/s 
[K     |████████████████████████████████| 162 kB 74.5 MB/s 
[K     |████████████████████████████████| 162 kB 70.2 MB/s 
[K     |██████████████████

In [3]:
import wandb
wandb.login()

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 

··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` now requires a token generated from https://huggingface.co/settings/tokens .
    
Token: 
Add token as git credential? (Y/n) n
Token is valid.
Your token has been saved to /root/.huggingface/token
Login successful


In [5]:
!mkdir -p CausalQA/input
!cp -r /content/drive/MyDrive/CausalQA/input/* ./CausalQA/input/

In [6]:
!unzip ./CausalQA/input/original-splits.zip
!unzip ./CausalQA/input/random-splits.zip

Archive:  ./CausalQA/input/original-splits.zip
   creating: Webis-CausalQA-22-v-1.0/input/original-splits/
  inflating: Webis-CausalQA-22-v-1.0/input/original-splits/eli5_train_original_split.csv  
  inflating: Webis-CausalQA-22-v-1.0/input/original-splits/msmarco_valid_original_split.csv  
  inflating: Webis-CausalQA-22-v-1.0/input/original-splits/searchqa_train_original_split.csv  
  inflating: Webis-CausalQA-22-v-1.0/input/original-splits/newsqa_train_original_split.csv  
  inflating: Webis-CausalQA-22-v-1.0/input/original-splits/naturalquestions_valid_original_split.csv  
  inflating: Webis-CausalQA-22-v-1.0/input/original-splits/hotpotqa_valid_original_split.csv  
  inflating: Webis-CausalQA-22-v-1.0/input/original-splits/searchqa_valid_original_split.csv  
  inflating: Webis-CausalQA-22-v-1.0/input/original-splits/triviaqa_valid_original_split.csv  
  inflating: Webis-CausalQA-22-v-1.0/input/original-splits/naturalquestions_train_original_split.csv  
  inflating: Webis-CausalQA-2

In [7]:
import warnings
warnings.simplefilter('ignore')

In [8]:
import argparse
from argparse import Namespace
from collections import Counter
from datasets import load_dataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    set_seed,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)
from rouge_score import rouge_scorer, scoring

In [9]:
# concatenate question+context with \\n as a separator
def build_input(batch):
    input_ = [
        (question + " \\n " + context if context is not None else question)
        for question, context in zip(
            batch["question_processed"], batch["context_processed"]
        )
    ]
    batch["input"] = input_
    return batch

In [10]:
def generate_tokenizer(args):
  print("Load tokenizer and model...")
  tokenizer = T5Tokenizer.from_pretrained(args.tokenizer_ckp)
  model = T5ForConditionalGeneration.from_pretrained(args.checkpoint)

  return tokenizer, model

In [11]:
print("Define tokenize_function_train...")
def tokenize_function_train(batches):
    encoded_inputs = tokenizer(
        batches["input"],
        max_length=args.source_length,
        padding="max_length",
        truncation=True,
    )
    encoded_answers = tokenizer(
        batches["answer"],
        max_length=args.target_length,
        padding="max_length",
        truncation=True,
    )
    encoded_inputs["labels"] = [
        [(a if a != tokenizer.pad_token_id else -100) for a in ans]
        for ans in encoded_answers["input_ids"]
    ]
    return encoded_inputs

Define tokenize_function_train...


In [12]:
args = Namespace(
    tokenizer_ckp="andreaschandra/unifiedqa-v2-t5-base-1363200-finetuned-causalqa-squad",
    checkpoint="allenai/unifiedqa-v2-t5-base-1363200",
    train_file="Webis-CausalQA-22-v-1.0/input/original-splits/squad2_train_original_split.csv",
    eval_file="Webis-CausalQA-22-v-1.0/input/original-splits/squad2_valid_original_split.csv",
    epochs=5,
    # source_length=2048, #original
    source_length=1024,
    target_length=100,
    batch_size=2,
    seed=42,
    num_procs=8,
    output_directory="Webis-CausalQA-22-v-1.0/models/original-splits/"
)

In [13]:
set_seed(args.seed)

In [14]:
tokenizer, model = generate_tokenizer(args)

Load tokenizer and model...


Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.43k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [15]:
print("Load dataset from csv file")
train_dataset = load_dataset("csv", data_files=args.train_file)["train"]
eval_dataset = load_dataset("csv", data_files=args.eval_file)["train"]

Load dataset from csv file




Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-07322aba3d916733/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-07322aba3d916733/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-bd5f6a419939b4bd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-bd5f6a419939b4bd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [16]:
print("Map build_input")
train_dataset = train_dataset.map(
    build_input, batched=True, load_from_cache_file=False, num_proc=args.num_procs
)
train_dataset = train_dataset.remove_columns(["context", "context_processed"])

Map build_input
            

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

In [17]:
print("Map build_input")
eval_dataset = eval_dataset.map(
    build_input, batched=True, load_from_cache_file=False, num_proc=args.num_procs
)
eval_dataset = eval_dataset.remove_columns(["context", "context_processed"])

Map build_input
          

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

In [18]:
def get_most_common(batch):
  answers = []
  for ans in batch['answer']:
    answer = Counter(ans.split('\t')).most_common(1)[0][0]
    answers.append(answer)
                   
  batch['answer'] = answers
  
  return batch

In [19]:
eval_dataset = eval_dataset.map(get_most_common, batched=True, num_proc=args.num_procs)

           

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

   

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

In [20]:
print("Map tokenize_function_train")
train_dataset = train_dataset.map(
    tokenize_function_train,
    batched=True,
    load_from_cache_file=False,
    num_proc=args.num_procs,
)
train_dataset = train_dataset.remove_columns(["input", "answer"])

Map tokenize_function_train
          

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

   

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

In [21]:
print("Map tokenize_function_train")
eval_dataset = eval_dataset.map(
    tokenize_function_train,
    batched=True,
    load_from_cache_file=False,
    num_proc=args.num_procs,
)
eval_dataset = eval_dataset.remove_columns(["input", "answer"])

Map tokenize_function_train
           

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

In [22]:
print("DataCollator...")
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

DataCollator...


In [23]:
log_steps = (len(train_dataset) // args.batch_size) // 20

In [24]:
log_steps

73

In [25]:
print("Set TrainingArguments...")
train_args = Seq2SeqTrainingArguments(
    output_dir="andreaschandra/unifiedqa-v2-t5-base-1363200-finetuned-causalqa-squad",
    report_to='wandb',
    per_device_train_batch_size=args.batch_size,
    per_device_eval_batch_size=args.batch_size,
    num_train_epochs=args.epochs,
    save_strategy='epoch',
    evaluation_strategy='steps',
    logging_strategy='steps',
    logging_steps=log_steps,
    push_to_hub=True,
    log_level='error',
    seed=args.seed
)

Set TrainingArguments...


In [26]:
print("Seq2SeqTrainer...")
trainer = Seq2SeqTrainer(
    model=model,
    args=train_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

Seq2SeqTrainer...


Cloning https://huggingface.co/andreaschandra/unifiedqa-v2-t5-base-1363200-finetuned-causalqa-squad into local empty directory.


Download file pytorch_model.bin:   0%|          | 3.43k/850M [00:00<?, ?B/s]

Download file training_args.bin:  98%|#########8| 3.48k/3.55k [00:00<?, ?B/s]

Download file spiece.model:   0%|          | 3.44k/773k [00:00<?, ?B/s]

Clean file training_args.bin:  28%|##8       | 1.00k/3.55k [00:00<?, ?B/s]

Clean file spiece.model:   0%|          | 1.00k/773k [00:00<?, ?B/s]

Clean file pytorch_model.bin:   0%|          | 1.00k/850M [00:00<?, ?B/s]

In [27]:
print("Train...")
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mandreaschandra[0m. Use [1m`wandb login --relogin`[0m to force relogin


Train...


Step,Training Loss,Validation Loss
73,0.7378,1.18369
146,0.6984,0.891821
219,0.4511,0.83421
292,0.4696,0.764182
365,0.295,0.799609
438,0.266,0.777346
511,0.2372,0.859212
584,0.2881,0.843963
657,0.2578,0.830636
730,0.2733,0.822849


Step,Training Loss,Validation Loss
73,0.7378,1.18369
146,0.6984,0.891821
219,0.4511,0.83421
292,0.4696,0.764182
365,0.295,0.799609
438,0.266,0.777346
511,0.2372,0.859212
584,0.2881,0.843963
657,0.2578,0.830636
730,0.2733,0.822849


TrainOutput(global_step=7395, training_loss=0.12706294928030681, metrics={'train_runtime': 8424.095, 'train_samples_per_second': 1.755, 'train_steps_per_second': 0.878, 'total_flos': 1.80068848238592e+16, 'train_loss': 0.12706294928030681, 'epoch': 5.0})

In [28]:
trainer.push_to_hub(commit_message="Training Completed!")

Several commits (2) will be pushed upstream.
The progress bars may be unreliable.
remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/andreaschandra/unifiedqa-v2-t5-base-1363200-finetuned-causalqa-squad
   4333271..6cadc45  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/andreaschandra/unifiedqa-v2-t5-base-1363200-finetuned-causalqa-squad
   4333271..6cadc45  main -> main



'https://huggingface.co/andreaschandra/unifiedqa-v2-t5-base-1363200-finetuned-causalqa-squad/commit/6cadc45bbef81bfb52be90d7002e6d1c34eaf538'

# Load finetune model

In [None]:
args.model = "andreaschandra/unifiedqa-v2-t5-base-1363200-finetuned-causalqa-squad"

In [None]:
tokenizer = T5Tokenizer.from_pretrained(args.model)

In [None]:
finetuned_model = T5ForConditionalGeneration.from_pretrained(args.model)

In [None]:
valid_dataset = load_dataset("csv", data_files="Webis-CausalQA-22-v-1.0/input/original-splits/squad2_valid_original_split.csv")["train"]

In [None]:
valid_dataset

In [None]:
valid_dataset = valid_dataset.map(
    build_input, batched=True, load_from_cache_file=False, num_proc=args.num_procs
)
valid_dataset = valid_dataset.remove_columns(["context", "context_processed"])

In [None]:
valid_dataset["input"][0]

In [None]:
valid_dataset = valid_dataset.map(
        tokenize_function_train,
        batched=True,
        load_from_cache_file=False,
        num_proc=args.num_procs,
    )
valid_dataset = valid_dataset.remove_columns(["input", "answer"])

In [None]:
valid_dataset

In [None]:
sample = valid_dataset[0]

In [None]:
from pprint import pprint

In [None]:
sample['id']

In [None]:
sample['question']

In [None]:
sample['question_processed']

In [None]:
len(sample['input_ids'])

In [None]:
print(len(sample['attention_mask']))

In [None]:
print(sample['labels'])

In [None]:
tokenizer.decode([97, 42, 628, 97, 42, 628, 97, 42, 628, 1])

In [None]:
sample.keys()

In [None]:
import torch

In [None]:
sample_final = {}
for k, v in sample.items():
  if k in ['input_ids', 'attention_mask']:
    sample_final[k] = torch.LongTensor([v]).to('cuda')

In [None]:
sample_final.keys()

In [None]:
max_label = sample['labels'].index(1) + 1

In [None]:
sample.keys()

In [None]:
out = model.generate(**sample_final)
print("Question:", sample['question'])
print("Reference:", tokenizer.decode(sample['labels'][:max_label]))
print("Predict:", tokenizer.decode(out[0]))