# Train UnifiedQA

In [None]:
!pip install transformers datasets sentencepiece

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!mkdir CausalQA
!cp -r /content/drive/MyDrive/CausalQA/input/* ./CausalQA/input/

In [None]:
!unzip ./CausalQA/input/original-splits.zip
!unzip ./CausalQA/input/random-splits.zip

In [None]:
import argparse
from argparse import Namespace
from datasets import load_dataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    set_seed,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)

In [None]:
# concatenate question+context with \\n as a separator
def build_input(batch):
    input_ = [
        (question + " \\n " + context if context is not None else question)
        for question, context in zip(
            batch["question_processed"], batch["context_processed"]
        )
    ]
    batch["input"] = input_
    return batch

In [None]:
def generate_tokenizer(args):
  print("Load tokenizer and model...")
  tokenizer = T5Tokenizer.from_pretrained(args.checkpoint)
  model = T5ForConditionalGeneration.from_pretrained(args.checkpoint)

  return tokenizer, model

In [None]:
print("Define tokenize_function_train...")
def tokenize_function_train(batches):
    encoded_inputs = tokenizer(
        batches["input"],
        max_length=args.source_length,
        padding="max_length",
        truncation=True,
    )
    encoded_answers = tokenizer(
        batches["answer"],
        max_length=args.target_length,
        padding="max_length",
        truncation=True,
    )
    encoded_inputs["labels"] = [
        [(a if a != tokenizer.pad_token_id else -100) for a in ans]
        for ans in encoded_answers["input_ids"]
    ]
    return encoded_inputs

Define tokenize_function_train...


In [None]:
def train_unifiedqa(args):    

    print("Load dataset from csv file")
    train_dataset = load_dataset("csv", data_files=args.train_file)["train"]
    
    print("Map build_input")
    train_dataset = train_dataset.map(
        build_input, batched=True, load_from_cache_file=False, num_proc=args.num_procs
    )
    train_dataset = train_dataset.remove_columns(["context", "context_processed"])
    
    print("Map tokenize_function_train")
    train_dataset = train_dataset.map(
        tokenize_function_train,
        batched=True,
        load_from_cache_file=False,
        num_proc=args.num_procs,
    )
    train_dataset = train_dataset.remove_columns(["input", "answer"])

    print("DataCollator...")
    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
    log_steps = args.steps // 10

    print("Set TrainingArguments...")
    train_args = Seq2SeqTrainingArguments(
        "models",
        per_device_train_batch_size=args.batch_size,
        max_steps=args.steps,
        seed=args.seed,
        save_strategy="no",
        logging_strategy="steps",
        logging_steps=log_steps,
        save_total_limit=1,
    )

    print("Seq2SeqTrainer...")
    trainer = Seq2SeqTrainer(
        model=model,
        args=train_args,
        train_dataset=train_dataset,
        data_collator=data_collator,
    )
    
    print("Train...")
    _ = trainer.train()

    start_index = args.train_file.rfind("/") + 1
    end_index = args.train_file.find("_")
    print("Save model...")
    trainer.save_model(args.output_directory + args.train_file[start_index:end_index])

In [None]:
args = Namespace(
    checkpoint="allenai/unifiedqa-v2-t5-base-1363200",
    train_file="Webis-CausalQA-22-v-1.0/input/original-splits/squad2_train_original_split.csv",
    steps=6000,
    # source_length=2048, #original
    source_length=1024,
    target_length=100,
    batch_size=2,
    seed=42,
    num_procs=8,
    output_directory="Webis-CausalQA-22-v-1.0/models/original-splits/"
)

In [None]:
set_seed(args.seed)

In [None]:
tokenizer, model = generate_tokenizer(args)

Load tokenizer and model...


In [None]:
train_unifiedqa(args)

In [None]:
!cp -r Webis-CausalQA-22-v-1.0/models/original-splits/squad2 /content/drive/MyDrive/CausalQA/models/

# Load finetune model

In [None]:
tokenizer = T5Tokenizer.from_pretrained(args.checkpoint)

In [None]:
finetuned_model = T5ForConditionalGeneration.from_pretrained('Webis-CausalQA-22-v-1.0/models/original-splits/squad2')

In [None]:
args.train_file

'Webis-CausalQA-22-v-1.0/input/original-splits/squad2_train_original_split.csv'

In [None]:
valid_dataset = load_dataset("csv", data_files="Webis-CausalQA-22-v-1.0/input/original-splits/squad2_valid_original_split.csv")



  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
valid_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'question_processed', 'context', 'context_processed', 'answer', 'answer_processed'],
        num_rows: 252
    })
})

In [None]:
valid_dataset = valid_dataset.map(
    build_input, batched=True, load_from_cache_file=False, num_proc=args.num_procs
)
valid_dataset = valid_dataset.remove_columns(["context", "context_processed"])

         

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

   

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
valid_dataset = valid_dataset.map(
        tokenize_function_train,
        batched=True,
        load_from_cache_file=False,
        num_proc=args.num_procs,
    )
valid_dataset = valid_dataset.remove_columns(["input", "answer"])

          

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
valid_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'question_processed', 'answer_processed', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 252
    })
})

In [None]:
sample = valid_dataset['train'][0]

In [None]:
from pprint import pprint

In [None]:
sample.keys()

dict_keys(['id', 'question', 'question_processed', 'answer_processed', 'input_ids', 'attention_mask', 'labels'])

In [None]:
sample['id']

'56e1b00ce3433e140042309f'

In [None]:
sample['question']

'What are two factors that directly effect how powerful a Turing machine may or may not be?'

In [None]:
sample['question_processed']

'what are two factors that directly effect how powerful a turing machine may or may not be?'

In [None]:
len(sample['input_ids'])

1024

In [None]:
print(len(sample['attention_mask']))

1024


In [None]:
print(sample['labels'])

[97, 42, 628, 97, 42, 628, 97, 42, 628, 1, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]


In [None]:
tokenizer.decode([97, 42, 628, 97, 42, 628, 97, 42, 628, 1])

'time or space time or space time or space</s>'

In [None]:
sample.keys()

dict_keys(['id', 'question', 'question_processed', 'answer_processed', 'input_ids', 'attention_mask', 'labels'])

In [None]:
import torch

In [None]:
sample_final = {}
for k, v in sample.items():
  if k in ['input_ids', 'attention_mask']:
    sample_final[k] = torch.LongTensor([v])

In [None]:
sample_final.keys()

dict_keys(['input_ids', 'attention_mask'])

In [None]:
max_label = sample['labels'].index(1) + 1

In [None]:
sample.keys()

dict_keys(['id', 'question', 'question_processed', 'answer_processed', 'input_ids', 'attention_mask', 'labels'])

In [None]:
sam

In [None]:
out = model.generate(**sample_final)
print("Question:", sample['question'])
print("Reference:", tokenizer.decode(sample['labels'][:max_label]))
print("Predict:", tokenizer.decode(out[0]))



Question: What are two factors that directly effect how powerful a Turing machine may or may not be?
Reference: time or space time or space time or space</s>
Predict: <pad> time or space</s>


'<pad> time or space</s>'