In [1]:
import torch
import nlp
from transformers import T5Tokenizer
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="6"


In [2]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')

In [3]:
from nlp import load_dataset

In [4]:
# train = load_dataset('text', data_files='data/codex-m/train.txt')['train']
# valid = load_dataset('text', data_files='data/codex-m/valid.txt')['train']
train = load_dataset('text', data_files='data/wikidata5m/train.txt')['train']
valid = load_dataset('text', data_files='data/wikidata5m/valid.txt')['train']

Using custom data configuration default
Using custom data configuration default


In [5]:
train[0]

{'text': 'predict tail: lalit kumar goel | instance of |\thuman being'}

In [6]:
def split_example(example):
    text = example['text'].split('\t')
    example['input_text'] = text[0]
    example['target_text'] = text[1]
    return example

def convert_to_features(example_batch):
    global tokenizer
    input_encodings = tokenizer.batch_encode_plus(example_batch['input_text'], pad_to_max_length=True, max_length=128)
    target_encodings = tokenizer.batch_encode_plus(example_batch['target_text'], pad_to_max_length=True, max_length=32)

    encodings = {
        'input_ids': input_encodings['input_ids'], 
        'attention_mask': input_encodings['attention_mask'],
        'target_ids': target_encodings['input_ids'],
        'target_attention_mask': target_encodings['attention_mask']
    }

    return encodings


In [None]:
train = train.map(split_example)
train = train.map(convert_to_features, batched=True)

  0%|          | 0/42687362 [00:00<?, ?it/s]

In [None]:
train.column_names

In [None]:
valid = valid.map(split_example)
valid = valid.map(convert_to_features, batched=True)

In [10]:
train[0]

{'input_text': 'predict tail: a few good men | cast member |',
 'target_text': 'jack nicholson',
 'text': 'predict tail: a few good men | cast member |\tjack nicholson'}

In [None]:
columns = ['input_ids', 'attention_mask', 'target_ids', 'target_attention_mask']
train.set_format(type='torch', columns=columns)
valid.set_format(type='torch', columns=columns)

In [None]:
len(train), len(valid)

In [None]:
torch.save(train, 'data/wikidata5m/train_data.pt')
torch.save(valid, 'data/wikidata5m/valid_data.pt')

In [14]:
import dataclasses
import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Dict, List, Optional

import numpy as np
import torch

from transformers import T5ForConditionalGeneration, T5Tokenizer, EvalPrediction
from transformers import (
    HfArgumentParser,
    DataCollator,
    Trainer,
    TrainingArguments,
    set_seed,
    DataCollatorForSeq2Seq,
)


logger = logging.getLogger(__name__)

class T2TDataCollator:
    
    """
    Take a list of samples from a Dataset and collate them into a batch.
    Returns:
        A dictionary of tensors
    """
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        
    def __call__(self, batch):
        input_text = [example['input_text'] for example in batch]
        target_text = [example['target_text'] for example in batch]
        inputs_tokenized = self.tokenizer(input_text, padding='max_length', max_length=128, return_tensors="pt")
        outputs_tokenized = self.tokenizer(target_text, padding='max_length', max_length=32, return_tensors="pt")
        input_ids, attention_mask = inputs_tokenized.input_ids, inputs_tokenized.attention_mask
        labels, labels_attention_mask = outputs_tokenized.input_ids, outputs_tokenized.attention_mask
        labels[labels==0] = -100
#         print(len(batch))
    #     input_ids = torch.stack([example['input_ids'] for example in batch])
    #     lm_labels = torch.stack([example['target_ids'] for example in batch])
    #     lm_labels[lm_labels[:, :] == 0] = -100
    #     attention_mask = torch.stack([example['attention_mask'] for example in batch])
    #     decoder_attention_mask = torch.stack([example['target_attention_mask'] for example in batch])


        return {
            'input_ids': input_ids, 
            'attention_mask': attention_mask,
            'labels': labels, 
    #         'decoder_attention_mask': decoder_attention_mask
        }

@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
    )

@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """
    train_file_path: Optional[str] = field(
        default='data/codex-m/train_data.pt',
        metadata={"help": "Path for cached train dataset"},
    )
    valid_file_path: Optional[str] = field(
        default='data/codex-m/valid_data.pt',
        metadata={"help": "Path for cached valid dataset"},
    )
    max_len: Optional[int] = field(
        default=128,
        metadata={"help": "Max input length for the source text"},
    )
    target_max_len: Optional[int] = field(
        default=32,
        metadata={"help": "Max input length for the target text"},
    )



In [15]:
import json
args_dict = {
  "model_name_or_path": 't5-small',
  "max_len": 128 ,
  "target_max_len": 32,
  "output_dir": './models/trainer',
  "overwrite_output_dir": True,
  "per_device_train_batch_size ": 64,
  "per_device_eval_batch_size ": 128,
  "train_batch_size": 64,
  "gradient_accumulation_steps": 4,
  "learning_rate": 1e-4,
  "num_train_epochs": 4,
  "do_train": True
}
with open('args.json', 'w') as f:
    json.dump(args_dict, f)
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath('args.json'))

In [16]:
training_args.per_device_eval_batch_size

8

In [17]:
training_args

TrainingArguments(output_dir=./models/trainer, overwrite_output_dir=True, do_train=True, do_eval=None, do_predict=False, evaluation_strategy=EvaluationStrategy.NO, prediction_loss_only=False, per_device_train_batch_size=8, per_device_eval_batch_size=8, gradient_accumulation_steps=4, eval_accumulation_steps=None, learning_rate=0.0001, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=4, max_steps=-1, lr_scheduler_type=SchedulerType.LINEAR, warmup_steps=0, logging_dir=runs/May01_14-03-50_puri, logging_first_step=False, logging_steps=500, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level=O1, fp16_backend=auto, local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=500, dataloader_num_workers=0, past_index=-1, run_name=./models/trainer, disable_tqdm=False, remove_unused_columns=True, label_names=None, load_best_model_at_end=False, metric

In [18]:
len(train)

371168

In [19]:
ta = TrainingArguments(output_dir='./models/trainer',
                      per_device_train_batch_size=64,
                      learning_rate=1e-4,
                      num_train_epochs=4,
                      overwrite_output_dir=True,
                      dataloader_num_workers=12,
                      prediction_loss_only=True,
                      do_train=True,
                      adafactor=True,
                      )

In [20]:
ta.per_device_train_batch_size

64

In [21]:
if (
    os.path.exists(training_args.output_dir)
    and os.listdir(training_args.output_dir)
    and training_args.do_train
    and not training_args.overwrite_output_dir
):
    raise ValueError(
        f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
    )

# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
)
logger.warning(
    "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
    training_args.local_rank,
    training_args.device,
    training_args.n_gpu,
    bool(training_args.local_rank != -1),
    training_args.fp16,
)
logger.info("Training/evaluation parameters %s", training_args)

# Set seed
set_seed(training_args.seed)

05/01/2021 14:03:59 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(output_dir=./models/trainer, overwrite_output_dir=True, do_train=True, do_eval=None, do_predict=False, evaluation_strategy=EvaluationStrategy.NO, prediction_loss_only=False, per_device_train_batch_size=8, per_device_eval_batch_size=8, gradient_accumulation_steps=4, eval_accumulation_steps=None, learning_rate=0.0001, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=4, max_steps=-1, lr_scheduler_type=SchedulerType.LINEAR, warmup_steps=0, logging_dir=runs/May01_14-03-50_puri, logging_first_step=False, logging_steps=500, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level=O1, fp16_backend=auto, local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=500, dataloader_num_workers=0, past_index=-1, run_name=./models/trainer, disable_tqdm=False, remove_un

In [22]:
# tokenizer = T5Tokenizer.from_pretrained(
#     model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
#     cache_dir=model_args.cache_dir,
# )
# model = T5ForConditionalGeneration.from_pretrained(
#     model_args.model_name_or_path,
#     cache_dir=model_args.cache_dir,
# )
model = T5ForConditionalGeneration.from_pretrained('models/trainer/checkpoint-2000')

In [23]:
print('loading data')
# train_dataset  = torch.load(data_args.train_file_path)
# valid_dataset = torch.load(data_args.valid_file_path)
train_dataset = train
valid_dataset = valid
print('loading done')


loading data
loading done


In [24]:
data_collator = T2TDataCollator(tokenizer)

In [25]:
data_collator.tokenizer

PreTrainedTokenizer(name_or_path='t5-small', vocab_size=32100, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>', '<extra_id_44>', '<extra_id_45>'

In [26]:
from transformers import Adafactor
from torch.optim.lr_scheduler import LambdaLR
from torch.optim import AdamW

In [27]:
trainer = Trainer(
    model=model,
    args=ta,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=T2TDataCollator,
)


In [28]:
if ta.do_train:
    trainer.train(
        model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
    )
    trainer.save_model()
    # For convenience, we also re-save the tokenizer to the same directory,
    # so that you can share your model easily on huggingface.co/models =)
    if trainer.is_world_master():
        tokenizer.save_pretrained(training_args.output_dir)




TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/scratche/home/apoorv/transformer-kgc/kgc_env/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 178, in _worker_loop
    data = fetcher.fetch(index)
  File "/scratche/home/apoorv/transformer-kgc/kgc_env/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/scratche/home/apoorv/transformer-kgc/kgc_env/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/scratche/home/apoorv/transformer-kgc/kgc_env/lib/python3.8/site-packages/nlp/arrow_dataset.py", line 714, in __getitem__
    return self._getitem(
  File "/scratche/home/apoorv/transformer-kgc/kgc_env/lib/python3.8/site-packages/nlp/arrow_dataset.py", line 702, in _getitem
    outputs = self._convert_outputs(
  File "/scratche/home/apoorv/transformer-kgc/kgc_env/lib/python3.8/site-packages/nlp/arrow_dataset.py", line 619, in _convert_outputs
    v = map_nested(command, v, **map_nested_kwargs)
  File "/scratche/home/apoorv/transformer-kgc/kgc_env/lib/python3.8/site-packages/nlp/utils/py_utils.py", line 191, in map_nested
    return function(data_struct)
TypeError: new(): invalid data type 'str'


In [None]:
# we will load the arguments from a json file, 
#make sure you save the arguments in at ./args.json


# Load pretrained model and tokenizer
#
# Distributed training:
# The .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.


# Get datasets

# Initialize our Trainer

# Training

# Evaluation
results = {}
if training_args.do_eval and training_args.local_rank in [-1, 0]:
    logger.info("*** Evaluate ***")

    eval_output = trainer.evaluate()

    output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results *****")
        for key in sorted(eval_output.keys()):
            logger.info("  %s = %s", key, str(eval_output[key]))
            writer.write("%s = %s\n" % (key, str(eval_output[key])))

    results.update(eval_output)

return results