In [0]:
import torch
from tfrecord.torch.dataset import TFRecordDataset

tfrecord_path = "train_samples.tfrecord"
index_path = None
#description = {"image": "byte", "label": "float"}
dataset = TFRecordDataset(tfrecord_path, index_path)#, description)
loader = torch.utils.data.DataLoader(dataset, batch_size=32)

#data = next(iter(loader))
#print(data.keys)

In [0]:
import collections

from absl import logging

from transformers import BertPreTrainedModel, BertModel
import torch
from torch import nn
from torch.nn import CrossEntropyLoss, MSELoss
import torch.nn.functional as F


class TYDIQA(BertPreTrainedModel):
    "Create a QA model for tydi taks"

    def __init__(self, bert_config):
        super(BertPreTrainedModel, self).__init__(bert_config)
        self.num_answer_types = 5

        self.bert = BertModel(bert_config)

        self.qa_outputs = nn.Linear(bert_config.hidden_size, 2) #we need to label start and end position
        self.answer_type_output_dense = nn.Linear(bert_config.hidden_size, self.num_answer_types)

        self.init_weights()

    def forward(self,
                input_ids = None,
                attention_mask = None,
                token_type_ids = None,
                position_ids = None,
                head_mask = None,
                inputs_embeds = None,
                start_positions = None,
                end_positions = None,
                answer_types = None
    ):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        sequence_output = outputs[0]

        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim = -1) #split logits into two, with each of size [batch * seq_len]
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        # Get the logits for the answer type prediction.
        answer_type_output_layer = outputs[1]
        answer_type_logits = self.answer_type_output_dense(answer_type_output_layer)

        #get sequence length
        seq_length = sequence_output.size(1)

        def compute_loss(logits, positions):
            one_hot_positions = F.one_hot(
                positions, num_classes = seq_length
            )
            log_probs = F.log_softmax(logits, dim=-1)
            loss = -torch.mean(torch.sum(one_hot_positions * log_probs, dim = -1))
            return loss

        # Computes the loss for labels.
        def compute_label_loss(logits, labels):
            one_hot_positions = F.one_hot(
                labels, num_classes=self.num_answer_types
            )
            log_probs = F.log_softmax(logits, dim=-1)
            loss = -torch.mean(torch.sum(one_hot_positions * log_probs, dim=-1))
            return loss

        start_loss = compute_loss(start_logits, start_positions)
        end_loss = compute_loss(end_logits, end_positions)

        answer_type_loss = compute_label_loss(answer_type_logits, answer_types)

        total_loss = (start_loss + end_loss + answer_type_loss) / 3.0

        return start_logits, end_logits, answer_type_logits, total_loss

In [0]:
tfrecord_path = "train_samples.tfrecord"
#index_path = None
#description = {"image": "byte", "label": "float"}
dataset = TFRecordDataset(tfrecord_path, index_path)#, description)
loader = torch.utils.data.DataLoader(dataset, batch_size=32)
model = TYDIQA.from_pretrained('bert-base-multilingual-cased')

In [13]:
dataset

<torch.utils.data.dataloader.DataLoader at 0x7f436f3338d0>

In [11]:
outputs[-1]

NameError: ignored

In [0]:
import collections
import json
import os

from absl import logging
from bert import modeling as bert_modeling
import tensorflow.compat.v1 as tf
import postproc

import torch
import argparse
from transformers import BertConfig, AdamW, get_linear_schedule_with_warmup, squad_convert_examples_to_features
#from tydi_modeling_torch import TYDIQA
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import tqdm
from transformers.data.processors.squad import SquadResult, SquadV1Processor, SquadV2Processor
from torch.utils.tensorboard import SummaryWriter

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

parser = argparse.ArgumentParser(description='Arguments for running tydi')












parser.add_argument(
        "--predict_file",
        default=None,
        type=str,
        required=False,
         help="TyDi json for predictions. E.g., dev-v1.1.jsonl.gz or test-v1.1.jsonl.gz. "
         "Used only for `--do_predict`."
    )

parser.add_argument(
        "--precomputed_predict_file",
        default=None,
        type=str,
        required=False,
         help="TyDi json for predictions. E.g., dev-v1.1.jsonl.gz or test-v1.1.jsonl.gz. "
         "Used only for `--do_predict`."
    )


parser.add_argument(
        "--max_seq_length",
        default=512,
        type=int,
        required=False,
         help="Where to print predictions in TyDi prediction format, to be passed to"
         "tydi_eval.py."
    )

parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        required=False,
         help="When splitting up a long document into chunks, how much stride to "
         "take between chunks."
    )

parser.add_argument(
        "--max_question_length",
        default=64,
        type=str,
        required=False,
         help="When splitting up a long document into chunks, how much stride to "
         "take between chunks."
    )

parser.add_argument(
        "--do_train",
        default=True,
        type=bool,
        required=False,
         help="Whether to run training."
    )



parser.add_argument(
        "--train_batch_size",
        default=1,
        type=int,
        required=False,
         help="Whether to run prediction."
    )


parser.add_argument(
        "--predict_batch_size",
        default=16,
        type=int,
        required=False,
         help="Total batch size for predictions."
    )

parser.add_argument(
        "--predict_file_shard_size",
        default=1000,
        type=int,
        required=False,
         help="The maximum number of examples to put into each temporary TF example file "
    "used as model input a prediction time."
    )

parser.add_argument(
        "--learning_rate",
        default=5e-5,
        type=float,
        required=False,
         help="The initial learning rate for Adam."
    )

parser.add_argument(
        "--num_train_epochs",
        default=3,
        type=int,
        required=False,
         help="The initial learning rate for Adam."
    )

parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        required=False,
         help="Proportion of training to perform linear learning rate warmup for. "
    "E.g., 0.1 = 10% of training."
    )

parser.add_argument(
        "--save_checkpoints_steps",
        default=0.1,
        type=float,
        required=False,
         help="How often to save the model checkpoint."
    "E.g., 0.1 = 10% of training."
    )

parser.add_argument(
        "--iterations_per_loop",
        default=1000,
        type=int,
        required=False,
         help="How many steps to make in each estimator call."
    )

parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        required=False,
         help="The maximum length of an answer that can be generated. This is needed "
    "because the start and end predictions are not conditioned on one another."
    )

parser.add_argument(
        "--include_unknowns",
        default=-1.0,
        type=float,
        required=False,
         help=   "If positive, probability of including answers of type `UNKNOWN`."
         )

parser.add_argument(
        "--verbose_logging",
        default=False,
        type=bool,
        required=False,
         help=   "If true, all of the warnings related to data processing will be printed. "
    "A number of warnings are expected for a normal TyDi evaluation."
         )

parser.add_argument(
        "--max_passages",
        default=45,
        type=int,
        required=False,
         help=   "Maximum number of passages to consider for a "
                        "single article. If an article contains more than"
                        "this, they will be discarded during training. "
                        "BERT's WordPiece vocabulary must be modified to include "
                        "these within the [unused*] vocab IDs."
         )

parser.add_argument(
        "--max_position",
        default=45,
        type=int,
        required=False,
         help=  "Maximum passage position for which to generate special tokens."
         )

parser.add_argument(
        "--fail_on_invalid",
        default=True,
        type=bool,
        required=False,
         help= "Stop immediately on encountering an invalid example? "
    "If false, just print a warning and skip it."
         )

parser.add_argument(
        "--adam_epsilon",
        default=1e-8,
        type=float,
        required=False,
         help= "weight decaying rate for adam optimizer"
         )


parser.add_argument(
    "--gradient_accumulation_steps",
    type=int,
    default=1,
    help="Number of updates steps to accumulate before performing a backward/update pass.",
)
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
args = parser.parse_args(args=[])





In [28]:
train(args, dataset, model)

INFO:absl:***** Running training *****
INFO:absl:  Num Epochs = 3
INFO:absl:  Let's start finetuning!
Epoch: 100%|██████████| 3/3 [05:44<00:00, 114.77s/it]


(45, 1.9299078902436628)

In [0]:
def train(args, train_dataset, model):
    # use tensorboard to keep track of training process
    tb_writer = SummaryWriter()

    #train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, batch_size=args.train_batch_size)

    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)

    # Train!
    logging.info("***** Running training *****")
    #logging.info("  Num examples = %d", len(train_dataset))
    logging.info("  Num Epochs = %d", args.num_train_epochs)

    logging.info("  Let's start finetuning!")
    tr_loss, logging_loss = 0.0, 0.0
    global_step = 0
    for epoch in tqdm.trange(args.num_train_epochs, desc = 'Epoch'):
        #epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        for step, batch in enumerate(train_dataloader):
            model.train()

            outputs = model(
                input_ids=batch["input_ids"].long().to(DEVICE),
                attention_mask=batch['input_mask'].long().to(DEVICE),
                token_type_ids=batch['segment_ids'].long().to(DEVICE),
                start_positions=batch["start_positions"].long().to(DEVICE),
                end_positions=batch["end_positions"].long().to(DEVICE),
                answer_types=batch["answer_types"].long().to(DEVICE)
            )

            loss = outputs[-1]

            loss.backward()
            tr_loss += loss.item()
            optimizer.step()
            model.zero_grad()
            global_step += 1

            #loggin points
            if global_step % args.logging_steps == 0:
                tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                logging_loss = tr_loss

            # save checkpoint
            if global_step % args.save_steps == 0:
                output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                    # Take care of distributed/parallel training
                model_to_save = model.module if hasattr(model, "module") else model
                model_to_save.save_pretrained(output_dir)
                tokenizer.save_pretrained(output_dir)

                torch.save(args, os.path.join(output_dir, "training_args.bin"))
                logging.info("Saving model checkpoint to %s", output_dir)

                torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                logging.info("Saving optimizer and scheduler states to %s", output_dir)

    return global_step, tr_loss / global_step

In [0]:
dict_keys(['answer_types', 'segment_ids', 'input_ids', 'input_mask', 'language_id', 'start_positions', 'end_positions', 'unique_ids', 'example_index'])

[0m[01;34mbert[0m/                            run_tydi.py
data.py                          run_tydi_test.py
debug.py                         tf_io.py
[01;34mdev_samples[0m/                     tiny_dev.jsonl.gz
mbert_modified_vocab.txt         tokenization.py
[01;34mmulti_cased_L-12_H-768_A-12[0m/     train_samples_record_count.txt
multi_cased_L-12_H-768_A-12.zip  train_samples.tfrecord
postproc.py                      tydi_modeling.py
prepare_tydi_data.py             [01;32mtydiqa.tape[0m*
preproc.py                       tydiqa-v1.0-dev.jsonl.gz
[01;34m__pycache__[0m/                     tydiqa-v1.0-train.jsonl.gz
README.md
