In [1]:
%load_ext autoreload
%autoreload 2

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
import os
import json
import itertools
from itertools import product, permutations
from random import sample

In [3]:
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [4]:
from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.modeling import BertForPreTraining, BertForMaskedLM, BertConfig
from pytorch_pretrained_bert.optimization import BertAdam
from run_child_finetuning import *

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [5]:
BERT_DIR = '/nas/pretrain-bert/pretrain-pytorch/bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained('/nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt')

06/09/2019 14:55:34 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt


In [6]:
def assert_in_bert_vocab(tokens):
    for token in tokens:
        if isinstance(token, str):  # entities
            assert token.lower() in tokenizer.vocab, token + '->' + str(tokenizer.tokenize(token))
        elif isinstance(token, tuple):  # relations
            assert len(token) == 2, str(token)
            for rel in token:
                rel = rel.split('..')[0]
                assert rel in tokenizer.vocab, rel + '->' + str(tokenizer.tokenize(rel))

In [7]:
fruits = ['apple', 'banana', 'pear', 'orange', 'peach', 'berry', 'plum', 'pinapple', 'melon', 'cherry', 'grape', 'lemon',
          'papaya', 'durian', 'kiwi', 'mongo', 'date', 'jujube', 'watermelon']
len(fruits)
# http://www.manythings.org/vocabulary/lists/e/words.php?f=fruit

19

In [8]:
animals = ['dog', 'cat', 'pig', 'chicken', 'hen', 'cock', 'duck', 'goose', 'monkey', 'tiger', 'bird', 'bear', 'lion', 'bee', 'ant', 'elephant']
len(animals)
# see more at http://www.manythings.org/vocabulary/lists/a/words.php?f=animals_1
# http://www.manythings.org/vocabulary/lists/a/
# especially http://www.manythings.org/vocabulary/lists/a/words.php?f=classroom_1  things in classroom

16

In [9]:
male_names = ['James', 'John', 'Robert', ]#'Michael', 'David', 'Paul', 'Jeff', 'Daniel', 'Charles', 'Thomas']
female_names = ['Mary', 'Linda', 'Jennifer', ]#'Maria', 'Susan', 'Lisa', 'Sandra', 'Barbara', 'Patricia', 'Elizabeth']
len(male_names)
len(female_names)
people_names = (male_names, female_names)
assert_in_bert_vocab(male_names)
assert_in_bert_vocab(female_names)

3

3

In [10]:
spatial_relations = (
    ('above', 'below'), 
    ('in front of/in the front', 'behind/in the back'), 
    ('on the left..side of', 'on the right..side of')
)
people_adj_relations = (
    ('taller..than', 'shorter..than'), 
#     ('thinner..than', 'fatter..than'),   # fatter not in BERT vocab
    ('younger..than', 'older..than'), 
#     ('stronger..than', 'weaker..than'), 
#     ('faster..than', 'slower..than'),
#     ('richer..than', 'poorer..than')
)
animal_adj_relations = (
    ('thinner..than', 'fatter..than'), 
    ('younger..than', 'older..than'), 
    ('stronger..than', 'weaker..than'), 
    ('faster..than', 'slower..than')
)
object_adj_relations = (
    ('bigger..than', 'smaller..than'), 
    ('heavier..than', 'lighter..than'), 
    ('better..than', 'worse..than')
)
assert_in_bert_vocab(people_adj_relations)

In [11]:
rel2entypes = {
#     spatial_relations: [fruits, animals, people_names],
    people_adj_relations: [people_names],
#     animal_adj_relations: [animals],
#     object_adj_relations: [fruits, animals]
}

In [12]:
twoent_A_template = 'is {dt} {ent0} {rel} {dt} {ent1}'
twoent_B_template = '{dt} {ent} is {pred}'
twoent_template = '"{A}?" "{conj} {B}."'

In [13]:
def reverse(l):
    return list(reversed(l)) if isinstance(l, list) else tuple(reversed(l))

In [14]:
def mask(ent_str):
    tokens = ent_str.strip().split()
    if len(tokens) == 1:
        return '[%s]' % tokens[0]
    elif len(tokens) == 2:
        assert tokens[0] == 'the', ent_str
        return '%s [%s]' % (tokens[0], tokens[1])
    else:
        assert False, ent_str

In [15]:
def get_conj(join_type, A, B):
    if join_type == 'no':
        return 'no,'
    return 'yes,'
    assert join_type == 'yes'
    subB = B.split('is')[0].split()[-1]
    w0, w1, w2 = A.split()[: 3]
    assert w0 == 'Is'
    subA = w1 if w1 != 'the' else w2
    if subA == subB and 'not' not in B:  # B is repeating A
        return 'Yes,'
    else:
        return 'Yes, in other words,'

In [134]:
def make_sentences(A_template, B_template, join_template,
                   index=-1, orig_sentence='', entities=["John", "Mary"], entity_substitutes=None, determiner="", 
                   relations=[],
                   packed_relations=["rel/~rel", "rev_rel/~rev_rel"], packed_relation_substitutes=None, relation_suffix="",
                   packed_predicates=["pred0/~pred0", "pred1/~pred1"], predicate_substitutes=None,
                   predicate_dichotomy=True, reverse_causal=False):
#     assert entities[0].lower() in tokenizer.vocab , entities[0]
#     assert entities[1].lower() in tokenizer.vocab , entities[1]
    determiner = 'the' if entities[0].islower() else ''
    relations, predicates = ([r.replace('..', ' ') for r in relations], [r.split('..')[0] for r in relations]) \
        if '..' in relations[0] else ([r.split('/')[0] for r in relations], [r.split('/')[-1] for r in relations])
    neg_predicates = ['not ' + p for p in predicates]
    As = [A_template.format(dt=determiner, ent0=ent0, ent1=ent1, rel=rel, rel_suffix=relation_suffix) 
          for ent0, ent1, rel in [entities + relations[:1], reverse(entities) + reverse(relations)[:1]]]
    negAs = [A_template.format(dt=determiner, ent0=ent0, ent1=ent1, rel=rel, rel_suffix=relation_suffix) 
             for ent0, ent1, rel in [entities + reverse(relations)[:1], reverse(entities) + relations[:1]]]
    
    Bs = [B_template.format(dt=determiner, ent=mask(ent), pred=pred) for ent, pred in zip(entities, predicates)]
    negBs = [B_template.format(dt=determiner, ent=mask(ent), pred=pred) for ent, pred in zip(entities, neg_predicates)]
    if predicate_dichotomy:
        Bs += [B_template.format(dt=determiner, ent=mask(ent), pred=pred) for ent, pred in zip(entities, reversed(neg_predicates))]
        negBs += [B_template.format(dt=determiner, ent=mask(ent), pred=pred) for ent, pred in zip(entities, reversed(predicates))]
    
    def form_sentences(sentence_template, join_type, As, Bs):
        return [" ".join(sentence_template.format(A=A, B=B, conj=get_conj(join_type, A, B)).split()) for A, B in itertools.product(As, Bs)]
    
    yes_sentences = []
    for A, B in [(As, Bs), (negAs, negBs)]:
        yes_sentences += form_sentences(join_template, 'yes', A, B)
#     yes_sentences = list(itertools.chain.from_iterable([form_sentences(join_template, 'yes', A, B) for A, B in [(As, Bs), (negAs, negBs)]]))

    no_sentences = []
    for A, B in [(As, negBs), (negAs, Bs)]:
        no_sentences += form_sentences(join_template, 'no', A, B)
        
    return yes_sentences + no_sentences
    
# make_sentences(
#     twoent_A_template, twoent_B_template, twoent_template, entities=['apple', 'banana'], determiner='', relations=['taller..than', 'shorter..than'])

In [180]:
sentence_groups = []
for relations, entity_types in rel2entypes.items():
    sentences = []
    ent_pairs = []
    for entities in entity_types:
        if isinstance(entities, list):
            ent_pairs += permutations(entities, 2)
        else:
            assert isinstance(entities, tuple) and len(entities) == 2  # people_names
            ent_pairs += product(entities[0], entities[1])
            ent_pairs += product(entities[1], entities[0])
    for (rel, ent_pair) in product(relations, ent_pairs):
#         yes_sent, no_sent = make_sentences(twoent_A_template, twoent_B_template, twoent_template, entities=list(ent_pair), relations=rel)
#         sentences += (yes_sent + no_sent)
        sentences +=  make_sentences(twoent_A_template, twoent_B_template, twoent_template, entities=list(ent_pair), relations=rel)
    sample(sentences, 20)
    sentence_groups.append(sentences)

NameError: name 'make_sentences' is not defined

In [115]:
len(sentence_groups)
[len(sg) for sg in sentence_groups]

4

[78432, 38400, 32768, 59232]

In [16]:
def comparative2superlative(comparative_form, structured=False):
    assert comparative_form.endswith('er'), comparative_form
    superlative_form = 'the ' + comparative_form[:-2] + 'est' \
        if not structured else 'the ' + comparative_form + ' st'
    return superlative_form

In [17]:
def make_relational_atoms(relational_template, entities, relations):
    neg_relations = ["isn't " + r for r in relations]
    relations = ["is " + r for r in relations]
    atoms = [relational_template.format(ent0=ent0, ent1=ent1, rel=rel) 
             for ent0, ent1, rel in [entities + relations[:1], reverse(entities) + reverse(relations)[:1]]]
    atoms += [relational_template.format(ent0=ent0, ent1=ent1, rel=rel) 
              for ent0, ent1, rel in [entities + reverse(neg_relations)[:1], reverse(entities) + neg_relations[:1]]]
    return atoms

In [44]:
transitive_P_template = '{ent0} {rel} {ent1} .'
transitive_wh_QA_template = '{which} is {pred} ? {ent} .'
transitive_yesno_QA_template = 'is {ent0} {rel} {ent1} ? {ans} .'

def make_transitive(P_template, wh_QA_template, yesno_QA_template, join_template,
                   index=-1, orig_sentence='', entities=["John", "Mary", "Susan"], entity_substitutes=None, determiner="", 
                   relations=('taller..than', 'shorter..than'), maybe=True, structured=False,
                   packed_predicates=["pred0/~pred0", "pred1/~pred1"], predicate_substitutes=None,
                   predicate_dichotomy=True, reverse_causal=False):
    if entities[0].islower():
        entities = ['the ' + e for e in entities]
#     print('relations =', relations)
    relations, predicates = ([r.replace('..', ' ') for r in relations], [r.split('..')[0] for r in relations]) \
        if '..' in relations[0] else ([r.split('/')[0] for r in relations], [r.split('/')[-1] for r in relations])
#     print('relations =', relations, 'predicates =', predicates)
    predicates = [comparative2superlative(p, structured=structured) for p in predicates]
    
    P0_entities, P1_entities = ([entities[0], entities[1]], [entities[1], entities[2]]) \
        if not maybe else ([entities[0], entities[1]], [entities[0], entities[2]])
    P0 = make_relational_atoms(P_template, P0_entities, relations)
    P1 = make_relational_atoms(P_template, P1_entities, relations)
        
    wh_pronoun = 'which' if entities[0].startswith('the') else 'who'
    wh_QA = [wh_QA_template.format(which=wh_pronoun, pred=pred, ent=ent) 
             for pred, ent in [(predicates[0], mask(entities[0])), (predicates[-1], mask(entities[-1] if not maybe else 'unknown'))]]
    
    def _maybe(s):
         return s if not maybe else 'maybe'
    yesno_entities = (entities[0], entities[-1]) if not maybe else (entities[1], entities[-1])
    yesno_QA = [yesno_QA_template.format(ent0=ent0, ent1=ent1, rel=rel, ans=ans) 
                for ent0, ent1, rel, ans in [
                    (yesno_entities[0], yesno_entities[-1], relations[0], mask(_maybe('yes'))), 
                    (yesno_entities[0], yesno_entities[-1], relations[-1], mask(_maybe('no'))),
                    (yesno_entities[-1], yesno_entities[0], relations[-1], mask(_maybe('yes'))),
                    (yesno_entities[-1], yesno_entities[0], relations[0], mask(_maybe('no')))]]
    
    Ps = [(p0, p1) for p0, p1 in list(product(P0, P1)) + list(product(P1, P0))]
    QAs = wh_QA + yesno_QA
    
    def get_rel(atom):
        for rel in relations:
#             assert rel.startswith('is')
            rel = rel.split()[0]  # "taller than" -> "taller"
            if rel in atom:
                return rel
        assert False
    sentences = [p0 + ' ' + p1 + ' ||| ' + qas for (p0, p1), qas in product(Ps, QAs)
                 if not structured or get_rel(p0) == get_rel(p1) == get_rel(qas)]
#     sentences = [s.replace('er st ', 'est ') for s in sentences]
    return sentences

sentences = make_transitive(transitive_P_template, transitive_wh_QA_template, transitive_yesno_QA_template, None, maybe=False, structured=False)
# len(sentences)
sample(sentences, 20)
sentences

['John is taller than Mary . Mary is taller than Susan . ||| is Susan shorter than John ? [yes] .',
 'John is taller than Mary . Susan is shorter than Mary . ||| is Susan shorter than John ? [yes] .',
 "Mary isn't taller than John . Mary is taller than Susan . ||| is Susan shorter than John ? [yes] .",
 "Susan isn't taller than Mary . John is taller than Mary . ||| who is the tallest ? [John] .",
 "John is taller than Mary . Susan isn't taller than Mary . ||| is John shorter than Susan ? [no] .",
 "Mary is shorter than John . Mary isn't shorter than Susan . ||| is Susan shorter than John ? [yes] .",
 "John isn't shorter than Mary . Mary is taller than Susan . ||| is Susan taller than John ? [no] .",
 "Mary is taller than Susan . John isn't shorter than Mary . ||| is Susan taller than John ? [no] .",
 "Mary is shorter than John . Susan isn't taller than Mary . ||| is Susan taller than John ? [no] .",
 "Mary is shorter than John . Susan isn't taller than Mary . ||| who is the shortest ? 

['John is taller than Mary . Mary is taller than Susan . ||| who is the tallest ? [John] .',
 'John is taller than Mary . Mary is taller than Susan . ||| who is the shortest ? [Susan] .',
 'John is taller than Mary . Mary is taller than Susan . ||| is John taller than Susan ? [yes] .',
 'John is taller than Mary . Mary is taller than Susan . ||| is John shorter than Susan ? [no] .',
 'John is taller than Mary . Mary is taller than Susan . ||| is Susan shorter than John ? [yes] .',
 'John is taller than Mary . Mary is taller than Susan . ||| is Susan taller than John ? [no] .',
 'John is taller than Mary . Susan is shorter than Mary . ||| who is the tallest ? [John] .',
 'John is taller than Mary . Susan is shorter than Mary . ||| who is the shortest ? [Susan] .',
 'John is taller than Mary . Susan is shorter than Mary . ||| is John taller than Susan ? [yes] .',
 'John is taller than Mary . Susan is shorter than Mary . ||| is John shorter than Susan ? [no] .',
 'John is taller than Mary

In [41]:
'a' + ' .'*random.randint(0, 10) + ' ' + 'b' +  ' .'*random.randint(0, 10) + ' ' +  'c'
len(None)

'a . . . b . . . c'

TypeError: object of type 'NoneType' has no len()

In [23]:
sentence_groups = []
maybe = False
for relations, entity_types in rel2entypes.items():
    sentences = []
    ent_tuples = []
    for entities in entity_types:
        if isinstance(entities, list):
            ent_tuples += permutations(entities, 3)
        else:
            assert isinstance(entities, tuple) and len(entities) == 2  # people_names
            ent_tuples += permutations(entities[0] + entities[1], 3)
    for (rel, ent_tuple) in product(relations, ent_tuples):
        sentences += make_transitive(transitive_P_template, transitive_wh_QA_template, transitive_yesno_QA_template, None, 
                            entities=list(ent_tuple), relations=rel, maybe=False, structured=True)
        if maybe:
            sentences += make_transitive(transitive_P_template, transitive_wh_QA_template, transitive_yesno_QA_template, None, 
                                entities=list(ent_tuple), relations=rel, maybe=True, structured=True)
    sample(sentences, 20)
    print('num_sent =', len(sentences), '->', len(set(sentences)))
    sentence_groups.append(sentences)

['James is older than Jennifer . Jennifer is older than John . ||| is James older than John ? [yes] .',
 "James is younger than Jennifer . James isn't younger than Linda . ||| who is the younger st ? [Linda] .",
 "Linda is shorter than Mary . Linda isn't shorter than Robert . ||| is Mary shorter than Robert ? [no] .",
 'Linda is shorter than Robert . John is shorter than Linda . ||| who is the shorter st ? [John] .',
 'Mary is older than Robert . John is older than Mary . ||| is Robert older than John ? [no] .',
 "Jennifer isn't younger than Robert . James is younger than Robert . ||| is Jennifer younger than James ? [no] .",
 "Mary is shorter than Jennifer . Mary isn't shorter than John . ||| who is the shorter st ? [John] .",
 "Linda isn't taller than Robert . Linda is taller than John . ||| who is the taller st ? [Robert] .",
 "Robert isn't younger than Mary . Mary isn't younger than Linda . ||| is Robert younger than Linda ? [no] .",
 "Jennifer isn't taller than Linda . Mary isn't 

num_sent = 11520 -> 11520


In [247]:
import argparse
parser = argparse.ArgumentParser()

parser.add_argument("--max_seq_length",
                    default=128,
                    type=int,
                    help="The maximum total input sequence length after WordPiece tokenization. \n"
                         "Sequences longer than this will be truncated, and sequences shorter \n"
                         "than this will be padded.")
parser.add_argument("--do_train",
                    action='store_true',
                    help="Whether to run training.")
parser.add_argument("--do_eval",
                    action='store_true',
                    help="Whether to run eval on the dev set.")
parser.add_argument("--train_batch_size",
                    default=32,
                    type=int,
                    help="Total batch size for training.")
parser.add_argument("--eval_batch_size",
                    default=32,
                    type=int,
                    help="Total batch size for eval.")
parser.add_argument("--learning_rate",
                    default=3e-5,
                    type=float,
                    help="The initial learning rate for Adam.")
parser.add_argument("--num_train_epochs",
                    default=3.0,
                    type=float,
                    help="Total number of training epochs to perform.")
parser.add_argument("--warmup_proportion",
                    default=0.1,
                    type=float,
                    help="Proportion of training to perform linear learning rate warmup for. "
                         "E.g., 0.1 = 10%% of training.")
parser.add_argument("--no_cuda",
                    action='store_true',
                    help="Whether not to use CUDA when available")
parser.add_argument("--do_lower_case",
                    action='store_true',
                    help="Whether to lower case the input text. True for uncased models, False for cased models.")
parser.add_argument('--seed',
                    type=int,
                    default=42,
                    help="random seed for initialization")
parser.add_argument('--gradient_accumulation_steps',
                    type=int,
                    default=1,
                    help="Number of updates steps to accumualte before performing a backward/update pass.")
parser.add_argument("--dev_percent",
                    default=0.5,
                    type=float)
# args = parser.parse_args(['--output_dir', '/home'])
args = parser.parse_args([])
args.do_lower_case = True
args.do_train = True
args.do_eval = True
args.eval_batch_size = 128
args.learning_rate = 1e-4
args.num_train_epochs = 100
print(args)

_StoreAction(option_strings=['--max_seq_length'], dest='max_seq_length', nargs=None, const=None, default=128, type=<class 'int'>, choices=None, help='The maximum total input sequence length after WordPiece tokenization. \nSequences longer than this will be truncated, and sequences shorter \nthan this will be padded.', metavar=None)

_StoreTrueAction(option_strings=['--do_train'], dest='do_train', nargs=0, const=True, default=False, type=None, choices=None, help='Whether to run training.', metavar=None)

_StoreTrueAction(option_strings=['--do_eval'], dest='do_eval', nargs=0, const=True, default=False, type=None, choices=None, help='Whether to run eval on the dev set.', metavar=None)

_StoreAction(option_strings=['--train_batch_size'], dest='train_batch_size', nargs=None, const=None, default=32, type=<class 'int'>, choices=None, help='Total batch size for training.', metavar=None)

_StoreAction(option_strings=['--eval_batch_size'], dest='eval_batch_size', nargs=None, const=None, default=32, type=<class 'int'>, choices=None, help='Total batch size for eval.', metavar=None)

_StoreAction(option_strings=['--learning_rate'], dest='learning_rate', nargs=None, const=None, default=3e-05, type=<class 'float'>, choices=None, help='The initial learning rate for Adam.', metavar=None)

_StoreAction(option_strings=['--num_train_epochs'], dest='num_train_epochs', nargs=None, const=None, default=3.0, type=<class 'float'>, choices=None, help='Total number of training epochs to perform.', metavar=None)

_StoreAction(option_strings=['--warmup_proportion'], dest='warmup_proportion', nargs=None, const=None, default=0.1, type=<class 'float'>, choices=None, help='Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% of training.', metavar=None)

_StoreTrueAction(option_strings=['--no_cuda'], dest='no_cuda', nargs=0, const=True, default=False, type=None, choices=None, help='Whether not to use CUDA when available', metavar=None)

_StoreTrueAction(option_strings=['--do_lower_case'], dest='do_lower_case', nargs=0, const=True, default=False, type=None, choices=None, help='Whether to lower case the input text. True for uncased models, False for cased models.', metavar=None)

_StoreAction(option_strings=['--seed'], dest='seed', nargs=None, const=None, default=42, type=<class 'int'>, choices=None, help='random seed for initialization', metavar=None)

_StoreAction(option_strings=['--gradient_accumulation_steps'], dest='gradient_accumulation_steps', nargs=None, const=None, default=1, type=<class 'int'>, choices=None, help='Number of updates steps to accumualte before performing a backward/update pass.', metavar=None)

Namespace(do_eval=True, do_lower_case=True, do_train=True, eval_batch_size=128, gradient_accumulation_steps=1, learning_rate=0.0001, max_seq_length=128, no_cuda=False, num_train_epochs=100, seed=42, train_batch_size=32, warmup_proportion=0.1)


In [243]:
child_dataset = CHILDDataset(tokenizer, sentence_groups[0], dev_percent=0.5)
train_features = child_dataset.get_train_features()
num_train_steps = int(
    len(train_features) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)
print('num_train_steps =', num_train_steps)
eval_features = child_dataset.get_dev_features()

train_dataset = child_dataset.build_dataset(train_features)
eval_dataset = child_dataset.build_dataset(eval_features)

num_train_steps = 10800


In [250]:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
n_gpu = torch.cuda.device_count()
logger.info("device: {} n_gpu: {}".format(
        device, n_gpu))

args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)

random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if n_gpu > 0:
    torch.cuda.manual_seed_all(args.seed)

# Prepare model
# model = BertForMaskedLM.from_pretrained(BERT_DIR)
CONFIG_NAME = 'bert_config_small.json'
config = BertConfig(os.path.join(BERT_DIR, CONFIG_NAME))
model = BertForMaskedLM(config)
_ = model.to(device)
if n_gpu > 1:
    model = torch.nn.DataParallel(model)

06/09/2019 10:05:44 - INFO - run_child_finetuning -   device: cuda n_gpu: 1


<torch._C.Generator at 0x7f7d219e97b0>

In [252]:
# Prepare optimizer
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=args.learning_rate,
                     warmup=args.warmup_proportion,
                     t_total=num_train_steps)

In [253]:
# train_sampler = RandomSampler(train_dataset)
# train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
# eval_sampler = SequentialSampler(eval_dataset)
# eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

logger.info("Epoch 0")
logger.info("Evaluating on train set...")
validate(model, train_dataset, device)
logger.info("Evaluating on valid set...")
validate(model, eval_dataset, device)

global_step = 0
for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
    _ = model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
#     for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
    for step, batch_idx in enumerate(get_batch_index(len(train_dataset), args.train_batch_size, randomized=True)):
        batch = tuple(t[batch_idx] for t in train_dataset.tensors)
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch
        loss = model(input_ids, segment_ids, input_mask, lm_label_ids)
        if n_gpu > 1:
            loss = loss.mean() # mean() to average on multi-gpu.
        if args.gradient_accumulation_steps > 1:
            loss = loss / args.gradient_accumulation_steps
        loss.backward()
        tr_loss += loss.item()
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
        if (step + 1) % args.gradient_accumulation_steps == 0:
            # modify learning rate with special warm up BERT uses
            lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_steps, args.warmup_proportion)
            if global_step % 1000 == 0:
                print('global_step %d, lr = %f' % (global_step, lr_this_step))
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr_this_step
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1

    if args.do_eval:
        logger.info("Epoch %d" % (epoch + 1))
        logger.info("Evaluating on train set...")
        validate(model, train_dataset, device)
        logger.info("Evaluating on valid set...")
        validate(model, eval_dataset, device)





Epoch:   0%|          | 0/100 [00:00<?, ?it/s][A[A[A[A

global_step 0, lr = 0.000000


06/09/2019 10:10:32 - INFO - run_child_finetuning -   Epoch 1
06/09/2019 10:10:32 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 10:11:00 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:11:00 - INFO - run_child_finetuning -     eval_accuracy = 0.3390625
06/09/2019 10:11:00 - INFO - run_child_finetuning -     eval_loss = 9.694811651441785
06/09/2019 10:11:00 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 10:11:28 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:11:28 - INFO - run_child_finetuning -     eval_accuracy = 0.32760416666666664
06/09/2019 10:11:28 - INFO - run_child_finetuning -     eval_loss = 9.699780379401313




Epoch:   1%|          | 1/100 [01:06<1:49:31, 66.37s/it][A[A[A[A06/09/2019 10:11:39 - INFO - run_child_finetuning -   Epoch 2
06/09/2019 10:11:39 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 10:12:07 - INFO - run_child_finetuning -   ***

global_step 1000, lr = 0.000093


06/09/2019 10:12:45 - INFO - run_child_finetuning -   Epoch 3
06/09/2019 10:12:45 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 10:13:13 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:13:13 - INFO - run_child_finetuning -     eval_accuracy = 0.3390625
06/09/2019 10:13:13 - INFO - run_child_finetuning -     eval_loss = 3.257909724447462
06/09/2019 10:13:13 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 10:13:40 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:13:40 - INFO - run_child_finetuning -     eval_accuracy = 0.32760416666666664
06/09/2019 10:13:40 - INFO - run_child_finetuning -     eval_loss = 3.2719171391593087




Epoch:   3%|▎         | 3/100 [03:19<1:47:13, 66.32s/it][A[A[A[A06/09/2019 10:13:51 - INFO - run_child_finetuning -   Epoch 4
06/09/2019 10:13:51 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 10:14:19 - INFO - run_child_finetuning -   **

global_step 2000, lr = 0.000081


06/09/2019 10:16:03 - INFO - run_child_finetuning -   Epoch 6
06/09/2019 10:16:03 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 10:16:31 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:16:31 - INFO - run_child_finetuning -     eval_accuracy = 0.4223090277777778
06/09/2019 10:16:31 - INFO - run_child_finetuning -     eval_loss = 1.4861090461413065
06/09/2019 10:16:31 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 10:16:59 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:16:59 - INFO - run_child_finetuning -     eval_accuracy = 0.4110243055555556
06/09/2019 10:16:59 - INFO - run_child_finetuning -     eval_loss = 1.500312285953098




Epoch:   6%|▌         | 6/100 [06:37<1:44:00, 66.38s/it][A[A[A[A06/09/2019 10:17:10 - INFO - run_child_finetuning -   Epoch 7
06/09/2019 10:17:10 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 10:17:38 - INFO - run_child_finetunin

global_step 3000, lr = 0.000072


06/09/2019 10:19:22 - INFO - run_child_finetuning -   Epoch 9
06/09/2019 10:19:22 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 10:19:50 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:19:50 - INFO - run_child_finetuning -     eval_accuracy = 0.4223090277777778
06/09/2019 10:19:50 - INFO - run_child_finetuning -     eval_loss = 1.369037503666348
06/09/2019 10:19:50 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 10:20:17 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:20:17 - INFO - run_child_finetuning -     eval_accuracy = 0.4110243055555556
06/09/2019 10:20:17 - INFO - run_child_finetuning -     eval_loss = 1.3817288875579834




Epoch:   9%|▉         | 9/100 [09:56<1:40:18, 66.14s/it][A[A[A[A06/09/2019 10:20:28 - INFO - run_child_finetuning -   Epoch 10
06/09/2019 10:20:28 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 10:20:56 - INFO - run_child_finetuni

global_step 4000, lr = 0.000063


06/09/2019 10:22:40 - INFO - run_child_finetuning -   Epoch 12
06/09/2019 10:22:40 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 10:23:08 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:23:08 - INFO - run_child_finetuning -     eval_accuracy = 0.4223090277777778
06/09/2019 10:23:08 - INFO - run_child_finetuning -     eval_loss = 1.3473159684075249
06/09/2019 10:23:08 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 10:23:36 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:23:36 - INFO - run_child_finetuning -     eval_accuracy = 0.4110243055555556
06/09/2019 10:23:36 - INFO - run_child_finetuning -     eval_loss = 1.3603505068355137




Epoch:  12%|█▏        | 12/100 [13:14<1:37:00, 66.14s/it][A[A[A[A06/09/2019 10:23:46 - INFO - run_child_finetuning -   Epoch 13
06/09/2019 10:23:46 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 10:24:14 - INFO - run_child_finet

global_step 5000, lr = 0.000054


06/09/2019 10:24:52 - INFO - run_child_finetuning -   Epoch 14
06/09/2019 10:24:52 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 10:25:20 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:25:20 - INFO - run_child_finetuning -     eval_accuracy = 0.4223090277777778
06/09/2019 10:25:20 - INFO - run_child_finetuning -     eval_loss = 1.3357309381167093
06/09/2019 10:25:20 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 10:25:48 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:25:48 - INFO - run_child_finetuning -     eval_accuracy = 0.4110243055555556
06/09/2019 10:25:48 - INFO - run_child_finetuning -     eval_loss = 1.3490168280071682




Epoch:  14%|█▍        | 14/100 [15:26<1:34:36, 66.01s/it][A[A[A[A06/09/2019 10:25:58 - INFO - run_child_finetuning -   Epoch 15
06/09/2019 10:25:58 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 10:26:26 - INFO - run_child_finet

global_step 6000, lr = 0.000044


06/09/2019 10:28:10 - INFO - run_child_finetuning -   Epoch 17
06/09/2019 10:28:10 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 10:28:38 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:28:38 - INFO - run_child_finetuning -     eval_accuracy = 0.44973958333333336
06/09/2019 10:28:38 - INFO - run_child_finetuning -     eval_loss = 1.3007791850301955
06/09/2019 10:28:38 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 10:29:06 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:29:06 - INFO - run_child_finetuning -     eval_accuracy = 0.43914930555555554
06/09/2019 10:29:06 - INFO - run_child_finetuning -     eval_loss = 1.314843883779314




Epoch:  17%|█▋        | 17/100 [18:44<1:31:17, 65.99s/it][A[A[A[A06/09/2019 10:29:17 - INFO - run_child_finetuning -   Epoch 18
06/09/2019 10:29:17 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 10:29:45 - INFO - run_child_fine

global_step 7000, lr = 0.000035


06/09/2019 10:31:27 - INFO - run_child_finetuning -   Epoch 20
06/09/2019 10:31:27 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 10:31:55 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:31:55 - INFO - run_child_finetuning -     eval_accuracy = 0.44973958333333336
06/09/2019 10:31:55 - INFO - run_child_finetuning -     eval_loss = 1.2851258847448561
06/09/2019 10:31:55 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 10:32:24 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:32:24 - INFO - run_child_finetuning -     eval_accuracy = 0.43914930555555554
06/09/2019 10:32:24 - INFO - run_child_finetuning -     eval_loss = 1.2990937895245023




Epoch:  20%|██        | 20/100 [22:02<1:27:51, 65.89s/it][A[A[A[A06/09/2019 10:32:32 - INFO - run_child_finetuning -   Epoch 21
06/09/2019 10:32:32 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 10:33:00 - INFO - run_child_fin

global_step 8000, lr = 0.000026


06/09/2019 10:34:41 - INFO - run_child_finetuning -   Epoch 23
06/09/2019 10:34:41 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 10:35:08 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:35:08 - INFO - run_child_finetuning -     eval_accuracy = 0.44973958333333336
06/09/2019 10:35:08 - INFO - run_child_finetuning -     eval_loss = 1.2800932976934645
06/09/2019 10:35:08 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 10:35:36 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:35:36 - INFO - run_child_finetuning -     eval_accuracy = 0.43914930555555554
06/09/2019 10:35:36 - INFO - run_child_finetuning -     eval_loss = 1.2938868072297838




Epoch:  23%|██▎       | 23/100 [25:14<1:23:07, 64.77s/it][A[A[A[A06/09/2019 10:35:45 - INFO - run_child_finetuning -   Epoch 24
06/09/2019 10:35:45 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 10:36:13 - INFO - run_child_fin

global_step 9000, lr = 0.000017


06/09/2019 10:37:57 - INFO - run_child_finetuning -   Epoch 26
06/09/2019 10:37:57 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 10:38:25 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:38:25 - INFO - run_child_finetuning -     eval_accuracy = 0.44973958333333336
06/09/2019 10:38:25 - INFO - run_child_finetuning -     eval_loss = 1.2780342843797472
06/09/2019 10:38:25 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 10:38:53 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:38:53 - INFO - run_child_finetuning -     eval_accuracy = 0.43914930555555554
06/09/2019 10:38:53 - INFO - run_child_finetuning -     eval_loss = 1.2919086231125725




Epoch:  26%|██▌       | 26/100 [28:31<1:20:44, 65.46s/it][A[A[A[A06/09/2019 10:39:04 - INFO - run_child_finetuning -   Epoch 27
06/09/2019 10:39:04 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 10:39:31 - INFO - run_child_fin

global_step 10000, lr = 0.000007


06/09/2019 10:40:10 - INFO - run_child_finetuning -   Epoch 28
06/09/2019 10:40:10 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 10:40:38 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:40:38 - INFO - run_child_finetuning -     eval_accuracy = 0.4519097222222222
06/09/2019 10:40:38 - INFO - run_child_finetuning -     eval_loss = 1.277630979484982
06/09/2019 10:40:38 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 10:41:06 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:41:06 - INFO - run_child_finetuning -     eval_accuracy = 0.43697916666666664
06/09/2019 10:41:06 - INFO - run_child_finetuning -     eval_loss = 1.2915024439493814




Epoch:  28%|██▊       | 28/100 [30:44<1:19:02, 65.87s/it][A[A[A[A06/09/2019 10:41:16 - INFO - run_child_finetuning -   Epoch 29
06/09/2019 10:41:16 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 10:41:44 - INFO - run_child_finet

global_step 11000, lr = -0.000002


06/09/2019 10:43:30 - INFO - run_child_finetuning -   Epoch 31
06/09/2019 10:43:30 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 10:43:58 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:43:58 - INFO - run_child_finetuning -     eval_accuracy = 0.4519097222222222
06/09/2019 10:43:58 - INFO - run_child_finetuning -     eval_loss = 1.2775861925548977
06/09/2019 10:43:58 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 10:44:25 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:44:25 - INFO - run_child_finetuning -     eval_accuracy = 0.43697916666666664
06/09/2019 10:44:25 - INFO - run_child_finetuning -     eval_loss = 1.2914685328801474




Epoch:  31%|███       | 31/100 [34:03<1:16:15, 66.31s/it][A[A[A[A06/09/2019 10:44:36 - INFO - run_child_finetuning -   Epoch 32
06/09/2019 10:44:36 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 10:45:04 - INFO - run_child_fine

global_step 12000, lr = -0.000011


06/09/2019 10:46:47 - INFO - run_child_finetuning -   Epoch 34
06/09/2019 10:46:47 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 10:47:14 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:47:14 - INFO - run_child_finetuning -     eval_accuracy = 0.4519097222222222
06/09/2019 10:47:14 - INFO - run_child_finetuning -     eval_loss = 1.2771676964230008
06/09/2019 10:47:14 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 10:47:42 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:47:42 - INFO - run_child_finetuning -     eval_accuracy = 0.43697916666666664
06/09/2019 10:47:42 - INFO - run_child_finetuning -     eval_loss = 1.2910632411638896




Epoch:  34%|███▍      | 34/100 [37:20<1:12:16, 65.70s/it][A[A[A[A06/09/2019 10:47:54 - INFO - run_child_finetuning -   Epoch 35
06/09/2019 10:47:54 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 10:48:21 - INFO - run_child_fine

global_step 13000, lr = -0.000020


06/09/2019 10:50:02 - INFO - run_child_finetuning -   Epoch 37
06/09/2019 10:50:02 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 10:50:30 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:50:30 - INFO - run_child_finetuning -     eval_accuracy = 0.4519097222222222
06/09/2019 10:50:30 - INFO - run_child_finetuning -     eval_loss = 1.275437773598565
06/09/2019 10:50:30 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 10:50:58 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:50:58 - INFO - run_child_finetuning -     eval_accuracy = 0.43697916666666664
06/09/2019 10:50:58 - INFO - run_child_finetuning -     eval_loss = 1.2893267750740052




Epoch:  37%|███▋      | 37/100 [40:36<1:08:23, 65.14s/it][A[A[A[A06/09/2019 10:51:06 - INFO - run_child_finetuning -   Epoch 38
06/09/2019 10:51:06 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 10:51:34 - INFO - run_child_finet

global_step 14000, lr = -0.000030


06/09/2019 10:52:10 - INFO - run_child_finetuning -   Epoch 39
06/09/2019 10:52:10 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 10:52:38 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:52:38 - INFO - run_child_finetuning -     eval_accuracy = 0.4519097222222222
06/09/2019 10:52:38 - INFO - run_child_finetuning -     eval_loss = 1.2734754257731968
06/09/2019 10:52:38 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 10:53:06 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:53:06 - INFO - run_child_finetuning -     eval_accuracy = 0.43697916666666664
06/09/2019 10:53:06 - INFO - run_child_finetuning -     eval_loss = 1.28737782769733




Epoch:  39%|███▉      | 39/100 [42:44<1:05:48, 64.74s/it][A[A[A[A06/09/2019 10:53:18 - INFO - run_child_finetuning -   Epoch 40
06/09/2019 10:53:18 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 10:53:46 - INFO - run_child_finetu

global_step 15000, lr = -0.000039


06/09/2019 10:55:31 - INFO - run_child_finetuning -   Epoch 42
06/09/2019 10:55:31 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 10:55:59 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:55:59 - INFO - run_child_finetuning -     eval_accuracy = 0.4519097222222222
06/09/2019 10:55:59 - INFO - run_child_finetuning -     eval_loss = 1.2693544705708821
06/09/2019 10:55:59 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 10:56:27 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:56:27 - INFO - run_child_finetuning -     eval_accuracy = 0.43697916666666664
06/09/2019 10:56:27 - INFO - run_child_finetuning -     eval_loss = 1.2828620976871914




Epoch:  42%|████▏     | 42/100 [46:05<1:03:44, 65.93s/it][A[A[A[A06/09/2019 10:56:36 - INFO - run_child_finetuning -   Epoch 43
06/09/2019 10:56:36 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 10:57:03 - INFO - run_child_fine

global_step 16000, lr = -0.000048


06/09/2019 10:58:47 - INFO - run_child_finetuning -   Epoch 45
06/09/2019 10:58:47 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 10:59:15 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:59:15 - INFO - run_child_finetuning -     eval_accuracy = 0.4509548611111111
06/09/2019 10:59:15 - INFO - run_child_finetuning -     eval_loss = 1.2655728247430589
06/09/2019 10:59:15 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 10:59:43 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 10:59:43 - INFO - run_child_finetuning -     eval_accuracy = 0.4379340277777778
06/09/2019 10:59:43 - INFO - run_child_finetuning -     eval_loss = 1.2796800454457602




Epoch:  45%|████▌     | 45/100 [49:21<1:00:09, 65.63s/it][A[A[A[A06/09/2019 10:59:53 - INFO - run_child_finetuning -   Epoch 46
06/09/2019 10:59:53 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:00:21 - INFO - run_child_finet

global_step 17000, lr = -0.000057


06/09/2019 11:02:05 - INFO - run_child_finetuning -   Epoch 48
06/09/2019 11:02:05 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:02:33 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:02:33 - INFO - run_child_finetuning -     eval_accuracy = 0.45199652777777777
06/09/2019 11:02:33 - INFO - run_child_finetuning -     eval_loss = 1.2638536400265163
06/09/2019 11:02:33 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 11:03:00 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:03:00 - INFO - run_child_finetuning -     eval_accuracy = 0.4368923611111111
06/09/2019 11:03:00 - INFO - run_child_finetuning -     eval_loss = 1.2784057392014399




Epoch:  48%|████▊     | 48/100 [52:39<57:02, 65.81s/it][A[A[A[A06/09/2019 11:03:11 - INFO - run_child_finetuning -   Epoch 49
06/09/2019 11:03:11 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:03:38 - INFO - run_child_finetu

global_step 18000, lr = -0.000067


06/09/2019 11:05:22 - INFO - run_child_finetuning -   Epoch 51
06/09/2019 11:05:22 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:05:49 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:05:49 - INFO - run_child_finetuning -     eval_accuracy = 0.4509548611111111
06/09/2019 11:05:49 - INFO - run_child_finetuning -     eval_loss = 1.2582731948958503
06/09/2019 11:05:49 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 11:06:17 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:06:17 - INFO - run_child_finetuning -     eval_accuracy = 0.4379340277777778
06/09/2019 11:06:17 - INFO - run_child_finetuning -     eval_loss = 1.271828391816881




Epoch:  51%|█████     | 51/100 [55:55<53:39, 65.69s/it][A[A[A[A06/09/2019 11:06:28 - INFO - run_child_finetuning -   Epoch 52
06/09/2019 11:06:28 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:06:56 - INFO - run_child_finetuni

global_step 19000, lr = -0.000076


06/09/2019 11:07:35 - INFO - run_child_finetuning -   Epoch 53
06/09/2019 11:07:35 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:08:03 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:08:03 - INFO - run_child_finetuning -     eval_accuracy = 0.45078125
06/09/2019 11:08:03 - INFO - run_child_finetuning -     eval_loss = 1.2580892933739556
06/09/2019 11:08:03 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 11:08:31 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:08:31 - INFO - run_child_finetuning -     eval_accuracy = 0.4381076388888889
06/09/2019 11:08:31 - INFO - run_child_finetuning -     eval_loss = 1.2724553810225592




Epoch:  53%|█████▎    | 53/100 [58:09<51:53, 66.24s/it][A[A[A[A06/09/2019 11:08:42 - INFO - run_child_finetuning -   Epoch 54
06/09/2019 11:08:42 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:09:10 - INFO - run_child_finetuning -   

global_step 20000, lr = -0.000085


06/09/2019 11:10:54 - INFO - run_child_finetuning -   Epoch 56
06/09/2019 11:10:54 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:11:22 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:11:22 - INFO - run_child_finetuning -     eval_accuracy = 0.45078125
06/09/2019 11:11:22 - INFO - run_child_finetuning -     eval_loss = 1.2557978232701619
06/09/2019 11:11:22 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 11:11:49 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:11:49 - INFO - run_child_finetuning -     eval_accuracy = 0.4381076388888889
06/09/2019 11:11:49 - INFO - run_child_finetuning -     eval_loss = 1.2688145054711235




Epoch:  56%|█████▌    | 56/100 [1:01:27<48:30, 66.15s/it][A[A[A[A06/09/2019 11:12:00 - INFO - run_child_finetuning -   Epoch 57
06/09/2019 11:12:00 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:12:28 - INFO - run_child_finetuning - 

global_step 21000, lr = -0.000094


06/09/2019 11:14:10 - INFO - run_child_finetuning -   Epoch 59
06/09/2019 11:14:10 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:14:38 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:14:38 - INFO - run_child_finetuning -     eval_accuracy = 0.45078125
06/09/2019 11:14:38 - INFO - run_child_finetuning -     eval_loss = 1.2543529285324944
06/09/2019 11:14:38 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 11:15:06 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:15:06 - INFO - run_child_finetuning -     eval_accuracy = 0.4381076388888889
06/09/2019 11:15:06 - INFO - run_child_finetuning -     eval_loss = 1.2681733555263943




Epoch:  59%|█████▉    | 59/100 [1:04:44<44:49, 65.59s/it][A[A[A[A06/09/2019 11:15:16 - INFO - run_child_finetuning -   Epoch 60
06/09/2019 11:15:16 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:15:44 - INFO - run_child_finetuning - 

global_step 22000, lr = -0.000104


06/09/2019 11:17:31 - INFO - run_child_finetuning -   Epoch 62
06/09/2019 11:17:31 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:17:59 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:17:59 - INFO - run_child_finetuning -     eval_accuracy = 0.4519097222222222
06/09/2019 11:17:59 - INFO - run_child_finetuning -     eval_loss = 1.2535286770926581
06/09/2019 11:17:59 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 11:18:27 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:18:27 - INFO - run_child_finetuning -     eval_accuracy = 0.43697916666666664
06/09/2019 11:18:27 - INFO - run_child_finetuning -     eval_loss = 1.267613332801395




Epoch:  62%|██████▏   | 62/100 [1:08:05<42:13, 66.66s/it][A[A[A[A06/09/2019 11:18:37 - INFO - run_child_finetuning -   Epoch 63
06/09/2019 11:18:37 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:19:05 - INFO - run_child_finet

global_step 23000, lr = -0.000113


06/09/2019 11:19:42 - INFO - run_child_finetuning -   Epoch 64
06/09/2019 11:19:42 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:20:09 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:20:09 - INFO - run_child_finetuning -     eval_accuracy = 0.44991319444444444
06/09/2019 11:20:09 - INFO - run_child_finetuning -     eval_loss = 1.2541112303733826
06/09/2019 11:20:09 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 11:20:37 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:20:37 - INFO - run_child_finetuning -     eval_accuracy = 0.43897569444444445
06/09/2019 11:20:37 - INFO - run_child_finetuning -     eval_loss = 1.267833666006724




Epoch:  64%|██████▍   | 64/100 [1:10:15<39:27, 65.76s/it][A[A[A[A06/09/2019 11:20:46 - INFO - run_child_finetuning -   Epoch 65
06/09/2019 11:20:46 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:21:14 - INFO - run_child_fine

global_step 24000, lr = -0.000122


06/09/2019 11:22:58 - INFO - run_child_finetuning -   Epoch 67
06/09/2019 11:22:58 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:23:27 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:23:27 - INFO - run_child_finetuning -     eval_accuracy = 0.45078125
06/09/2019 11:23:27 - INFO - run_child_finetuning -     eval_loss = 1.2558889269828797
06/09/2019 11:23:27 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 11:23:55 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:23:55 - INFO - run_child_finetuning -     eval_accuracy = 0.4381076388888889
06/09/2019 11:23:55 - INFO - run_child_finetuning -     eval_loss = 1.2680187635951572




Epoch:  67%|██████▋   | 67/100 [1:13:33<36:17, 65.97s/it][A[A[A[A06/09/2019 11:24:05 - INFO - run_child_finetuning -   Epoch 68
06/09/2019 11:24:05 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:24:33 - INFO - run_child_finetuning - 

global_step 25000, lr = -0.000131


06/09/2019 11:26:20 - INFO - run_child_finetuning -   Epoch 70
06/09/2019 11:26:20 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:26:48 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:26:48 - INFO - run_child_finetuning -     eval_accuracy = 0.44973958333333336
06/09/2019 11:26:48 - INFO - run_child_finetuning -     eval_loss = 1.256232378217909
06/09/2019 11:26:48 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 11:27:16 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:27:16 - INFO - run_child_finetuning -     eval_accuracy = 0.43914930555555554
06/09/2019 11:27:16 - INFO - run_child_finetuning -     eval_loss = 1.2710346884197659




Epoch:  70%|███████   | 70/100 [1:16:54<33:23, 66.77s/it][A[A[A[A06/09/2019 11:27:28 - INFO - run_child_finetuning -   Epoch 71
06/09/2019 11:27:28 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:27:56 - INFO - run_child_fine

global_step 26000, lr = -0.000141


06/09/2019 11:29:40 - INFO - run_child_finetuning -   Epoch 73
06/09/2019 11:29:40 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:30:08 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:30:08 - INFO - run_child_finetuning -     eval_accuracy = 0.4509548611111111
06/09/2019 11:30:08 - INFO - run_child_finetuning -     eval_loss = 1.2545752935939365
06/09/2019 11:30:08 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 11:30:36 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:30:36 - INFO - run_child_finetuning -     eval_accuracy = 0.4379340277777778
06/09/2019 11:30:36 - INFO - run_child_finetuning -     eval_loss = 1.269049670961168




Epoch:  73%|███████▎  | 73/100 [1:20:14<29:58, 66.62s/it][A[A[A[A06/09/2019 11:30:46 - INFO - run_child_finetuning -   Epoch 74
06/09/2019 11:30:46 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:31:14 - INFO - run_child_finetu

global_step 27000, lr = -0.000150


06/09/2019 11:32:58 - INFO - run_child_finetuning -   Epoch 76
06/09/2019 11:32:58 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:33:26 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:33:26 - INFO - run_child_finetuning -     eval_accuracy = 0.4599826388888889
06/09/2019 11:33:26 - INFO - run_child_finetuning -     eval_loss = 1.2468693004714118
06/09/2019 11:33:26 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 11:35:11 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:35:39 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:35:39 - INFO - run_child_finetuning -     eval_accuracy = 0.4732638888888889
06/09/2019 11:35:39 - INFO - run_child_finetuning -     eval_loss = 1.2132148491011725
06/09/2019 11:35:39 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 11:36:06 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:36:06 

global_step 29000, lr = -0.000169


06/09/2019 11:38:28 - INFO - run_child_finetuning -   Epoch 81
06/09/2019 11:38:28 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:38:56 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:38:56 - INFO - run_child_finetuning -     eval_accuracy = 0.5042534722222223
06/09/2019 11:38:56 - INFO - run_child_finetuning -     eval_loss = 1.1616202606095207
06/09/2019 11:38:56 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 11:39:24 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:39:24 - INFO - run_child_finetuning -     eval_accuracy = 0.4915798611111111
06/09/2019 11:39:24 - INFO - run_child_finetuning -     eval_loss = 1.1748484041955736




Epoch:  81%|████████  | 81/100 [1:29:02<20:53, 65.97s/it][A[A[A[A06/09/2019 11:39:34 - INFO - run_child_finetuning -   Epoch 82
06/09/2019 11:39:34 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:40:02 - INFO - run_child_finet

global_step 30000, lr = -0.000178


06/09/2019 11:41:47 - INFO - run_child_finetuning -   Epoch 84
06/09/2019 11:41:47 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:42:14 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:42:14 - INFO - run_child_finetuning -     eval_accuracy = 0.5082465277777778
06/09/2019 11:42:14 - INFO - run_child_finetuning -     eval_loss = 1.076478154791726
06/09/2019 11:42:14 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 11:42:42 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:42:42 - INFO - run_child_finetuning -     eval_accuracy = 0.4957465277777778
06/09/2019 11:42:42 - INFO - run_child_finetuning -     eval_loss = 1.0900266740057203




Epoch:  84%|████████▍ | 84/100 [1:32:20<17:36, 66.05s/it][A[A[A[A06/09/2019 11:42:53 - INFO - run_child_finetuning -   Epoch 85
06/09/2019 11:42:53 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:43:20 - INFO - run_child_finetu

global_step 31000, lr = -0.000187


06/09/2019 11:45:05 - INFO - run_child_finetuning -   Epoch 87
06/09/2019 11:45:05 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:45:33 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:45:33 - INFO - run_child_finetuning -     eval_accuracy = 0.5419270833333333
06/09/2019 11:45:33 - INFO - run_child_finetuning -     eval_loss = 0.9918538702858819
06/09/2019 11:45:33 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 11:46:01 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:46:01 - INFO - run_child_finetuning -     eval_accuracy = 0.5289930555555555
06/09/2019 11:46:01 - INFO - run_child_finetuning -     eval_loss = 1.0008921066919962




Epoch:  87%|████████▋ | 87/100 [1:35:39<14:19, 66.13s/it][A[A[A[A06/09/2019 11:46:11 - INFO - run_child_finetuning -   Epoch 88
06/09/2019 11:46:11 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:46:39 - INFO - run_child_finet

global_step 32000, lr = -0.000196


06/09/2019 11:47:17 - INFO - run_child_finetuning -   Epoch 89
06/09/2019 11:47:17 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:47:45 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:47:45 - INFO - run_child_finetuning -     eval_accuracy = 0.56171875
06/09/2019 11:47:45 - INFO - run_child_finetuning -     eval_loss = 0.9563338147269355
06/09/2019 11:47:45 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 11:48:13 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:48:13 - INFO - run_child_finetuning -     eval_accuracy = 0.5494791666666666
06/09/2019 11:48:13 - INFO - run_child_finetuning -     eval_loss = 0.9669986453321245




Epoch:  89%|████████▉ | 89/100 [1:37:51<12:08, 66.23s/it][A[A[A[A06/09/2019 11:48:24 - INFO - run_child_finetuning -   Epoch 90
06/09/2019 11:48:24 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:48:51 - INFO - run_child_finetuning - 

global_step 33000, lr = -0.000206


06/09/2019 11:50:36 - INFO - run_child_finetuning -   Epoch 92
06/09/2019 11:50:36 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:51:04 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:51:04 - INFO - run_child_finetuning -     eval_accuracy = 0.5755208333333334
06/09/2019 11:51:04 - INFO - run_child_finetuning -     eval_loss = 0.9096341941091749
06/09/2019 11:51:04 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 11:51:32 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:51:32 - INFO - run_child_finetuning -     eval_accuracy = 0.56328125
06/09/2019 11:51:32 - INFO - run_child_finetuning -     eval_loss = 0.9205585459868113




Epoch:  92%|█████████▏| 92/100 [1:41:10<08:49, 66.14s/it][A[A[A[A06/09/2019 11:51:43 - INFO - run_child_finetuning -   Epoch 93
06/09/2019 11:51:43 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:52:11 - INFO - run_child_finetuning - 

global_step 34000, lr = -0.000215


06/09/2019 11:53:54 - INFO - run_child_finetuning -   Epoch 95
06/09/2019 11:53:54 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:54:21 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:54:21 - INFO - run_child_finetuning -     eval_accuracy = 0.5769965277777778
06/09/2019 11:54:21 - INFO - run_child_finetuning -     eval_loss = 0.8847115721967486
06/09/2019 11:54:21 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 11:54:49 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:54:49 - INFO - run_child_finetuning -     eval_accuracy = 0.5650173611111111
06/09/2019 11:54:49 - INFO - run_child_finetuning -     eval_loss = 0.8932965106434292




Epoch:  95%|█████████▌| 95/100 [1:44:27<05:29, 65.95s/it][A[A[A[A06/09/2019 11:54:59 - INFO - run_child_finetuning -   Epoch 96
06/09/2019 11:54:59 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:55:27 - INFO - run_child_finet

global_step 35000, lr = -0.000224


06/09/2019 11:57:11 - INFO - run_child_finetuning -   Epoch 98
06/09/2019 11:57:11 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:57:38 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:57:38 - INFO - run_child_finetuning -     eval_accuracy = 0.6711805555555556
06/09/2019 11:57:38 - INFO - run_child_finetuning -     eval_loss = 0.7093107594384087
06/09/2019 11:57:38 - INFO - run_child_finetuning -   Evaluating on valid set...
06/09/2019 11:58:06 - INFO - run_child_finetuning -   ***** Eval results *****
06/09/2019 11:58:06 - INFO - run_child_finetuning -     eval_accuracy = 0.6711805555555556
06/09/2019 11:58:06 - INFO - run_child_finetuning -     eval_loss = 0.7124807761775123




Epoch:  98%|█████████▊| 98/100 [1:47:44<02:11, 65.84s/it][A[A[A[A06/09/2019 11:58:17 - INFO - run_child_finetuning -   Epoch 99
06/09/2019 11:58:17 - INFO - run_child_finetuning -   Evaluating on train set...
06/09/2019 11:58:44 - INFO - run_child_finet