In [360]:
import json
import numpy as np
import torch as t
import torch.nn as nn
from torch.utils.data import Dataset

In [361]:
from collections import namedtuple
from nltk.tokenize import WordPunctTokenizer
import bisect

In [363]:
from model.modules.attention import BidirectionalAttention, SelfAttention
from model.util import get_device
from model.predictor import PredictorConfig, GRUConfig, PredictorModel, DocQAPredictor
from model.corpus import CorpusStats
from model.trainer import Trainer

In [364]:
Data = namedtuple('Data', ['paras', 
                           'qid_to_para', 
                           'qid_to_q', 
                           'tokenized_paras', 
                           'qid_to_tokenized_q', 
                           'encoded_paras', 
                           'qid_to_encoded_qtext', 
                           'qid_to_encoded_ans'])

WordVectors = namedtuple('WordVectors', ['vectors', 
                                         'word_to_idx', 
                                         'idx_to_word'])

VectorEncodedData = namedtuple('VectorEncodedData', ['qids', 
                                                     'paras', 
                                                     'qid_to_q', 
                                                     'qid_to_para', 
                                                     'qid_to_ans',
                                                     'vectors'])

Mapping = namedtuple('Mapping', ['all_tokens', 
                                 'all_chars', 
                                 'token_to_id', 
                                 'id_to_token', 
                                 'char_to_id', 
                                 'id_to_char'])

EncodedSample = namedtuple("EncodedSample", ['question_words',
                                             'question_id',
                                             'context_words',
                                             'span_starts',
                                             'span_ends',
                                             'question_chars',
                                             'context_chars'])

In [218]:
processor = lambda txt: txt.lower()
tokenizer = WordPunctTokenizer()
Token = namedtuple('Tokenized', ['words', 'spans'])
def tokenize(text):
    text = processor(text)
    spans = list(tokenizer.span_tokenize(text))
    words = [text[span_start:span_end] for span_start, span_end in spans]
    return Token(words, spans)

In [249]:
def read_raw_dict(data_file, data):
    with open(data_file, 'r') as f:
        data_dict = json.load(f)
    for doc in data_dict['data']:
        json_paragraphs = doc['paragraphs']
        for para in json_paragraphs:
            try:
                ctx = para['context']
            except TypeError:
                continue
            data.paras.append(ctx)
            cur_para_idx = len(data.paras) - 1
            qas = para['qas']
            for qa in qas:
                data.qid_to_para[qa['id']] = cur_para_idx
                answer = (qa['answers'][0]['answer_start'], qa['answers'][0]['answer_start'] + len(qa['answers'][0]['text']))
                question = {'text': qa['question'], 'answer': answer}
                data.qid_to_q[qa['id']] = question
    return data

In [250]:
def tokenize_data(data, mapping, build_new_mapping):
    if build_new_mapping:
        mapping = Mapping(set(), set(), {}, {}, {}, {})
    for para in data.paras:
        ptkn = tokenize(para)
        if build_new_mapping:
            mapping.all_tokens.update(ptkn.words)
            mapping.all_chars.update(''.join(ptkn.words))
        data.tokenized_paras.append(ptkn)
    
    
    for qid, qobj in data.qid_to_q.items():
        qtkn = tokenize(qobj['text'])
        if build_new_mapping:
            mapping.all_tokens.update(qtkn.words)
            mapping.all_chars.update(''.join(qtkn.words))
    
        pid = data.qid_to_para[qid]
        ptkn = data.tokenized_paras[pid]
        ptkn_starts, ptkn_ends = zip(*ptkn.spans)
        ans_first_tkn = bisect.bisect_right(ptkn_starts, qobj['answer'][0]) - 1
        ans_last_tkn = bisect.bisect_left(ptkn_ends, qobj['answer'][1])
        data.qid_to_tokenized_q[qid] = {'tokens': qtkn.words, 'answer': (ans_first_tkn, ans_last_tkn)}
    return data, mapping

In [262]:
def build_mapping(mapping):
    mapping.id_to_token[1] = '<UNK>'
    mapping.id_to_char[1] = '<UNK>'
    mapping.token_to_id.update(dict(map(reversed, enumerate(mapping.all_tokens, 2))))
    mapping.id_to_token.update(dict(enumerate(mapping.all_tokens, 2)))
    mapping.char_to_id.update(dict(map(reversed, enumerate(mapping.all_chars, 2))))
    mapping.id_to_char.update(dict(enumerate(mapping.all_chars, 2)))
    return mapping

In [263]:
def encode_data(data, mapping):
    data = data._replace(encoded_paras=[[mapping.token_to_id.get(word, 1) for word in para.words] for para in data.tokenized_paras])
    for qid, qobj in data.qid_to_tokenized_q.items():
        encoded_question = [mapping.token_to_id.get(word, 1) for word in qobj['tokens']]
        para_len = len(data.encoded_paras[data.qid_to_para[qid]])
        ans = qobj['answer']
        encoded_answer = [1 if (i >= ans[0] and i <= ans[1]) else 0 for i in range(para_len)]
        data.qid_to_encoded_qtext[qid] = encoded_question
        data.qid_to_encoded_ans[qid] = encoded_answer
    return data

In [264]:
def print_samples(data, mapping, num_questions=1):
    i=1
    for qid, pid in data.qid_to_para.items():
        if i > num_questions:
            break
        i+= 1
    
        orig_qobj = data.qid_to_q[qid]
        orig_span = orig_qobj['answer']
    
        orig_q = orig_qobj['text']
        orig_p = data.paras[pid]
        orig_a = orig_p[orig_qobj['answer'][0]:orig_qobj['answer'][1]]
        print('Original q: {}'.format(orig_q))
        print('Original p: {}'.format(orig_p))
        print('Original a: {}'.format(orig_a))
        print()
    
        q = data.qid_to_encoded_qtext[qid]
        p = data.encoded_paras[pid]
        a = data.qid_to_encoded_ans[qid]
    
        q_text = ' '.join([mapping.id_to_token[idx] for idx in q])
        p_text = ' '.join([mapping.id_to_token[idx] for idx in p])
        a_text = ' '.join([mapping.id_to_token[tok] for idx, tok in enumerate(p) if a[idx] == 1])
    
        print('q: {}'.format(q_text))
        print('p: {}'.format(p_text))
        print('a: {}'.format(a_text))
        print()

In [254]:
def parse_squad(data_file, build_new_mapping=False, mapping=None):
    data = Data([], {}, {}, [], {}, [], {}, {})
    if build_new_mapping:
        mapping = Mapping(set(), set(), {}, {}, {}, {})
    data = read_raw_dict(data_file, data)
    data, mapping = tokenize_data(data, mapping, build_new_mapping)
    if build_new_mapping:
        mapping = build_mapping(mapping)
    data = encode_data(data, mapping)
    return data, mapping

In [266]:
print_samples(train_data, mapping)
print_samples(dev_data, mapping, num_questions=10)

Original q: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Original p: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.
Original a: Saint Bernadette Soubirous

q: to whom did the virgin mary allegedly appear in 1858 in lourdes france ?
p: architecturally , the school has a catholic character . atop the main building ' s gold d

In [292]:
def read_vectors_file(vectors_file):
    vectors_list = []
    vocab = []
    with open(vectors_file, 'r') as f:
        for line in f:
            word, vec_data = line.rstrip("\n").split(" ", 1)
            vector = np.array([float(num) for num in vec_data.split(" ")], dtype=np.float32)
            vectors_list.append(vector)
            vocab.append(word)
    pad_vec = np.zeros_like(vectors_list[0])
    vectors_list.insert(0, pad_vec)
    vocab.insert(0, '<PAD>')
    unk_vec = np.random.randn((vectors_list[0].shape[0]))
    vectors_list.insert(1, unk_vec)
    vocab.insert(1, '<UNK>')
    return WordVectors(vectors=np.stack(vectors_list), idx_to_word=dict(enumerate(vocab)), word_to_idx=dict(map(reversed, enumerate(vocab))))

In [308]:
def encode_dataset_with_vectors(data, mapping, starting_vectors, filter_vectors=False):
    if filter_vectors:
        dataset_mapping_id_to_vectors_idx = {0: 0, 1: 1} # pad->pad, unk->unk
        for idx, word in mapping.id_to_token.items():
            vectors_idx = starting_vectors.word_to_idx.get(word, 1)
            dataset_mapping_id_to_vectors_idx[idx] = vectors_idx
        used_vector_indices = list(sorted(dataset_mapping_id_to_vectors_idx.values()))    
        
        compact_vectors = np.take(starting_vectors.vectors, used_vector_indices, axis=0)
        old_vector_idx_to_compact_vector_idx = dict(map(reversed, enumerate(used_vector_indices)))
        word_to_compact_vector_idx = {word: old_vector_idx_to_compact_vector_idx[idx] for word, idx in starting_vectors.word_to_idx.items() if idx in used_vector_indices}
        compact_vector_idx_to_word = {old_vector_idx_to_compact_vector_idx[idx]: word for word, idx in starting_vectors.idx_to_word.items() if idx in used_vector_indices}
        vectors = WordVectors(vectors=compact_vectors, word_to_idx=word_to_compact_vector_idx, idx_to_word=compact_vector_idx_to_word)
    else:
        vectors = starting_vectors
        
    dataset_mapping_id_to_vectors_idx = {idx: vectors.word_to_idx.get(word, 1) for idx, word in mapping.id_to_token.items()}
    encoded_qid_to_q = {qid: [dataset_mapping_id_to_vectors_idx[tok_id] for tok_id in qobj] for qid, qobj in data.qid_to_encoded_qtext.items()}
    encoded_paras = [[dataset_mapping_id_to_vectors_idx[tok_id] for tok_id in para] for para in data.encoded_paras]
    
    vector_encoded_data = VectorEncodedData(qids=list(data.qid_to_para.keys()),
                                            paras=encoded_paras,
                                            qid_to_q=encoded_qid_to_q,
                                            qid_to_para=data.qid_to_para,
                                            qid_to_ans=data.qid_to_encoded_ans,
                                            vectors=vectors)
    return vector_encoded_data

In [378]:
class SQACorpus():
    def __init__(self, encoded_data):
        self.stats = CorpusStats(
            n_contexts=len(encoded_data.paras),
            n_questions=len(encoded_data.qids),
            n_answerable=len(encoded_data.qids),
            n_unanswerable=0,
            max_context_len=max(len(para) for para in encoded_data.paras),
            max_q_len=max(len(qtext) for qtext in encoded_data.qid_to_q.values()),
            max_word_len=0,
            single_answer=True,
            word_vocab_size=0,
            char_vocab_size=0)
        self.idx_to_word = encoded_data.vectors.idx_to_word
        self.encoded_data = encoded_data
        self.samples = []
        for qid in encoded_data.qids:
            q_words = np.array(encoded_data.qid_to_q[qid])
            ctx_words = np.array(encoded_data.paras[encoded_data.qid_to_para[qid]])
            spans = encoded_data.qid_to_ans[qid]
            span_starts = np.array([spans.index(1)])
            span_ends = np.array([len(spans) - list(reversed(ans)).index(1)])
            q_chars = np.array((len(encoded_data.qid_to_q[qid]), 1))
            ctx_chars = np.array((ctx_words.shape[0], 1))
            self.samples.append(EncodedSample(q_words, qid, ctx_words, span_starts, span_ends, q_chars, ctx_chars))  

        
class SQADataset(Dataset):
    def __init__(self, encoded_data):
        self.corpus = SQACorpus(encoded_data)
        self.source_file = 'data/original/dev.json'
        
    def __len__(self):
        return len(self.corpus.samples)
    
    def __getitem__(self, idx):
        return self.corpus.samples[idx]
    
    def get_answer_texts(self, answer_token_idxs):
        return {qid: self.get_single_answer_text(qid, span_start, span_end) for qid, (span_start, span_end) in answer_token_idxs.items()}
            
    def get_single_answer_text(self, qid, span_start, span_end):
        # TODO: Should this be +1 for end
        return " ".join(self.corpus.encoded_data.paras[self.corpus.encoded_data.qid_to_para[qid]][span_start: span_end])


In [385]:
class WordEmbeddor(nn.Module):
    def __init__(self, vectors):
        super().__init__()
        self.embedding_dim = vectors.shape[1]
        embedding_matrix = t.Tensor(vectors)
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)
    
    def forward(self, words, chars):
        return self.embedding(words)

In [386]:
def initialize_model(word_vecs, use_cuda=True, rnn_hidden_size=128, rnn_num_layers=2, dropout=0.1, bidirectional=True, attention_linear_hidden_size=128, use_self_attention=True, batch_size=32):
    device = get_device(not use_cuda)
    predictor_config = PredictorConfig(
        gru=GRUConfig(
            rnn_hidden_size,
            rnn_num_layers,
            dropout,
            not bidirectional),
        dropout_prob=dropout,
        attention_linear_hidden_size=attention_linear_hidden_size,
        use_self_attention=use_self_attention,
        batch_size=batch_size)
    embeddor = WordEmbeddor(word_vecs)
    predictor_model = DocQAPredictor(embeddor, predictor_config).to(device)
    return predictor_model

In [255]:
train_data, mapping = parse_squad('data/original/train.json', build_new_mapping=True)
dev_data, mapping = parse_squad('data/original/dev.json', mapping=mapping)

In [298]:
glove_vectors = read_vectors_file('data/word-vectors/glove/glove.6B.100d.txt')

In [309]:
glove_encoded_train_data = encode_dataset_with_vectors(train_data, mapping, glove_vectors, filter_vectors=True)

In [310]:
train_vecs = glove_encoded_train_data.vectors

In [311]:
glove_encoded_dev_data = encode_dataset_with_vectors(dev_data, mapping, train_vecs, filter_vectors=False)

In [379]:
train_dataset = SQADataset(glove_encoded_train_data)

In [381]:
sample = train_dataset[0]
max(word.size for word in sample.question_chars)

1

In [382]:
dev_dataset = SQADataset(glove_encoded_dev_data)

In [387]:
test_model = initialize_model(train_vecs.vectors, 
                              use_cuda=False, 
                              rnn_hidden_size=64, 
                              rnn_num_layers=1, 
                              dropout=0, 
                              bidirectional=False, 
                              use_self_attention=False, 
                              batch_size=6)

In [388]:
training_config = Trainer.TrainingConfig(learning_rate=2e-3,weight_decay=1e-5,num_epochs=20,batch_size=5,max_question_size=0,max_context_size=0,device=get_device(True),loader_num_workers=0,model_checkpoint_path='jupyter.pth')

In [None]:
Trainer.train_model(test_model, train_dataset, dev_dataset, training_config, debug=False)

Epoch 1:   0%|          | 0/20 [00:00<?, ?it/s]
  0%|          | 0/17520 [00:00<?, ?it/s][A
Batch 1:   0%|          | 0/17520 [00:00<?, ?it/s][A
Batch 1:   0%|          | 0/17520 [00:00<?, ?it/s, loss=9.73][A
Batch 1:   0%|          | 1/17520 [00:00<3:44:00,  1.30it/s, loss=9.73][A
Batch 2:   0%|          | 1/17520 [00:00<3:47:03,  1.29it/s, loss=9.73][A
Batch 2:   0%|          | 1/17520 [00:01<5:34:02,  1.14s/it, loss=9.21][A
Batch 2:   0%|          | 2/17520 [00:01<2:47:57,  1.74it/s, loss=9.21][A
Batch 3:   0%|          | 2/17520 [00:01<2:49:37,  1.72it/s, loss=9.21][A
Batch 3:   0%|          | 2/17520 [00:01<4:13:08,  1.15it/s, loss=9.27][A
Batch 3:   0%|          | 3/17520 [00:01<2:49:03,  1.73it/s, loss=9.27][A
Batch 4:   0%|          | 3/17520 [00:01<2:50:24,  1.71it/s, loss=9.27][A
Batch 4:   0%|          | 3/17520 [00:02<3:25:39,  1.42it/s, loss=8.22][A
Batch 4:   0%|          | 4/17520 [00:02<2:34:26,  1.89it/s, loss=8.22][A
Batch 5:   0%|          | 4/17520 [00:

Batch 35:   0%|          | 35/17520 [00:19<2:38:19,  1.84it/s, loss=0][A
Batch 36:   0%|          | 35/17520 [00:19<2:38:26,  1.84it/s, loss=0][A
Batch 36:   0%|          | 35/17520 [00:19<2:43:46,  1.78it/s, loss=0][A
Batch 36:   0%|          | 36/17520 [00:19<2:39:14,  1.83it/s, loss=0][A
Batch 37:   0%|          | 36/17520 [00:19<2:39:21,  1.83it/s, loss=0][A
Batch 37:   0%|          | 36/17520 [00:20<2:42:29,  1.79it/s, loss=4.77e-07][A
Batch 37:   0%|          | 37/17520 [00:20<2:38:09,  1.84it/s, loss=4.77e-07][A
Batch 38:   0%|          | 37/17520 [00:20<2:38:14,  1.84it/s, loss=4.77e-07][A
Batch 38:   0%|          | 37/17520 [00:20<2:41:26,  1.80it/s, loss=0]       [A
Batch 38:   0%|          | 38/17520 [00:20<2:37:14,  1.85it/s, loss=0][A
Batch 39:   0%|          | 38/17520 [00:20<2:37:18,  1.85it/s, loss=0][A
Batch 39:   0%|          | 38/17520 [00:21<2:41:39,  1.80it/s, loss=0][A
Batch 39:   0%|          | 39/17520 [00:21<2:37:32,  1.85it/s, loss=0][A
Batch 40: 

Batch 70:   0%|          | 70/17520 [00:38<2:39:55,  1.82it/s, loss=0][A
Batch 71:   0%|          | 70/17520 [00:38<2:39:58,  1.82it/s, loss=0][A
Batch 71:   0%|          | 70/17520 [00:39<2:43:32,  1.78it/s, loss=0][A
Batch 71:   0%|          | 71/17520 [00:39<2:41:21,  1.80it/s, loss=0][A
Batch 72:   0%|          | 71/17520 [00:39<2:41:33,  1.80it/s, loss=0][A
Batch 72:   0%|          | 71/17520 [00:40<2:45:27,  1.76it/s, loss=0.00647][A
Batch 72:   0%|          | 72/17520 [00:40<2:43:10,  1.78it/s, loss=0.00647][A
Batch 73:   0%|          | 72/17520 [00:40<2:43:13,  1.78it/s, loss=0.00647][A
Batch 73:   0%|          | 72/17520 [00:40<2:44:43,  1.77it/s, loss=1.67e-07][A
Batch 73:   0%|          | 73/17520 [00:40<2:42:27,  1.79it/s, loss=1.67e-07][A
Batch 74:   0%|          | 73/17520 [00:40<2:42:30,  1.79it/s, loss=1.67e-07][A
Batch 74:   0%|          | 73/17520 [00:41<2:45:08,  1.76it/s, loss=4.77e-08][A
Batch 74:   0%|          | 74/17520 [00:41<2:42:55,  1.78it/s, los

Batch 105:   1%|          | 104/17520 [00:59<2:47:18,  1.73it/s, loss=1.91e-07][A
Batch 105:   1%|          | 104/17520 [01:00<2:48:20,  1.72it/s, loss=1.43e-06][A
Batch 105:   1%|          | 105/17520 [01:00<2:46:45,  1.74it/s, loss=1.43e-06][A
Batch 106:   1%|          | 105/17520 [01:00<2:46:46,  1.74it/s, loss=1.43e-06][A
Batch 106:   1%|          | 105/17520 [01:00<2:48:06,  1.73it/s, loss=2.86e-07][A
Batch 106:   1%|          | 106/17520 [01:00<2:46:31,  1.74it/s, loss=2.86e-07][A
Batch 107:   1%|          | 106/17520 [01:00<2:46:33,  1.74it/s, loss=2.86e-07][A
Batch 107:   1%|          | 106/17520 [01:01<2:49:12,  1.72it/s, loss=0]       [A
Batch 107:   1%|          | 107/17520 [01:01<2:47:37,  1.73it/s, loss=0][A
Batch 108:   1%|          | 107/17520 [01:01<2:47:40,  1.73it/s, loss=0][A
Batch 108:   1%|          | 107/17520 [01:02<2:48:51,  1.72it/s, loss=0][A
Batch 108:   1%|          | 108/17520 [01:02<2:47:18,  1.73it/s, loss=0][A
Batch 109:   1%|          | 108/

Batch 138:   1%|          | 137/17520 [01:19<2:48:10,  1.72it/s, loss=5.01e-07][A
Batch 138:   1%|          | 138/17520 [01:19<2:46:57,  1.74it/s, loss=5.01e-07][A
Batch 139:   1%|          | 138/17520 [01:19<2:46:58,  1.73it/s, loss=5.01e-07][A
Batch 139:   1%|          | 138/17520 [01:19<2:47:52,  1.73it/s, loss=1.1e-05] [A
Batch 139:   1%|          | 139/17520 [01:19<2:46:40,  1.74it/s, loss=1.1e-05][A
Batch 140:   1%|          | 139/17520 [01:19<2:46:41,  1.74it/s, loss=1.1e-05][A
Batch 140:   1%|          | 139/17520 [01:20<2:47:26,  1.73it/s, loss=2.67e-06][A
Batch 140:   1%|          | 140/17520 [01:20<2:46:15,  1.74it/s, loss=2.67e-06][A
Batch 141:   1%|          | 140/17520 [01:20<2:46:16,  1.74it/s, loss=2.67e-06][A
Batch 141:   1%|          | 140/17520 [01:20<2:47:28,  1.73it/s, loss=3.91e-06][A
Batch 141:   1%|          | 141/17520 [01:20<2:46:16,  1.74it/s, loss=3.91e-06][A
Batch 142:   1%|          | 141/17520 [01:20<2:46:18,  1.74it/s, loss=3.91e-06][A
Batch 

Batch 171:   1%|          | 170/17520 [01:36<2:43:18,  1.77it/s, loss=2.29e-06][A
Batch 171:   1%|          | 171/17520 [01:36<2:42:21,  1.78it/s, loss=2.29e-06][A
Batch 172:   1%|          | 171/17520 [01:36<2:42:22,  1.78it/s, loss=2.29e-06][A
Batch 172:   1%|          | 171/17520 [01:36<2:43:37,  1.77it/s, loss=7.06e-06][A
Batch 172:   1%|          | 172/17520 [01:36<2:42:39,  1.78it/s, loss=7.06e-06][A
Batch 173:   1%|          | 172/17520 [01:36<2:42:41,  1.78it/s, loss=7.06e-06][A
Batch 173:   1%|          | 172/17520 [01:37<2:43:27,  1.77it/s, loss=6.2e-06] [A
Batch 173:   1%|          | 173/17520 [01:37<2:42:30,  1.78it/s, loss=6.2e-06][A
Batch 174:   1%|          | 173/17520 [01:37<2:42:31,  1.78it/s, loss=6.2e-06][A
Batch 174:   1%|          | 173/17520 [01:37<2:43:20,  1.77it/s, loss=1.84e-06][A
Batch 174:   1%|          | 174/17520 [01:37<2:42:24,  1.78it/s, loss=1.84e-06][A
Batch 175:   1%|          | 174/17520 [01:37<2:42:25,  1.78it/s, loss=1.84e-06][A
Batch 

Batch 204:   1%|          | 203/17520 [01:54<2:42:42,  1.77it/s, loss=0.00829][A
Batch 204:   1%|          | 203/17520 [01:54<2:43:21,  1.77it/s, loss=1.97e-05][A
Batch 204:   1%|          | 204/17520 [01:54<2:42:33,  1.78it/s, loss=1.97e-05][A
Batch 205:   1%|          | 204/17520 [01:54<2:42:34,  1.78it/s, loss=1.97e-05][A
Batch 205:   1%|          | 204/17520 [01:55<2:43:09,  1.77it/s, loss=4.29e-06][A
Batch 205:   1%|          | 205/17520 [01:55<2:42:21,  1.78it/s, loss=4.29e-06][A
Batch 206:   1%|          | 205/17520 [01:55<2:42:22,  1.78it/s, loss=4.29e-06][A
Batch 206:   1%|          | 205/17520 [01:55<2:43:01,  1.77it/s, loss=8.11e-07][A
Batch 206:   1%|          | 206/17520 [01:55<2:42:13,  1.78it/s, loss=8.11e-07][A
Batch 207:   1%|          | 206/17520 [01:55<2:42:14,  1.78it/s, loss=8.11e-07][A
Batch 207:   1%|          | 206/17520 [01:56<2:43:09,  1.77it/s, loss=9.06e-07][A
Batch 207:   1%|          | 207/17520 [01:56<2:42:21,  1.78it/s, loss=9.06e-07][A
Batch

Batch 236:   1%|▏         | 236/17520 [02:11<2:40:36,  1.79it/s, loss=0.000114][A
Batch 237:   1%|▏         | 236/17520 [02:11<2:40:37,  1.79it/s, loss=0.000114][A
Batch 237:   1%|▏         | 236/17520 [02:12<2:41:11,  1.79it/s, loss=1.14e-06][A
Batch 237:   1%|▏         | 237/17520 [02:12<2:40:30,  1.79it/s, loss=1.14e-06][A
Batch 238:   1%|▏         | 237/17520 [02:12<2:40:31,  1.79it/s, loss=1.14e-06][A
Batch 238:   1%|▏         | 237/17520 [02:12<2:41:09,  1.79it/s, loss=7.63e-07][A
Batch 238:   1%|▏         | 238/17520 [02:12<2:40:28,  1.79it/s, loss=7.63e-07][A
Batch 239:   1%|▏         | 238/17520 [02:12<2:40:29,  1.79it/s, loss=7.63e-07][A
Batch 239:   1%|▏         | 238/17520 [02:12<2:40:50,  1.79it/s, loss=9.54e-07][A
Batch 239:   1%|▏         | 239/17520 [02:12<2:40:10,  1.80it/s, loss=9.54e-07][A
Batch 240:   1%|▏         | 239/17520 [02:12<2:40:10,  1.80it/s, loss=9.54e-07][A
Batch 240:   1%|▏         | 239/17520 [02:13<2:40:38,  1.79it/s, loss=1.53e-06][A
Batc

Batch 270:   2%|▏         | 269/17520 [02:28<2:39:03,  1.81it/s, loss=9.54e-07][A
Batch 270:   2%|▏         | 269/17520 [02:29<2:39:34,  1.80it/s, loss=2.84e-05][A
Batch 270:   2%|▏         | 270/17520 [02:29<2:38:58,  1.81it/s, loss=2.84e-05][A
Batch 271:   2%|▏         | 270/17520 [02:29<2:38:59,  1.81it/s, loss=2.84e-05][A
Batch 271:   2%|▏         | 270/17520 [02:29<2:39:21,  1.80it/s, loss=3.81e-07][A
Batch 271:   2%|▏         | 271/17520 [02:29<2:38:45,  1.81it/s, loss=3.81e-07][A
Batch 272:   2%|▏         | 271/17520 [02:29<2:38:46,  1.81it/s, loss=3.81e-07][A
Batch 272:   2%|▏         | 271/17520 [02:30<2:39:14,  1.81it/s, loss=7.63e-07][A
Batch 272:   2%|▏         | 272/17520 [02:30<2:38:38,  1.81it/s, loss=7.63e-07][A
Batch 273:   2%|▏         | 272/17520 [02:30<2:38:39,  1.81it/s, loss=7.63e-07][A
Batch 273:   2%|▏         | 272/17520 [02:30<2:39:09,  1.81it/s, loss=3.81e-07][A
Batch 273:   2%|▏         | 273/17520 [02:30<2:38:34,  1.81it/s, loss=3.81e-07][A
Batc

Batch 303:   2%|▏         | 303/17520 [02:45<2:36:34,  1.83it/s, loss=0][A
Batch 304:   2%|▏         | 303/17520 [02:45<2:36:35,  1.83it/s, loss=0][A
Batch 304:   2%|▏         | 303/17520 [02:45<2:37:06,  1.83it/s, loss=0][A
Batch 304:   2%|▏         | 304/17520 [02:45<2:36:35,  1.83it/s, loss=0][A
Batch 305:   2%|▏         | 304/17520 [02:45<2:36:36,  1.83it/s, loss=0][A
Batch 305:   2%|▏         | 304/17520 [02:46<2:37:11,  1.83it/s, loss=0][A
Batch 305:   2%|▏         | 305/17520 [02:46<2:36:39,  1.83it/s, loss=0][A
Batch 306:   2%|▏         | 305/17520 [02:46<2:36:40,  1.83it/s, loss=0][A
Batch 306:   2%|▏         | 305/17520 [02:46<2:37:02,  1.83it/s, loss=0][A
Batch 306:   2%|▏         | 306/17520 [02:46<2:36:31,  1.83it/s, loss=0][A
Batch 307:   2%|▏         | 306/17520 [02:46<2:36:32,  1.83it/s, loss=0][A
Batch 307:   2%|▏         | 306/17520 [02:47<2:36:54,  1.83it/s, loss=3.62e-06][A
Batch 307:   2%|▏         | 307/17520 [02:47<2:36:23,  1.83it/s, loss=3.62e-06][

Batch 338:   2%|▏         | 337/17520 [03:02<2:35:25,  1.84it/s, loss=0][A
Batch 338:   2%|▏         | 337/17520 [03:03<2:35:51,  1.84it/s, loss=0][A
Batch 338:   2%|▏         | 338/17520 [03:03<2:35:23,  1.84it/s, loss=0][A
Batch 339:   2%|▏         | 338/17520 [03:03<2:35:24,  1.84it/s, loss=0][A
Batch 339:   2%|▏         | 338/17520 [03:04<2:36:14,  1.83it/s, loss=0][A
Batch 339:   2%|▏         | 339/17520 [03:04<2:35:45,  1.84it/s, loss=0][A
Batch 340:   2%|▏         | 339/17520 [03:04<2:35:46,  1.84it/s, loss=0][A
Batch 340:   2%|▏         | 339/17520 [03:04<2:36:09,  1.83it/s, loss=1.91e-07][A
Batch 340:   2%|▏         | 340/17520 [03:04<2:35:41,  1.84it/s, loss=1.91e-07][A
Batch 341:   2%|▏         | 340/17520 [03:04<2:35:41,  1.84it/s, loss=1.91e-07][A
Batch 341:   2%|▏         | 340/17520 [03:05<2:35:59,  1.84it/s, loss=3.05e-06][A
Batch 341:   2%|▏         | 341/17520 [03:05<2:35:31,  1.84it/s, loss=3.05e-06][A
Batch 342:   2%|▏         | 341/17520 [03:05<2:35:32,