In [1]:
!pip install tokenizers
!pip install transformers

Collecting tokenizers
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 6.3MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.9.4
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/99/84/7bc03215279f603125d844bf81c3fb3f2d50fe8e511546eb4897e4be2067/transformers-4.0.0-py3-none-any.whl (1.4MB)
[K     |████████████████████████████████| 1.4MB 6.6MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 21.2MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: fil

In [2]:
import os
import re
import json
import string
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig

max_len = 384
configuration = BertConfig() 

In [4]:
slow_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
save_path = 'bert_base_uncased/'

if not os.path.exists(save_path):
  os.mkdir(save_path)

slow_tokenizer.save_pretrained(save_path)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




('bert_base_uncased/tokenizer_config.json',
 'bert_base_uncased/special_tokens_map.json',
 'bert_base_uncased/vocab.txt',
 'bert_base_uncased/added_tokens.json')

In [5]:
tokenizer = BertWordPieceTokenizer('bert_base_uncased/vocab.txt', 
                                   lowercase=True)

In [6]:
train_data_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json"
train_path = keras.utils.get_file("train.json", train_data_url)
eval_data_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json"
eval_path = keras.utils.get_file("eval.json", eval_data_url)


Downloading data from https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json
Downloading data from https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json


In [8]:
class SquadExample:
  def __init__(self, question, context, start_char_idx, answer_txt,
               all_answers):
    self.question = question
    self.context = context
    self.start_char_idx = start_char_idx
    self.answer_txt = answer_txt
    self.all_answers = all_answers
    self.skip = False
  
  def preprocess(self):
    context = self.context
    question = self.question
    answer_txt = self.answer_txt
    start_char_idx = self.start_char_idx

    # context, question, answer
    # " " + split()
    context = " ".join(str(context).split())
    question = " ".join(str(question).split())
    answer = " ".join(str(answer_txt).split())

    # answer에서 마지막 charc 탐색
    end_char_idx = start_char_idx + len(answer)
    if end_char_idx >= len(context):
      self.skip = True
      return

    # charc index 가 answer에 포함됐는지 표시
    is_char_in_ans = [0] * len(context)
    for idx in range(start_char_idx, end_char_idx):
      is_char_in_ans[idx] = 1

    # context tokenizing
    tokenized_context = tokenizer.encode(context)

    ans_token_idx = []
    for idx, (start, end) in enumerate(tokenized_context.offsets):
      if sum(is_char_in_ans[start:end]) > 0:
        ans_token_idx.append(idx)

    if len(ans_token_idx) == 0:
      self.skip = True
      return

    # start token, end token 
    start_token_idx = ans_token_idx[0]
    end_token_idx = ans_token_idx[-1]

    # question tokenizing
    tokenized_question = tokenizer.encode(question)

    # input 구성
    input_ids = tokenized_context.ids + tokenized_question.ids[1:]
    token_type_ids = [0] * len(tokenized_context.ids) + \
                    [1] * len(tokenized_question.ids[1:])

    attention_mask = [1] * len(input_ids)

    #ppadding
    padding_length = max_len - len(input_ids)

    # padding 적용
    if padding_length > 0:
      input_ids = input_ids + ([0] * padding_length)
      attention_mask = attention_mask + ([0] * padding_length)
      token_type_ids = token_type_ids + ([0] * padding_length)

    elif padding_length < 0:
      self.skip = True
      return

    self.input_ids = input_ids
    self.token_type_ids = token_type_ids
    self.attention_mask = attention_mask
    self.start_token_idx = start_token_idx
    self.end_token_idx = end_token_idx
    self.context_token_to_char = tokenized_context.offsets


# json file 들 로딩
with open(train_path) as f:
  raw_train_data = json.load(f)

with open(eval_path) as f:
  raw_eval_data = json.load(f)




In [9]:
def create_squad_examples(raw_data):
    squad_examples = []
    for item in raw_data["data"]:
        for para in item["paragraphs"]:
            context = para["context"]
            for qa in para["qas"]:
                question = qa["question"]
                answer_text = qa["answers"][0]["text"]
                all_answers = [_["text"] for _ in qa["answers"]]
                start_char_idx = qa["answers"][0]["answer_start"]
                squad_eg = SquadExample(
                    question, context, start_char_idx, answer_text, all_answers
                )
                squad_eg.preprocess()
                squad_examples.append(squad_eg)
    return squad_examples

In [12]:
def create_inputs_targets(squad_examples):
  dataset_dict = {
      'input_ids' : [],
      'token_type_ids': [],
      'attention_mask': [],
      'start_token_idx': [],
      'end_token_idx': [],
  }

  for item in squad_examples:
    if item.skip == False:
      for key in dataset_dict:
        dataset_dict[key].append(getattr(item, key))

  for key in dataset_dict:
    dataset_dict[key] = np.array(dataset_dict[key])

  x = [
       dataset_dict['input_ids'],
       dataset_dict['token_type_ids'],
       dataset_dict['attention_mask']
  ]

  y = [dataset_dict['start_token_idx'], dataset_dict['end_token_idx']]
  return x, y

In [13]:
train_squad_examples = create_squad_examples(raw_train_data)
x_train, y_train = create_inputs_targets(train_squad_examples)
print(f'{len(train_squad_examples)} training points ')

87599 training points 


In [15]:
eval_squad_examples = create_squad_examples(raw_eval_data)
x_eval, y_eval = create_inputs_targets(eval_squad_examples)
print(f'{len(eval_squad_examples)} eval points ')

10570 eval points 


In [19]:
def create_model():
  encoder = TFBertModel.from_pretrained('bert-base-uncased')

  # input layer 
  input_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
  token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
  attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32)

  embedding = encoder(
      input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
  )[0]

  start_logits = layers.Dense(1, name='start_logit',use_bias=False)(embedding)
  start_logits = layers.Flatten()(start_logits)

  end_logits = layers.Dense(1, name='end_logit', use_bias=False)(embedding)
  end_logits = layers.Flatten()(end_logits)

  start_probs = layers.Activation(keras.activations.softmax)(start_logits)
  end_probs = layers.Activation(keras.activations.softmax)(end_logits)

  model = keras.Model(
      inputs = [input_ids, token_type_ids, attention_mask],
      outputs = [start_probs, end_probs]
  )

  loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
  optimizer = keras.optimizers.Adam(lr=5e-5)
  model.compile(optimizer=optimizer, loss= [loss, loss])
  return model


In [20]:
# tpu startegy model = creat_moddel
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.experimental.TPUStrategy(tpu)

with strategy.scope():
  model = create_model()





INFO:tensorflow:Initializing the TPU system: grpc://10.45.26.186:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.45.26.186:8470


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=536063208.0, style=ProgressStyle(descri…




Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [21]:
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 384)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 384)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 384)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     TFBaseModelOutputWit 109482240   input_1[0][0]                    
                                                                 input_3[0][0]         

In [None]:
model.fit(x_train, y_train, epochs=1, batch_size=64)

Instructions for updating:
Use `tf.data.Iterator.get_next_as_optional()` instead.


Instructions for updating:
Use `tf.data.Iterator.get_next_as_optional()` instead.






















 164/1346 [==>...........................] - ETA: 9:01 - loss: 4.3123 - activation_4_loss: 2.2026 - activation_5_loss: 2.1098