In [1]:
import sklearn.preprocessing as preprocess
import numpy as np
import pandas as pd
from keras import backend as K

Using TensorFlow backend.


In [2]:
df_hd_tp = pd.read_csv('evaluation_set/headline_topic_mapping.csv')

In [3]:
df_hd_tp.Title.head()

0    Video game content rating system
1               Grand Theft Childhood
2                     School violence
3            Video game controversies
4               Nonviolent video game
Name: Title, dtype: object

In [8]:
import keras
from keras_bert import get_base_dict, get_model, compile_model, gen_batch_inputs


# A toy input example
sentence_pairs = [
    [['all', 'work', 'and', 'no', 'play'], ['makes', 'jack', 'a', 'dull', 'boy']],
    [['from', 'the', 'day', 'forth'], ['my', 'arm', 'changed']],
    [['and', 'a', 'voice', 'echoed'], ['power', 'give', 'me', 'more', 'power']],
]


# Build token dictionary
token_dict = get_base_dict()  # A dict that contains some special tokens
for pairs in sentence_pairs:
    for token in pairs[0] + pairs[1]:
        if token not in token_dict:
            token_dict[token] = len(token_dict)
token_list = list(token_dict.keys())  # Used for selecting a random word


# Build & train the model
model = get_model(
    token_num=len(token_dict),
    head_num=5,
    transformer_num=12,
    embed_dim=25,
    feed_forward_dim=100,
    seq_len=20,
    pos_num=20,
    dropout_rate=0.05,
)
compile_model(model)
model.summary()

def _generator():
    while True:
        yield gen_batch_inputs(
            sentence_pairs,
            token_dict,
            token_list,
            seq_len=20,
            mask_rate=0.3,
            swap_sentence_rate=1.0,
        )

model.fit_generator(
    generator=_generator(),
    steps_per_epoch=1000,
    epochs=100,
    validation_data=_generator(),
    validation_steps=100,
    callbacks=[
        keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
    ],
)


# Use the trained model
inputs, output_layer = get_model(
    token_num=len(token_dict),
    head_num=5,
    transformer_num=12,
    embed_dim=25,
    feed_forward_dim=100,
    seq_len=20,
    pos_num=20,
    dropout_rate=0.05,
    training=False,      # The input layers and output layer will be returned if `training` is `False`
    trainable=False,     # Whether the model is trainable. The default value is the same with `training`
    output_layer_num=4,  # The number of layers whose outputs will be concatenated as a single output.
                         # Only available when `training` is `False`.
)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        (None, 20)           0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      (None, 20)           0                                            
__________________________________________________________________________________________________
Embedding-Token (TokenEmbedding [(None, 20, 25), (28 700         Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, 20, 25)       50          Input-Segment[0][0]              
__________________________________________________________________________________________________
Embedding-

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100


Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100


In [9]:
inputs

[<tf.Tensor 'Input-Token_1:0' shape=(?, 20) dtype=float32>,
 <tf.Tensor 'Input-Segment_1:0' shape=(?, 20) dtype=float32>]

In [10]:
output_layer

<tf.Tensor 'Encoder-Output/concat:0' shape=(?, 20, 100) dtype=float32>

In [11]:
next(_generator())

([array([[ 2,  5,  6,  7,  8,  9,  3,  4, 11, 12, 13, 14,  3,  0,  0,  0,
           0,  0,  0,  0],
         [ 2,  4,  4, 17, 18,  3,  4, 25,  4, 27, 24,  3,  0,  0,  0,  0,
           0,  0,  0,  0],
         [ 2,  7,  4,  4, 23,  3, 19,  4, 21,  3,  0,  0,  0,  0,  0,  0,
           0,  0,  0,  0]]),
  array([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
  array([[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])],
 [array([[[ 2],
          [ 5],
          [ 6],
          [ 7],
          [ 8],
          [ 9],
          [ 3],
          [10],
          [11],
          [12],
          [13],
          [14],
          [ 3],
          [ 0],
          [ 0],
          [ 0],
        

In [13]:
get_base_dict()

{'': 0, '[CLS]': 2, '[MASK]': 4, '[SEP]': 3, '[UNK]': 1}

In [15]:
sentence_pairs

[[['all', 'work', 'and', 'no', 'play'], ['makes', 'jack', 'a', 'dull', 'boy']],
 [['from', 'the', 'day', 'forth'], ['my', 'arm', 'changed']],
 [['and', 'a', 'voice', 'echoed'], ['power', 'give', 'me', 'more', 'power']]]

In [16]:
token_dict

{'': 0,
 '[CLS]': 2,
 '[MASK]': 4,
 '[SEP]': 3,
 '[UNK]': 1,
 'a': 12,
 'all': 5,
 'and': 7,
 'arm': 20,
 'boy': 14,
 'changed': 21,
 'day': 17,
 'dull': 13,
 'echoed': 23,
 'forth': 18,
 'from': 15,
 'give': 25,
 'jack': 11,
 'makes': 10,
 'me': 26,
 'more': 27,
 'my': 19,
 'no': 8,
 'play': 9,
 'power': 24,
 'the': 16,
 'voice': 22,
 'work': 6}

In [17]:
token_list

['',
 'no',
 'me',
 'my',
 'power',
 'echoed',
 '[UNK]',
 '[MASK]',
 'work',
 'and',
 'voice',
 'boy',
 'more',
 'arm',
 'dull',
 '[CLS]',
 '[SEP]',
 'play',
 'the',
 'a',
 'forth',
 'jack',
 'from',
 'day',
 'makes',
 'give',
 'all',
 'changed']

In [19]:
# Build token dictionary
token_dict = get_base_dict()  # A dict that contains some special tokens
for pairs in sentence_pairs:
    for token in pairs[0] + pairs[1]:
        if token not in token_dict:
            token_dict[token] = len(token_dict)

all
work
and
no
play
makes
jack
a
dull
boy
from
the
day
forth
my
arm
changed
and
a
voice
echoed
power
give
me
more
power
