In [25]:
import kashgari
from kashgari.embeddings import NumericFeaturesEmbedding, BERTEmbedding, StackedEmbedding
from kashgari.tasks.labeling import BiLSTM_Model    #can change this to any other Neural network as well

import logging
logging.basicConfig(level='DEBUG')

In [73]:
#DATASET

text = [['[CLS]', "'", 'nl', '##p', "'", '[SEP]'], ['[CLS]', "'", 'projects', "'", '[SEP]'], ['[CLS]', "'", 'project', "'", '[SEP]'], ['[CLS]', "'", 'name', "'", '[SEP]'], ['[CLS]', "'", ':', "'", '[SEP]']]
start_of_p = [1, 2, 1, 2, 2]
bold = [1, 1, 1, 1, 2]
center = [1, 1, 2, 2, 2]
label = ['B-Category', 'I-Category', 'B-Project-name', 'I-Project-name', 'I-Project-name']

text_list = text
start_of_p_list = [start_of_p]*5
bold_list = [bold]*5
center_list = [center]*5
label_list = [label]*5

#SEQUENCE LENGTH 
SEQUENCE_LEN = 5

In [59]:
#BERT FEATURES 

bert_model_path= '/Users/ishitagupta/bert/uncased_L-12_H-768_A-12'
bert_embedding = BERTEmbedding(bert_model_path,
                               task=kashgari.LABELING,
                               sequence_length=SEQUENCE_LEN)

#tokenizer = bert_embedding.tokenizer
#sentences_tokenized = []
#for sentence in sentences:
#    sentence_tokenized = tokenizer.tokenize(sentence)
#    sentences_tokenized.append(sentence_tokenized)
"""
The sentences will become tokenized into:
[
    ['[CLS]', 'jim', 'henson', 'was', 'a', 'puppet', '##eer', '.', '[SEP]'],
    ['[CLS]', 'this', 'here', "'", 's', 'an', 'example', 'of', 'using', 'the', 'bert', 'token', '##izer', '.', '[SEP]'],
    ['[CLS]', 'why', 'did', 'the', 'chicken', 'cross', 'the', 'road', '?', '[SEP]']
]
"""

# Our tokenizer already added the BOS([CLS]) and EOS([SEP]) token
# so we need to disable the default add_bos_eos setting.
bert_embedding.processor.add_bos_eos = False


KeyboardInterrupt: 

In [60]:
#CUSTOM NUMERICAL FEATURES

start_of_p_embedding = NumericFeaturesEmbedding(feature_count=2,
                                                feature_name='start_of_p',
                                                sequence_length=SEQUENCE_LEN)

bold_embedding = NumericFeaturesEmbedding(feature_count=2,
                                                feature_name='bold',
                                                sequence_length=SEQUENCE_LEN)

center_embedding = NumericFeaturesEmbedding(feature_count=2,
                                                feature_name='center',
                                                sequence_length=SEQUENCE_LEN)


In [61]:
# COMBINING ALL FEATURES 
# first one must be the text embedding
stack_embedding = StackedEmbedding([
    bert_embedding,
    start_of_p_embedding,
    bold_embedding,
    center_embedding
])


[<tf.Tensor 'Input-Token_2:0' shape=(?, 100) dtype=float32>, <tf.Tensor 'Input-Segment_2:0' shape=(?, 100) dtype=float32>]
[<tf.Tensor 'Input-Token_2:0' shape=(?, 100) dtype=float32>, <tf.Tensor 'Input-Segment_2:0' shape=(?, 100) dtype=float32>]
Tensor("input_start_of_p_6:0", shape=(?, 100), dtype=float32)
[<tf.Tensor 'input_start_of_p_6:0' shape=(?, 100) dtype=float32>]
Tensor("input_bold_6:0", shape=(?, 100), dtype=float32)
[<tf.Tensor 'input_bold_6:0' shape=(?, 100) dtype=float32>]
Tensor("input_center_6:0", shape=(?, 100), dtype=float32)
[<tf.Tensor 'input_center_6:0' shape=(?, 100) dtype=float32>]


In [74]:

x = (text_list, start_of_p_list, bold_list, center_list)
y = label_list
stack_embedding.analyze_corpus(x, y)

print(x)

([['[CLS]', "'", 'nl', '##p', "'", '[SEP]'], ['[CLS]', "'", 'projects', "'", '[SEP]'], ['[CLS]', "'", 'project', "'", '[SEP]'], ['[CLS]', "'", 'name', "'", '[SEP]'], ['[CLS]', "'", ':', "'", '[SEP]']], [[1, 2, 1, 2, 2], [1, 2, 1, 2, 2], [1, 2, 1, 2, 2], [1, 2, 1, 2, 2], [1, 2, 1, 2, 2]], [[1, 1, 1, 1, 2], [1, 1, 1, 1, 2], [1, 1, 1, 1, 2], [1, 1, 1, 1, 2], [1, 1, 1, 1, 2]], [[1, 1, 2, 2, 2], [1, 1, 2, 2, 2], [1, 1, 2, 2, 2], [1, 1, 2, 2, 2], [1, 1, 2, 2, 2]])


In [75]:
# Now we can embed with this stacked embedding layer
print(stack_embedding.embed(x))

[[[-0.63078856  0.03127154 -0.66898596 ...  0.03219146  0.02711165
    0.04457002]
  [-0.48703504  0.43648356 -0.12753144 ...  0.03219146  0.02711165
    0.04457002]
  [ 1.05404    -0.99053955  0.18844646 ...  0.04742653  0.02535257
    0.02551291]
  ...
  [-0.50026405 -0.19942376  0.51704884 ...  0.00997248 -0.04567145
   -0.00486707]
  [-0.29065317 -0.25352845  0.5300425  ...  0.00997248 -0.04567145
   -0.00486707]
  [-0.17558987 -0.15362161  0.8306881  ...  0.00997248 -0.04567145
   -0.00486707]]

 [[-0.28589982 -0.09850432 -0.5538254  ...  0.03219146  0.02711165
    0.04457002]
  [ 0.16811052  0.03450568  0.05617864 ...  0.03219146  0.02711165
    0.04457002]
  [ 0.10430379 -0.39118722  0.23435825 ...  0.04742653  0.02535257
    0.02551291]
  ...
  [-0.35326642 -0.6005633   0.63343847 ...  0.00997248 -0.04567145
   -0.00486707]
  [-0.31339148 -0.5364813   0.8067658  ...  0.00997248 -0.04567145
   -0.00486707]
  [-0.17579253 -0.5047474   0.98398113 ...  0.00997248 -0.04567145
   -0.

In [45]:
# We can build any labeling model with this embedding

model = BiLSTM_Model(embedding=stack_embedding)
model.fit(x, y)

print(model.predict(x))
print(model.predict_entities(x))

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        (None, 100)          0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      (None, 100)          0                                            
__________________________________________________________________________________________________
Embedding-Token (TokenEmbedding [(None, 100, 768), ( 23440896    Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, 100, 768)     1536        Input-Segment[0][0]              
__________________________________________________________________________________________________
Embedding-

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[['[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]'], ['[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]'], ['[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]'], ['[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]'], ['[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']]
[{'text': "[CLS] ' nl ##p ' [SEP]", 'text_raw': ['[CLS]', "'", 'nl', '##p', "'", '[SEP]'], 'labels': [{'entity': '[PAD]', 'start': 0, 'end': 5, 'value': "[CLS] ' nl ##p ' [SEP]"}]}, {'text': "[CLS] ' projects ' [SEP]", 'text_raw': ['[CLS]', "'", 'projects', "'", '[SEP]'], 'labels': [{'entity': '[PAD]', 'start': 0, 'end': 4, 'value': "[CLS] ' projects ' [SEP]"}]}, {'text': "[CLS] ' project ' [SEP]", 'text_raw': ['[CLS]', "'", 'project', "'", '[SEP]'], 'labels': [{'entity': '[PAD]', 'start': 0, 'end': 4, 'value': "[CLS] ' project ' [SEP]"}]}, {'text': "[CLS] ' name ' [SEP]", 'text_raw': ['[CLS]', "'", 'name', "'", '[SEP]'], 'labels': [{'entity': '[PAD]', 'start': 0, 'end': 4, 'value': "[CLS] ' name