In [1]:
import kashgari
from kashgari.embeddings import NumericFeaturesEmbedding, StackedEmbedding, BERTEmbedding, BareEmbedding
import pandas as pd
import numpy as np
from tensorflow.python import keras
from kashgari.callbacks import EvalCallBack
import logging
logging.basicConfig(level='DEBUG')


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.



In [2]:
from kashgari.tasks.labeling import BiLSTM_Model, BiGRU_Model

In [3]:
TRAIN_CSV = '/Users/ishitagupta/Desktop/kashgari_data_train.csv'
train_df = pd.read_csv(TRAIN_CSV)
train_df.head()

Unnamed: 0,text1,text2,feature1,feature2,feature3,label
0,We did not consider it necessary at Commission...,We have not found it necessary at Commission l...,724,619,603,1
1,"My second point, Commissioner, regards applica...","My second point, Commissioner, is the applicat...",778,625,429,0
2,To this we must add the water pollution proble...,To this must be added the water pollution prob...,570,332,178,0
3,The directive concerning end-of-life vehicles ...,The End-of-Life Vehicles Directive will serve ...,613,418,368,0
4,"Information Centers for Europe, which already ...","Information Centers for Europe, already playin...",930,858,816,1


In [4]:
DEV_CSV = '/Users/ishitagupta/Desktop/kashgari_data_dev.csv'
dev_df = pd.read_csv(DEV_CSV)
dev_df.head()

Unnamed: 0,text1,text2,feature1,feature2,feature3,label
0,"Madam, the question you are asking is of utmos...","Madam, the question you are asking is of utmos...",1000,1000,1000,1
1,I am therefore able to say that Felix and Char...,I can say that Felix and Charles-Louis of Habs...,799,747,741,1
2,"These measures do, in fact, entail a certain n...","In reality, these measures imply some degree o...",427,111,39,0
3,"As others have said before me, however, our pa...","As others have already said, our partners have...",550,349,218,0
4,The objectives of LIFE: this must have as its ...,The objectives of LIFE: the main objective of ...,838,750,658,0


In [5]:
#SEQUENCE LENGTH 
SEQUENCE_LEN = 128

In [6]:
#BERT MODEL 

bert_model_path= '/Users/ishitagupta/bert/uncased_L-12_H-768_A-12'
bert_embedding = BERTEmbedding(bert_model_path,
                               task=kashgari.LABELING,
                               sequence_length=SEQUENCE_LEN)

tokenizer = bert_embedding.tokenizer


Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [7]:
#CUSTOM NUMERICAL FEATURES embedding definition , feature count is maximum allowed value of feature

feature1_embedding = NumericFeaturesEmbedding(feature_count=1001,
                                                feature_name='feature1',
                                                sequence_length=SEQUENCE_LEN)

feature2_embedding = NumericFeaturesEmbedding(feature_count=1001,
                                                feature_name='feature2',
                                                sequence_length=SEQUENCE_LEN)

feature3_embedding = NumericFeaturesEmbedding(feature_count=1001,
                                                feature_name='feature3',
                                                sequence_length=SEQUENCE_LEN)

In [8]:
# forming the stack embedding structure
# first one must be the text embedding
stack_embedding = StackedEmbedding([
    #text_embedding,
    bert_embedding,
    feature1_embedding,
    feature2_embedding,
    feature3_embedding
])


[<tf.Tensor 'Input-Token:0' shape=(?, 128) dtype=float32>, <tf.Tensor 'Input-Segment:0' shape=(?, 128) dtype=float32>]
[<tf.Tensor 'Input-Token:0' shape=(?, 128) dtype=float32>, <tf.Tensor 'Input-Segment:0' shape=(?, 128) dtype=float32>]
Tensor("input_feature1:0", shape=(?, 128), dtype=float32)
[<tf.Tensor 'input_feature1:0' shape=(?, 128) dtype=float32>]
Tensor("input_feature2:0", shape=(?, 128), dtype=float32)
[<tf.Tensor 'input_feature2:0' shape=(?, 128) dtype=float32>]
Tensor("input_feature3:0", shape=(?, 128), dtype=float32)
[<tf.Tensor 'input_feature3:0' shape=(?, 128) dtype=float32>]


In [16]:
# BERT TOKENIZER for train set
sentences_tokenized_train = []
i=0
for sentence in train_df['text1'].values: 
    #print(sentence)
    #print(train_df['text2'].values[i])
    sentence_tokenized_1 = tokenizer.tokenize(sentence)
    sentence_tokenized_2 = tokenizer.tokenize(train_df['text2'].values[i])
    sentence_tokenized = sentence_tokenized_1[:-1] + ['[SEP]']+ sentence_tokenized_2[1:]
    sentences_tokenized_train= sentences_tokenized_train + sentence_tokenized
    #sentences_tokenized_train.append(sentence_tokenized)
    i += 1
    #print(sentences_tokenized_train)
    
# BERT TOKENIZER for dev set
sentences_tokenized_dev = []
i=0
for sentence in dev_df['text1'].values: 
    #print(sentence)
    #print(train_df['text2'].values[i])
    sentence_tokenized_1 = tokenizer.tokenize(sentence)
    sentence_tokenized_2 = tokenizer.tokenize(dev_df['text2'].values[i])
    sentence_tokenized = sentence_tokenized_1[:-1] + ['[SEP]']+ sentence_tokenized_2[1:]
    sentences_tokenized_dev= sentences_tokenized_dev + sentence_tokenized
    #sentences_tokenized_dev.append(sentence_tokenized)
    i += 1
    #print(sentence_tokenized)
    
    
"""
The sentences will become tokenized into:
[
    ['[CLS]', 'jim', 'henson', 'was', 'a', 'puppet', '##eer', '.', '[SEP]'],
    ['[CLS]', 'this', 'here', "'", 's', 'an', 'example', 'of', 'using', 'the', 'bert', 'token', '##izer', '.', '[SEP]'],
    ['[CLS]', 'why', 'did', 'the', 'chicken', 'cross', 'the', 'road', '?', '[SEP]']
]
"""
print(sentences_tokenized_train)
#print(sentences_tokenized_dev)
# Our tokenizer already added the BOS([CLS]) and EOS([SEP]) token
# so we need to disable the default add_bos_eos setting.
bert_embedding.processor.add_bos_eos = False

['[CLS]', 'we', 'did', 'not', 'consider', 'it', 'necessary', 'at', 'commission', '.', '.', '.', '[SEP]', 'we', 'have', 'not', 'found', 'it', 'necessary', 'at', 'commission', 'l', '.', '.', '.', '[SEP]', '[CLS]', 'my', 'second', 'point', ',', 'commissioner', ',', 'regards', 'app', '##lica', '.', '.', '.', '[SEP]', 'my', 'second', 'point', ',', 'commissioner', ',', 'is', 'the', 'app', '##lica', '##t', '.', '.', '.', '[SEP]', '[CLS]', 'to', 'this', 'we', 'must', 'add', 'the', 'water', 'pollution', 'pro', '##ble', '.', '.', '.', '[SEP]', 'to', 'this', 'must', 'be', 'added', 'the', 'water', 'pollution', 'pro', '##b', '.', '.', '.', '[SEP]', '[CLS]', 'the', 'directive', 'concerning', 'end', '-', 'of', '-', 'life', 'vehicles', '.', '.', '.', '[SEP]', 'the', 'end', '-', 'of', '-', 'life', 'vehicles', 'directive', 'will', 'serve', '.', '.', '.', '[SEP]', '[CLS]', 'information', 'centers', 'for', 'europe', ',', 'which', 'already', '.', '.', '.', '[SEP]', 'information', 'centers', 'for', 'europe'

In [17]:
print(list(sentences_tokenized_train))

['[CLS]', 'we', 'did', 'not', 'consider', 'it', 'necessary', 'at', 'commission', '.', '.', '.', '[SEP]', 'we', 'have', 'not', 'found', 'it', 'necessary', 'at', 'commission', 'l', '.', '.', '.', '[SEP]', '[CLS]', 'my', 'second', 'point', ',', 'commissioner', ',', 'regards', 'app', '##lica', '.', '.', '.', '[SEP]', 'my', 'second', 'point', ',', 'commissioner', ',', 'is', 'the', 'app', '##lica', '##t', '.', '.', '.', '[SEP]', '[CLS]', 'to', 'this', 'we', 'must', 'add', 'the', 'water', 'pollution', 'pro', '##ble', '.', '.', '.', '[SEP]', 'to', 'this', 'must', 'be', 'added', 'the', 'water', 'pollution', 'pro', '##b', '.', '.', '.', '[SEP]', '[CLS]', 'the', 'directive', 'concerning', 'end', '-', 'of', '-', 'life', 'vehicles', '.', '.', '.', '[SEP]', 'the', 'end', '-', 'of', '-', 'life', 'vehicles', 'directive', 'will', 'serve', '.', '.', '.', '[SEP]', '[CLS]', 'information', 'centers', 'for', 'europe', ',', 'which', 'already', '.', '.', '.', '[SEP]', 'information', 'centers', 'for', 'europe'

In [18]:
#converting data into a list format 
#TRAIN set
tokenized_text_list_train = sentences_tokenized_train
feature1_list_train = [list(train_df['feature1'])]
feature2_list_train = [list(train_df['feature2'])]
feature3_list_train = [list(train_df['feature3'])]
label_list_train = [list(train_df['label'])]

#DEV set 
tokenized_text_list_dev = sentences_tokenized_dev
feature1_list_dev = [list(dev_df['feature1'])]
feature2_list_dev = [list(dev_df['feature2'])]
feature3_list_dev = [list(dev_df['feature3'])]
label_list_dev = [list(dev_df['label'])]

print(label_list_dev)
print(tokenized_text_list_train)

[[1, 1, 0, 0, 0]]
['[CLS]', 'we', 'did', 'not', 'consider', 'it', 'necessary', 'at', 'commission', '.', '.', '.', '[SEP]', 'we', 'have', 'not', 'found', 'it', 'necessary', 'at', 'commission', 'l', '.', '.', '.', '[SEP]', '[CLS]', 'my', 'second', 'point', ',', 'commissioner', ',', 'regards', 'app', '##lica', '.', '.', '.', '[SEP]', 'my', 'second', 'point', ',', 'commissioner', ',', 'is', 'the', 'app', '##lica', '##t', '.', '.', '.', '[SEP]', '[CLS]', 'to', 'this', 'we', 'must', 'add', 'the', 'water', 'pollution', 'pro', '##ble', '.', '.', '.', '[SEP]', 'to', 'this', 'must', 'be', 'added', 'the', 'water', 'pollution', 'pro', '##b', '.', '.', '.', '[SEP]', '[CLS]', 'the', 'directive', 'concerning', 'end', '-', 'of', '-', 'life', 'vehicles', '.', '.', '.', '[SEP]', 'the', 'end', '-', 'of', '-', 'life', 'vehicles', 'directive', 'will', 'serve', '.', '.', '.', '[SEP]', '[CLS]', 'information', 'centers', 'for', 'europe', ',', 'which', 'already', '.', '.', '.', '[SEP]', 'information', 'centers

In [19]:
#compiling features together 

train_x = (tokenized_text_list_train, feature1_list_train, feature2_list_train, feature3_list_train)
train_y = label_list_train
stack_embedding.analyze_corpus(train_x, train_y)

dev_x = (tokenized_text_list_dev, feature1_list_dev, feature2_list_dev, feature3_list_dev)
dev_y = label_list_dev
stack_embedding.analyze_corpus(dev_x, dev_y)

print(train_x)
print(train_y)

(['[CLS]', 'we', 'did', 'not', 'consider', 'it', 'necessary', 'at', 'commission', '.', '.', '.', '[SEP]', 'we', 'have', 'not', 'found', 'it', 'necessary', 'at', 'commission', 'l', '.', '.', '.', '[SEP]', '[CLS]', 'my', 'second', 'point', ',', 'commissioner', ',', 'regards', 'app', '##lica', '.', '.', '.', '[SEP]', 'my', 'second', 'point', ',', 'commissioner', ',', 'is', 'the', 'app', '##lica', '##t', '.', '.', '.', '[SEP]', '[CLS]', 'to', 'this', 'we', 'must', 'add', 'the', 'water', 'pollution', 'pro', '##ble', '.', '.', '.', '[SEP]', 'to', 'this', 'must', 'be', 'added', 'the', 'water', 'pollution', 'pro', '##b', '.', '.', '.', '[SEP]', '[CLS]', 'the', 'directive', 'concerning', 'end', '-', 'of', '-', 'life', 'vehicles', '.', '.', '.', '[SEP]', 'the', 'end', '-', 'of', '-', 'life', 'vehicles', 'directive', 'will', 'serve', '.', '.', '.', '[SEP]', '[CLS]', 'information', 'centers', 'for', 'europe', ',', 'which', 'already', '.', '.', '.', '[SEP]', 'information', 'centers', 'for', 'europe

In [20]:
# Now we can embed with this stacked embedding layer
print(stack_embedding.embed(train_x))


IndexError: index 1 is out of bounds for axis 0 with size 1

In [105]:
 len(train_x)

4

In [21]:
# We can build any labeling model with this embedding


model = BiLSTM_Model(embedding=stack_embedding)


model.fit(train_x, train_y, dev_x, dev_y)

#Evaluate the model
#model.evaluate(test_x, test_y)

# Model data will save to `saved_ner_model` folder
model.save('/Users/ishitagupta/Kashgari/model_output_3')


#print(model.predict(x))
#print(model.predict_entities(x))

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        (None, 128)          0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      (None, 128)          0                                            
__________________________________________________________________________________________________
Embedding-Token (TokenEmbedding [(None, 128, 768), ( 23440896    Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, 128, 768)     1536        Input-Segment[0][0]              
__________________________________________________________________________________________________
Embedding-

Epoch 1/5


ValueError: All input arrays (x) should have the same number of samples. Got array shapes: [(4, 128), (4, 128), (1, 128), (1, 128), (1, 128)]