In [17]:
import pandas as pd

data = pd.read_csv('data/ner_dataset.csv',encoding='latin1')
data.fillna(method="ffill",inplace=True)

max_len = 75
max_len_char = 10

In [2]:
data['Tag'].value_counts()
include_tags = [
    'B-geo',
    'I-geo',
    'B-tim',
    'I-tim',
    
]
for tag in data['Tag'].unique():
    if tag not in include_tags:
        data.loc[data['Tag']==tag,'Tag'] = 'O'
        
n_tags = data['Tag'].unique().shape[0]

tag_mapping = dict()
for tag_num,tag in enumerate(data['Tag'].unique().tolist()):
    tag_mapping[tag] = tag_num
data['Tag_num'] = data['Tag'].map(tag_mapping)

In [74]:
data['Tag_num'].unique()

array([0, 1, 2, 3, 4])

In [4]:
words = list(set(data["Word"].values))
n_words = len(words)
tags = list(set(data["Tag"].values))
n_tags = len(tags)

5

In [5]:
word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["UNK"] = 1
word2idx["PAD"] = 0
idx2word = {i: w for w, i in word2idx.items()}
tag2idx = {t: i + 1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0
idx2tag = {i: w for w, i in tag2idx.items()}

In [8]:
agg_func_words = lambda s: [w for w in s["Word"].values.tolist()]
X = data.groupby("Sentence #").apply(agg_func_words)

agg_func_labels = lambda s: [w for w in s["Tag_num"].values.tolist()]
y = data.groupby("Sentence #").apply(agg_func_labels)


In [19]:
from keras.preprocessing.sequence import pad_sequences

import numpy as np
X_word = [[word2idx[w] for w in s] for s in X.tolist()]
X_word = pad_sequences(maxlen=max_len, sequences=X_word, value=word2idx["PAD"], padding='post', truncating='post')


chars = set([w_i for w in words for w_i in w])
n_chars = len(chars)
print(n_chars)

char2idx = {c: i + 2 for i, c in enumerate(chars)}
char2idx["UNK"] = 1
char2idx["PAD"] = 0
X_char = []
for sentence in X.tolist():
    sent_seq = []
    for i in range(max_len):
        word_seq = []
        for j in range(max_len_char):
            try:
                word_seq.append(char2idx.get(sentence[i][0][j]))
            except:
                word_seq.append(char2idx.get("PAD"))
        sent_seq.append(word_seq)
    X_char.append(np.array(sent_seq))

98


In [99]:
X

Sentence #
Sentence: 1        [Thousands, of, demonstrators, have, marched, ...
Sentence: 10       [Iranian, officials, say, they, expect, to, ge...
Sentence: 100      [Helicopter, gunships, Saturday, pounded, mili...
Sentence: 1000     [They, left, after, a, tense, hour-long, stand...
Sentence: 10000    [U.N., relief, coordinator, Jan, Egeland, said...
Sentence: 10001    [Mr., Egeland, said, the, latest, figures, sho...
Sentence: 10002    [He, said, last, week, 's, tsunami, and, the, ...
Sentence: 10003        [Some, 1,27,000, people, are, known, dead, .]
Sentence: 10004    [Aid, is, being, rushed, to, the, region, ,, b...
Sentence: 10005    [Lebanese, politicians, are, condemning, Frida...
Sentence: 10006    [In, Beirut, ,, a, string, of, officials, voic...
Sentence: 10007    [One, person, was, killed, and, more, than, 20...
Sentence: 10008    [Lebanon, has, suffered, a, series, of, bombin...
Sentence: 10009    [Syria, is, widely, accused, of, involvement, ...
Sentence: 1001     [The

In [79]:
data['Tag'].value_counts()/data['Tag'].value_counts().sum()

O        0.931413
B-geo    0.035900
B-tim    0.019391
I-geo    0.007071
I-tim    0.006226
Name: Tag, dtype: float64

# Build a Model

In [80]:

from keras_wc_embd import WordCharEmbd,get_word_list_eng

def get_wordchar_embedding(data):
    '''
    This function takes the tokenized reports and transforms them to both
    a word and character embedding with this handy library:
    https://github.com/CyberZHG/keras-word-char-embd

    input: data, the dataframe that contains the 'reports' with the preprocessed reports
    output: wc_embd, a class that stores dictionaries of word embeddings, character embeddings,
    and methods for creating and updating the word and character embeddings
    '''
    wc_embd = WordCharEmbd(
        word_min_freq=5,
        char_min_freq=2,
        word_ignore_case=True,
        char_ignore_case=True,
    )
    [wc_embd.update_dicts(report) for report in data]
    return wc_embd

wc_embd = get_wordchar_embedding(X.tolist())

In [90]:
inputs_emb, embd_layer = wc_embd.get_embedding_layer(word_mask_zero=False,char_mask_zero=False)

In [91]:
inputs_emb

[<tf.Tensor 'Input_Word_14:0' shape=(?, ?) dtype=float32>,
 <tf.Tensor 'Input_Char_14:0' shape=(?, ?, 21) dtype=float32>]

In [89]:
from keras.layers import Bidirectional,LSTM,Conv1D,Input,Dense,Dropout,concatenate,MaxPooling1D,TimeDistributed,Flatten
from keras.models import Model

def create_cnnbilstm_model(wc_embd):
    '''
    This function assembles the word, character embedding and the keras Bi-LSTM for classification
    input: wc_embed, the word+character embedding layers of the network
    output: compiled, untrained keras model

    The network has two inputs layers:
    1) The word+char embedding layer that takes the embedded report date
    2) The body part encoding input that indicates whether the text is about knee or shoulder

    These two inputs are concatenated at the final dense perceptron layer
    '''
    inputs_emb, embd_layer = wc_embd.get_embedding_layer(word_mask_zero=False,char_mask_zero=False)

    drop_out = Dropout(0.5)(embd_layer)
    lstm_layer = Bidirectional(LSTM(units=64, name='LSTM', dropout=0.3,return_sequences=True))(drop_out)#32
    #lstm_model = Model(inputs=inputs_emb, outputs=lstm_layer)
    flatten_layer = Flatten()(lstm_layer)

    softmax_layer = TimeDistributed(Dense(units=n_tags+1, activation='softmax', name='Softmax'))(flatten_layer)
    model = Model(inputs=inputs_emb, outputs=softmax_layer)
    
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy'],
    )
    model.summary()
    return model

model = create_cnnbilstm_model(wc_embd)

ValueError: The shape of the input to "Flatten" is not fully defined (got (None, 128). Make sure to pass a complete "input_shape" or "batch_input_shape" argument to the first layer in your model.

In [86]:
import numpy as np
history = model.fit(
    wc_embd.get_batch_input(X.tolist()),
    np.array(y.tolist()),
    epochs=20,
    batch_size = 10
)

ValueError: Error when checking target: expected time_distributed_4 to have 3 dimensions, but got array with shape (47959, 1)