## Hi!

Welcome to Hands-On NLP!  

# 1. Let's load the data and give it a browse

In [54]:
import pandas as pd

data = pd.read_csv('data/ner_dataset.csv',encoding='latin1')
data.fillna(method="ffill",inplace=True)

max_len = 75
max_len_char = 10

In [55]:
data['Tag'].value_counts()/data['Tag'].value_counts().sum()

O        0.846776
B-geo    0.035900
B-tim    0.019391
B-org    0.019210
I-per    0.016452
B-per    0.016203
I-org    0.016006
B-gpe    0.015135
I-geo    0.007071
I-tim    0.006226
B-art    0.000383
B-eve    0.000294
I-art    0.000283
I-eve    0.000241
B-nat    0.000192
I-gpe    0.000189
I-nat    0.000049
Name: Tag, dtype: float64

In [81]:
data['Tag'].value_counts()
include_tags = [
    'B-geo',
    'I-geo',
    'B-tim',
    'I-tim',
    
]
for tag in data['Tag'].unique():
    if tag not in include_tags:
        data.loc[data['Tag']==tag,'Tag'] = 'O'
        
n_tags = data['Tag'].unique().shape[0] #add 1 to account for PAD

In [82]:
n_tags

5

In [83]:
words = list(set(data["Word"].values))
n_words = len(words)
tags = list(set(data["Tag"].values))
n_tags = len(tags)

In [84]:
n_tags

5

In [58]:
word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["UNK"] = 1
word2idx["PAD"] = 0
idx2word = {i: w for w, i in word2idx.items()}
tag2idx = {t: i + 1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0
idx2tag = {i: w for w, i in tag2idx.items()}

In [59]:
agg_func_words = lambda s: [w for w in s["Word"].values.tolist()]
X = data.groupby("Sentence #").apply(agg_func_words)

agg_func_labels = lambda s: [w for w in s["Tag"].values.tolist()]
y = data.groupby("Sentence #").apply(agg_func_labels)


In [60]:
from keras.preprocessing.sequence import pad_sequences

import numpy as np
X_word = [[word2idx[w] for w in s] for s in X.tolist()]
X_word = pad_sequences(maxlen=max_len, sequences=X_word, value=word2idx["PAD"], padding='post', truncating='post')


chars = set([w_i for w in words for w_i in w])
n_chars = len(chars)
print(n_chars)

char2idx = {c: i + 2 for i, c in enumerate(chars)}
char2idx["UNK"] = 1
char2idx["PAD"] = 0
X_char = []
for sentence in X.tolist():
    sent_seq = []
    for i in range(max_len):
        word_seq = []
        for j in range(max_len_char):
            try:
                word_seq.append(char2idx.get(sentence[i][0][j]))
            except:
                word_seq.append(char2idx.get("PAD"))
        sent_seq.append(word_seq)
    X_char.append(np.array(sent_seq))

98


In [78]:
tag2idx

{'I-tim': 1, 'B-geo': 2, 'B-tim': 3, 'I-geo': 4, 'O': 5, 'PAD': 0}

In [61]:
y = [[tag2idx[w] for w in s] for s in y]
y = pad_sequences(maxlen=max_len, sequences=y, value=tag2idx["PAD"], padding='post', truncating='post')


In [62]:
from sklearn.model_selection import train_test_split

X_word_tr, X_word_te, y_tr, y_te = train_test_split(X_word, y, test_size=0.1, random_state=2018)
X_char_tr, X_char_te, _, _ = train_test_split(X_char, y, test_size=0.1, random_state=2018)

In [79]:
n_tags

5

In [85]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Conv1D
from keras.layers import Bidirectional, concatenate, SpatialDropout1D, GlobalMaxPooling1D
def create_model():
    word_in = Input(shape=(max_len,))
    emb_word = Embedding(input_dim=n_words + 2, output_dim=20,
                         input_length=max_len, mask_zero=True)(word_in)

    # input and embeddings for characters
    char_in = Input(shape=(max_len, max_len_char,))
    emb_char = TimeDistributed(Embedding(input_dim=n_chars + 2, output_dim=10,
                               input_length=max_len_char, mask_zero=True))(char_in)
    # character LSTM to get word encodings by characters
    char_enc = TimeDistributed(LSTM(units=20, return_sequences=False,
                                    recurrent_dropout=0.5))(emb_char)

    # main LSTM
    x = concatenate([emb_word, char_enc])
    x = SpatialDropout1D(0.3)(x)
    main_lstm = Bidirectional(LSTM(units=50, return_sequences=True,
                                   recurrent_dropout=0.6))(x)
    out = TimeDistributed(Dense(n_tags+1, activation="softmax"))(main_lstm)

    model = Model([word_in, char_in], out)
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")
    model.summary()
    return model

model = create_model()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_16 (InputLayer)           (None, 75, 10)       0                                            
__________________________________________________________________________________________________
input_15 (InputLayer)           (None, 75)           0                                            
__________________________________________________________________________________________________
time_distributed_22 (TimeDistri (None, 75, 10, 10)   1000        input_16[0][0]                   
__________________________________________________________________________________________________
embedding_15 (Embedding)        (None, 75, 20)       703600      input_15[0][0]                   
__________________________________________________________________________________________________
time_distr

In [86]:
y_tr[0]

array([5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 4, 5, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [87]:
X_word_tr.shape

(43163, 75)

(43163, 75)


AttributeError: 'list' object has no attribute 'shape'

In [None]:
history = model.fit([X_word_tr,
                     np.array(X_char_tr).reshape((len(X_char_tr), max_len, max_len_char))],
                    np.array(y_tr).reshape(len(y_tr), max_len,1),
                    batch_size=32, epochs=10, validation_split=0.1, verbose=1)

Train on 38846 samples, validate on 4317 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
 1568/38846 [>.............................] - ETA: 2:40 - loss: 0.0307

# Build a Model

In [80]:

from keras_wc_embd import WordCharEmbd,get_word_list_eng

def get_wordchar_embedding(data):
    '''
    This function takes the tokenized reports and transforms them to both
    a word and character embedding with this handy library:
    https://github.com/CyberZHG/keras-word-char-embd

    input: data, the dataframe that contains the 'reports' with the preprocessed reports
    output: wc_embd, a class that stores dictionaries of word embeddings, character embeddings,
    and methods for creating and updating the word and character embeddings
    '''
    wc_embd = WordCharEmbd(
        word_min_freq=5,
        char_min_freq=2,
        word_ignore_case=True,
        char_ignore_case=True,
    )
    [wc_embd.update_dicts(report) for report in data]
    return wc_embd

wc_embd = get_wordchar_embedding(X.tolist())

In [90]:
inputs_emb, embd_layer = wc_embd.get_embedding_layer(word_mask_zero=False,char_mask_zero=False)

In [91]:
inputs_emb

[<tf.Tensor 'Input_Word_14:0' shape=(?, ?) dtype=float32>,
 <tf.Tensor 'Input_Char_14:0' shape=(?, ?, 21) dtype=float32>]

In [89]:
from keras.layers import Bidirectional,LSTM,Conv1D,Input,Dense,Dropout,concatenate,MaxPooling1D,TimeDistributed,Flatten
from keras.models import Model

def create_cnnbilstm_model(wc_embd):
    '''
    This function assembles the word, character embedding and the keras Bi-LSTM for classification
    input: wc_embed, the word+character embedding layers of the network
    output: compiled, untrained keras model

    The network has two inputs layers:
    1) The word+char embedding layer that takes the embedded report date
    2) The body part encoding input that indicates whether the text is about knee or shoulder

    These two inputs are concatenated at the final dense perceptron layer
    '''
    inputs_emb, embd_layer = wc_embd.get_embedding_layer(word_mask_zero=False,char_mask_zero=False)

    drop_out = Dropout(0.5)(embd_layer)
    lstm_layer = Bidirectional(LSTM(units=64, name='LSTM', dropout=0.3,return_sequences=True))(drop_out)#32
    #lstm_model = Model(inputs=inputs_emb, outputs=lstm_layer)
    flatten_layer = Flatten()(lstm_layer)

    softmax_layer = TimeDistributed(Dense(units=n_tags+1, activation='softmax', name='Softmax'))(flatten_layer)
    model = Model(inputs=inputs_emb, outputs=softmax_layer)
    
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy'],
    )
    model.summary()
    return model

model = create_cnnbilstm_model(wc_embd)

ValueError: The shape of the input to "Flatten" is not fully defined (got (None, 128). Make sure to pass a complete "input_shape" or "batch_input_shape" argument to the first layer in your model.

In [86]:
import numpy as np
history = model.fit(
    wc_embd.get_batch_input(X.tolist()),
    np.array(y.tolist()),
    epochs=20,
    batch_size = 10
)

ValueError: Error when checking target: expected time_distributed_4 to have 3 dimensions, but got array with shape (47959, 1)