In [1]:
#all imports required


import pandas as pd
from itertools import chain
from numpy import array
from numpy.random import seed
from keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
import spacy
from spacy import displacy
import numpy as np
import tensorflow as tf
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model

In [2]:
# function to import data and return it in a dataframe

def getRawData(path):
    return pd.read_excel(path)

In [3]:
# function to define token to id and id to token
def get_dict_map(data, token_or_tag):
    tok2idx = {}
    idx2tok = {}
    
    if token_or_tag == 'token':
        vocab = list(set(data['Text'].to_list()))
    else:
        vocab = list(set(data['Label'].to_list()))
    
    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}
    return tok2idx, idx2tok

In [4]:
#function to get extra columns of token to id representation for text and label
def getDataTagged(data,tok2idx,tag2idx):
    data['Word_idx'] = data['Text'].map(token2idx)
    data['Tag_idx'] = data['Label'].map(tag2idx)
    return data

In [5]:
def getGroupedData(data):
    data_fillna = data.fillna(method='ffill', axis=0)
    for i in range(len(data)):
        print(i)
        data_group = data_fillna.groupby(["Sentence"],as_index=False)['Text', 'POS', 'Label', 'Word_idx', 'Tag_idx'].agg(lambda x: list(x))
    return data_group

In [6]:
def get_pad_train_test_val(data_group, data):
    n_token = len(list(set(data['Text'].to_list())))
    n_tag = len(list(set(data['Label'].to_list())))   
    tokens = data_group['Word_idx'].tolist()
    maxlen = max([len(s) for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= n_token - 1)
    tags = data_group['Tag_idx'].tolist()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post')
    n_tags = len(tag2idx)
    pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]
    tokens_, test_tokens, tags_, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.1, train_size=0.9, random_state=2020)
    train_tokens, val_tokens, train_tags, val_tags = train_test_split(tokens_,tags_,test_size = 0.25,train_size =0.75, random_state=2020)
    print(
        'train_tokens length:', len(train_tokens),
        '\ntest_tokens length:', len(test_tokens),
        '\ntest_tags:', len(test_tags),
        '\nval_tokens:', len(val_tokens),
        '\nval_tags:', len(val_tags),
    )
    return train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags

In [7]:
def get_bilstm_lstm_model():
    model = Sequential()

    # Add Embedding layer
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))

    # Add bidirectional LSTM
    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))

    # Add LSTM
    model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))

    # Add timeDistributed Layer
    model.add(TimeDistributed(Dense(n_tags, activation="softmax")))

    #Optimiser 
    # adam = k.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999)

    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    return model

In [8]:
def train_model(X, y, model):
    loss = list()
    for i in range(25):
        # fit model for one epoch on this sequence
        hist = model.fit(X, y, batch_size=1000, verbose=1, epochs=1, validation_split=0.2)
        loss.append(hist.history['loss'][0])
    return loss

In [9]:
def saveModel(model_bilstm_lstm,save_model):
    #model.save(model_bilstm_lstm)
    tf.saved_model.save(model_bilstm_lstm,save_model )

In [10]:
def predict():
    nlp = spacy.load('bilstm_1_dense')
    text = "Dr. Ferris's research is focused on cellular immune mechanisms of natural killer (NK) cell, dendritic cells (DC) and T lymphocyte activation against head and neck cancer (HNC) tumor antigens. In addition his group pioneered studies demonstrating innate and adaptive immune responses induced by EGFR-specific mAb, cetuximab in cancer patients."
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

In [None]:
data=getRawData('improved_data.xlsx')
data

In [None]:
#data=getRawData('improved_data.xlsx')
token2idx, idx2token = get_dict_map(data, 'token')
tag2idx, idx2tag = get_dict_map(data, 'tag')
data_idx=getDataTagged(data,token2idx,tag2idx)
data_group=getGroupedData(data)
train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags = get_pad_train_test_val(data_group, data)
input_dim = len(list(set(data['Text'].to_list())))+1
output_dim = 64
input_length = max([len(s) for s in data_group['Word_idx'].tolist()])
n_tags = len(tag2idx)
results = pd.DataFrame()
model_bilstm_lstm = get_bilstm_lstm_model()
#plot_model(model_bilstm_lstm)
results['with_add_lstm'] = train_model(train_tokens, np.array(train_tags), model_bilstm_lstm)
save_model=saveModel(model_bilstm_lstm,save_model)
output=save_model.predict()