In [11]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras import backend
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF

import re
import spacy
from spacy import displacy
from spacy.tokenizer import Tokenizer
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) 

print(tf.__version__)

1.13.1


In [12]:
nlp = spacy.load("en_core_web_sm")

In [6]:
# Hyperparams if GPU is available
if tf.test.is_gpu_available():
    BATCH_SIZE = 512  # Number of examples used in each iteration
    EPOCHS = 5  # Number of passes through entire dataset
    MAX_LEN = 20  # Max length of review (in words)
    EMBEDDING = 40  # Dimension of word embedding vector
# Hyperparams for CPU training
else:
    BATCH_SIZE = 32
    EPOCHS = 20
    MAX_LEN = 75
    EMBEDDING = 32

In [106]:
data = pd.read_csv("./data/drug_bio_all.csv", encoding="utf8")
data = data.fillna(method="ffill")
print("Number of sentences: ", len(data.groupby(['Sentence #'])))
words = list(set(data["Word"].values))
n_words = len(words)
print("Number of words in the dataset: ", n_words)
tags = list(set(data["Tag"].values))
print("Tags:", tags)
n_tags = len(tags)
print("Number of Labels: ", n_tags)

Number of sentences:  1778
Number of words in the dataset:  5838
Tags: ['I-FREQUENCY', 'B-DRUG', 'B-FREQUENCY', 'I-ROUTE', 'O', 'B-STRENGTH', 'I-DRUG', 'I-DURATION', 'B-DURATION', 'B-ROUTE', 'B-FORM', 'B-DOSAGE', 'I-STRENGTH', 'I-DOSAGE']
Number of Labels:  14


In [189]:
sentence = "Slight wean of Levo"
sentences = []

doc = nlp(sentence)
for token in doc:
    sentences.append(token.text)

sentences = [sentences]
sentences

[['Slight', 'wean', 'of', 'Levo']]

In [190]:
word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["UNK"] = 1 # Unknown words
word2idx["PAD"] = 0 # Padding
idx2word = {i: w for w, i in word2idx.items()}

tag2idx = {t: i+1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0
idx2tag = {i: w for w, i in tag2idx.items()}

X = [[word2idx[w] for w in s] for s in sentences]
X = pad_sequences(maxlen=MAX_LEN, sequences=X, padding="post", value=word2idx["PAD"])

In [191]:
# Define a simple sequential model
def create_model():
    # Model definition
    input = Input(shape=(MAX_LEN,))
    model = Embedding(input_dim=n_words+2, output_dim=EMBEDDING, # n_words + 2 (PAD & UNK)
                      input_length=MAX_LEN, mask_zero=True)(input)  
    model = Bidirectional(LSTM(units=50, return_sequences=True,
                               recurrent_dropout=0.1))(model)  # variational biLSTM
    model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
    crf = CRF(n_tags+1)  # CRF layer, n_tags+1(PAD)
    out = crf(model)  # output
    model = Model(input, out)
    model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])
    
    return model

In [196]:
# Create a new model instance
model = create_model()

# Restore the weights
model.load_weights('./training/cp-0000.ckpt')
model.summary()



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_13 (InputLayer)        (None, 75)                0         
_________________________________________________________________
embedding_13 (Embedding)     (None, 75, 32)            186880    
_________________________________________________________________
bidirectional_13 (Bidirectio (None, 75, 100)           33200     
_________________________________________________________________
time_distributed_13 (TimeDis (None, 75, 50)            5050      
_________________________________________________________________
crf_13 (CRF)                 (None, 75, 15)            1020      
Total params: 226,150
Trainable params: 226,150
Non-trainable params: 0
_________________________________________________________________


In [197]:
pred_cat = model.predict(X[0:1])
pred = np.argmax(pred_cat, axis=-1)
pred_tag = [[idx2tag[i] for i in row] for row in pred]
print(pred_tag[0][:10])

['I-DOSAGE', 'B-DOSAGE', 'I-DOSAGE', 'B-DOSAGE', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']


In [198]:
text = [[idx2word[w] for w in s] for s in X]
print(text[0][:10])

['Slight', 'wean', 'of', 'Levo', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']


In [200]:
ignore_tokens = ["UNK", "PAD", "O"]
labels = []
is_begin = False

for i in range(pred.shape[1]):
    wordDicts = {}
    bio_tag = ""
    if pred_tag[0][i] not in ignore_tokens:
        wordDicts[pred_tag[0][i].split("-")[1]] = (text[0][i])
        labels.append(wordDicts)       
labels

[{'DOSAGE': 'Slight'},
 {'DOSAGE': 'wean'},
 {'DOSAGE': 'of'},
 {'DOSAGE': 'Levo'}]