In [1]:
#all imports required


import pandas as pd
from itertools import chain
from numpy import array
from numpy.random import seed
from keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
import spacy
from spacy import displacy
import numpy as np
import tensorflow as tf
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model

In [2]:
# function to import data and return it in a dataframe

def getRawData(path):
    return pd.read_excel(path)

In [3]:
# function to define token to id and id to token
def get_dict_map(data, token_or_tag):
    tok2idx = {}
    idx2tok = {}
    
    if token_or_tag == 'token':
        vocab = list(set(data['Text'].to_list()))
    else:
        vocab = list(set(data['Label'].to_list()))
     # Convert vocab elements to strings
    vocab = [str(x) for x in vocab]
    
    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}
    return tok2idx, idx2tok

In [4]:
#function to get extra columns of token to id representation for text and label
def getDataTagged(data,tok2idx,tag2idx):
    data['Word_idx'] = data['Text'].map(token2idx)
    data['Tag_idx'] = data['Label'].map(tag2idx)
    return data

In [5]:
def getGroupedData(data):
    data_fillna = data.fillna(method='ffill', axis=0)
    for i in range(len(data)):
        print(i)
        data_group = data_fillna.groupby(["Sentence"],as_index=False)['Text', 'POS', 'Label', 'Word_idx', 'Tag_idx'].agg(lambda x: list(x))
    return data_group

In [6]:
def get_pad_train_test_val(data_group, data):
    n_token = len(list(set(data['Text'].to_list())))
    n_tag = len(list(set(data['Label'].to_list())))   
    tokens = data_group['Word_idx'].to_list()
    maxlen = max([len(s) for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= n_token - 1)
    tags = data_group['Tag_idx'].to_list()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post')
    n_tags = len(tag2idx)
    pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]
    tokens_, test_tokens, tags_, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.1, train_size=0.9, random_state=2020)
    train_tokens, val_tokens, train_tags, val_tags = train_test_split(tokens_,tags_,test_size = 0.25,train_size =0.75, random_state=2020)
    print(
        'train_tokens length:', len(tokens_),
        '\ntrain_tags length:',len(tags_),
        '\ntest_tokens length:', len(test_tokens),
        '\ntest_tags:', len(test_tags),
    )
    return tokens_, test_tokens, tags_, test_tags

In [7]:
data=getRawData('improved_data.xlsx')

In [8]:
data=data.drop(data.columns[0],axis=1)

In [9]:
data_group=pd.read_excel("groupData.xlsx")

In [10]:
data_group

Unnamed: 0.1,Unnamed: 0,Sentence,Text,POS,Label,Word_idx,Tag_idx
0,0,Sentence 1,"['Dr.', 'Yue', 'Cao', 'is', 'a', 'highly', 're...","['NNP', 'NNP', 'NNP', 'VBZ', 'DT', 'RB', 'JJ',...","[nan, 'PERSON', 'PERSON', 'PERSON', 'PERSON', ...","[1970, 720, 956, 804, 1180, 244, 405, 854, 576...","[0, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
1,1,Sentence 10,"['Biography', 'of', 'Dr.', 'John', 'Kurhanewic...","['NN', 'IN', 'NNP', 'NNP', 'NNP', ',', 'NNP', ...","['PERSON', 'PERSON', 'PERSON', 'PERSON', 'PERS...","[105, 979, 1970, 1684, 1425, 1145, 9, 1136, 19...","[0, 0, 0, 12, 12, 0, 4, 0, 0, 12, 12, 0, 0, 0,..."
2,2,Sentence 100,"['Dr.', 'David', 'Bluemke', 'is', 'a', 'renown...","['NNP', 'NNP', 'NNP', 'VBZ', 'DT', 'JJ', 'NN',...","['PERSON', 'PERSON', 'PERSON', 'PERSON', 'PERS...","[1970, 1438, 1802, 804, 1180, 736, 854, 308, 9...","[0, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
3,3,Sentence 101,"['Dr.', 'David', 'Nascene', 'is', 'a', 'renown...","['NNP', 'NNP', 'NNP', 'VBZ', 'DT', 'JJ', 'NN',...","['PERSON', 'PERSON', 'PERSON', 'PERSON', 'PERS...","[1970, 1438, 59, 804, 1180, 736, 854, 308, 976...","[0, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
4,4,Sentence 11,"['Dr.', 'Anderanik', 'Tomasian', 'is', 'a', 'r...","['NNP', 'NNP', 'NNP', 'VBZ', 'DT', 'JJ', 'NN',...","['PERSON', 'PERSON', 'PERSON', 'PERSON', 'PERS...","[1970, 1019, 377, 804, 1180, 736, 854, 576, 11...","[0, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
...,...,...,...,...,...,...,...
96,96,Sentence 95,"['Dr.', 'James', 'Babb', 'is', 'a', 'renowned'...","['NNP', 'NNP', 'NNP', 'VBZ', 'DT', 'JJ', 'NN',...","['PERSON', 'PERSON', 'PERSON', 'PERSON', 'PERS...","[1970, 1042, 1215, 804, 1180, 736, 854, 308, 9...","[0, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
97,97,Sentence 96,"['Dr.', 'Mauricio', 'Castillo', 'is', 'a', 're...","['NNP', 'NNP', 'NNP', 'VBZ', 'DT', 'JJ', 'NN',...","['PERSON', 'PERSON', 'PERSON', 'PERSON', 'PERS...","[1970, 1080, 713, 804, 1180, 736, 854, 576, 11...","[0, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
98,98,Sentence 97,"['Dr.', 'Claude', 'Sirlin', 'is', 'a', 'highly...","['NNP', 'NNP', 'NNP', 'VBZ', 'DT', 'RB', 'JJ',...","['PERSON', 'PERSON', 'PERSON', 'PERSON', 'PERS...","[1970, 1118, 1172, 804, 1180, 244, 115, 854, 3...","[0, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
99,99,Sentence 98,"['Dr.', 'Martin', 'Prince', 'is', 'a', 'renown...","['NNP', 'NNP', 'NNP', 'VBZ', 'DT', 'JJ', 'NN',...","['PERSON', 'PERSON', 'PERSON', 'PERSON', 'PERS...","[1970, 20, 1435, 804, 1180, 736, 854, 932, 131...","[0, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."


In [11]:
data_group=data_group.drop(data_group.columns[0],axis=1)
tokens = data_group['Word_idx'].tolist()
for i in range(len(tokens)):
    tokens[i]=list(map(int,tokens[i][1:-1].split(",")))
data_group['Word_idx']=tokens
tags = data_group['Tag_idx'].tolist()
for i in range(len(tags)):
    tags[i]=list(map(int,tags[i][1:-1].split(",")))
data_group['Tag_idx']=tags

In [12]:
#tags

In [13]:
token2idx, idx2token = get_dict_map(data, 'token')
tag2idx, idx2tag = get_dict_map(data, 'tag')
data_idx=getDataTagged(data,token2idx,tag2idx)

tag2idx

In [14]:
tag2idx

{'nan': 0,
 'PERSON': 1,
 'LOC': 2,
 'LAW': 3,
 'NORP': 4,
 'CARDINAL': 5,
 'ORDINAL': 6,
 'ORG': 7,
 'WORK_OF_ART': 8,
 'FAC': 9,
 'EVENT': 10,
 'PRODUCT': 11,
 'GPE': 12,
 'DATE': 13}

In [15]:
train_tokens, test_tokens, train_tags,test_tags = get_pad_train_test_val(data_group, data)

train_tokens length: 90 
train_tags length: 90 
test_tokens length: 11 
test_tags: 11


In [16]:
import torch

train_texts = torch.tensor(train_tokens, dtype=torch.long)
train_names = torch.tensor(train_tags, dtype=torch.float)
val_texts = torch.tensor(test_tokens, dtype=torch.long)
val_names = torch.tensor(test_tags, dtype=torch.float)

  train_names = torch.tensor(train_tags, dtype=torch.float)


In [17]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        x = self.data[index]
        y = self.labels[index]
        return x, y

In [18]:
from torch.utils.data import DataLoader

batch_size = 16

train_dataset = CustomDataset(train_texts, train_names)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = CustomDataset(val_texts, val_names)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [19]:
import torch
import torch.nn as nn
import torch.optim as optim

class NERModel(nn.Module):
    def __init__(self, input_dim, output_dim, input_length, n_tags):
        super(NERModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, output_dim)
        self.bidirectional_lstm = nn.LSTM(output_dim, output_dim, bidirectional=True, dropout=0.2, batch_first=True)
        self.lstm = nn.LSTM(2*output_dim, output_dim, dropout=0.5, batch_first=True)
        self.time_distributed = nn.Linear(output_dim, n_tags)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.bidirectional_lstm(x)
        x, _ = self.lstm(x)
        x = self.time_distributed(x)
        x = self.softmax(x)
        return x


In [20]:
input_dim = len(list(set(data['Text'].to_list())))+1 # Your vocabulary size
output_dim = 64
input_length = max([len(s) for s in data_group['Word_idx'].tolist()]) # The maximum length of your input sequences
n_tags =  len(tag2idx) # The number of unique tags in your NER dataset
model = NERModel(input_dim, output_dim, input_length, n_tags)

criterion = nn.CrossEntropyLoss()
#criterion = nn.BCEWithLogitsLoss() 
optimizer = optim.Adam(model.parameters(), lr=0.0005)

print('input_dim=',input_dim)
print('output_dim=',output_dim)
print('input_length=',input_length)
print('number of unique tags=',n_tags)

print(len(set(data['Text'].to_list())))


input_dim= 2033
output_dim= 64
input_length= 713
number of unique tags= 14
2032




In [21]:
epochs = 30
batch_size = 10
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for batch_idx, (data, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(data)

        # Reshape labels for the loss function
        labels = labels.view(-1, n_tags)  # Modify this line
        outputs = outputs.view(-1, outputs.shape[-1])

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    train_loss = running_loss / len(train_loader)

    # Validation
    model.eval()
    running_loss = 0.0
    with torch.no_grad():
        for batch_idx, (data, labels) in enumerate(val_loader):
            outputs = model(data)

            # Reshape labels for the loss function
            labels = labels.view(-1, n_tags)  # Modify this line
            outputs = outputs.view(-1, outputs.shape[-1])

            loss = criterion(outputs, labels)
            running_loss += loss.item()
    val_loss = running_loss / len(val_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss}, Val Loss: {val_loss}")
print("Training Finished")

Epoch 1/30, Train Loss: 2.636612137158712, Val Loss: 2.628167152404785
Epoch 2/30, Train Loss: 2.6215038696924844, Val Loss: 2.6056041717529297
Epoch 3/30, Train Loss: 2.596214691797892, Val Loss: 2.5611095428466797
Epoch 4/30, Train Loss: 2.545430580774943, Val Loss: 2.480551242828369
Epoch 5/30, Train Loss: 2.4374778270721436, Val Loss: 2.3076555728912354
Epoch 6/30, Train Loss: 2.2041214307149253, Val Loss: 2.046205997467041
Epoch 7/30, Train Loss: 1.9858741164207458, Val Loss: 1.9238417148590088
Epoch 8/30, Train Loss: 1.9095744291941326, Val Loss: 1.8902039527893066
Epoch 9/30, Train Loss: 1.888508677482605, Val Loss: 1.8791836500167847
Epoch 10/30, Train Loss: 1.8787593841552734, Val Loss: 1.8745659589767456
Epoch 11/30, Train Loss: 1.8768525123596191, Val Loss: 1.8723028898239136
Epoch 12/30, Train Loss: 1.874928851922353, Val Loss: 1.8710343837738037
Epoch 13/30, Train Loss: 1.872561017672221, Val Loss: 1.8702316284179688
Epoch 14/30, Train Loss: 1.8713339567184448, Val Loss: 1

In [22]:
import spacy

nlp = spacy.load('en_core_web_sm')

def predict_named_entities(text):
    # Tokenize the input text using spacy
    tokenized_text = nlp(text)
    
    # Extract tokens as strings
    tokens = [token.text for token in tokenized_text]

    # Convert tokens to indices using token2idx dictionary
    #input_tokens = [token2idx.get(token, token2idx['<unknown>']) for token in tokens] # Replace '<unknown>' with the correct unknown token representation in your token2idx dictionary
    input_tokens = [token2idx.get(token, len(token2idx) - 1) for token in tokens]
    
    # Convert the input tokens to PyTorch tensor
    input_tensor = torch.tensor(input_tokens, dtype=torch.long).unsqueeze(0)
    
    # Run the model on the input tensor
    outputs = model(input_tensor)
    
    # Get the index of the maximum value for each token
    predictions = torch.argmax(outputs, dim=-1)
    
    return predictions.squeeze(dim=0), tokens

def post_process(predictions, tokens):
    entities = []
    labels = [idx2tag[pred.item()] for pred in predictions]
    for i, token_label in enumerate(labels):
        if token_label.startswith("B") or token_label.startswith("I"):
            entities.append((tokens[i], token_label))
    return entities


In [23]:
text = "John Doe works at Google in Mountain View, California."
predictions, tokens = predict_named_entities(text)
entities = post_process(predictions, tokens)
print(entities)


[]


In [24]:
from spacy.tokens import Token
# Register the extension
#spacy.tokens.Token.set_extension("predicted_entity", default=None)

# Preprocessing function
def preprocess_text(text):
    nlp = spacy.blank("en")
    doc = nlp(text.lower())
    tokens = [token.text for token in doc]
    return tokens

# Preparing input tensor function
def prepare_input(tokens):
    token_indices = [token2idx.get(token, len(token2idx) - 1) for token in tokens]
    maxlen = max([len(s) for s in data_group['Word_idx'].tolist()])
    pad_tokens = pad_sequences([token_indices], maxlen=maxlen, dtype='int32', padding='post', value=len(token2idx) - 1)
    input_tensor = torch.tensor(pad_tokens, dtype=torch.long)
    return input_tensor

# Prediction function
def predict_entities(input_tensor):
    model.eval()
    with torch.no_grad():
        output = model(input_tensor)
    return output
#Token.set_extension("predicted_entity", default="")

# Post-processing function
def post_process(output, tokens):
    _, predicted_indices = torch.max(output, dim=-1)
    predicted_indices = predicted_indices.squeeze(0).numpy()
    predicted_tags = [idx2tag[index] for index in predicted_indices]
   
    # Replace 'nan' tags with 'INTEREST'
   # predicted_tags = ['INTEREST' if tag == 'nan' else tag for tag in predicted_tags]

    nlp = spacy.blank("en")
    doc = nlp(" ".join(tokens))
    for token, pred_tag in zip(doc, predicted_tags):
        token._.predicted_entity = pred_tag

    return doc
    #result = list(zip(tokens, predicted_tags))
    #return result

# NER pipeline function
def ner_pipeline(text):
    tokens = preprocess_text(text)
    input_tensor = prepare_input(tokens)
    output = predict_entities(input_tensor)
    doc = post_process(output, tokens)
    displacy.render(doc, style="ent", options={"ents": list(tag2idx.keys()) , "colors": {"PER": "lightblue", "LOC": "yellow", "ORG": "purple", "INTEREST": "orange"}})

    return doc
   
   
    #return result

# Example usage
text = "Dr. Yue Cao is a highly respected radiologist with a wealth of experience in the field of medical imaging. Born on July 12, 1975, in Shanghai, China, Dr. Cao showed an early aptitude for science and a deep curiosity about the human body, which eventually led him to pursue a career in medicine. Dr. Cao completed his undergraduate studies in Medicine at Fudan University, one of the most prestigious universities in China. He then obtained his medical degree from Shanghai Medical College of Fudan University, where he graduated with top honors. He went on to complete his residency in Radiology at Huashan Hospital, also affiliated with Fudan University, where he developed a keen interest in diagnostic and interventional radiology.   After completing his residency, Dr. Cao embarked on a successful career in radiology, working at several renowned hospitals in China. He later moved to the United States to further his education and professional development. He completed a Fellowship in Radiology at Harvard Medical School and Massachusetts General Hospital, where he gained expertise in advanced imaging techniques and interventional radiology procedures. Dr. Cao's medical career has spanned over two decades, during which he has held various positions at leading medical institutions. Currently, he serves as the Chief of Radiology at a prominent hospital in New York City, where he leads a team of radiologists and oversees the diagnostic imaging services. Dr. Cao is known for his strong interest in medical research and has made significant contributions to the field of radiology. He has published numerous research papers in renowned medical journals and has been invited to speak at national and international conferences on topics such as advanced imaging techniques, minimally invasive procedures, and emerging technologies in radiology. His special interests include oncologic imaging, cardiovascular imaging, and image-guided interventions. Dr. Cao has also been involved in clinical trials and collaborative research projects with other medical institutions, aiming to advance the field of radiology and improve patient care. Dr. Cao has authored several books on radiology, which are widely recognized as valuable resources for medical professionals. His books cover various aspects of radiology, including diagnostic imaging, interventional radiology, and emerging trends in the field. His publications are known for their comprehensive and practical approach, providing insights into the latest advancements in radiology. Dr. Cao's contributions to the field of radiology have been recognized with numerous awards and honors. He has received prestigious awards from professional societies for his outstanding contributions to the field of radiology, including the American College of Radiology (ACR) and the Radiological Society of North America (RSNA). He is also an active member of several professional organizations, including the Society of Interventional Radiology (SIR) and the American Roentgen Ray Society (ARRS). In addition to his professional achievements, Dr. Cao is known for his compassionate and patient-centric approach to healthcare. He is committed to providing the highest quality of care to his patients and is dedicated to advancing the field of radiology through his research, clinical practice, and teaching. Dr. Cao's passion for radiology, his dedication to patient care, and his significant contributions to the field make him a respected and renowned figure in the medical community. His expertise and commitment to excellence continue to impact the field of radiology, improving patient outcomes and advancing the practice of medical imaging. "
doc = ner_pipeline(text)
print(doc)


AttributeError: [E047] Can't assign a value to unregistered extension attribute 'predicted_entity'. Did you forget to call the `set_extension` method?

In [None]:
displacy.render(result, style='ent')
