In [1]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import numpy as np
import spacy
from spacy.tokens import Doc
from spacy.pipeline import EntityRecognizer
from thinc.layers import PyTorchWrapper
import torch
from spacy.tokens import Span
from spacy.language import Language
from spacy.tokens import Token
from spacy import displacy
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, TimeDistributed, Dense


In [2]:
# Load the Excel sheet
file_path = "MasterList (3).xlsx"
data = pd.read_excel(file_path)



In [16]:
data

Unnamed: 0,Text,Name,Interest
0,Dr. Yue Cao is a highly respected radiologist ...,Yue Cao,tumor ;; tissue therapy;;
1,Dr. Bensheng Qiu is a renowned radiologist kno...,Bensheng Qiu,radiology;;cancers;;
2,Dr. Robert Fleck began his academic journey wi...,Robert Fleck J,cancer ;;diagnosis;;
3,Dr. Holden Wu is a renowned radiologist who ha...,Holden Wu,novel imaging modalities;;cardiac magnetic res...
4,Dr. William Hyslop is a renowned radiologist w...,William Hyslop,MRI;;PET;;CT;;radiology;;diagnosis;;
...,...,...,...
96,Dr. Claude Sirlin is a highly accomplished rad...,Claude Sirlin,MRI imaging;;liver cancer;; liver disease;;
97,Dr. Martin Prince is a renowned radiologist wh...,Martin Prince,gadolinium-enhanced MR Angiography;; Investig...
98,Dr. Scott Reeder is a renowned radiologist kno...,Scott Reeder,abdominal adiposity;; liver fat;; liver iron;;...
99,Dr. David Bluemke is a renowned radiologist kn...,David Bluemke,diagnosis;;cardiovascular diseases;; coronary...


In [3]:
# Separate the text and the names
texts = data['Text'].tolist()
names = data['Name'].tolist()

In [4]:
# Tokenize the texts and the names
text_tokenizer = Tokenizer(char_level=True)
text_tokenizer.fit_on_texts(texts)
text_sequences = text_tokenizer.texts_to_sequences(texts)

name_tokenizer = Tokenizer(char_level=True)
name_tokenizer.fit_on_texts(names)
name_sequences = name_tokenizer.texts_to_sequences(names)

In [5]:
# Pad the sequences
max_length = max([len(seq) for seq in text_sequences])
text_sequences_padded = pad_sequences(text_sequences, maxlen=max_length, padding='post')
name_sequences_padded = pad_sequences(name_sequences, maxlen=max_length, padding='post')

In [6]:
# Convert the names to one-hot encoding
name_sequences_onehot = [to_categorical(seq, num_classes=len(name_tokenizer.word_index) + 1) for seq in name_sequences_padded]

In [7]:
# Split the data into training and validation sets
train_texts, val_texts, train_names, val_names = train_test_split(text_sequences_padded, np.array(name_sequences_onehot), test_size=0.2)

In [8]:
input_dim = len(text_tokenizer.word_index) + 1
output_dim = len(name_tokenizer.word_index) + 1

model = Sequential([
    Input(shape=(max_length,)),
    Embedding(input_dim=input_dim, output_dim=32, input_length=max_length),
    Bidirectional(LSTM(units=32, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)),
    LSTM(units=32, return_sequences=True, dropout=0.5, recurrent_dropout=0.5),
    TimeDistributed(Dense(output_dim, activation="softmax"))
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])



In [9]:
history = model.fit(train_texts, train_names, epochs=5, batch_size=16, validation_data=(val_texts, val_names))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [10]:
# Convert Keras model to PyTorch model
torch_model = torch.nn.Sequential(*[PyTorchWrapper(layer) for layer in model.layers])

# Create a custom spaCy component
class KerasEntityRecognizer(EntityRecognizer):
    def predict(self, docs):
        X = [doc.tensor for doc in docs]
        X_padded = pad_sequences(X, maxlen=max_length, padding='post')
        y_pred = self.model(torch.tensor(X_padded, dtype=torch.float32))
        return y_pred.detach().numpy()


AttributeError: 'Embedding' object has no attribute 'to'

In [11]:
@Language.component("keras_entity_recognizer")
def keras_entity_recognizer(doc):
    # Convert the document text into input for the model
    text_sequence = text_tokenizer.texts_to_sequences([doc.text])
    text_sequence_padded = pad_sequences(text_sequence, maxlen=max_length, padding='post')

    # Get the model predictions
    predictions = model.predict(text_sequence_padded)[0]

    # Extract the name from the predictions
    name_indices = np.argmax(predictions, axis=-1)
    name_tokens = [name_tokenizer.index_word[idx] if idx > 0 else '' for idx in name_indices]

    # Assign the name tokens to the doc
    for i, token in enumerate(doc):
        if name_tokens[i]:
            token._.name = name_tokens[i]
        else:
            token._.name = ""

    # Add entities to the doc
    for token in doc:
        if token._.name:
            doc.ents += (Span(doc, token.i, token.i + 1, label='PERSON'),)

    return doc

In [12]:
# Register the custom attribute
Token.set_extension('name', default='')

# Load a spaCy model and add the custom component
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("keras_entity_recognizer")

# Process a text using the spaCy pipeline
text = " Dr. Sharath Chandra Mouli is a medical doctor who specializes in the field of Gastroenterology. He is associated with the Krishna Institute of Medical Sciences (KIMS) Hospital in Secunderabad, Telangana, India.Dr. Mouli completed his MBBS degree from the Rajiv Gandhi University of Health Sciences in Bangalore, India, and then went on to pursue a Doctorate of Medicine (DM) in Gastroenterology from the Nizam's Institute of Medical Sciences in Hyderabad, India. He has several years of experience in the field of Gastroenterology and has worked at various prestigious institutions across India.Dr. Mouli's areas of expertise include the diagnosis and treatment of various gastrointestinal disorders such as inflammatory bowel disease, liver diseases, pancreatic disorders, and motility disorders. He is also trained in performing advanced endoscopic procedures such as endoscopic ultrasound (EUS), endoscopic retrograde cholangiopancreatography (ERCP), and endoscopic mucosal resection (EMR).Apart from his clinical work, Dr. Mouli is also involved in research activities in the field of Gastroenterology and has published several articles in peer-reviewed medical journals."
doc = nlp(text)

# Print the extracted entities
print("Entities found:", [(ent.text, ent.label_) for ent in doc.ents])

Entities found: [('Sharath Chandra Mouli', 'PERSON'), ('the Krishna Institute of Medical Sciences', 'ORG'), ('Secunderabad', 'GPE'), ('Telangana', 'GPE'), ('India', 'GPE'), ('Mouli', 'PERSON'), ('the Rajiv Gandhi University of Health Sciences', 'ORG'), ('Bangalore', 'GPE'), ('India', 'GPE'), ("Nizam's Institute of Medical Sciences", 'ORG'), ('Hyderabad', 'GPE'), ('India', 'GPE'), ('several years', 'DATE'), ('Gastroenterology', 'ORG'), ('India', 'GPE'), ('Mouli', 'PERSON'), ('EUS', 'ORG'), ('ERCP', 'ORG'), ('Mouli', 'PERSON'), ('Gastroenterology', 'ORG')]


In [15]:
# Visualize the named entities using displaCy
#displacy.render(doc, style='ent', options={'ents': ['PERSON']})
displacy.render(doc, style='ent')