In [1]:
import os
import pandas as pd
import numpy as np
import spacy
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, TimeDistributed, Dense



In [2]:
# Prompt the user to enter the file path
file_path = "MasterList (3).xlsx"

# Check if the file exists
#if not os.path.exists(file_path):
   # print("File not found!")
   # exit()

# Load the data from the Excel sheet
df = pd.read_excel(file_path)


In [3]:
# Extract the texts and the names
texts = df['Text'].tolist()
names = df['Name'].tolist()

# Create a tokenizer for the texts and the names
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(texts + names)

# Convert the texts and the names to sequences of integers
text_sequences = tokenizer.texts_to_sequences(texts)
name_sequences = tokenizer.texts_to_sequences(names)

In [4]:
# Pad the sequences with zeros to the maximum sequence length
max_length = max([len(seq) for seq in text_sequences + name_sequences])
text_sequences_padded = pad_sequences(text_sequences, maxlen=max_length, padding='post')
name_sequences_padded = pad_sequences(name_sequences, maxlen=max_length, padding='post')


In [5]:

# Convert the name sequences to one-hot encoding
name_sequences_onehot = to_categorical(name_sequences_padded, num_classes=len(tokenizer.word_index) + 1)


In [6]:
# Split the data into training, validation, and test sets
train_texts, test_texts, train_names, test_names = train_test_split(text_sequences_padded, name_sequences_onehot, test_size=0.2, random_state=42)
train_texts, val_texts, train_names, val_names = train_test_split(train_texts, train_names, test_size=0.2, random_state=42)


In [7]:
# Define a function to create the Keras model
def create_model(input_dim, output_dim, max_length):
    model = Sequential([
        Input(shape=(max_length,)),
        Embedding(input_dim=input_dim, output_dim=32, input_length=max_length),
        Bidirectional(LSTM(units=32, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)),
        LSTM(units=32, return_sequences=True, dropout=0.5, recurrent_dropout=0.5),
        TimeDistributed(Dense(output_dim, activation="softmax"))
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


In [8]:

# Set the input and output dimensions
input_dim = len(tokenizer.word_index) + 1
output_dim = len(tokenizer.word_index) + 1

# Create the Keras model
model = create_model(input_dim, output_dim, max_length)



In [9]:
# Train the model on the training set and validate on the validation set
history = model.fit(train_texts, train_names, epochs=5, batch_size=16, validation_data=(val_texts, val_names))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [27]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(test_texts, test_names)
print("Test loss:", loss)
print("Test accuracy:", accuracy)


Test loss: 1.8146940469741821
Test accuracy: 0.9967508912086487


In [28]:
def extract_named_entities(text, model, tokenizer):
    # Tokenize the text
    text_sequence = tokenizer.texts_to_sequences([text])
    text_sequence_padded = pad_sequences(text_sequence, maxlen=max_length, padding='post')

    # Make predictions using the model
    predictions = model.predict(text_sequence_padded)[0]

    # Extract the named entities from the predictions
    name_indices = np.argmax(predictions, axis=-1)
    name_tokens = [tokenizer.index_word[idx] if idx > 0 else '' for idx in name_indices]

    # Create a list of spans representing the named entities
    spans = []
    start = None
    for i, token in enumerate(name_tokens):
        if token and not start:
            start = i
        elif not token and start:
            end = i
            spans.append((start, end))
            start = None
    if start:
        end = len(name_tokens)
        spans.append((start, end))

    # Create a list of spaCy entities from the spans
    entities = []
    for start, end in spans:
        if 'PERSON' in predictions[start:end]:
            entities.append(spacy.tokens.Span(doc, start, end, label='PERSON'))

    return entities


In [29]:

# Define a function to add the named entities to a spaCy doc
def add_named_entities(doc, entities):
    for ent in entities:
        doc.ents += (ent,)
    return doc



In [30]:
# Load a spaCy model
nlp = spacy.load("en_core_web_sm")
# Get a reference to the named entity recognition component
ner = nlp.get_pipe("ner")
# Process a text and extract the named entities
text = " Dr. Sharath Chandra Mouli is a medical doctor who specializes in the field of Gastroenterology. He is associated with the Krishna Institute of Medical Sciences (KIMS) Hospital in Secunderabad, Telangana, India.Dr. Mouli completed his MBBS degree from the Rajiv Gandhi University of Health Sciences in Bangalore, India, and then went on to pursue a Doctorate of Medicine (DM) in Gastroenterology from the Nizam's Institute of Medical Sciences in Hyderabad, India. He has several years of experience in the field of Gastroenterology and has worked at various prestigious institutions across India.Dr. Mouli's areas of expertise include the diagnosis and treatment of various gastrointestinal disorders such as inflammatory bowel disease, liver diseases, pancreatic disorders, and motility disorders. He is also trained in performing advanced endoscopic procedures such as endoscopic ultrasound (EUS), endoscopic retrograde cholangiopancreatography (ERCP), and endoscopic mucosal resection (EMR).Apart from his clinical work, Dr. Mouli is also involved in research activities in the field of Gastroenterology and has published several articles in peer-reviewed medical journals."
doc = nlp(text)
entities = extract_named_entities(text, model, tokenizer)
person_entities = [ent for ent in entities if ent.label_ == 'PERSON']
doc = add_named_entities(doc, person_entities)
#doc = add_named_entities(doc, entities)

# Print the named entities
print("Named entities found:")
for ent in doc.ents:
    print(ent.text, ent.label_)


Named entities found:
Sharath Chandra Mouli PERSON
the Krishna Institute of Medical Sciences ORG
Secunderabad GPE
Telangana GPE
India GPE
Mouli PERSON
the Rajiv Gandhi University of Health Sciences ORG
Bangalore GPE
India GPE
Nizam's Institute of Medical Sciences ORG
Hyderabad GPE
India GPE
several years DATE
Gastroenterology ORG
India GPE
Mouli PERSON
EUS ORG
ERCP ORG
Mouli PERSON
Gastroenterology ORG


In [19]:

# Visualize the named entities using displaCy
spacy.displacy.render(doc, style='ent')


In [None]:

# Define a function to extract the named entities from a text using the trained model
def extract_named_entities(text, model, tokenizer):
    # Tokenize the text
    text_sequence = tokenizer.texts_to_sequences([text])
    text_sequence_padded = pad_sequences(text_sequence, maxlen=max_length, padding='post')

    # Make predictions using the model
    predictions = model.predict(text_sequence_padded)[0]

    # Extract the named entities from the predictions
    name_indices = np.argmax(predictions, axis=-1)
    name_tokens = [tokenizer.index_word[idx] if idx > 0 else '' for idx in name_indices]

    # Create a list of spans representing the named entities
    spans = []
    start = None
    for i, token in enumerate(name_tokens):
        if token and not start:
            start = i
        elif not token and start:
            end = i
            label = 'PERSON'
            spans.append((start, end, label))
            start = None
    if start:
        end = len(name_tokens)
        label = 'PERSON'
        spans.append((start, end, label))

    # Create a list of spaCy entities from the spans
    entities = []
    for start, end, label in spans:
        entities.append(spacy.tokens.Span(doc, start, end, label=label))

    return entities

# Define a function to add the named entities to a spaCy doc
def add_named_entities(doc, entities):
    for ent in entities:
        doc.ents += (ent,)
    return doc

