In [None]:
#!pip install tensorflow-mkl

In [1]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import numpy as np
import spacy
from spacy.tokens import Doc
from spacy.pipeline import EntityRecognizer
from thinc.layers import PyTorchWrapper
import torch
from spacy.tokens import Span
from spacy.language import Language
from spacy.tokens import Token
from spacy import displacy
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, TimeDistributed, Dense


In [None]:
#!pip install tensorflow-mkl

In [None]:
#!pip install -r requirements.txt
#!pip install package-name
#print(torch.__version__)

In [2]:
# Load the Excel sheet
file_path = "MasterList (3).xlsx"
df = pd.read_excel(file_path)

In [3]:
df

Unnamed: 0,Text,Name,Interest
0,Dr. Yue Cao is a highly respected radiologist ...,Yue Cao,tumor ;; tissue therapy;;
1,Dr. Bensheng Qiu is a renowned radiologist kno...,Bensheng Qiu,radiology;;cancers;;
2,Dr. Robert Fleck began his academic journey wi...,Robert Fleck J,cancer ;;diagnosis;;
3,Dr. Holden Wu is a renowned radiologist who ha...,Holden Wu,novel imaging modalities;;cardiac magnetic res...
4,Dr. William Hyslop is a renowned radiologist w...,William Hyslop,MRI;;PET;;CT;;radiology;;diagnosis;;
...,...,...,...
96,Dr. Claude Sirlin is a highly accomplished rad...,Claude Sirlin,MRI imaging;;liver cancer;; liver disease;;
97,Dr. Martin Prince is a renowned radiologist wh...,Martin Prince,gadolinium-enhanced MR Angiography;; Investig...
98,Dr. Scott Reeder is a renowned radiologist kno...,Scott Reeder,abdominal adiposity;; liver fat;; liver iron;;...
99,Dr. David Bluemke is a renowned radiologist kn...,David Bluemke,diagnosis;;cardiovascular diseases;; coronary...


In [4]:
# Separate the text and the names
texts = df['Text'].tolist()
names = df['Name'].tolist()

In [5]:
# Tokenize the texts and the names
text_tokenizer = Tokenizer(char_level=True)
text_tokenizer.fit_on_texts(texts)
text_sequences = text_tokenizer.texts_to_sequences(texts)

name_tokenizer = Tokenizer(char_level=True)
name_tokenizer.fit_on_texts(names)
name_sequences = name_tokenizer.texts_to_sequences(names)

In [6]:
# Pad the sequences
max_length = max([len(seq) for seq in text_sequences])
text_sequences_padded = pad_sequences(text_sequences, maxlen=max_length, padding='post')
name_sequences_padded = pad_sequences(name_sequences, maxlen=max_length, padding='post')

In [7]:
# Convert the names to one-hot encoding
name_sequences_onehot = [to_categorical(seq, num_classes=len(name_tokenizer.word_index) + 1) for seq in name_sequences_padded]

In [8]:
# Split the data into training and validation sets
train_texts, val_texts, train_names, val_names = train_test_split(text_sequences_padded, np.array(name_sequences_onehot), test_size=0.2)

In [9]:
input_dim = len(text_tokenizer.word_index) + 1
#output_dim = len(name_tokenizer.word_index) + 1
output_dim = len(text_tokenizer.word_index) + 1
model = Sequential([
    Input(shape=(max_length,)),
    Embedding(input_dim=input_dim, output_dim=32, input_length=max_length),
    Bidirectional(LSTM(units=32, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)),
    LSTM(units=32, return_sequences=True, dropout=0.5, recurrent_dropout=0.5),
    TimeDistributed(Dense(output_dim, activation="softmax"))
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])



In [11]:
import torch
import torch.nn as nn

class MyModel(nn.Module):
    def __init__(self, input_dim, max_length, output_dim):
        super(MyModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, 64, max_norm=max_length)
        self.bi_lstm = nn.LSTM(64, 64, batch_first=True, bidirectional=True, dropout=0.2)
        self.lstm = nn.LSTM(128, 64, batch_first=True, dropout=0.5)
        self.time_distributed = nn.Linear(64, output_dim)
        self.softmax = nn.Softmax(dim=2)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.bi_lstm(x)
        x, _ = self.lstm(x)
        x = self.time_distributed(x)
        #x = self.softmax(x)
        return x

# Instantiate the model
input_dim = len(text_tokenizer.word_index) + 1 # Set the input dimension
max_length = len(text_tokenizer.word_index) + 1  # Set the maximum sequence length
output_dim = len(text_tokenizer.word_index) + 1 # Set the output dimension
model = MyModel(input_dim, max_length, output_dim)

In [12]:
# Convert your data to PyTorch tensors, if not already in that format
train_texts = torch.tensor(train_texts, dtype=torch.long)
train_names = torch.tensor(train_names, dtype=torch.long)
val_texts = torch.tensor(val_texts, dtype=torch.long)
val_names = torch.tensor(val_names, dtype=torch.long)

# Define the loss function and the optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

# Training parameters
epochs = 30
batch_size = 16

# Create the DataLoader for training and validation data
train_dataset = torch.utils.data.TensorDataset(train_texts, train_names)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = torch.utils.data.TensorDataset(val_texts, val_names)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Training loop
for epoch in range(epochs):
    # Training
    model.train()
    running_loss = 0.0
    for batch_idx, (data, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(data)

        # Reshape the labels for the loss function
        labels = labels.view(-1)

        outputs = outputs.view(-1, outputs.shape[-1])
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    train_loss = running_loss / len(train_loader)

    # Validation
    model.eval()
    running_loss = 0.0
    with torch.no_grad():
        for batch_idx, (data, labels) in enumerate(val_loader):
            outputs = model(data)

            # Reshape the labels for the loss function
            labels = labels.view(-1)

            outputs = outputs.view(-1, outputs.shape[-1])
            loss = criterion(outputs, labels)
            running_loss += loss.item()
    val_loss = running_loss / len(val_loader)

    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss}, Val Loss: {val_loss}")

print("Training finished.")


  train_texts = torch.tensor(train_texts, dtype=torch.long)
  train_names = torch.tensor(train_names, dtype=torch.long)
  val_texts = torch.tensor(val_texts, dtype=torch.long)
  val_names = torch.tensor(val_names, dtype=torch.long)


ValueError: Expected input batch_size (66832) to match target batch_size (1938128).

In [13]:
#history = model.fit(train_texts, train_names, epochs=30, batch_size=16, validation_data=(val_texts, val_names))
#import torch.optim as optim

In [14]:
# Convert Keras model to PyTorch model
#torch_model = torch.nn.Sequential(*[PyTorchWrapper(layer) for layer in model.layers])

# Create a custom spaCy component
class KerasEntityRecognizer(EntityRecognizer):
    def predict(self, docs):
        X = [doc.tensor for doc in docs]
        X_padded = pad_sequences(X, maxlen=max_length, padding='post')
        y_pred = self.model(torch.tensor(X_padded, dtype=torch.float32))
        return y_pred.detach().numpy()


In [15]:
@Language.component("keras_entity_recognizer")
def keras_entity_recognizer(doc):
    # Convert the document text into input for the model
    text_sequence = text_tokenizer.texts_to_sequences([doc.text])
    text_sequence_padded = pad_sequences(text_sequence, maxlen=max_length, padding='post')

    # Get the model predictions
    predictions = model.predict(text_sequence_padded)[0]

    # Extract the name from the predictions
    name_indices = np.argmax(predictions, axis=-1)
    name_tokens = [name_tokenizer.index_word[idx] if idx > 0 else '' for idx in name_indices]

    # Assign the name tokens to the doc
    for i, token in enumerate(doc):
        if name_tokens[i]:
            token._.name = name_tokens[i]
        else:
            token._.name = ""

    # Add entities to the doc
    for token in doc:
        if token._.name:
            doc.ents += (Span(doc, token.i, token.i + 1, label='PERSON'),)

    return doc

In [16]:
# Register the custom attribute
Token.set_extension('name', default='')

# Load a spaCy model and add the custom component
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("keras_entity_recognizer")

# Process a text using the spaCy pipeline
text = " Dr. Sharath Chandra Mouli is a medical doctor who specializes in the field of Gastroenterology. He is associated with the Krishna Institute of Medical Sciences (KIMS) Hospital in Secunderabad, Telangana, India.Dr. Mouli completed his MBBS degree from the Rajiv Gandhi University of Health Sciences in Bangalore, India, and then went on to pursue a Doctorate of Medicine (DM) in Gastroenterology from the Nizam's Institute of Medical Sciences in Hyderabad, India. He has several years of experience in the field of Gastroenterology and has worked at various prestigious institutions across India.Dr. Mouli's areas of expertise include the diagnosis and treatment of various gastrointestinal disorders such as inflammatory bowel disease, liver diseases, pancreatic disorders, and motility disorders. He is also trained in performing advanced endoscopic procedures such as endoscopic ultrasound (EUS), endoscopic retrograde cholangiopancreatography (ERCP), and endoscopic mucosal resection (EMR).Apart from his clinical work, Dr. Mouli is also involved in research activities in the field of Gastroenterology and has published several articles in peer-reviewed medical journals."
doc = nlp(text)

# Print the extracted entities
print("Entities found:", [(ent.text, ent.label_) for ent in doc.ents])

AttributeError: 'MyModel' object has no attribute 'predict'

In [None]:
# Visualize the named entities using displaCy
#displacy.render(doc, style='ent', options={'ents': ['PERSON']})
displacy.render(doc, style='ent')