In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Embedding, Bidirectional, TimeDistributed
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [2]:
# Load the data from the Excel sheet
df = pd.read_excel("data.xlsx")

In [3]:
df

Unnamed: 0,auth_name,fn,interest
0,Dr. Yue Cao is a highly respected radiologist ...,Yue Cao,Quantitative imaging for tumor and normal tiss...
1,"Biography of Dr. Bensheng Qiu, Radiologist\n\n...",Bensheng Qiu,"advancing the field of radiology,developing in..."
2,"Biography of Dr. Robert Fleck, Radiologist\n\n...",Robert Fleck J,"imaging in early cancer detection, and his wor..."
3,Dr. Holden Wu is a renowned radiologist who ha...,Holden Wu,"novel imaging modalities, such as cardiac magn..."
4,Biography of Dr. William Hyslop: Radiologist E...,William Hyslop,"advanced imaging techniques, such as functiona..."
...,...,...,...
96,Dr. Claude Sirlin is a highly accomplished rad...,Claude Sirlin,MRI imaging of liver cancer and liver disease;;
97,Dr. Martin Prince is a renowned radiologist wh...,Martin Prince,"Developed high-dose, gadolinium-enhanced MR An..."
98,Dr. Scott Reeder is a renowned radiologist kno...,Scott Reeder,Development of new MRI methods for quantificat...
99,Dr. David Bluemke is a renowned radiologist kn...,David Bluemke,diagnosis and management of cardiovascular di...


In [4]:
# Define the training data
train_data = []
for i, row in df.iterrows():
    text = row["auth_name"]
    name = row["fn"]
    train_data.append((text, name))

In [6]:
# Define the input and output data
X = [text for text, _ in train_data]
y = [[("B-PERSON" if i == 0 else "I-PERSON") for i in range(len(name.split()))] for _, name in train_data]

In [7]:
# Define the vocabulary
vocab = set([word.lower() for text in X for word in text.split()])
vocab_size = len(vocab) + 1

In [8]:
# Define the word-to-index and index-to-word dictionaries
word_to_idx = dict([(word, i+1) for i, word in enumerate(vocab)])
idx_to_word = dict([(i+1, word) for i, word in enumerate(vocab)])

In [9]:
# Convert the input data to sequences of word indices
X_seq = [[word_to_idx[word.lower()] for word in text.split()] for text in X]

In [10]:
# Pad the input sequences to the maximum length
maxlen = max([len(seq) for seq in X_seq])
X_pad = pad_sequences(X_seq, padding='post', maxlen=maxlen)

In [11]:
# Convert the output data to sequences of tag indices
tag_to_idx = {"B-PERSON": 0, "I-PERSON": 1}
y_seq = [[tag_to_idx[tag] for tag in tags] for tags in y]

In [12]:
# Pad the output sequences to the maximum length
y_pad = pad_sequences(y_seq, padding='post', maxlen=maxlen)
y_pad_cat = to_categorical(y_pad)

In [13]:
# Define the model architecture
input_layer = Input(shape=(maxlen,))
embedding_layer = Embedding(input_dim=vocab_size, output_dim=128, input_length=maxlen)(input_layer)
lstm_layer = Bidirectional(LSTM(units=128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))(embedding_layer)
dense_layer = TimeDistributed(Dense(units=len(tag_to_idx), activation='softmax'))(lstm_layer)
model = Model(inputs=input_layer, outputs=dense_layer)

In [14]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [15]:
# Train the model
model.fit(X_pad, y_pad_cat, batch_size=32, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1c0652fd6d0>

In [16]:
model.save("ner_model.h5")

In [17]:
model = tf.keras.models.load_model("ner_model.h5")

In [18]:
# Define a sample text
text = "John Smith is a software engineer at Google."

In [24]:
text_seq = [word_to_idx.get(word.lower(), 0) for word in text.split()]

In [27]:
# Pad the sequence to the maximum length
text_pad = pad_sequences([text_seq], padding='post', maxlen=614)

In [28]:
# Predict the output labels
y_pred = model.predict(text_pad)



In [30]:
# Convert the predicted tag indices to tag labels
idx_to_tag = {0: "B-PERSON", 1: "I-PERSON"}
y_pred_labels = [[idx_to_tag[np.argmax(tag)] for tag in tags] for tags in y_pred][0]

In [33]:
y_pred_labels

['B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',

In [31]:
# Extract the name from the text using the predicted labels
name = ""
for i in range(len(y_pred_labels)):
    if y_pred_labels[i] == "B-PERSON" or y_pred_labels[i] == "I-PERSON":
        name += " " + text.split()[i]
print(name.strip())

IndexError: list index out of range