In [21]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout, Bidirectional
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [22]:
# Load the Excel file
data = pd.read_excel('data.xlsx')

In [23]:
data

Unnamed: 0,auth_name,fn,interest
0,Dr. Yue Cao is a highly respected radiologist ...,Yue Cao,Quantitative imaging for tumor and normal tiss...
1,"Biography of Dr. Bensheng Qiu, Radiologist\n\n...",Bensheng Qiu,"advancing the field of radiology,developing in..."
2,"Biography of Dr. Robert Fleck, Radiologist\n\n...",Robert Fleck J,"imaging in early cancer detection, and his wor..."
3,Dr. Holden Wu is a renowned radiologist who ha...,Holden Wu,"novel imaging modalities, such as cardiac magn..."
4,Biography of Dr. William Hyslop: Radiologist E...,William Hyslop,"advanced imaging techniques, such as functiona..."
...,...,...,...
96,Dr. Claude Sirlin is a highly accomplished rad...,Claude Sirlin,MRI imaging of liver cancer and liver disease;;
97,Dr. Martin Prince is a renowned radiologist wh...,Martin Prince,"Developed high-dose, gadolinium-enhanced MR An..."
98,Dr. Scott Reeder is a renowned radiologist kno...,Scott Reeder,Development of new MRI methods for quantificat...
99,Dr. David Bluemke is a renowned radiologist kn...,David Bluemke,diagnosis and management of cardiovascular di...


In [24]:
# Convert the second column to a categorical variable
data['Label'] = pd.Categorical(data['fn'])
data['Label'] = data['Label'].cat.codes

In [25]:
data['Label']

0      99
1      11
2      79
3      38
4      95
       ..
96     16
97     60
98     85
99     17
100    18
Name: Label, Length: 101, dtype: int8

In [26]:
# Split the data into training and testing sets
train_size = int(len(data) * 0.8)
train_data = data[:train_size]
test_data = data[train_size:]

In [27]:
# Create a tokenizer to convert words to integers
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data['auth_name'])

In [28]:
# Convert the text data to sequences of integers
X_train = tokenizer.texts_to_sequences(train_data['auth_name'])
X_test = tokenizer.texts_to_sequences(test_data['auth_name'])

In [29]:
# Pad the sequences to make them the same length
max_length = 5000
X_train = pad_sequences(X_train, maxlen=max_length, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=max_length, padding='post', truncating='post')

In [30]:
# Convert the labels to one-hot encoding
y_train = to_categorical(train_data['Label'])
y_test = to_categorical(test_data['Label'])

In [31]:
# Define the model architecture
model = Sequential()
model.add(Embedding(5000, 64, input_length=max_length))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(y_train.shape[1], activation='softmax'))

In [32]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [33]:
# Set up a callback to save the best model during training
filepath="best_model.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')

21

In [35]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test), callbacks=[checkpoint])

Epoch 1/10

ValueError: in user code:

    File "C:\Users\SlideSciences\AppData\Roaming\Python\Python39\site-packages\keras\engine\training.py", line 1820, in test_function  *
        return step_function(self, iterator)
    File "C:\Users\SlideSciences\AppData\Roaming\Python\Python39\site-packages\keras\engine\training.py", line 1804, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\SlideSciences\AppData\Roaming\Python\Python39\site-packages\keras\engine\training.py", line 1792, in run_step  **
        outputs = model.test_step(data)
    File "C:\Users\SlideSciences\AppData\Roaming\Python\Python39\site-packages\keras\engine\training.py", line 1758, in test_step
        self.compute_loss(x, y, y_pred, sample_weight)
    File "C:\Users\SlideSciences\AppData\Roaming\Python\Python39\site-packages\keras\engine\training.py", line 1082, in compute_loss
        return self.compiled_loss(
    File "C:\Users\SlideSciences\AppData\Roaming\Python\Python39\site-packages\keras\engine\compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "C:\Users\SlideSciences\AppData\Roaming\Python\Python39\site-packages\keras\losses.py", line 152, in __call__
        losses = call_fn(y_true, y_pred)
    File "C:\Users\SlideSciences\AppData\Roaming\Python\Python39\site-packages\keras\losses.py", line 284, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "C:\Users\SlideSciences\AppData\Roaming\Python\Python39\site-packages\keras\losses.py", line 2004, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "C:\Users\SlideSciences\AppData\Roaming\Python\Python39\site-packages\keras\backend.py", line 5532, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None, 101) and (None, 100) are incompatible


In [None]:
# Load the best model
model.load_weights("best_model.hdf5")

In [None]:
# Test the model on a new text
new_text = "John Smith is a software engineer at XYZ company."
new_text = tokenizer.texts_to_sequences([new_text])
new_text = pad_sequences(new_text, maxlen=max_length, padding='post', truncating='post')
pred = model.predict(new_text)
pred_label = np.argmax(pred, axis=1)
print(pred_label)