In [1]:
import pandas as pd
import spacy
from spacy.tokens import Doc
from spacy.vocab import Vocab
from spacy.training import Example
import random
import torch
import numpy as np
from spacy.util import minibatch, compounding
from spacy import displacy
from spacy import __version__ as spacy_version


In [2]:
# Load data from Excel sheet
df = pd.read_excel('Copy of MasterList (5).xlsx')
texts = df['Text'].tolist()
labels = df['Name'].tolist()
texts = [text for text in texts if text.strip()]
labels = [label for label in labels if label.strip()]

In [3]:
df

Unnamed: 0,Text,Name,Interest
0,Dr. Yue Cao is a highly respected radiologist ...,Yue Cao,tumor ;; tissue therapy;;
1,Dr. Bensheng Qiu is a renowned radiologist kno...,Bensheng Qiu,radiology;;cancers;;
2,Dr. Robert Fleck began his academic journey wi...,Robert Fleck J,cancer ;;diagnosis;;
3,Dr. Holden Wu is a renowned radiologist who ha...,Holden Wu,novel imaging modalities;;cardiac magnetic res...
4,Dr. William Hyslop is a renowned radiologist w...,William Hyslop,MRI;;PET;;CT;;radiology;;diagnosis;;
...,...,...,...
96,Dr. Claude Sirlin is a highly accomplished rad...,Claude Sirlin,MRI imaging;;liver cancer;; liver disease;;
97,Dr. Martin Prince is a renowned radiologist wh...,Martin Prince,gadolinium-enhanced MR Angiography;; Investig...
98,Dr. Scott Reeder is a renowned radiologist kno...,Scott Reeder,abdominal adiposity;; liver fat;; liver iron;;...
99,Dr. David Bluemke is a renowned radiologist kn...,David Bluemke,diagnosis;;cardiovascular diseases;; coronary...


In [4]:
# Set up blank Spacy model
nlp = spacy.blank('en')
vocab = Vocab()
nlp.vocab = vocab

In [5]:
# Define your BiLSTM model architecture
class BiLSTMModel(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(BiLSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.lstm = torch.nn.LSTM(input_dim, hidden_dim, bidirectional=True)
        self.fc = torch.nn.Linear(hidden_dim * 2, output_dim)
        
    def forward(self, input):
        lstm_out, _ = self.lstm(input.view(len(input), 1, -1))
        out = self.fc(torch.cat((lstm_out[-1,:, :self.hidden_dim], lstm_out[0,:, self.hidden_dim:]), dim=1))
        return out


In [6]:

# Set up your training hyperparameters
epochs = 10
learning_rate = 0.001
batch_size = 8
dropout = 0.2
input_dim = 300
hidden_dim = 128
output_dim = nlp.vocab.vectors_length

# Initialize your model
model = BiLSTMModel(input_dim, hidden_dim, output_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.MSELoss()



In [7]:
# Train your model
for epoch in range(epochs):
    random.seed(epoch)
    losses = {}
    batches = minibatch(zip(texts, labels), size=compounding(batch_size, len(texts), 1.001))
    for batch in batches:
        texts, labels = zip(*batch)
        docs = [Doc(nlp.vocab, words=text.split(' ')) for text in texts]
        X = torch.FloatTensor([doc.vector for doc in docs])
        y = torch.FloatTensor([np.asarray(label) for label in labels])
        model.zero_grad()
        preds = model(X)
        loss = loss_fn(preds, y)
        loss.backward()
        optimizer.step()
        losses[epoch] = losses.get(epoch, 0) + loss.item()
    print("Epoch: ", epoch, " Loss: ", losses[epoch])


ValueError: [E031] Invalid token: empty string ('') at position 119.

In [None]:
# Test your model
text = "John lives in New York City."
words = text.split(' ')
doc = Doc(nlp.vocab, words=words)
X = torch.FloatTensor([doc.vector])
preds = model(X)
print(preds)


In [None]:
# Save your model
nlp.to_disk("my_model")