In [1]:
#all imports required


import pandas as pd
from itertools import chain
from numpy import array
from numpy.random import seed
from keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
import spacy
from spacy import displacy
import numpy as np
import tensorflow as tf
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model

In [2]:
# function to import data and return it in a dataframe

def getRawData(path):
    return pd.read_excel(path)

In [3]:
# function to define token to id and id to token
def get_dict_map(data, token_or_tag):
    tok2idx = {}
    idx2tok = {}
    
    if token_or_tag == 'token':
        vocab = list(set(data['Text'].to_list()))
    else:
        vocab = list(set(data['Label'].to_list()))
    
    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}
    return tok2idx, idx2tok

In [4]:
#function to get extra columns of token to id representation for text and label
def getDataTagged(data,tok2idx,tag2idx):
    data['Word_idx'] = data['Text'].map(token2idx)
    data['Tag_idx'] = data['Label'].map(tag2idx)
    return data

In [5]:
def getGroupedData(data):
    data_fillna = data.fillna(method='ffill', axis=0)
    for i in range(len(data)):
        print(i)
        data_group = data_fillna.groupby(["Sentence"],as_index=False)['Text', 'POS', 'Label', 'Word_idx', 'Tag_idx'].agg(lambda x: list(x))
    return data_group

In [6]:
def get_pad_train_test_val(data_group, data):
    n_token = len(list(set(data['Text'].to_list())))
    n_tag = len(list(set(data['Label'].to_list())))   
    tokens = data_group['Word_idx'].tolist()
    maxlen = max([len(s) for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= n_token - 1)
    tags = data_group['Tag_idx'].tolist()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post')
    n_tags = len(tag2idx)
    pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]
    tokens_, test_tokens, tags_, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.1, train_size=0.9, random_state=2020)
    train_tokens, val_tokens, train_tags, val_tags = train_test_split(tokens_,tags_,test_size = 0.25,train_size =0.75, random_state=2020)
    print(
        'train_tokens length:', len(tokens_),
        '\ntrain_tags length:',len(tags_),
        '\ntest_tokens length:', len(test_tokens),
        '\ntest_tags:', len(test_tags),
    )
    return tokens_, test_tokens, tags_, test_tags

In [7]:
def get_bilstm_lstm_model():
    model = Sequential()

    # Add Embedding layer
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))

    # Add bidirectional LSTM
    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))

    # Add LSTM
    model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))

    # Add timeDistributed Layer
    model.add(TimeDistributed(Dense(n_tags, activation="softmax")))

    #Optimiser 
    # adam = k.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999)

    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    return model

In [8]:
def train_model(X, y, model):
    loss = list()
    for i in range(25):
        # fit model for one epoch on this sequence
        hist = model.fit(X, y, batch_size=1000, verbose=1, epochs=1, validation_split=0.2)
        loss.append(hist.history['loss'][0])
    return loss

In [9]:
data=getRawData('improved_data.xlsx')
data

Unnamed: 0.1,Unnamed: 0,Sentence,Text,POS,Label
0,0,Sentence 1,Dr.,NNP,
1,1,Sentence 1,Yue,NNP,PERSON
2,2,Sentence 1,Cao,NNP,PERSON
3,3,Sentence 1,is,VBZ,
4,4,Sentence 1,a,DT,
...,...,...,...,...,...
58790,601,Sentence 101,of,IN,
58791,602,Sentence 101,medical,JJ,
58792,603,Sentence 101,imaging,NN,
58793,604,Sentence 101,.,.,


In [10]:
#data_group=pd.read_excel("groupData.xlsx")

In [None]:
#data=getRawData('improved_data.xlsx')
token2idx, idx2token = get_dict_map(data, 'token')
tag2idx, idx2tag = get_dict_map(data, 'tag')
data_idx=getDataTagged(data,token2idx,tag2idx)

In [None]:
data_group=data_group.drop(data_group.columns[0],axis=1)

In [None]:
train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags = get_pad_train_test_val(data_group, data)
input_dim = len(list(set(data['Text'].to_list())))+1
output_dim = 64
input_length = max([len(s) for s in data_group['Word_idx'].tolist()])
n_tags = len(tag2idx)

data_group=getGroupedData(data)

model_bilstm_lstm = get_bilstm_lstm_model()
#plot_model(model_bilstm_lstm)
results['with_add_lstm'] = train_model(train_tokens, np.array(train_tags), model_bilstm_lstm)
save_model=saveModel(model_bilstm_lstm,save_model)
output=save_model.predict()

In [None]:
model = get_bilstm_lstm_model()
print(train_tokens)
loss = train_model(train_tokens,train_tags,model)

In [None]:
print(train_tokens[0][3],train_tags[0][3])

In [11]:
data=getRawData('improved_data.xlsx')

In [12]:
data=data.drop(data.columns[0],axis=1)

In [13]:
data_group=pd.read_excel("groupData.xlsx")

In [14]:
data_group=data_group.drop(data_group.columns[0],axis=1)
tokens = data_group['Word_idx'].tolist()
for i in range(len(tokens)):
    tokens[i]=list(map(int,tokens[i][1:-1].split(",")))
data_group['Word_idx']=tokens
tags = data_group['Tag_idx'].tolist()
for i in range(len(tags)):
    tags[i]=list(map(int,tags[i][1:-1].split(",")))
data_group['Tag_idx']=tags

In [15]:
token2idx, idx2token = get_dict_map(data, 'token')
tag2idx, idx2tag = get_dict_map(data, 'tag')
data_idx=getDataTagged(data,token2idx,tag2idx)

In [16]:
train_tokens, test_tokens, train_tags,test_tags = get_pad_train_test_val(data_group, data)

train_tokens length: 90 
train_tags length: 90 
test_tokens length: 11 
test_tags: 11


import torch
import torch.nn as nn

class MyModel(nn.Module):
    def __init__(self, input_dim, max_length, output_dim):
        super(MyModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, 32, max_norm=max_length)
        self.bi_lstm = nn.LSTM(32, 32, batch_first=True, bidirectional=True, dropout=0.2)
        self.lstm = nn.LSTM(64, 32, batch_first=True, dropout=0.5)
        self.time_distributed = nn.Linear(32, output_dim)
        self.softmax = nn.Softmax(dim=2)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.bi_lstm(x)
        x, _ = self.lstm(x)
        x = self.time_distributed(x)
        x = self.softmax(x)
        return x

In [17]:
import torch

train_texts = torch.tensor(train_tokens, dtype=torch.long)
train_names = torch.tensor(train_tags, dtype=torch.long)
val_texts = torch.tensor(test_tokens, dtype=torch.long)
val_names = torch.tensor(test_tags, dtype=torch.long)

  train_names = torch.tensor(train_tags, dtype=torch.long)
  train_names = torch.tensor(train_tags, dtype=torch.long)
  val_names = torch.tensor(test_tags, dtype=torch.long)


In [18]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        x = self.data[index]
        y = self.labels[index]
        return x, y

In [19]:
from torch.utils.data import DataLoader

batch_size = 16

train_dataset = CustomDataset(train_texts, train_names)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = CustomDataset(val_texts, val_names)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [20]:
import torch
import torch.nn as nn
import torch.optim as optim

class NERModel(nn.Module):
    def __init__(self, input_dim, output_dim, input_length, n_tags):
        super(NERModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, output_dim)
        self.bidirectional_lstm = nn.LSTM(output_dim, output_dim, bidirectional=True, dropout=0.2, batch_first=True)
        self.lstm = nn.LSTM(2*output_dim, output_dim, dropout=0.5, batch_first=True)
        self.time_distributed = nn.Linear(output_dim, n_tags)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.bidirectional_lstm(x)
        x, _ = self.lstm(x)
        x = self.time_distributed(x)
        x = self.softmax(x)
        return x


In [21]:
input_dim = len(list(set(data['Text'].to_list())))+1 # Your vocabulary size
output_dim = 64
input_length = max([len(s) for s in data_group['Word_idx'].tolist()]) # The maximum length of your input sequences
n_tags =  len(tag2idx) # The number of unique tags in your NER dataset
model = NERModel(input_dim, output_dim, input_length, n_tags)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)

print('input_dim=',input_dim)
print('output_dim=',output_dim)
print('input_length=',input_length)
print('number of unique tags=',n_tags)

print(len(set(data['Text'].to_list())))


input_dim= 2033
output_dim= 64
input_length= 713
number of unique tags= 14
2032




In [22]:
epochs = 30
batch_size = 10

# Assuming you have already created train_loader and val_loader

for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for batch_idx, (data, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(data)

        # Reshape labels for the loss function
        labels = labels.view(-1)
        outputs = outputs.view(-1, outputs.shape[-1])

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    train_loss = running_loss / len(train_loader)

    # Validation
    model.eval()
    running_loss = 0.0
    with torch.no_grad():
        for batch_idx, (data, labels) in enumerate(val_loader):
            outputs = model(data)

            # Reshape labels for the loss function
            labels = labels.view(-1)
            outputs = outputs.view(-1, outputs.shape[-1])

            loss = criterion(outputs, labels)
            running_loss += loss.item()
    val_loss = running_loss / len(val_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss}, Val Loss: {val_loss}")
print("Training finished.")

ValueError: Expected input batch_size (11408) to match target batch_size (159712).

In [None]:
import spacy
import random
from spacy.training import Example
from spacy.util import minibatch, compounding

# Load your preprocessed data (words and tags)
data = train_tokens,train_tags

# Function to convert your data into SpaCy's format
def convert_data_to_spacy_format(data):
    spacy_data = []
    for text, tags in data:
        entities = []
        for tag in tags:
            entities.append((tag[0], tag[1], tag[2]))
        spacy_data.append((text, {"entities": entities}))
    return spacy_data

# Convert your data to SpaCy format
spacy_data = convert_data_to_spacy_format(data)

# Create a blank SpaCy model
nlp = spacy.blank("en")
ner = nlp.create_pipe("ner")
nlp.add_pipe("ner")

# Add custom NER labels to the model
for _, annotations in spacy_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

# Train the custom NER model
optimizer = nlp.begin_training()
for i in range(10): # Number of epochs
    random.shuffle(spacy_data)
    losses = {}
    
    batches = minibatch(spacy_data, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        examples = [Example.from_dict(nlp.make_doc(text), entities) for text, entities in batch]
        nlp.update(examples, drop=0.1, sgd=optimizer, losses=losses)

    print(f"Losses at iteration {i}: {losses}")

# Save the model
nlp.to_disk("custom_ner_model")




# Instantiate the model
input_dim = len(list(set(data['Text'].to_list())))+1 # Set the input dimension
max_length = max([len(s) for s in data_group['Word_idx'].tolist()]) # Set the maximum sequence length
output_dim = 32 # Set the output dimension
model = MyModel(input_dim, max_length, output_dim)