# Bidirectional LSTM-CRF

Implemented by following: 
- https://github.com/Akshayc1/named-entity-recognition/blob/master/NER%20using%20Bidirectional%20LSTM%20-%20CRF%20.ipynb

- https://github.com/xuxingya/tf2crf/tree/master

In [None]:
from utils.NERcorpus import NERCorpus
from utils.prediction_vis import print_labeled_tag_pred_example
import numpy as np

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import LSTM, Dense, TimeDistributed, Embedding, Bidirectional, Input
from tensorflow.keras.models import Model
from tf2crf import CRF, ModelWithCRFLoss

import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import f1_score
import pickle

In [71]:
data_path = "../nlp_d2_data/"
corpus = NERCorpus()
train_seq = corpus.read_sequence_list_csv(f"{data_path}train_data_ner.csv")
test_seq = corpus.read_sequence_list_csv(f"{data_path}test_data_ner.csv")

In [64]:
print("PAD in word_dict?", "PAD" in corpus.word_dict)

print("PAD0 in word_dict?", "PAD0" in corpus.word_dict)

PAD in word_dict? True
PAD0 in word_dict? False


In [72]:
# Add PAD0 tag to corpus.word_dict and .tag_dict
corpus.word_dict.add("PAD0")
corpus.tag_dict.add("PAD0")

words = list(corpus.word_dict.keys())
tags = list(corpus.tag_dict.keys())

In [None]:
# Create reverse dictionaries inside corpus class
corpus.reverse_dictionaries()

In [7]:
def tag_onehot(tag_seqs, corpus, num_tag, max_len):
    # Find the max length to pad sequences
    max_len = max(len(seq) for seq in tag_seqs)

    # Pad the tag sequences with the padding index
    padded_tag_seqs = pad_sequences(tag_seqs, maxlen=max_len, padding='post', value=corpus.tag_dict["PAD0"])

    # Convert to one-hot encoded format (shape: num_sequences x max_len x num_tag)
    y_one_hot = to_categorical(padded_tag_seqs, num_classes=num_tag)

    return y_one_hot

In [8]:
def format_seq(seq, corpus, num_tag):
    # Extract word and tag sequences from train_seq.seq_list
    word_seqs = [seqi.x for seqi in seq.seq_list]
    tag_seqs = [seqi.y for seqi in seq.seq_list]

    # Pad sequences to max_len
    sequence_lengths = [len(seq.x) for seq in train_seq.seq_list]
    max_len = max(sequence_lengths)

    X = pad_sequences(word_seqs, maxlen=max_len, padding='post', value=corpus.word_dict["PAD0"])

    # One hot encoding of tags
    y = tag_onehot(tag_seqs, corpus, num_tag, max_len)

    return X, y, max_len

In [9]:
num_tag = len(corpus.tag_dict)
X_train, y_train, max_len = format_seq(train_seq, corpus, num_tag)
X_test, y_test, _ = format_seq(test_seq, corpus, num_tag)

In [10]:
# Number of data points passed in each iteration
batch_size = 64 
# Passes through entire dataset
epochs = 8
# Dimension of embedding vector
embedding = 40 

In [11]:
print("Size of training input data : ", X_train.shape)
print("Size of training output data : ", np.array(y_train).shape)
print("Size of testing input data : ", X_test.shape)
print("Size of testing output data : ", np.array(y_test).shape)

Size of training input data :  (38366, 104)
Size of training output data :  (38366, 104, 18)
Size of testing input data :  (38367, 104)
Size of testing output data :  (38367, 104, 18)


In [12]:
# Model architecture
input_layer = Input(shape=(max_len,))
embedding_layer = Embedding(input_dim=len(words), output_dim=embedding, input_length=max_len, mask_zero=True)(input_layer)
bilstm_layer = Bidirectional(LSTM(units=50, return_sequences=True, recurrent_dropout=0.1))(embedding_layer)
td_dense = TimeDistributed(Dense(50, activation="relu"))(bilstm_layer)

crf = CRF(units=num_tag +1)
output_layer = crf(td_dense)

base_model = Model(inputs=input_layer, outputs=output_layer)
model = ModelWithCRFLoss(base_model, sparse_target=False, metric='accuracy')
model.compile(optimizer='adam')

In [13]:
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size)

2025-06-13 22:54:20.147162: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2025-06-13 22:54:20.147407: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x30127c160>

In [None]:
# Predict on test set
y_pred_tag_idxs = model.predict(X_test)
y_test_tag_idxs = np.argmax(y_test, -1)

In [53]:
# Flatten lists of lists into one long list (preserving order)
y_test_flat = [tag for seq in y_pred_tag_idxs for tag in seq]
y_pred_flat = [tag for seq in y_test_tag_idxs for tag in seq]

f1_test = f1_score(y_test_flat, y_pred_flat, average='weighted')
print(f"F1-score is : {f1_test:.1%}")

F1-score is : 98.1%


In [None]:
print_labeled_tag_pred_example(X_test, y_test_tag_idxs, y_pred_tag_idxs, corpus)

True tags:
Iranian/B-gpe officials/O say/O they/O expect/O to/O get/O access/O to/O sealed/O sensitive/O parts/O of/O the/O plant/O Wednesday/B-tim ,/O after/O an/O IAEA/B-org surveillance/O system/O begins/O functioning/O ./O

Predicted tags:
Iranian/B-gpe officials/O say/O they/O expect/O to/O get/O access/O to/O sealed/O sensitive/O parts/O of/O the/O plant/O Wednesday/B-tim ,/O after/O an/O IAEA/B-org surveillance/O system/O begins/O functioning/O ./O


In [86]:
# Save base model
model.save('models/bilstm_crf_model')

# Save the corpus object
with open("models/bilstm_crf_model_corpus.pkl", "wb") as f:
    pickle.dump(corpus, f)

2025-06-13 23:54:37.500765: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: models/bilstm_crf_model/assets


INFO:tensorflow:Assets written to: models/bilstm_crf_model/assets


In [None]:
# To load the model back:

# import tensorflow as tf
# lo = tf.keras.models.load_model('models/bilstm_crf_model')