In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, TimeDistributed, Dense
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

In [None]:
# Sample dataset in BIO format
sentences = [
    ["Barack", "Obama", "was", "born", "in", "Hawaii", "."],
    ["Apple", "is", "based", "in", "Cupertino", "."],
    ["Elon", "Musk", "founded", "SpaceX", "in", "2002", "."]
]

tags = [
    ["B-PER", "I-PER", "O", "O", "O", "B-LOC", "O"],
    ["B-ORG", "O", "O", "O", "B-LOC", "O"],
    ["B-PER", "I-PER", "O", "B-ORG", "O", "B-DATE", "O"]
]

In [None]:
# Build vocab
words = list(set(w for s in sentences for w in s))
tags_list = list(set(t for ts in tags for t in ts))

word2idx = {w: i+2 for i, w in enumerate(words)}
word2idx["PAD"] = 0
word2idx["UNK"] = 1

tag2idx = {t: i for i, t in enumerate(tags_list)}

In [None]:
# Convert to sequences
X = [[word2idx.get(w, 1) for w in s] for s in sentences]
y = [[tag2idx[t] for t in ts] for ts in tags]

# Pad sequences
max_len = max(len(x) for x in X)
X = pad_sequences(X, maxlen=max_len, padding='post')
y = pad_sequences(y, maxlen=max_len, padding='post')

# Convert labels to categorical
y = [to_categorical(i, num_classes=len(tag2idx)) for i in y]

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, np.array(y), test_size=0.2)

In [None]:
# Build LSTM model
model = Sequential([
    Embedding(input_dim=len(word2idx), output_dim=64, input_length=max_len),
    LSTM(64, return_sequences=True),
    TimeDistributed(Dense(len(tag2idx), activation="softmax"))
])
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()

In [None]:
# Train the model
print("\nTraining the NER model, please wait...\n")
model.fit(X_train, y_train, batch_size=2, epochs=15, validation_split=0.1)

In [None]:
# Test the model
i = 0
p = model.predict(np.array([X_test[i]]))
p = np.argmax(p, axis=-1)
y_true = np.argmax(y_test[i], axis=-1)

# Decode predictions
idx2word = {i: w for w, i in word2idx.items()}
idx2tag = {i: t for t, i in tag2idx.items()}

print("\n--- Sample Prediction ---")
print("{:15} | {:10} | {}".format("Word", "True", "Predicted"))
print("-" * 40)
for w, t, pred in zip(X_test[i], y_true, p[0]):
    if w != 0:
        print("{:15} | {:10} | {}".format(idx2word[w], idx2tag[t], idx2tag[pred]))