Named Entity Recognition (NER) is a task in Natural Language Processing (NLP) that involves identifying and classifying important entities in a text.

NER models are designed to take a sequence of text (such as sentences or paragraphs) and label each word or token with a tag representing its entity type. In this partiular project, we are working on a movies dataset with tags such as Actor, director, genre, character, etc.

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk.corpus import wordnet
from collections import Counter

from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras.preprocessing.sequence import pad_sequences
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

We are using two different datasets for our model. Both datasets have the same tags with different names. So in order to merge the datasets and use it in the model, we must generalize the tag names. 

In [None]:
replace_dict = {
    'Actor': "ACTOR",
    'Character_Name': "CHARACTER",
    'Director': "DIRECTOR",
    'Genre': "GENRE",
    'Plot': "PLOT",
    'Opinion': "REVIEW",
    'Soundtrack': "SONG",
    'Year': "YEAR",
    'Award': "AWARD",
    'Relationship': "RELATIONSHIP",
    'Origin': "ORIGIN",
    'Quote': "QUOTE"
}

# we will only use a few tags for this project and remove all else. You can use these tags if you want.
remove_tags = ['B-TITLE','I-TITLE','B-PLOT','I-PLOT', 'B-ORIGIN', 'I-ORIGIN','B-RELATIONSHIP', 'I-RELATIONSHIP', 'I-TRAILER', 'B-TRAILER', 'I-RATINGS_AVERAGE', 'B-RATINGS_AVERAGE', 'I-SONG', 'B-SONG', 'I-REVIEW', 'B-REVIEW', 'B-QUOTE', 'I-QUOTE']

def replace_tags(label):
    label_ = label.split("-")
    new_label = ""

    if len(label_) > 1:
        if label_[1] in replace_dict:
            new_label = f"{label_[0]}-{replace_dict[label_[1]]}"
        else:
            new_label = label
    else:
        new_label = label

    if new_label in remove_tags:
        return "O"

    return new_label

In [None]:
lemmatizer = WordNetLemmatizer()

# Preprocess words by removing some contractions and splitting into individual tokens. 
# As our datasets are mostly clean, there is no need to preprocess a lot.

def preprocess_words(text):
    # text = re.sub(r'\bs\b', 'is', text)
    text = re.sub(r'\bm\b', 'am', text)
    text = re.sub(r'\bcan t\b', 'can not', text)

    # text = [lemmatizer.lemmatize(word) for word in text.split()]
    return text.split()

In [None]:
# load the data from the text file and apply preprocessing
def load_data(path):
    sentence = []
    sentences = []

    with open(path, 'r') as file:

        for line in file:
            if line.strip():
                tag, word = line.split()
                sentence.append((word, replace_tags(tag)))
            else:
                if sentence:

                  word_sentence = ' '.join([word for word, tag in sentence])
                  tags = [tag for word, tag in sentence]
                  sentence = preprocess_words(word_sentence)
                  sentences.append((sentence, tags))
                  sentence = []

    return sentences

In [None]:
train = load_data('/content/drive/MyDrive/NER/Data/train.txt')
engtrain = load_data('/content/drive/MyDrive/NER/Data/engtrain.bio.txt')
test = load_data('/content/drive/MyDrive/NER/Data/test.txt')
engtest = load_data('/content/drive/MyDrive/NER/Data/engtest.bio.txt')

In [None]:
# Merge the two datasets
train = train + engtrain
test = test + engtest

In [None]:
# seperate the sentences and tags from the data tuples
def seperate_data(data):
  sentences = []
  tags = []

  for i in data:
    assert len(i[0]) == len(i[1])
    sentences.append(i[0])
    tags.append(i[1])

  return sentences, tags

train_sentences, train_tags = seperate_data(train)
test_sentences, test_tags = seperate_data(test)

In [None]:
# Function to map tokens or tags to their respective index (token2idx, tag2idx) and reverse (idx2token, idx2tag)
# adding a 'PAD' token for padding and 'UNK' token for unknown tokens 

def get_dict_map(data, token_or_tag):
    tok2idx = {}
    idx2tok = {}
    if token_or_tag == 'token':
        vocab = list(set([word for sentence in data for word in sentence]))
        vocab = ['PAD', 'UNK'] + vocab
    else:
        vocab = list(set([tag for sentence in data for tag in sentence]))
        vocab = ['PAD'] + vocab

    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}

    return tok2idx, idx2tok

token2idx, idx2token = get_dict_map(train_sentences, 'token')
tag2idx, idx2tag = get_dict_map(train_tags, 'tag')

In [None]:
tag2idx

{'PAD': 0,
 'I-GENRE': 1,
 'O': 2,
 'B-YEAR': 3,
 'I-RATING': 4,
 'B-CHARACTER': 5,
 'B-DIRECTOR': 6,
 'B-GENRE': 7,
 'I-AWARD': 8,
 'I-YEAR': 9,
 'B-AWARD': 10,
 'B-RATING': 11,
 'I-DIRECTOR': 12,
 'I-ACTOR': 13,
 'B-ACTOR': 14,
 'I-CHARACTER': 15}

In [None]:
def prepare_data(sentences, tags, token2idx, tag2idx, max_len):

    X = [[token2idx.get(token, token2idx['UNK']) for token in sentence] for sentence in sentences]
    y = [[tag2idx[tag] for tag in sentence] for sentence in tags]
    X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=token2idx["PAD"])
    y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["PAD"])

    return X, y

X_train, y_train = prepare_data(train_sentences, train_tags, token2idx, tag2idx, 50)
X_test, y_test = prepare_data(test_sentences, test_tags, token2idx, tag2idx, 50)

In [None]:
# One-hot encoding
y_tr = [to_categorical(i, num_classes=len(tag2idx)) for i in y_train]
y_te = [to_categorical(i, num_classes=len(tag2idx)) for i in y_test]

In [None]:
# Create an embedding matrix where each token is mapped to its corresponding Word2Vec vector
from gensim.models import KeyedVectors
word2vec = KeyedVectors.load('/path/to/word2vec-google-news-300.model')

vocab_size = len(token2idx)
embedding_dim = 300
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in token2idx.items():
    if word in word2vec:
        embedding_matrix[i] = word2vec.get_vector(word)

In [None]:
# Model architecture
model = Sequential()

model.add(Embedding(input_dim=vocab_size,
                    output_dim=embedding_dim,
                    weights=[embedding_matrix],
                    input_length=50,
                    trainable=False))

model.add(Bidirectional(LSTM(units=50, return_sequences=True, recurrent_dropout=0.1))) # As NER is a token-level task, we need to keep 'return-sequence' true to get a sequence of output instead of just one final output/
model.add(LSTM(units=50, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))
model.add(TimeDistributed(Dense(len(tag2idx), activation="softmax")))



In [None]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy", "precision", "recall"])

# Train the model
history = model.fit(
    X_train,
    np.array(y_tr),
    batch_size=64,
    epochs=10,
    validation_data=(X_test, np.array(y_te)),
    verbose=1
)

Epoch 1/10
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 193ms/step - accuracy: 0.8857 - loss: 0.5950 - precision: 0.9410 - recall: 0.7707 - val_accuracy: 0.9431 - val_loss: 0.2219 - val_precision: 0.9639 - val_recall: 0.9371
Epoch 2/10
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 184ms/step - accuracy: 0.9482 - loss: 0.2045 - precision: 0.9704 - recall: 0.9363 - val_accuracy: 0.9613 - val_loss: 0.1320 - val_precision: 0.9823 - val_recall: 0.9471
Epoch 3/10
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 209ms/step - accuracy: 0.9630 - loss: 0.1318 - precision: 0.9815 - recall: 0.9483 - val_accuracy: 0.9741 - val_loss: 0.0924 - val_precision: 0.9838 - val_recall: 0.9647
Epoch 4/10
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 183ms/step - accuracy: 0.9737 - loss: 0.0975 - precision: 0.9838 - recall: 0.9637 - val_accuracy: 0.9801 - val_loss: 0.0721 - val_precision: 0.9852 - val_recall: 0.9756
Epoch 5/10


In [None]:
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score

y_preds = model.predict(X_test)

y_test_flat = np.array(y_test).flatten()
y_preds_flat = np.argmax(y_preds, axis=-1).flatten()

precision = precision_score(y_preds_flat, y_test_flat, average='macro')
recall = recall_score(y_preds_flat, y_test_flat, average='macro', zero_division=0)
f1 = f1_score(y_preds_flat, y_test_flat, average='macro', zero_division=0)

# Print classification report for detailed results
report = classification_report(y_preds_flat, y_test_flat, zero_division=0)

print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
print("\nDetailed Report:\n", report)

[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 55ms/step
Precision: 0.7472859566359298
Recall: 0.8345525883025926
F1-Score: 0.7699618790547424

Detailed Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    156120
           1       0.73      0.82      0.77       680
           2       0.99      0.97      0.98     51709
           3       0.89      0.90      0.90      1355
           4       0.87      0.94      0.90       209
           5       0.23      0.52      0.31       161
           6       0.78      0.88      0.83       780
           7       0.89      0.86      0.87      1990
           8       0.69      0.59      0.64       174
           9       0.94      0.93      0.94       663
          10       0.14      0.75      0.23        12
          11       0.96      0.96      0.96       503
          12       0.80      0.90      0.85       806
          13       0.89      0.90      0.89      2377
        

In [None]:
def predict(text):
  sentences = sent_tokenize(text)
  all_results = []

  for sentence in sentences:

      sentence = sentence.lower()
      sentence = re.sub(r'[^a-zA-Z0-9\s]', ' ', sentence)
      sentence = preprocess_words(sentence)
      sentence = [token2idx.get(token, token2idx['UNK']) for token in sentence]
      sentence = pad_sequences(maxlen=50, sequences=[sentence], padding="post", value=token2idx["PAD"])
      y_pred = model.predict(sentence.reshape(1, sentence.shape[1]))
      y_pred = [idx2tag[id] for id in np.argmax(y_pred, axis=-1)[0]]

      result = []
      for token, tag in zip(sentence[0], y_pred):
          if token == token2idx["PAD"]:
              break
          result.append((idx2token[token], tag))
      all_results.append(result)

  return all_results

In [None]:
text = "Leonardo DiCaprio's performance in Inception was mesmerizing. Christopher Nolan's direction brought out the best in the cast. The movie, set in Paris and other global locations, was a masterpiece of visual effects and storytelling. I can’t wait to see what Nolan does next!"
predict(text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step


[[('leonardo', 'B-ACTOR'),
  ('dicaprio', 'I-ACTOR'),
  ('s', 'O'),
  ('performance', 'O'),
  ('in', 'O'),
  ('inception', 'O'),
  ('was', 'O'),
  ('UNK', 'O')],
 [('christopher', 'B-ACTOR'),
  ('nolan', 'I-DIRECTOR'),
  ('s', 'O'),
  ('direction', 'O'),
  ('brought', 'O'),
  ('out', 'O'),
  ('the', 'O'),
  ('best', 'O'),
  ('in', 'O'),
  ('the', 'O'),
  ('cast', 'O')],
 [('the', 'O'),
  ('movie', 'O'),
  ('set', 'O'),
  ('in', 'O'),
  ('paris', 'O'),
  ('and', 'O'),
  ('other', 'O'),
  ('global', 'O'),
  ('locations', 'O'),
  ('was', 'O'),
  ('a', 'O'),
  ('masterpiece', 'O'),
  ('of', 'O'),
  ('visual', 'O'),
  ('effects', 'O'),
  ('and', 'O'),
  ('storytelling', 'O')],
 [('i', 'O'),
  ('can', 'O'),
  ('not', 'O'),
  ('wait', 'O'),
  ('to', 'O'),
  ('see', 'O'),
  ('what', 'O'),
  ('nolan', 'B-ACTOR'),
  ('does', 'O'),
  ('next', 'O')]]