<a href="https://colab.research.google.com/github/as3091/IITJ/blob/NER/ML/Assign_2/NER/NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Title: Named Entity Recognition

Description:
In this NER-focused project, you will design and develop a custom Named Entity Recognition (NER) system for text analysis. Named Entity Recognition involves identifying and classifying specific entities, such as names, dates, locations, and more, within unstructured text data. Your project will offer a versatile NER solution that will work well on the provided dataset.

Dataset: Named Entity Recognition (NER) Corpus (kaggle.com)

https://www.kaggle.com/datasets/naseralqaydeh/named-entity-recognition-ner-corpus

In [1]:
# !pip install tensorflow --quiet
# !pip install keras --quiet

In [2]:
import warnings,sys, ast, pickle
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import tensorflow as tf
from IPython.display import display, HTML
# import matplotlib.pyplot as plt
from pprint import pprint

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from keras.layers import Dense, Input, Bidirectional, LSTM, Embedding, Dropout
from keras.models import Model
from keras.losses import SparseCategoricalCrossentropy
from keras.callbacks import EarlyStopping
from tensorflow.keras.models import load_model
# from keras.random import SeedGenerator

# seed_gen = SeedGenerator(seed=42)
tf.random.set_seed(42)
np.random.seed(42)

import datetime as dt

2025-04-13 12:52:30.792704: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-13 12:52:30.812073: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-13 12:52:30.960820: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-13 12:52:31.106999: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744548751.294798   11599 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744548751.33

In [3]:
# !pip install kagglehub --quiet

In [4]:
# while True:
try:
    NER_df = pd.read_csv("ner.csv")
except FileNotFoundError:
    import kagglehub
    from kagglehub import KaggleDatasetAdapter

    # Set the path to the file you'd like to load
    file_path = "ner.csv"

    # Load the latest version
    NER_df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "naseralqaydeh/named-entity-recognition-ner-corpus",
    file_path,

    )

In [5]:
display(NER_df.head(2).T)

Unnamed: 0,0,1
Sentence #,Sentence: 1,Sentence: 2
Sentence,Thousands of demonstrators have marched throug...,Families of soldiers killed in the conflict jo...
POS,"['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP'...","['NNS', 'IN', 'NNS', 'VBN', 'IN', 'DT', 'NN', ..."
Tag,"['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', '...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


Essential info about entities:

- geo = Geographical Entity
- org = Organization
- per = Person
- gpe = Geopolitical Entity
- tim = Time indicator
- art = Artifact
- eve = Event
- nat = Natural Phenomenon

### Class it

In [6]:
class The_Neural_Net:
    def __init__(self):
        self.max_len = 0

1. Read data

In [7]:
def read_data(self):
    try:
        NER_df = pd.read_csv("ner.csv")
    except FileNotFoundError:
        import kagglehub
        from kagglehub import KaggleDatasetAdapter

        file_path = "ner.csv"

        NER_df = kagglehub.load_dataset(
        KaggleDatasetAdapter.PANDAS,
        "naseralqaydeh/named-entity-recognition-ner-corpus",
        file_path,

        )
    print(NER_df.shape,"\n")
    print("\n",NER_df.info())
    return NER_df
The_Neural_Net.read_data = read_data

2. Preprocess

In [None]:
def PreProcess(self):
    NER_df = self.read_data()
    NER_df.dropna(inplace=True)
    NER_df.drop(columns=["Sentence #","POS"],inplace=True)
    NER_df["Tag"] = NER_df["Tag"].apply(lambda x: ast.literal_eval(x))

    self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(NER_df["Sentence"], NER_df["Tag"], shuffle=True,test_size=0.20, random_state=42)
    del NER_df

    self.X_tokenizer = Tokenizer(lower=False,oov_token="UNK")
    self.X_tokenizer.fit_on_texts(self.X_train)

    self.X_train = self.X_tokenizer.texts_to_sequences(self.X_train)
    self.X_test = self.X_tokenizer.texts_to_sequences(self.X_test)

    self.vocab_len = len(self.X_tokenizer.word_index)
    print(f"Number of unique tokens:\t{self.vocab_len}")

    self.y_tokenizer = Tokenizer(lower=False,oov_token="UNK")
    self.y_tokenizer.fit_on_texts(self.y_train)

    self.y_train = self.y_tokenizer.texts_to_sequences(self.y_train)
    self.y_test = self.y_tokenizer.texts_to_sequences(self.y_test)

    for dataset in [self.X_train,self.X_test]:
        for i in range(len(dataset)):
            self.max_len = max(self.max_len,len(dataset[i]))

    self.X_train = pad_sequences(self.X_train, maxlen=self.max_len, padding="post", value=0)
    self.X_test = pad_sequences(self.X_test, maxlen=self.max_len, padding="post", value=0)

    self.y_train = pad_sequences(self.y_train, maxlen=self.max_len, padding="post", value=0)
    self.y_test = pad_sequences(self.y_test, maxlen=self.max_len, padding="post", value=0)

    self.Number_of_classes_K = len(self.y_tokenizer.word_index) + 1

The_Neural_Net.PreProcess = PreProcess

In [75]:
def model_arch(self):
    vector_size = 4

    input_layer = Input(shape=(self.max_len,))
    embedding_layer = Embedding(input_dim=self.vocab_len + 1, output_dim=vector_size, mask_zero=True, trainable=True)(input_layer)
    dropout_layer_1 = Dropout(0.075)(embedding_layer)
    bidirectional_LSTM_Layer = Bidirectional(LSTM(vector_size * 2, return_sequences=True))(dropout_layer_1)
    # bidirectional_LSTM_Layer = Bidirectional(LSTM(vector_size * 2, return_sequences=True))(embedding_layer)
    output_layer = Dense(self.Number_of_classes_K)(bidirectional_LSTM_Layer)

    self.model = Model(input_layer, output_layer)
    print(self.model.summary())

    self.model.compile(optimizer="adam",loss=SparseCategoricalCrossentropy(from_logits=True),metrics=["accuracy"])
The_Neural_Net.model_arch = model_arch

In [76]:
def model_fit(self,num_of_epochs):
    early_stopping = EarlyStopping(
    monitor='val_loss',  # Metric to monitor (e.g., validation loss)
    patience=3,          # Number of epochs with no improvement after which training will stop
    restore_best_weights=True  # Restore the weights of the best epoch
    )
    self.num_of_epochs = num_of_epochs
    self.model.fit(
            self.X_train,
            self.y_train,
            epochs=num_of_epochs,
            validation_data=(self.X_test, self.y_test),
           callbacks=[early_stopping]  # Include EarlyStopping in callbacks
        )
The_Neural_Net.model_fit = model_fit


In [77]:
def save_to_file(self):
    with open('X_tokenizer.pkl', 'wb') as file:
        pickle.dump(self.X_tokenizer, file)
    print("X_tokenizer saved to X_tokenizer.pkl")

    with open('y_tokenizer.pkl', 'wb') as file:
        pickle.dump(self.y_tokenizer, file)
    print("y_tokenizer saved to y_tokenizer.pkl")

    model_save_path = f"ner_model_{self.num_of_epochs}.keras"
    # print(model_save_path)
    self.model.save(model_save_path)
    print(f"Model saved to {model_save_path}")
The_Neural_Net.save_to_file = save_to_file

In [78]:
def load_from_file(self,num_of_epochs):
    # self.num_of_epochs = num_of_epochs
    with open('X_tokenizer.pkl', 'rb') as file:
        self.X_tokenizer = pickle.load(file)
        print("Tokenizer loaded from X_tokenizer.pkl")

    with open('y_tokenizer.pkl', 'rb') as file:
        self.y_tokenizer = pickle.load(file)
        print("Tokenizer loaded from y_tokenizer.pkl")

    model_save_path = f"ner_model_{num_of_epochs}.keras"
    self.model = load_model(model_save_path)
    print(f"Model loaded from {model_save_path}")


The_Neural_Net.load_from_file = load_from_file

In [79]:
def predict(self,model,sentence):
    sentence_tokens = self.X_tokenizer.texts_to_sequences([sentence])
    # print(len(sentence.split()))
    # print(len(sentence_tokens[0]))
    # tokens_to_words = [word for word, index in self.X_tokenizer.word_index.items() if index in sentence_tokens[0]]

    predictions = model.predict(pad_sequences(sentence_tokens,
                                            maxlen=self.max_len,
                                            padding="post"))
    # print(predictions)
    prediction_ner = np.argmax(predictions,axis=-1)
    # print(prediction_ner)

    NER_tags = [self.y_tokenizer.index_word[num] for num in list(prediction_ner.flatten())]
    final_pred = {"Word":[],"Tag":[]}
    sentence_split = sentence.split()
    for Word,Tag in zip(sentence_split,NER_tags):
        # final_pred[tokens_to_words[i]] = NER_tags[i]
        final_pred["Word"].append(Word)
        final_pred["Tag"].append(Tag)
    return pd.DataFrame(final_pred)
The_Neural_Net.predict = predict

In [80]:
NN_obj = The_Neural_Net()
NN_obj.PreProcess()

(47959, 4) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47959 entries, 0 to 47958
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Sentence #  47959 non-null  object
 1   Sentence    47959 non-null  object
 2   POS         47959 non-null  object
 3   Tag         47959 non-null  object
dtypes: object(4)
memory usage: 1.5+ MB

 None
Number of unique tokens:	28761


In [81]:
num_of_epochs = 51
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='local') # Detect TPU
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    tpu_strategy = tf.distribute.TPUStrategy(tpu)

    num_of_epochs = 100
    with tpu_strategy.scope():
        NN_obj.model_arch()
        NN_obj.model_fit(num_of_epochs)

except Exception as e:
    NN_obj.model_arch()
    NN_obj.model_fit(num_of_epochs)


INFO:tensorflow:Deallocate tpu buffers before initializing tpu system.
INFO:tensorflow:Initializing the TPU system: local


None
Epoch 1/51
[1m1199/1199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 59ms/step - accuracy: 0.2107 - loss: 1.2209 - val_accuracy: 0.2081 - val_loss: 0.6066
Epoch 2/51
[1m1199/1199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 59ms/step - accuracy: 0.2076 - loss: 0.5887 - val_accuracy: 0.1947 - val_loss: 0.5364
Epoch 3/51
[1m1199/1199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 57ms/step - accuracy: 0.1965 - loss: 0.5164 - val_accuracy: 0.1982 - val_loss: 0.4659
Epoch 4/51
[1m1199/1199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 56ms/step - accuracy: 0.1996 - loss: 0.4496 - val_accuracy: 0.2009 - val_loss: 0.4220
Epoch 5/51
[1m1199/1199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 56ms/step - accuracy: 0.2022 - loss: 0.4053 - val_accuracy: 0.2019 - val_loss: 0.3925
Epoch 6/51
[1m1199/1199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 56ms/step - accuracy: 0.2034 - loss: 0.3759 - val_accuracy: 0.2025 - val_loss: 0.3769

In [82]:
# num_of_epochs = 5
# if tf.test.is_gpu_available():
#     num_of_epochs = 100
#     with tf.device('/device:GPU:0'):
#         NN_obj.model_fit(num_of_epochs)
# else:
#     NN_obj.model_fit(num_of_epochs)


In [83]:
NN_obj.save_to_file()

X_tokenizer saved to X_tokenizer.pkl
y_tokenizer saved to y_tokenizer.pkl
Model saved to ner_model_51.keras


In [84]:
NN_obj.load_from_file(num_of_epochs=num_of_epochs)

Tokenizer loaded from X_tokenizer.pkl
Tokenizer loaded from y_tokenizer.pkl
Model loaded from ner_model_51.keras


In [85]:
sentence = """Is this the real life? Is this just fantasy? Caught in a landslide, no escape from reality"""
model = NN_obj.model
prediction_df = NN_obj.predict(model=NN_obj.model,sentence=sentence)
display(prediction_df)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 527ms/step


Unnamed: 0,Word,Tag
0,Is,O
1,this,O
2,the,O
3,real,O
4,life?,O
5,Is,O
6,this,O
7,just,O
8,fantasy?,O
9,Caught,O


In [86]:
sentence = """Apoorv Code, Ankur question one, Alok, zoom meeting"""
model = NN_obj.model
prediction_df = NN_obj.predict(model=NN_obj.model,sentence=sentence)
display(prediction_df)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step


Unnamed: 0,Word,Tag
0,Apoorv,B-org
1,"Code,",B-org
2,Ankur,O
3,question,O
4,"one,",O
5,"Alok,",O
6,zoom,O
7,meeting,O


In [47]:
sentence = """Apoorv Code, Ankur question one, Alok, zoom meeting"""
model = NN_obj.model
prediction_df = NN_obj.predict(model=NN_obj.model,sentence=sentence)
display(prediction_df)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step


Unnamed: 0,Word,Tag
0,Apoorv,B-per
1,"Code,",B-per
2,Ankur,O
3,question,O
4,"one,",O
5,"Alok,",O
6,zoom,O
7,meeting,O
