In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import ast
import re
import spacy

DATA_PATH = "/content/drive/MyDrive/news_project/data/raw/news.tsv"
df = pd.read_csv(DATA_PATH,sep="\t")

nlp = spacy.load("en_core_web_sm")

df["Headline"] = df["Headline"].astype(str)
df["Title entity"] = df["Title entity"].astype(str)

COUNTRIES = ["United States", "India", "Brazil", "China", "Mexico", "Canada"]
PERSON_PATTERN = r"^[A-Z][a-z]+(\s[A-Z][a-z]+)+$"
ORG_KEYWORDS = ["Corporation", "Authority", "Committee", "Association", "University", "Agency", "Company", "FC", "Ltd"]


def infer_entity_type(expanded):
    expanded = expanded.strip()

    if re.match(PERSON_PATTERN, expanded):
        return "PERSON"

    if expanded in COUNTRIES:
        return "LOCATION"

    if any(k in expanded for k in ORG_KEYWORDS):
        return "ORG"

    return "MISC"

def convert_to_bio(text, entity_string):
    tokens = text.split()
    tags = ["O"] * len(tokens)

    if entity_string == "{}":
        return tokens, tags

    try:
        ent_dict = ast.literal_eval(entity_string)
    except:
        return tokens, tags

    lower_tokens = [w.lower().strip(".,!?") for w in tokens]

    for surface, expanded in ent_dict.items():
        clean_surface = surface.replace("'s", "").strip()
        stoks = clean_surface.split()
        stoks = [w.lower().strip(".,!?") for w in stoks]
        n = len(stoks)

        ent_type = infer_entity_type(expanded)

        # Search entity span safely
        for i in range(len(tokens)):
            try:
                if lower_tokens[i:i+n] == stoks:
                    tags[i] = f"B-{ent_type}"
                    for j in range(i+1, i+n):
                        if j < len(tags):   # SAFETY CHECK
                            tags[j] = f"I-{ent_type}"
            except:
                continue

    return tokens, tags



sentences = []
labels = []

for _, row in df.iterrows():
    s, t = convert_to_bio(row["Headline"], row["Title entity"])
    sentences.append(s)
    labels.append(t)

print("DATA READY — Samples:", len(sentences))


DATA READY — Samples: 113762


In [4]:
!pip install keras tensorflow

Collecting keras
  Downloading keras-3.12.0-py3-none-any.whl.metadata (5.9 kB)
Collecting tensorflow
  Using cached tensorflow-2.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Collecting tensorboard~=2.20.0 (from tensorflow)
  Downloading tensorboard-2.20.0-py3-none-any.whl.metadata (1.8 kB)
Downloading keras-3.12.0-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tensorflow-2.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (620.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m620.7/620.7 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tensorboard-2.20.0-py3-none-any.whl (5.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m134.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorboard, keras, tensorflow
  Attempting uninstall: tenso

In [5]:
# ENCODING & VOCAB GENERATION

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

word_set = set(w for s in sentences for w in s)
tag_set = set(t for seq in labels for t in seq)

word2idx = {w: i+2 for i, w in enumerate(sorted(word_set))}
word2idx["<PAD>"] = 0
word2idx["<OOV>"] = 1

tag2idx = {t: i for i, t in enumerate(sorted(tag_set))}
idx2tag = {v: k for k, v in tag2idx.items()}

MAX_LEN = 60

X = [[word2idx.get(w, 1) for w in seq] for seq in sentences]
X = pad_sequences(X, maxlen=MAX_LEN, padding="post")

y = [[tag2idx[t] for t in seq] for seq in labels]
y = pad_sequences(y, maxlen=MAX_LEN, padding="post")
y_cat = to_categorical(y, num_classes=len(tag2idx))

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y_cat, test_size=0.2, random_state=42
)

VOCAB_SIZE = len(word2idx)
NUM_TAGS = len(tag2idx)

print("VOCAB SIZE:", VOCAB_SIZE)
print("NUM TAGS:", NUM_TAGS)


VOCAB SIZE: 104705
NUM TAGS: 8


In [6]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip


--2025-12-06 12:47:22--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-12-06 12:47:22--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-12-06 12:47:22--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.1’


2

In [7]:
!pip install gensim



In [8]:
# BUILD EMBEDDINGS

#  Word2vec

import numpy as np
from gensim.models import Word2Vec
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=2, workers=4)

embedding_w2v = np.zeros((VOCAB_SIZE, 100))
for w, i in word2idx.items():
    embedding_w2v[i] = w2v_model.wv[w] if w in w2v_model.wv else np.random.normal(0,0.6,100)


In [9]:
# GloVe

GLOVE_PATH = "/content/glove.6B.100d.txt"
import numpy as np

glove_index = {}
with open(GLOVE_PATH, encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype="float32")
        glove_index[word] = vector

print("Total embeddings found:", len(glove_index))

# Matrix

embedding_glove = np.zeros((VOCAB_SIZE, 100))

for w, i in word2idx.items():
    embedding_glove[i] = glove_index.get(
        w, np.random.normal(scale=0.6, size=(100,))
    )


Total embeddings found: 400000


In [None]:
# !pip uninstall -y tensorflow keras
# !pip install tensorflow==2.11 keras==2.11 tensorflow-addons==0.20.0
# !pip install keras-crf


In [None]:
# !pip install tensorflow-addons==0.22.0


In [11]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, TimeDistributed, Dense
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True)

# ============================================
# Regular BiLSTM Model
# ============================================

def build_bilstm(embedding_matrix):
    inp = Input(shape=(MAX_LEN,))
    emb = Embedding(
        VOCAB_SIZE, 100,
        weights=[embedding_matrix],
        mask_zero=True,
        trainable=False
    )(inp)

    x = Bidirectional(LSTM(128, return_sequences=True))(emb)
    out = TimeDistributed(Dense(NUM_TAGS, activation="softmax"))(x)

    model = Model(inp, out)
    model.compile(
        optimizer="adam",
        loss="categorical_crossentropy",
        metrics=["accuracy"]
    )
    return model


# ============================================
# BiLSTM + CRF-like loss
# (NO tensorflow_addons needed)
# ============================================

def build_bilstm_crf(embedding_matrix):
    inp = Input(shape=(MAX_LEN,))
    emb = Embedding(
        VOCAB_SIZE, 100,
        weights=[embedding_matrix],
        mask_zero=True,
        trainable=False
    )(inp)

    x = Bidirectional(LSTM(128, return_sequences=True))(emb)
    logits = TimeDistributed(Dense(NUM_TAGS))(x)

    # Learnable CRF transition matrix
    transitions = tf.Variable(
        tf.random.uniform(shape=(NUM_TAGS, NUM_TAGS)),
        name="transition_matrix"
    )

    def crf_loss(y_true, y_pred):
        """
        y_true → one-hot target
        y_pred → logits
        """
        y_true_idx = tf.argmax(y_true, axis=-1)  # (batch, seq)

        # emission log probabilities
        log_softmax = tf.nn.log_softmax(y_pred, axis=-1)

        # likelihood of correct token prediction
        token_ll = tf.reduce_sum(
            tf.reduce_sum(
                tf.one_hot(y_true_idx, NUM_TAGS) * log_softmax,
                axis=-1
            ),
            axis=-1
        )

        # transition score
        seq_score = 0.0

        for t in range(MAX_LEN - 1):
            curr = y_true_idx[:, t]
            nxt = y_true_idx[:, t + 1]

            seq_score += tf.gather_nd(
                transitions,
                tf.stack([curr, nxt], axis=1)
            )

        loss = -(token_ll + seq_score)

        return tf.reduce_mean(loss)

    model = Model(inp, logits)
    model.compile(optimizer="adam", loss=crf_loss)

    return model


In [25]:
results = []
models_dict = {}

print("\nTraining MODEL-1: BiLSTM + Word2Vec")
model_1 = build_bilstm(embedding_w2v)
model_1.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=5, batch_size=32, callbacks=[early_stop])
models_dict["1"] = model_1


Training MODEL-1: BiLSTM + Word2Vec
Epoch 1/5
[1m2845/2845[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 21ms/step - accuracy: 0.9161 - loss: 0.2971 - val_accuracy: 0.9205 - val_loss: 0.2663
Epoch 2/5
[1m2845/2845[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 24ms/step - accuracy: 0.9242 - loss: 0.2495 - val_accuracy: 0.9251 - val_loss: 0.2475


In [13]:

print("\nTraining MODEL-2: BiLSTM + GloVe")
model_2 = build_bilstm(embedding_glove)
model_2.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=5, batch_size=32, callbacks=[early_stop])
models_dict["2"] = model_2


Training MODEL-2: BiLSTM + GloVe
Epoch 1/5
[1m2845/2845[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 20ms/step - accuracy: 0.9159 - loss: 0.3110 - val_accuracy: 0.9251 - val_loss: 0.2510
Epoch 2/5
[1m2845/2845[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 20ms/step - accuracy: 0.9349 - loss: 0.2131 - val_accuracy: 0.9363 - val_loss: 0.2084
Epoch 3/5
[1m2845/2845[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 20ms/step - accuracy: 0.9440 - loss: 0.1773 - val_accuracy: 0.9394 - val_loss: 0.1966
Epoch 4/5
[1m2845/2845[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 20ms/step - accuracy: 0.9507 - loss: 0.1525 - val_accuracy: 0.9413 - val_loss: 0.1921
Epoch 5/5
[1m2845/2845[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 25ms/step - accuracy: 0.9565 - loss: 0.1323 - val_accuracy: 0.9411 - val_loss: 0.1937


In [14]:
print("\nTraining MODEL-3: BiLSTM-CRF + Word2Vec")
model_3 = build_bilstm_crf(embedding_w2v)
model_3.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=5, batch_size=32, callbacks=[early_stop])
models_dict["3"] = model_3



Training MODEL-3: BiLSTM-CRF + Word2Vec
Epoch 1/5
[1m2845/2845[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 20ms/step - loss: 25.8660 - val_loss: -2.2927
Epoch 2/5
[1m2845/2845[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 18ms/step - loss: -6.2627 - val_loss: -8.2341
Epoch 3/5
[1m2845/2845[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 19ms/step - loss: -8.9154 - val_loss: -9.2397
Epoch 4/5
[1m2845/2845[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 19ms/step - loss: -9.4977 - val_loss: -9.5105
Epoch 5/5
[1m2845/2845[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 19ms/step - loss: -9.7082 - val_loss: -9.5911


In [15]:
print("\nTraining MODEL-4: BiLSTM-CRF + GloVe")
model_4 = build_bilstm_crf(embedding_glove)
model_4.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=5, batch_size=32, callbacks=[early_stop])
models_dict["4"] = model_4



Training MODEL-4: BiLSTM-CRF + GloVe
Epoch 1/5
[1m2845/2845[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 21ms/step - loss: 33.4681 - val_loss: 4.9757
Epoch 2/5
[1m2845/2845[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 19ms/step - loss: 0.8262 - val_loss: -1.1629


In [26]:
# Create folder before saving
!mkdir -p "/content/drive/MyDrive/news_project/NER_eval"

from sklearn.metrics import classification_report

def evaluate(model, model_name, embed_name):
    preds = model.predict(X_val).argmax(axis=-1)
    true_tags = y_val.argmax(axis=-1)

    y_true = []
    y_pred = []

    for t_seq, p_seq in zip(true_tags, preds):
        for t, p in zip(t_seq, p_seq):
            if idx2tag[t] != "O":
                y_true.append(idx2tag[t])
                y_pred.append(idx2tag[p])

    report = classification_report(
        y_true,
        y_pred,
        output_dict=True,
        zero_division=0
    )

    weighted = report["weighted avg"]

    return {
        "Model": model_name,
        "Embedding": embed_name,
        "Precision": weighted["precision"],
        "Recall": weighted["recall"],
        "F1 Score": weighted["f1-score"]
    }

results.append(evaluate(model_1, "BiLSTM", "Word2Vec"))
results.append(evaluate(model_2, "BiLSTM", "GloVe"))
results.append(evaluate(model_3, "BiLSTM-CRF", "Word2Vec"))
results.append(evaluate(model_4, "BiLSTM-CRF", "GloVe"))

df_result = pd.DataFrame(results)
df_result.to_csv("/content/drive/MyDrive/news_project/NER_eval/model_comparison_NER.csv", index=False)
print("Evaluation Completed and File Saved")


[1m712/712[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step
[1m712/712[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step
[1m712/712[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step
[1m712/712[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step
Evaluation Completed and File Saved


In [27]:
best_row = df_result.loc[df_result["F1 Score"].idxmax()]
print("\nBEST MODEL SELECTED:\n", best_row)

best_tuple = (best_row["Model"], best_row["Embedding"])

best_model = {
    ("BiLSTM", "Word2Vec"): model_1,
    ("BiLSTM", "GloVe"): model_2,
    ("BiLSTM-CRF", "Word2Vec"): model_3,
    ("BiLSTM-CRF", "GloVe"): model_4,
}[best_tuple]

best_model.save("/content/drive/MyDrive/news_project/NER_eval/final_best_model.keras")
print("\nSAVED BEST MODEL!")



BEST MODEL SELECTED:
 Model        BiLSTM-CRF
Embedding      Word2Vec
Precision      0.998365
Recall         0.987914
F1 Score        0.99054
Name: 2, dtype: object

SAVED BEST MODEL!


In [30]:
import pickle

with open("/content/drive/MyDrive/news_project/NER_eval/word2idx.pkl", "wb") as f:
    pickle.dump(word2idx, f)


In [31]:
with open("/content/drive/MyDrive/news_project/NER_eval/idx2tag.pkl", "wb") as f:
    pickle.dump(idx2tag, f)


In [32]:
with open("/content/drive/MyDrive/news_project/NER_eval/max_len.txt", "w") as f:
    f.write(str(MAX_LEN))


Testing

In [34]:
import numpy as np
import pickle
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load model
model = load_model("/content/drive/MyDrive/news_project/NER_eval/final_best_model.keras", compile=False)
print("Model Loaded Successfully")

# Load vocabulary mappings
with open("/content/drive/MyDrive/news_project/NER_eval/word2idx.pkl", "rb") as f:
    word2idx = pickle.load(f)

with open("/content/drive/MyDrive/news_project/NER_eval/idx2tag.pkl", "rb") as f:
    idx2tag = pickle.load(f)

# Load max sequence length
with open("/content/drive/MyDrive/news_project/NER_eval/max_len.txt") as f:
    MAX_LEN = int(f.read())

print("Vocab & config loaded")


Model Loaded Successfully
Vocab & config loaded


In [35]:
def predict_ner(text):
    tokens = text.split()

    encoded = [word2idx.get(word, word2idx["<OOV>"]) for word in tokens]

    padded = pad_sequences([encoded], maxlen=MAX_LEN, padding="post")

    pred = model.predict(padded)[0].argmax(axis=-1)

    tags = [idx2tag[idx] for idx in pred][:len(tokens)]

    return list(zip(tokens, tags))


In [36]:
print(predict_ner("Apple invested 3 billion dollars in India"))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[('Apple', 'O'), ('invested', 'O'), ('3', 'O'), ('billion', 'O'), ('dollars', 'O'), ('in', 'O'), ('India', 'B-MISC')]


In [37]:
def extract_entities(tokens_with_tags):
    entities = {}
    current_entity = ""
    current_tag = ""

    for token, tag in tokens_with_tags:
        if tag.startswith("B-"):
            ent_type = tag[2:]
            if current_entity:
                entities.setdefault(current_tag, []).append(current_entity)

            current_entity = token
            current_tag = ent_type

        elif tag.startswith("I-") and current_tag == tag[2:]:
            current_entity += " " + token

        else:
            if current_entity:
                entities.setdefault(current_tag, []).append(current_entity)

            current_entity = ""
            current_tag = ""

    if current_entity:
        entities.setdefault(current_tag, []).append(current_entity)

    return entities


In [38]:
text = "Apple invested 3 billion dollars in India and met Joe Biden in Washington"
print("INPUT TEXT:", text)

token_tags = predict_ner(text)

print("\nToken Prediction:")
for tok, tag in token_tags:
    print(f"{tok:12} --> {tag}")

final_entities = extract_entities(token_tags)

print("\nExtracted Entities:")
print(final_entities)


INPUT TEXT: Apple invested 3 billion dollars in India and met Joe Biden in Washington
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step

Token Prediction:
Apple        --> O
invested     --> O
3            --> O
billion      --> O
dollars      --> O
in           --> O
India        --> O
and          --> O
met          --> O
Joe          --> B-PERSON
Biden        --> I-PERSON
in           --> O
Washington   --> B-MISC

Extracted Entities:
{'PERSON': ['Joe Biden'], 'MISC': ['Washington']}
