# Part 1. Sequence Tagging: NER

In [1]:
!pip show gensim

Name: gensim
Version: 4.3.2
Summary: Python framework for fast Vector Space Modelling
Home-page: https://radimrehurek.com/gensim/
Author: Radim Rehurek
Author-email: me@radimrehurek.com
License: LGPL-2.1-only
Location: /Users/raghavrnair/opt/anaconda3/envs/CZ4045/lib/python3.9/site-packages
Requires: numpy, scipy, smart-open
Required-by: 


In [32]:
import pandas as pd
import os
import gensim.downloader
from gensim.models import Word2Vec
import tensorflow as tf
import numpy as np
import gc
from sklearn import metrics
from sklearn.preprocessing import normalize

# Part 1.1

In [3]:
# Show all available models in gensim-data
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [4]:
embeddings = gensim.downloader.load("word2vec-google-news-300")

## Question 1.1
### Based on word2vec embeddings you have downloaded, use cosine similarity to find the most similar word to each of these words: (a) “student”; (b) “Apple”; (c) “apple”. Report the most similar word and its cosine similarity.


In [5]:
words = ["student", "Apple", "apple"]
print("-----------------------------------------------------------------------")
print("Word\t\tMost similar word\tCosine similarity")
print("-----------------------------------------------------------------------")
for word in words:
    most_similar = embeddings.most_similar(positive=[word])
    print(f"{word}\t\t{most_similar[0][0]}  \t\t{most_similar[0][1]}")
print("-----------------------------------------------------------------------")

-----------------------------------------------------------------------
Word		Most similar word	Cosine similarity
-----------------------------------------------------------------------
student		students  		0.7294867038726807
Apple		Apple_AAPL  		0.7456986308097839
apple		apples  		0.720359742641449
-----------------------------------------------------------------------


# Part 1.2

In [6]:
train_dir = 'eng.train'
dev_dir = 'eng.testa'
test_dir = 'eng.testb'

def import_content(path):
    try:
        with open(path, 'r') as file:
            content = file.readlines()
        file.close()
    except Exception as e:
        content = None
        print(e)
    
    return content

def print_items(item):
    for s in item: print(s)

In [7]:
train_content = import_content(train_dir)
dev_content = import_content(dev_dir)
test_content = import_content(test_dir)

In [8]:
def split_sentences(content):
    split_data = [c.split(' ') for c in content] if content != None else []
    sentences = []
    sentence = []
    words = []

    for line in split_data:
        # if end of a sentence
        if line == ['\n']:
            sentences.append(sentence)
            sentence = []
        else:
            s_text  = line[0]
            s_tag = line[-1].replace('\n','')

            sentence.append([s_text, s_tag]) 
            words.append([s_text, s_tag])
    
    sentences.append(sentence)         

    return sentences, words

In [9]:
def split_text_tag(sentences):
    text = []
    tag = []
    combined = []
    sentence_count = 1

    for s in sentences:
        for w in s:
            w_text  = w[0]
            w_tag = w[-1].replace('\n','')

            text.append(w_text)
            tag.append(w_tag)        
            combined.append({
                'sentence': sentence_count,
                'text' : w_text,
                'tag' : w_tag
            })   
        sentence_count+=1       
    return text, tag, combined

In [10]:
train_sentences, train_words = split_sentences(train_content)
dev_sentences, dev_words = split_sentences(dev_content)
test_sentences, test_words = split_sentences(test_content)

train_text, train_tag, train_combined = split_text_tag(train_sentences)
dev_text, dev_tag, dev_combined = split_text_tag(dev_sentences)
test_text, test_tag, test_combined = split_text_tag(test_sentences)

train_voc = np.unique(np.array(train_text))
dev_voc = np.unique(np.array(dev_text))
tag_set = np.unique(np.array(train_tag))

# Question 1.2 a)
### Describe the size (number of sentences) of the training, development and test file for CoNLL2003.

In [11]:
print("Number of sentences (training):", len(train_sentences))
print("Number of sentences (dev):", len(dev_sentences))
print("Number of sentences (test):", len(test_sentences))

Number of sentences (training): 14987
Number of sentences (dev): 3466
Number of sentences (test): 3684


### Specify the complete set of all possible word labels based on the tagging scheme (IO, BIO, etc.) you chose

In [12]:
print("Tag set (BIO):", tag_set)

Tag set (BIO): ['B-LOC' 'B-MISC' 'B-ORG' 'I-LOC' 'I-MISC' 'I-ORG' 'I-PER' 'O']


# Question 1.2 b)
###  Choose an example sentence from the training set of CoNLL2003 that has at least two named entities with more than one word. Explain how to form complete named entities from the label for each word, and list all the named entities in this sentence.

In [13]:
def get_multiple_ne_sentence(sentences):
    for sentence in sentences:
        ne_count = 0
        for word_info in sentence:
            if "B-" in word_info[-1]:
                ne_count+=1
        if ne_count == 2:
            return sentence
    return None   

In [14]:
sentence = get_multiple_ne_sentence(train_sentences)
sentence

[['Swiss', 'I-MISC'],
 ['Grand', 'B-MISC'],
 ['Prix', 'I-MISC'],
 ['World', 'B-MISC'],
 ['Cup', 'I-MISC'],
 ['cycling', 'O'],
 ['race', 'O'],
 ['on', 'O'],
 ['Sunday', 'O'],
 [':', 'O']]

In [15]:
def get_named_entities(sentence):
    inside_tags = ['I-ORG', 'I-LOC', 'I-PER', 'I-MISC'] # Tags that require multiple words to form an entity
    begin_tags = ['B-LOC', 'B-ORG', 'B-MISC'] # Tags that are single word entities
    outside_tags = ['O']
    entities = [] # all entities gotten from search
    entity = [] # word group of current entity if any group tags encountered
    
    for c in sentence:
        if (c['tag'] in begin_tags or c['tag'] in outside_tags or c['tag'] == '\n') and len(entity) != 0:
            entities.append(' '.join(entity))
            entity = []
        if c['tag'] in begin_tags or c['tag'] in inside_tags: 
            entity.append(c['text'])

    return entities

In [16]:
_,_,sentence_text_tag = split_text_tag([sentence])
print("Complete named entities in the sentence:", get_named_entities(sentence_text_tag))

Complete named entities in the sentence: ['Swiss', 'Grand Prix', 'World Cup']


# Part 1.3

In [40]:
header_names = ['word', 'useless1', 'useless2', 'tag']
train_raw = pd.read_csv("eng.train", header=None, sep=' ', names=header_names, skip_blank_lines=False, quotechar="|")
validation_raw = pd.read_csv("eng.testa", header=None, sep=' ', names=header_names, skip_blank_lines=False, quotechar="|")
test_raw = pd.read_csv("eng.testb", header=None, sep=' ', names=header_names, skip_blank_lines=False, quotechar="|")

In [41]:
null_indices = train_raw[train_raw['word'].isnull()].reset_index()['index']

In [42]:
# create average embedding to replace Out-Of-Vocabulary words
average_embedding = np.mean(embeddings.vectors, axis=0)

In [43]:
def drop_na_labels(data):
  temp = data[['word', 'tag']]
  #without_na = temp.dropna(subset=['tag']).reset_index().drop(columns='index')
  return temp

In [44]:
def get_tag_to_integer_dict(train_without_na):
  count = 0
  tag_to_integer_dictionary = {}
  for tag in train_without_na['tag'].unique():
    tag_to_integer_dictionary[tag] = count
    count += 1

  return tag_to_integer_dictionary

In [45]:
def get_tag_to_integer_dict_from_raw(train_raw):
  temp = train_raw[['word', 'tag']]
  without_na = temp.dropna(subset=['tag']).reset_index().drop(columns='index')
  tag_to_integer_dictionary = get_tag_to_integer_dict(without_na)
  return tag_to_integer_dictionary

In [46]:
tag_to_integer_dictionary = get_tag_to_integer_dict_from_raw(train_raw)

In [47]:
def get_sentences_wordstags_array(train_without_na):
  sentences_train = []
  new_sentence = []

  for i, row in train_without_na.iterrows():

    if row.isna().all():
      sentences_train.append(new_sentence)
      new_sentence = []

    else:
      if len(new_sentence) >= 50:
        # if sentence length is too long, break it up every 30 words, to prevent timesteps from being too large
        new_sentence.append([row[0], row[1]])
        sentences_train.append(new_sentence)
        new_sentence = []

      else:
        new_sentence.append([row[0], row[1]])


#    if i > 10000:
#      break

  return sentences_train

In [48]:
def get_sentences_embeddingstags_array(sentences_wordstags_array, tag_to_integer_dictionary):
  # replace words with embeddings and tags with integers
  embeddings_in_sentences = []
  new_sentence = []

  count = 0
  for sentence in sentences_wordstags_array:
    for word, tag in sentence:
      if tag not in tag_to_integer_dictionary:
        #print(f"{tag} not found with {word}, skipping")
        continue
      if word not in embeddings:
        #new_embedding = np.zeros(300)
        new_embedding = average_embedding

      else:
        unnormalized_embedding = embeddings[word].astype(np.float32)
        # Reshape the embedding to be a 2D array with a single row
        embedding_reshaped = unnormalized_embedding.reshape(1, -1)
        # Normalize the embedding
        embedding_normalized = normalize(embedding_reshaped, axis=1, norm='l2')
        # Flatten the normalized embedding back into a 1D array
        new_embedding = embedding_normalized.flatten()

      new_sentence.append([new_embedding, tag_to_integer_dictionary[tag]])
      count += 1

    embeddings_in_sentences.append(new_sentence)
    new_sentence = []

  return embeddings_in_sentences

In [49]:
def get_without_labels(sentences_embeddingstags):
  # create copy that doesnt have the NER tag
  without_label = []
  new_sentence = []

  count = 0
  for sentence in sentences_embeddingstags:
    for embedding, tag in sentence:
      new_sentence.append(embedding)
      count += 1

    without_label.append(new_sentence)
    new_sentence = []

  return without_label

In [50]:
def get_labels(sentences_embeddingstags):
  # create labels
  labels = []
  new_sentence = []

  count = 0
  for sentence in sentences_embeddingstags:
    for embedding, tag in sentence:
      new_sentence.append(tag)
      count += 1

    labels.append(np.array(new_sentence))
    new_sentence = []

  return labels

In [51]:
def pad_without_labels_and_labels(without_labels, labels, max_sentence_length):
  # padding
  print("padding: ", max_sentence_length)
  padded = tf.keras.utils.pad_sequences(without_labels, padding="post", dtype="float32", maxlen=max_sentence_length, value=0)
  padded_labels = tf.keras.utils.pad_sequences(labels, padding="post", maxlen=max_sentence_length, value=999)

  return padded, padded_labels

In [52]:
def convert_padded_and_padded_labels_to_np_arrays(padded, padded_labels):
  # input: [batch, timestep, feature]
  padded_np = np.array(padded)
  padded_labels_np = np.array(padded_labels)
  """
  print(type(train_labels))
  print(type(train_labels[0]))
  print(type(train_labels[0][0]))
  print(type(train))
  print(type(train[0]))
  print(type(train[0][0]))
  print(type(train[0][0][0]))
  """
  return padded_np, padded_labels_np

In [53]:
def process_raw_to_input_and_labels(raw, tag_to_integer_dictionary):
  without_na = drop_na_labels(raw)
  num_classes = len(tag_to_integer_dictionary)
  sentences_wordstags = get_sentences_wordstags_array(without_na)
  sentences_embeddingstags = get_sentences_embeddingstags_array(sentences_wordstags, tag_to_integer_dictionary)
  without_labels = get_without_labels(sentences_embeddingstags)
  labels = get_labels(sentences_embeddingstags)
  """
  del without_na
  del sentences_wordstags
  del sentences_embeddingstags

  gc.collect()
  """

  return without_labels, labels, tag_to_integer_dictionary, num_classes

In [54]:
def pad_input_and_labels(without_labels, labels, max_sentence_length):
  padded, padded_labels = pad_without_labels_and_labels(without_labels, labels, max_sentence_length)
  padded_np, padded_labels_np = convert_padded_and_padded_labels_to_np_arrays(padded, padded_labels)

  return padded_np, padded_labels_np

In [55]:
unpadded_train, unpadded_train_labels, tag_to_integer_dictionary, num_classes = process_raw_to_input_and_labels(train_raw, tag_to_integer_dictionary)
unpadded_val, unpadded_val_labels, unused_val_dictionary, unused_val_num_classes = process_raw_to_input_and_labels(validation_raw, tag_to_integer_dictionary)
unpadded_test, unpadded_test_labels, unused_test_dictionary, unused_test_num_classes = process_raw_to_input_and_labels(test_raw, tag_to_integer_dictionary)

max_sentence_length = 0
for sentence in unpadded_train:
  if len(sentence) > max_sentence_length:
    max_sentence_length = len(sentence)
for sentence in unpadded_val:
  if len(sentence) > max_sentence_length:
    max_sentence_length = len(sentence)
for sentence in unpadded_test:
  if len(sentence) > max_sentence_length:
    max_sentence_length = len(sentence)

train, train_labels = pad_input_and_labels(unpadded_train, unpadded_train_labels, max_sentence_length)
val, val_labels = pad_input_and_labels(unpadded_val, unpadded_val_labels, max_sentence_length)
test, test_labels = pad_input_and_labels(unpadded_test, unpadded_test_labels, max_sentence_length)

  new_sentence.append([row[0], row[1]])
  new_sentence.append([row[0], row[1]])
  new_sentence.append([row[0], row[1]])
  new_sentence.append([row[0], row[1]])
  new_sentence.append([row[0], row[1]])
  new_sentence.append([row[0], row[1]])


padding:  51
padding:  51
padding:  51


In [56]:
np.isnan(train).any()

False

In [57]:
print(train.shape)
print(val.shape)
print(test.shape)

(15035, 51, 300)
(3494, 51, 300)
(3704, 51, 300)


In [58]:
import keras
from keras import layers

In [59]:
inputs = keras.Input(shape = (train.shape[1], train.shape[2]))
x = layers.Masking(mask_value=0.0)(inputs)
x = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(x)
outputs = layers.Dense(num_classes, activation="softmax")(x)
model = keras.Model(inputs, outputs)
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 51, 300)]         0         
                                                                 
 masking_1 (Masking)         (None, 51, 300)           0         
                                                                 
 bidirectional_1 (Bidirecti  (None, 51, 32)            40576     
 onal)                                                           
                                                                 
 dense_1 (Dense)             (None, 51, 8)             264       
                                                                 
Total params: 40840 (159.53 KB)
Trainable params: 40840 (159.53 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [60]:
from keras.callbacks import Callback
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np

from seqeval.metrics import f1_score, precision_score, recall_score
from seqeval.scheme import IOB1

idx_to_label = {idx: label for label, idx in tag_to_integer_dictionary.items()}

class F1ScoreCallback(Callback):
    def __init__(self, train_data, train_labels, val_data, val_labels, mask_value=999):
        super(F1ScoreCallback, self).__init__()
        self.train_data = train_data
        self.train_labels = train_labels
        self.val_data = val_data
        self.val_labels = val_labels
        self.mask_value = mask_value
#         self.best_f1 = -1
#         self.patience = 5
#         self.num_epochs_without_improvement = 0

    def on_epoch_end(self, epoch, logs={}):
        # Predictions for the training set
        train_softmaxed_outputs = self.model.predict(self.train_data)
        train_predicted_indices = np.argmax(train_softmaxed_outputs, axis=-1)

        # Mask the training predictions and labels
        train_mask = (self.train_labels != self.mask_value)
        filtered_train_pred = train_predicted_indices[train_mask]
        filtered_train_pred_labels = [[idx_to_label[index] for index in filtered_train_pred]]
        filtered_train_true = self.train_labels[train_mask]
        filtered_train_true_labels = [[idx_to_label[index] for index in filtered_train_true]]
        

        # Predictions for the validation set
        val_softmaxed_outputs = self.model.predict(self.val_data)
        val_predicted_indices = np.argmax(val_softmaxed_outputs, axis=-1)

        # Mask the validation predictions and labels
        val_mask = (self.val_labels != self.mask_value)
        filtered_val_pred = val_predicted_indices[val_mask]
        filtered_val_pred_labels = [[idx_to_label[index] for index in filtered_val_pred]]
        filtered_val_true = self.val_labels[val_mask]
        filtered_val_true_labels = [[idx_to_label[index] for index in filtered_val_true]]

        # Calculate the metrics
        train_precision = precision_score(filtered_train_true_labels, filtered_train_pred_labels, average='weighted', scheme=IOB1)
        train_recall = recall_score(filtered_train_true_labels, filtered_train_pred_labels, average='weighted', scheme=IOB1)
        train_f1 = f1_score(filtered_train_true_labels, filtered_train_pred_labels, average='weighted', scheme=IOB1)

        val_precision = precision_score(filtered_val_true_labels, filtered_val_pred_labels, average='weighted', scheme=IOB1)
        val_recall = recall_score(filtered_val_true_labels, filtered_val_pred_labels, average='weighted', scheme=IOB1)
        val_f1 = f1_score(filtered_val_true_labels, filtered_val_pred_labels, average='weighted', scheme=IOB1)

        # Print the metrics
        print(f'\nEpoch {epoch + 1}')
        print(f'Training Precision: {train_precision:.4f} | Training Recall: {train_recall:.4f} | Training F1: {train_f1:.4f}')
        print(f'Validation Precision: {val_precision:.4f} | Validation Recall: {val_recall:.4f} | Validation F1: {val_f1:.4f}')
        
#         # Update the best F1 score
#         if val_f1 > self.best_f1:
#             self.best_f1 = val_f1
#             self.num_epochs_without_improvement = 0
#         else:
#             self.num_epochs_without_improvement += 1

#         # Check if early stopping condition is met
#         if self.num_epochs_without_improvement >= self.patience:
#             self.model.stop_training = True
#             print("Early stopping is triggered.")

        # Update the logs dictionary with the F1 scores
        logs['train_f1'] = train_f1
        logs['val_f1'] = val_f1
        
# Instantiate EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_f1', patience=5, mode='max', restore_best_weights=True)

# Then, create an instance of the F1ScoreCallback
f1_score_callback = F1ScoreCallback(train_data=train, train_labels=train_labels, val_data=val, val_labels=val_labels)

In [61]:
def masked_loss_function(y_true, y_pred):
  """
  mask = tf.cast(tf.not_equal(y_true, 999), tf.float32)
  tf.print("y_true: ", y_true)
  tf.print("mask: ", mask)
  #tf.print(y_true)
  tf.print(y_pred[0][0][:])
  tf.print(len(y_pred))
  tf.print(len(y_pred[0]))
  tf.print(len(y_pred[0][0]))
  loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred)
  loss *= mask
  #tf.print(tf.reduce_sum(loss) / tf.reduce_sum(mask))
  return tf.reduce_sum(loss) / tf.reduce_sum(mask)
  """
  # Create a mask to ignore the loss for 999 values in y_true
  mask = tf.cast(tf.not_equal(y_true, 999), tf.float32)

  # Replace the 999 values with a valid class index (e.g., 0)
  y_true_masked = tf.where(tf.not_equal(y_true, 999), y_true, 0)

  # Calculate the loss using the modified y_true
  loss = tf.keras.losses.sparse_categorical_crossentropy(y_true_masked, y_pred)

  # Apply the mask to zero-out the loss for originally masked values
  loss *= mask

  # Return the mean loss only for the unmasked elements
  return tf.reduce_sum(loss) / tf.reduce_sum(mask)

In [62]:
learning_rate = 0.001
adam_optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

In [63]:
model.compile(optimizer=adam_optimizer, loss=masked_loss_function, metrics=["accuracy"])
#model.fit(train, train_labels, batch_size=32, epochs=2)
model.fit(train, train_labels, batch_size=32, epochs=50, validation_data=(val, val_labels), callbacks=[f1_score_callback, early_stopping])

Epoch 1/50


  _warn_prf(average, modifier, msg_start, len(result))



Epoch 1
Training Precision: 0.5772 | Training Recall: 0.4460 | Training F1: 0.4958
Validation Precision: 0.5814 | Validation Recall: 0.4558 | Validation F1: 0.5030
Epoch 2/50

Epoch 2
Training Precision: 0.7269 | Training Recall: 0.7146 | Training F1: 0.7178
Validation Precision: 0.7269 | Validation Recall: 0.7201 | Validation F1: 0.7221
Epoch 3/50

Epoch 3
Training Precision: 0.7661 | Training Recall: 0.7640 | Training F1: 0.7642
Validation Precision: 0.7681 | Validation Recall: 0.7717 | Validation F1: 0.7696
Epoch 4/50

Epoch 4
Training Precision: 0.7904 | Training Recall: 0.7909 | Training F1: 0.7903
Validation Precision: 0.7892 | Validation Recall: 0.7980 | Validation F1: 0.7934
Epoch 5/50

Epoch 5
Training Precision: 0.8105 | Training Recall: 0.8136 | Training F1: 0.8116
Validation Precision: 0.8055 | Validation Recall: 0.8151 | Validation F1: 0.8102
Epoch 6/50

Epoch 6
Training Precision: 0.8256 | Training Recall: 0.8198 | Training F1: 0.8226
Validation Precision: 0.8142 | Valid


Epoch 19
Training Precision: 0.8982 | Training Recall: 0.8859 | Training F1: 0.8915
Validation Precision: 0.8601 | Validation Recall: 0.8541 | Validation F1: 0.8565
Epoch 20/50

Epoch 20
Training Precision: 0.9083 | Training Recall: 0.8948 | Training F1: 0.9014
Validation Precision: 0.8667 | Validation Recall: 0.8621 | Validation F1: 0.8643
Epoch 21/50

Epoch 21
Training Precision: 0.9128 | Training Recall: 0.9011 | Training F1: 0.9069
Validation Precision: 0.8655 | Validation Recall: 0.8624 | Validation F1: 0.8639
Epoch 22/50

Epoch 22
Training Precision: 0.9069 | Training Recall: 0.9010 | Training F1: 0.9038
Validation Precision: 0.8647 | Validation Recall: 0.8655 | Validation F1: 0.8650
Epoch 23/50

Epoch 23
Training Precision: 0.9181 | Training Recall: 0.9036 | Training F1: 0.9107
Validation Precision: 0.8663 | Validation Recall: 0.8614 | Validation F1: 0.8638
Epoch 24/50

Epoch 24
Training Precision: 0.9157 | Training Recall: 0.9027 | Training F1: 0.9090
Validation Precision: 0.8

<keras.src.callbacks.History at 0x7fcd95af7610>

In [64]:
# Testing layer outputs
from tensorflow import keras
from keras import layers
import numpy as np

# Create a new model that will return the outputs from all layers:
layer_outputs = [layer.output for layer in model.layers]  # Exclude the Input layer if necessary
activation_model = keras.Model(inputs=model.input, outputs=layer_outputs)

# Get the outputs for an input:
all_layer_activations = activation_model.predict(train)

# Now iterate over the outputs and check for NaNs:
for layer_activation in all_layer_activations:
    # Check if the activation contains NaNs
    if np.isnan(layer_activation).any():
        print("NaNs detected")

# If you want to check a particular layer by name, you can do:
for layer, activation in zip(model.layers, all_layer_activations):
    if np.isnan(activation).any():
        print(f"NaN detected in layer: {layer.name}")



In [65]:
test_softmaxed_outputs = model.predict(test)

