# **BiLSTM with character level information and extra CRF Layer**

In [1]:
import os
print(f'Current working directory: {os.getcwd()}')
parent_dir = os.path.dirname(os.getcwd())
print(f'Parent directory: {parent_dir}')
os.chdir(parent_dir)
print(f'Current working directory: {os.getcwd()}')
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload

# notebook will reload external python modules;
%autoreload 2 

Current working directory: d:\azeem\Documents\UNIGE\MSc CS\Semester IV\METL\ner_seq2seq_project\notebooks
Parent directory: d:\azeem\Documents\UNIGE\MSc CS\Semester IV\METL\ner_seq2seq_project
Current working directory: d:\azeem\Documents\UNIGE\MSc CS\Semester IV\METL\ner_seq2seq_project


In [2]:
import matplotlib.pyplot as plt
import tensorflow as tf
from datasets import load_dataset
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tf2crf import CRF, ModelWithCRFLoss
from transformers import BertTokenizerFast, TFAutoModel

from src.bilstm_models import *


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [4]:
dataset = load_dataset('conll2003')

# Define label2token and token2label mappings
label2token = {
    'O': 0,
    'B-PER': 1,
    'I-PER': 2,
    'B-ORG': 3,
    'I-ORG': 4,
    'B-LOC': 5,
    'I-LOC': 6,
    'B-MISC': 7,
    'I-MISC': 8,
    '[PAD]': 10 
}
token2label = {token: label for label, token in label2token.items()}

# Process the dataset
tokenized_datasets = dataset.map(tokenize_and_align_labels)
tokenized_datasets.set_format(type='tensorflow', columns=['input_ids', 'attention_mask', 'labels', 'char_encoded_tokens'])

# check the first example in the dataset
example = tokenized_datasets['train'][0]

# get the input_ids and convert to numpy array
input_ids = example['input_ids'].numpy()

# decode the input_ids to get the original sentence
sentence = tokenizer.decode(input_ids)

print("Original Sentence:")
print(sentence)

# get the labels and convert to numpy array
labels = example['labels'].numpy()

# print the labels before transformation
print("\nLabels Before Transformation:")
print(labels)

# remove the padding (10) before converting to labels
labels = labels[labels != 10]

# convert the tokenized labels back to their original string labels
reconstructed_labels = [token2label[token] for token in labels]

print("\nLabels After Transformation:")
print(reconstructed_labels)

Found cached dataset conll2003 (C:/Users/azeem/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at C:\Users\azeem\.cache\huggingface\datasets\conll2003\conll2003\1.0.0\9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98\cache-a055f85b27121498.arrow
Loading cached processed dataset at C:\Users\azeem\.cache\huggingface\datasets\conll2003\conll2003\1.0.0\9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98\cache-debebc254be816c8.arrow
Loading cached processed dataset at C:\Users\azeem\.cache\huggingface\datasets\conll2003\conll2003\1.0.0\9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98\cache-be014cd55b1485b8.arrow


Original Sentence:
[CLS] EU rejects German call to boycott British lamb. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]

Labels Before Transformation:
[10  3  0  7  0  0  0  7  0  0  0 10 10 10 10 10 10 10 10 10 10 10 10 10
 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10

In [5]:
def prepare_dataset(tokenized_data, shuffle=False, cache=True):
  AUTOTUNE = tf.data.experimental.AUTOTUNE
  column_names = ['input_ids', 'attention_mask', 'labels', 'char_encoded_tokens']
  print(f"\t preparing {tokenized_data} dataset ... ", flush=True)
  dataset = tf.data.Dataset.from_tensor_slices((
      {key: np.array(tokenized_datasets[tokenized_data][key]) for key in column_names},
      np.array(tokenized_datasets[tokenized_data]['labels'])
  ))

  if cache:
      dataset = dataset.cache()
  if shuffle:
      dataset = dataset.shuffle(1000)
  dataset = dataset.batch(32, drop_remainder=True)
  dataset = dataset.prefetch(AUTOTUNE)
  return dataset

In [None]:

train_dataset = prepare_dataset('train', shuffle=True)
validation_dataset = prepare_dataset('validation')
test_dataset = prepare_dataset('test')

print(f"train dataset size: {len(tokenized_datasets['train'])}")
print(f"validation dataset size: {len(tokenized_datasets['validation'])}")
print(f"test dataset size: {len(tokenized_datasets['test'])}")

print(f"{tokenized_datasets['train'][0].keys()}")

print(f"input_ids shape: {tokenized_datasets['train'][0]['input_ids'].shape}")
# print(tokenized_datasets['train'][0]['input_ids'])

print(f"attention maske shape: {tokenized_datasets['train'][0]['attention_mask'].shape}")
# print(tokenized_datasets['train'][0]['attention_mask'])

print(f"labels shape: {tokenized_datasets['train'][0]['labels'].shape}")
# print(tokenized_datasets['train'][0]['labels'])

print(f"tokenised train dataset shape: {tokenized_datasets['train'][0]['char_encoded_tokens'].shape}")
# print(tokenized_datasets['train'][0]['char_encoded_tokens'])

### character embedding

In [None]:
# First, let's prepare the model inputs
input_data, labels, sample_weights = prepare_inputs(train_dataset)

# Take one example from the input_data to showcase
input_ids_example, attention_mask_example, char_embed_example = input_data[0][0], input_data[1][0], input_data[2][0]
labels_example = labels[0]
weights_example = sample_weights[0]

# Decode the input_ids_example back to words  
words = tokenizer.convert_ids_to_tokens(input_ids_example)

# Display the first word, its input_ids, attention_mask, char_embed, label and sample_weight
print(f"Word: {words[0]}")
print(f"Input IDs: {input_ids_example[0]}")
print(f"Attention Mask: {attention_mask_example[0]}")
print(f"Char Embedding: {char_embed_example[0]}")
print(f"Label: {labels_example[0]}")
print(f"Sample Weight: {weights_example[0]}")

x_train, y_train, sample_weights_train = prepare_inputs(train_dataset)
x_test, y_test, sample_weights_test = prepare_inputs(test_dataset)
x_val, y_val, sample_weights_val = prepare_inputs(validation_dataset)

In [None]:

LOAD_MODEL = False
EPOCHS = 20
BATCH_SIZE = 75
num_chars = len(char2idx) # The number of unique characters
hidden_units = 96  # The number of hidden units for LSTM layers
unique_labels = set([tag for sentence in dataset["train"]["ner_tags"] for tag in sentence])
num_classes = len(unique_labels)+1
word_embedding_dim = 64  # The dimension of word embedding
char_embed_dim = 20  # The dimension of character embedding
vocab_size = len(tokenizer.get_vocab())  # Size of the vocabulary

max_seq_len = 128  # Maximum number of words in a sequence
max_word_len = 128  # Maximum number of characters in a word



# prepare training data
x_train_padded = [pad_sequences(x, maxlen=max_seq_len, padding="post", value=vocab_size) for x in x_train]

# y_train_encoded = [[label + 1 for label in seq] for seq in y_train]  # Shift labels to [1, 10]
y_train_new = pad_sequences(y_train, maxlen=max_seq_len, padding="post", value=10)  # Use 0 for padding

x_val_padded = [pad_sequences(x, maxlen=max_seq_len, padding="post", value=vocab_size) for x in x_val]
# y_val_encoded = [[label + 1 for label in seq] for seq in y_val]
y_val_new = pad_sequences(y_val, maxlen=max_seq_len, padding="post", value=10)  # Use 0 for padding

# only word
x_train_nochar = [x_train_padded[0],x_train_padded[1]]
x_val_nochar = [x_val_padded[0],x_val_padded[1]]


print(f"\n\n \t\t\t creating and training model ... \n\n")

# ----------------- pre trained embedding
# Load DistilBERT
distilbert_model = TFAutoModel.from_pretrained('distilbert-base-uncased')
distilbert_model.trainable = False

# Character embedding
print(f"creating character embedding ... ")
char_input = layers.Input(shape=(max_seq_len, max_word_len), dtype='int32')

char_embed = layers.TimeDistributed(layers.Embedding(num_chars, char_embed_dim, mask_zero=True))(char_input)

char_bilstm = layers.TimeDistributed(layers.Bidirectional(layers.LSTM(hidden_units)))(char_embed)

# Word-level input
print(f"creating word-level input ... ")
word_input = layers.Input(shape=(max_seq_len,), dtype='int32')
attention_mask_input = layers.Input(shape=(max_seq_len,), dtype='int32')

# Word Embedding
print(f"creating word embedding ... ")
# word_embedding = layers.Embedding(input_dim=vocab_size, output_dim=word_embedding_dim)(word_input)
word_embedding = distilbert_model([word_input, attention_mask_input])[0] # with distilbert

# Concatenate word and char-level information
print(f"concatenating word and char-level information ... ")
combined = layers.Concatenate()([word_embedding, char_bilstm])

# Final BiLSTM layer for sequence tagging
print(f"final BiLSTM layer for sequence tagging ... ")
bilstm = layers.Bidirectional(layers.LSTM(hidden_units, return_sequences=True))(combined)

# for word embedding only
# bilstm = layers.Bidirectional(layers.LSTM(hidden_units, return_sequences=True,
#                                           kernel_initializer='he_normal'))(word_embedding)


# Final BiLSTM layer for sequence tagging
dropout = layers.Dropout(0.5)(bilstm)

# Add CRF layer
crf = CRF(num_classes+1)  # Plus 1 for the padding class
output = crf(dropout)                                         


base_model = tf.keras.Model(inputs=[word_input, attention_mask_input, char_input], outputs=output) # char info
# base_model = tf.keras.Model(inputs=[word_input, attention_mask_input], outputs=output) #no char info
model = ModelWithCRFLoss(base_model)


class_weights = {i: 1 for i in range(num_classes)}
class_weights[10] = 0  # Set the weight for the padding label to 0


# Compile model
print(f"COMPILING MODEL ... ")

model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.0001))

if LOAD_MODEL == True:
  model.load_weights('./models/checkpoints/bilstm_crf_model_checkpoint.h5')
else:
  checkpoint = ModelCheckpoint('model_checkpoint.h5',
                             verbose=1, save_best_only=True, mode='min',
                             save_weights_only=True)


# Training
print(f"TRAINING MODEL ... ")
early_stopping = EarlyStopping(monitor='val_loss_val', patience=3, restore_best_weights=True)

# get maximum workers from cpu
import multiprocessing
USE_MP = False
workers = multiprocessing.cpu_count()

if USE_MP == True:
  history = model.fit(x_train_padded, y_train_new,
                      validation_data = (x_val_padded,y_val_new),
                      epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks = [checkpoint, early_stopping],
                      workers=workers-1)

history = model.fit(x_train_padded, y_train_new,
                    validation_data = (x_val_padded,y_val_new),
                    epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks = [checkpoint, early_stopping])

# history = model.fit(x_train_nochar, y_train,
#                     validation_data = (x_val_nochar,y_val),
#                     epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks = [checkpoint, early_stopping])


In [None]:

# Save weights
model.save_weights('bilstm_crf_final_model.h5')


### Testing

In [None]:
x_test_padded = [pad_sequences(x, maxlen=max_seq_len, padding="post", value=vocab_size) for x in x_test]
y_test_new = pad_sequences(y_test, maxlen=max_seq_len, padding="post", value=10)  
x_test_nochar = [x_test_padded[0],x_test_padded[1]]


In [None]:
evaluate_model(model, x_test_padded, y_test_new, crf_model=True)

### Plotting

In [None]:
# Call the function
plot_training(history, crf_model=True)