In [1]:
# Install pyspark
# ! pip install --ignore-installed pyspark==2.4.4
# Install Spark NLP
# ! pip install --ignore-installed spark-nlp
# Install tensorflow
!pip install tensorflow

You should consider upgrading via the '/home/fipulab/.pyenv/versions/3.8.10/bin/python -m pip install --upgrade pip' command.[0m


In [37]:
# Importing
import numpy as np
import pandas as pd
import os
from pathlib import Path
import urllib.request
import tensorflow as tf
import argparse
from keras.preprocessing.sequence import pad_sequences

### Data preprocessing
Dataset preuzet s: https://deepai.org/dataset/conll-2003-english

In [3]:
def split_text_label(filename):
    f = open(filename)
    split_labeled_text = []
    sentence = []
    for line in f:
        if len(line) == 0 or line.startswith('-DOCSTART') or line[0] == "\n":
            if len(sentence) > 0:
                split_labeled_text.append(sentence)
                sentence = []
            continue
        splits = line.split(' ')
        sentence.append([splits[0], splits[-1].rstrip("\n")])
    if len(sentence) > 0:
        split_labeled_text.append(sentence)
        sentence = []
    return split_labeled_text

In [4]:
split_train = split_text_label(os.path.join('conll2003', "train.txt"))
split_valid = split_text_label(os.path.join('conll2003', "valid.txt"))
split_test = split_text_label(os.path.join('conll2003', "test.txt"))

In [5]:

labelSet = set()
wordSet = set()
# words and labels
for data in [split_train, split_valid, split_test]:
    for labeled_text in data:
        for word, label in labeled_text:
            labelSet.add(label)
            wordSet.add(word.lower())

In [6]:
# Sort the set to ensure '0' is assigned to 0
sorted_labels = sorted(list(labelSet), key=len)
# Create mapping for labels
label2Idx = {}
for label in sorted_labels:
    label2Idx[label] = len(label2Idx)
idx2Label = {v: k for k, v in label2Idx.items()}
# Create mapping for words
word2Idx = {}
if len(word2Idx) == 0:
    word2Idx["PADDING_TOKEN"] = len(word2Idx)
    word2Idx["UNKNOWN_TOKEN"] = len(word2Idx)
for word in wordSet:
    word2Idx[word] = len(word2Idx)

In [7]:
def createMatrices(data, word2Idx, label2Idx):
    sentences = []
    labels = []
    for split_labeled_text in data:
        wordIndices = []
        labelIndices = []
        for word, label in split_labeled_text:
            if word in word2Idx:
                wordIdx = word2Idx[word]
            elif word.lower() in word2Idx:
                wordIdx = word2Idx[word.lower()]
            else:
                wordIdx = word2Idx['UNKNOWN_TOKEN']
            wordIndices.append(wordIdx)
            labelIndices.append(label2Idx[label])
        sentences.append(wordIndices)
        labels.append(labelIndices)
    return sentences, labels

In [8]:
train_sentences, train_labels = createMatrices(split_train, word2Idx, label2Idx)
valid_sentences, valid_labels = createMatrices(split_valid, word2Idx, label2Idx)
test_sentences, test_labels = createMatrices(split_test, word2Idx, label2Idx)

In [38]:
def padding(sentences, labels, max_len, padding='post'):
    padded_sentences = pad_sequences(sentences, max_len, padding='post')
    padded_labels = pad_sequences(labels, max_len, padding='post')
    return padded_sentences, padded_labels

In [40]:
# parser = argparse.ArgumentParser()
# parser.add_argument("--max_seq_length",
#                     default=128,
#                     type=int,
#                     help="The maximum total input sequence length after WordPiece tokenization. \n"
#                          "Sequences longer than this will be truncated, and sequences shorter \n"
#                          "than this will be padded.")
# args = parser.parse_args()
# processor = NerProcessor()
# # Required parameters
# parser.add_argument("--data_dir",
#                     default=None,
#                     type=str,
#                     required=True,
#                     help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
# parser.add_argument("--bert_model", default=None, type=str, required=True,
#                     help="Bert pre-trained model selected in the list: bert-base-cased,bert-large-cased")
# parser.add_argument("--output_dir",
#                     default=None,
#                     type=str,
#                     required=True,
#                     help="The output directory where the model predictions and checkpoints will be written.")

In [39]:
train_features, train_labels = padding(train_sentences, train_labels, max_seq_len, padding='post')
valid_features, valid_labels = padding(valid_sentences, valid_labels, max_seq_len, padding='post')
test_features, test_labels = padding(test_sentences, test_labels, max_seq_len, padding='post')

In [30]:
# train_examples = processor.get_train_examples(args.data_dir)

In [44]:
all_input_ids = tf.data.Dataset.from_tensor_slices(np.asarray([f.input_ids for f in train_features]))
all_input_mask = tf.data.Dataset.from_tensor_slices(np.asarray([f.input_mask for f in train_features]))
all_segment_ids = tf.data.Dataset.from_tensor_slices(np.asarray([f.segment_ids for f in train_features]))
all_valid_ids = tf.data.Dataset.from_tensor_slices(np.asarray([f.valid_ids for f in train_features]))
all_label_mask = tf.data.Dataset.from_tensor_slices(np.asarray([f.label_mask for f in train_features]))
all_label_ids = tf.data.Dataset.from_tensor_slices(np.asarray([f.label_id for f in train_features]))

AttributeError: 'numpy.ndarray' object has no attribute 'input_ids'

In [45]:
# Dataset using tf.data
train_data = tf.data.Dataset.zip((all_input_ids, all_input_mask, all_segment_ids, all_valid_ids, all_label_ids,all_label_mask))
shuffled_train_data = train_data.shuffle(buffer_size=int(len(train_features) * 0.1),seed = args.seed, reshuffle_each_iteration=True)
batched_train_data = shuffled_train_data.batch(args.train_batch_size)

NameError: name 'all_input_ids' is not defined

In [38]:
# Model architecture

In [16]:
class BertNer(tf.keras.Model):
  def __init__(self, bert_model,float_type, num_labels,   
  max_seq_length, final_layer_initializer=None):
    super(BertNer, self).__init__()
    input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,),   
    dtype=tf.int32, name='input_word_ids')
    input_mask = tf.keras.layers.Input(shape=(max_seq_length,),     
    dtype=tf.int32, name='input_mask')
    input_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), 
    dtype=tf.int32, name='input_type_ids')
    
    bert_config = BertConfig.from_json_file (os.path.join(bert_model, "bert_config.json"))
    bert_layer = BertModel(config=bert_config,
      float_type=float_type)
    _, sequence_output = bert_layer(input_word_ids,  
    input_mask,input_type_ids)
    self.bert = tf.keras.Model(inputs=[input_word_ids, input_mask,  
    input_type_ids],outputs=[sequence_output])
    if type(bert_model) == str:
      init_checkpoint = os.path.join(bert_model,"bert_model.ckpt")     
      checkpoint = tf.train.Checkpoint(model=self.bert)
      checkpoint.restore(init_checkpoint).assert_
      existing_objects_matched()
   
    self.dropout = tf.keras.layers.Dropout(    
    rate=bert_config.hidden_dropout_prob)
    if final_layer_initializer is not None: 
      initializer = final_layer_initializer
    else:
      initializer = tf.keras.initializers.TruncatedNormal(   
      stddev=bert_config.initializer_range)
    self.classifier = tf.keras.layers.Dense(num_labels, 
    kernel_initializer=initializer, name='output', dtype=float_type)

In [17]:
def call(self, input_word_ids,input_mask=None,input_type_ids=None,
valid_ids=None, **kwargs):
    sequence_output = self.bert([input_word_ids, input_mask,  
    input_type_ids],**kwargs)
    valid_output = []
    for i in range(sequence_output.shape[0]): 
      r = 0
      temp = []
      for j in range(sequence_output.shape[1]):
        if valid_ids[i][j] == 1:
           temp = temp + [sequence_output[i][j]]
        else:
           r += 1
      temp = temp + r * [tf.zeros_like(sequence_output[i][j])]
      valid_output = valid_output + temp
    valid_output = tf.reshape(tf.stack(valid_output)
    ,sequence_output.shape)
    sequence_output = self.dropout(valid_output,  
    training=kwargs.get('training', False)) 
    logits = self.classifier(sequence_output)
    return logits

In [18]:
#Custom training loop

In [19]:
def train_step(input_ids, input_mask, segment_ids, valid_ids, label_ids,label_mask):
  with tf.GradientTape() as tape:
    logits = ner(input_ids, input_mask,segment_ids, valid_ids, 
    training=True) #batchsize, max_seq_length, num_labels
    label_ids_masked = tf.boolean_mask(label_ids,label_mask)        
    logits_masked = tf.boolean_mask(logits,label_mask)
    loss = loss_fct(label_ids_masked, logits_masked)
  grads = tape.gradient(loss, ner.trainable_variables)
  optimizer.apply_gradients(list(zip(grads,       
  ner.trainable_variables)))
  return loss

In [58]:
# Evaluation on valid dataset
# if args.do_eval:
# load tokenizer
tokenizer = FullTokenizer(os.path.join(args.output_dir, "vocab.txt"), args.do_lower_case)
# model build hack : fix
config = json.load(open(os.path.join(args.output_dir, "bert_config.json")))
ner = BertNer(config, tf.float32, num_labels, 
args.max_seq_length)
ids = tf.ones((1,128),dtype=tf.int64)
_ = ner(ids,ids,ids,ids, training=False)
ner.load_weights(os.path.join(args.output_dir,"model.h5"))
# load test or development set based on argsK
if args.eval_on == "dev":
    eval_examples = processor.get_dev_examples(args.data_dir)
elif args.eval_on == "test":
    eval_examples = processor.get_test_examples(args.data_dir)
    eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer)
    all_input_ids = tf.data.Dataset.from_tensor_slices(np.asarray([f.input_ids for f in eval_features]))  
    all_input_mask = tf.data.Dataset.from_tensor_slices(np.asarray([f.input_mask for f in eval_features]))
    all_segment_ids = tf.data.Dataset.from_tensor_slices(np.asarray([f.segment_ids for f in eval_features]))
    all_valid_ids = tf.data.Dataset.from_tensor_slices(np.asarray([f.valid_ids for f in eval_features]))
    all_label_ids = tf.data.Dataset.from_tensor_slices(np.asarray([f.label_id for f in eval_features]))
    eval_data = tf.data.Dataset.zip((all_input_ids, all_input_mask, all_segment_ids, all_valid_ids, all_label_ids))
    batched_eval_data = eval_data.batch(args.eval_batch_size)

loss_metric = tf.keras.metrics.Mean()
epoch_bar = master_bar(range(1))
pb_max_len = math.ceil(
float(len(eval_features))/float(args.eval_batch_size))
y_true = []
y_pred = []
label_map = {i : label for i, label in enumerate(label_list,1)}
for epoch in epoch_bar:
  for (input_ids, input_mask, segment_ids, valid_ids, label_ids) in progress_bar(batched_eval_data, total=pb_max_len, parent=epoch_bar):
    logits = ner(input_ids, input_mask, segment_ids, valid_ids, training=False)
    logits = tf.argmax(logits,axis=2)
    for i, label in enumerate(label_ids):
     temp_1 = []
     temp_2 = []
     for j,m in enumerate(label):
       if j == 0:
         continue
       elif label_ids[i][j].numpy() == len(label_map):     
         y_true.append(temp_1)
         y_pred.append(temp_2)
         break
       else:
         temp_1.append(label_map[label_ids[i][j].numpy()])
         temp_2.append(label_map[logits[i][j].numpy()])

report = classification_report(y_true, y_pred,digits=4)
output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
with open(output_eval_file, "w") as writer:
  logger.info("***** Eval results *****")
  logger.info("\n%s", report)
writer.write(report)

NameError: name 'FullTokenizer' is not defined

In [29]:
for epoch in epoch_bar:
    for (input_ids, input_mask, segment_ids, valid_ids, 
    label_ids,label_mask) in progress_bar(batched_train_data, 
    total=pb_max_len, parent=epoch_bar):
      loss = train_step(input_ids, input_mask, segment_ids,
      valid_ids, label_ids,label_mask)
      loss_metric(loss)
      epoch_bar.child.comment = f'loss : {loss_metric.result()}'
    loss_metric.reset_states()

NameError: name 'epoch_bar' is not defined