In [None]:
# !git clone https://github.com/nirajkale/ResumeNER.git

In [None]:
!pip install -r /content/ResumeNER/req.txt



In [None]:
from tensorflow.python.keras.backend import dropout
from ResumeNER import *
import transformers
from transformers import ElectraTokenizer, TFElectraModel
from os import path
import tensorflow as tf
import math
from tensorflow import optimizers
import numpy as np
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score

In [None]:
seed = 232
model_name = 'google/electra-base-discriminator'
tokenizer = ElectraTokenizer.from_pretrained(model_name)
use_token_type_ids = "token_type_ids" in tokenizer.model_input_names

use_iob2_format = True
model_meta = ModelMeta()
model_meta.model_type = 'bert'
batch_size = 5

In [None]:
def read_data(filepath):
    examples, annotations_list, class_list = read_annotation_file(filepath)
    converted_examples = convert_platform_data_to_ner(examples, annotations_list, class_list, use_iob2_format = use_iob2_format)
    class_map = {i:label for i, label in enumerate(class_list)}
    features = convert_examples_to_features(model_meta, converted_examples,class_list,tokenizer,use_iob2_format = use_iob2_format)
    return features, class_map

def create_tensorflow_dataset(features):
    def gen():
        for ex in features:
            yield (
                {
                    "input_ids": ex.input_ids,
                    "attention_mask": ex.attention_mask,
                    "token_type_ids": ex.token_type_ids,
                },
                ex.label_ids,
            )
    return tf.data.Dataset.from_generator(
            gen,
            ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
            (
                {
                    "input_ids": tf.TensorShape([None]),
                    "attention_mask": tf.TensorShape([None]),
                    "token_type_ids": tf.TensorShape([None]),
                },
                tf.TensorShape([None]),
            ),
        )
    
def build_model(num_labels, use_dropout=True, dropout_rate=0.15):
    model = TFElectraModel.from_pretrained(model_name)
    input_ids = tf.keras.layers.Input(shape=(model_meta.max_seq_length,), name='input_ids', dtype='int32')
    attention_mask = tf.keras.layers.Input(shape=(model_meta.max_seq_length,), name='attention_mask', dtype='int32')
    token_type_ids = tf.keras.layers.Input(shape=(model_meta.max_seq_length,), name='token_type_ids', dtype='int32')
    model_inputs = [input_ids, attention_mask, token_type_ids]
    outputs = model(model_inputs)
    logits = outputs[0]
    if use_dropout and dropout_rate>0:
        logits = tf.keras.layers.Dropout(dropout_rate)(logits)
    model_op = tf.keras.layers.Dense(num_labels, activation = 'softmax', kernel_initializer='glorot_uniform')(logits)
    keras_model = tf.keras.Model(inputs= model_inputs, outputs = model_op)
    return keras_model


In [None]:
from sklearn.model_selection import train_test_split
features, class_map = read_data([r'/content/ResumeNER/traindata.json', r'/content/ResumeNER/testdata.json'])
print(class_map)
features_train,features_test = train_test_split(features, test_size=0.2, shuffle=True)

ds_train = create_tensorflow_dataset(features_train)\
            .shuffle(len(features_train), seed=seed)\
            .batch(batch_size, drop_remainder=False)\
            .prefetch(tf.data.experimental.AUTOTUNE)

ds_test = create_tensorflow_dataset(features_test)\
    .batch(batch_size, drop_remainder=False)\
    .prefetch(tf.data.experimental.AUTOTUNE)

steps_per_epoch = math.ceil(len(features_train)/batch_size)
validation_steps = math.ceil(len(features_test)/batch_size)


converting samples:   5%|▍         | 10/220 [00:00<00:02, 99.86it/s]

Total Samples Processed: 220
labels skipped: 0


converting samples: 100%|██████████| 220/220 [00:01<00:00, 151.37it/s]
generating features: 100%|██████████| 220/220 [00:11<00:00, 19.30it/s]


{0: 'O', 1: 'B-COMPANIES WORKED AT', 2: 'I-COMPANIES WORKED AT', 3: 'B-COLLEGE NAME', 4: 'I-COLLEGE NAME', 5: 'B-DEGREE', 6: 'I-DEGREE', 7: 'B-DESIGNATION', 8: 'I-DESIGNATION', 9: 'B-EMAIL ADDRESS', 10: 'I-EMAIL ADDRESS', 11: 'B-LOCATION', 12: 'I-LOCATION', 13: 'B-NAME', 14: 'I-NAME', 15: 'B-YEARS OF EXPERIENCE', 16: 'I-YEARS OF EXPERIENCE'}


In [None]:
@tf.function
def train_step(x, y, model, loss_fn, optimizer):
  #forward propagation
  with tf.GradientTape() as tape:
    logits = model(x, training=True)
    loss_val = loss_fn(y, logits)
  #backpropagation
  gradients = tape.gradient(loss_val, model.trainable_variables)
  #based on backprop, update model weights
  optimizer.apply_gradients(zip(gradients, model.trainable_variables))
  return loss_val

@tf.function
def evaluation_step(x, y, model, loss_fn):
  logits= model(x, training=False)
  loss_val = loss_fn(y, logits)
  return loss_val

In [None]:
def align_predictions(predictions: np.ndarray, label_ids: np.ndarray, attention_masks: np.array, class_map:Dict) -> Tuple[List[int], List[int]]:
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape
    out_label_list = [[] for _ in range(batch_size)]
    preds_list = [[] for _ in range(batch_size)]

    for i in range(batch_size):
        for j in range(seq_len):
            if attention_masks[i, j] ==1:
                out_label_list[i].append(class_map[label_ids[i][j]])
                preds_list[i].append(class_map[preds[i][j]])

    return preds_list, out_label_list

def evaluate_model(model, dataset, batch_size, steps, class_map:Dict, return_report=True):
    preds = []
    labels = []
    attention_masks = []
    for i in tqdm(range(steps)):
        data_batch, labels_batch = next(iter(dataset))
        preds_batch = model.predict(data_batch, batch_size = batch_size, verbose=0)
        preds.append(preds_batch)
        labels.append(labels_batch.numpy())
        attention_masks.append( data_batch['attention_mask'].numpy())
    preds = np.concatenate(preds, axis=0)
    labels = np.concatenate(labels, axis=0)
    attention_masks = np.concatenate(attention_masks, axis=0)
    preds_list, out_label_list = align_predictions(preds, labels, attention_masks, class_map)
    if return_report:
        return {
                "precision": precision_score(out_label_list, preds_list),
                "recall": recall_score(out_label_list, preds_list),
                "f1": f1_score(out_label_list, preds_list),
                'classification_report': classification_report(out_label_list, preds_list)
            }
    return f1_score(out_label_list, preds_list)

In [None]:
model = build_model(len(class_map), True, dropout_rate = 0.45)
# print(model.summary())
optimizer = optimizers.Adam(learning_rate=1e-4)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True)
epoch_loss_metric = tf.keras.metrics.Mean()
training_loss_history, val_loss_history = [], []
print('steps per epochs:', steps_per_epoch )
for epoch in range(10):
  tf.print('Epoch ',epoch+1,' started')
  for x_batch_train, y_batch_train in tqdm(ds_train, total= steps_per_epoch, desc='training'):
    batch_loss = train_step(x_batch_train, y_batch_train, model, loss_fn, optimizer)
    # if tf.math.is_nan(batch_loss):
    #   a, b = x_batch_train, y_batch_train
    #   raise Exception('here')
    epoch_loss_metric.update_state(batch_loss)
    epoch_loss_train = epoch_loss_metric.result().numpy().item()
    epoch_loss_metric.reset_states()
  for x_batch_val, y_batch_val in tqdm(ds_test, total= validation_steps, desc='validating'):
    batch_loss = evaluation_step(x_batch_val, y_batch_val, model, loss_fn)
    epoch_loss_metric.update_state(batch_loss)
    epoch_loss_val = epoch_loss_metric.result().numpy().item()
  result_train = evaluate_model(model, ds_train, batch_size, steps= steps_per_epoch, class_map = class_map, return_report= False)
  result_val = evaluate_model(model, ds_test, batch_size, steps= validation_steps, class_map = class_map, return_report= False)
  print('training loss:', epoch_loss_train, ' val loss:', epoch_loss_val)
  print('Epoch ',epoch+1, ' Training F1:', result_train, ' validation f1:', result_val)
  training_loss_history.append(epoch_loss_train)
  val_loss_history.append(epoch_loss_val)

Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing TFElectraModel: ['discriminator_predictions']
- This IS expected if you are initializing TFElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFElectraModel were initialized from the model checkpoint at google/electra-base-discriminator.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFElectraModel for predictions without further training.


steps per epochs: 74
Epoch  1  started


training: 100%|██████████| 74/74 [00:58<00:00,  1.26it/s]
validating: 100%|██████████| 19/19 [00:06<00:00,  3.03it/s]
100%|██████████| 74/74 [00:27<00:00,  2.64it/s]
100%|██████████| 19/19 [00:04<00:00,  4.12it/s]

training loss: 0.2728392779827118  val loss: 0.14711977541446686
Epoch  1  Training F1: 0.11469344608879492  validation f1: 0.27027027027027023
Epoch  2  started



training: 100%|██████████| 74/74 [00:48<00:00,  1.52it/s]
validating: 100%|██████████| 19/19 [00:03<00:00,  4.99it/s]
100%|██████████| 74/74 [00:26<00:00,  2.78it/s]
100%|██████████| 19/19 [00:04<00:00,  4.11it/s]

training loss: 0.11617723107337952  val loss: 0.10369794815778732
Epoch  2  Training F1: 0.4544  validation f1: 0.5714285714285714
Epoch  3  started



training: 100%|██████████| 74/74 [00:48<00:00,  1.52it/s]
validating: 100%|██████████| 19/19 [00:03<00:00,  4.97it/s]
100%|██████████| 74/74 [00:26<00:00,  2.77it/s]
100%|██████████| 19/19 [00:04<00:00,  4.11it/s]

training loss: 0.11705130338668823  val loss: 0.08094348013401031
Epoch  3  Training F1: 0.5328960852633736  validation f1: 0.6956521739130435
Epoch  4  started



training: 100%|██████████| 74/74 [00:48<00:00,  1.52it/s]
validating: 100%|██████████| 19/19 [00:03<00:00,  5.00it/s]
100%|██████████| 74/74 [00:26<00:00,  2.77it/s]
100%|██████████| 19/19 [00:04<00:00,  4.09it/s]

training loss: 0.022395947948098183  val loss: 0.08070254325866699
Epoch  4  Training F1: 0.7229571984435798  validation f1: 0.7843137254901961
Epoch  5  started



training: 100%|██████████| 74/74 [00:48<00:00,  1.52it/s]
validating: 100%|██████████| 19/19 [00:03<00:00,  4.99it/s]
100%|██████████| 74/74 [00:26<00:00,  2.78it/s]
100%|██████████| 19/19 [00:04<00:00,  4.09it/s]

training loss: 0.10959555953741074  val loss: 0.08559748530387878
Epoch  5  Training F1: 0.7819581958195819  validation f1: 0.8163265306122449
Epoch  6  started



training: 100%|██████████| 74/74 [00:48<00:00,  1.52it/s]
validating: 100%|██████████| 19/19 [00:03<00:00,  4.95it/s]
100%|██████████| 74/74 [00:26<00:00,  2.78it/s]
100%|██████████| 19/19 [00:04<00:00,  4.10it/s]

training loss: 0.12367884069681168  val loss: 0.08858359605073929
Epoch  6  Training F1: 0.7896857373086219  validation f1: 0.7826086956521738
Epoch  7  started



training: 100%|██████████| 74/74 [00:48<00:00,  1.53it/s]
validating: 100%|██████████| 19/19 [00:03<00:00,  4.99it/s]
100%|██████████| 74/74 [00:26<00:00,  2.80it/s]
100%|██████████| 19/19 [00:04<00:00,  4.10it/s]

training loss: 0.10675588250160217  val loss: 0.09347495436668396
Epoch  7  Training F1: 0.8307692307692308  validation f1: 0.75
Epoch  8  started



training: 100%|██████████| 74/74 [00:48<00:00,  1.53it/s]
validating: 100%|██████████| 19/19 [00:03<00:00,  4.97it/s]
100%|██████████| 74/74 [00:26<00:00,  2.80it/s]
100%|██████████| 19/19 [00:04<00:00,  4.10it/s]

training loss: 0.02852211892604828  val loss: 0.11413860321044922
Epoch  8  Training F1: 0.8050570962479608  validation f1: 0.68
Epoch  9  started



training: 100%|██████████| 74/74 [00:48<00:00,  1.53it/s]
validating: 100%|██████████| 19/19 [00:03<00:00,  4.97it/s]
100%|██████████| 74/74 [00:26<00:00,  2.76it/s]
100%|██████████| 19/19 [00:04<00:00,  4.08it/s]

training loss: 0.06559143215417862  val loss: 0.13148558139801025
Epoch  9  Training F1: 0.8801054018445322  validation f1: 0.6545454545454545
Epoch  10  started



training: 100%|██████████| 74/74 [00:48<00:00,  1.53it/s]
validating: 100%|██████████| 19/19 [00:03<00:00,  4.96it/s]
100%|██████████| 74/74 [00:26<00:00,  2.76it/s]
100%|██████████| 19/19 [00:04<00:00,  4.10it/s]

training loss: 0.008254670538008213  val loss: 0.12229684740304947
Epoch  10  Training F1: 0.9423548650858545  validation f1: 0.8695652173913043





In [None]:
result_train = evaluate_model(model, ds_train, batch_size, steps= steps_per_epoch, class_map = class_map, return_report= True)
result_test = evaluate_model(model, ds_test, batch_size, steps= steps_per_epoch, class_map = class_map, return_report= True)

100%|██████████| 74/74 [00:26<00:00,  2.77it/s]
100%|██████████| 74/74 [00:18<00:00,  4.08it/s]


In [None]:
print(result_train['classification_report'])
print(result_test['classification_report'])

                     precision    recall  f1-score   support

             DEGREE       0.96      0.98      0.97       245
           LOCATION       0.98      0.97      0.97       294
COMPANIES WORKED AT       0.91      0.96      0.94       575
       COLLEGE NAME       0.96      0.96      0.96       265
        DESIGNATION       0.89      0.95      0.92       370
      EMAIL ADDRESS       0.90      0.96      0.93       187
YEARS OF EXPERIENCE       0.94      0.92      0.93        37
               NAME       0.99      1.00      1.00       169

          micro avg       0.93      0.97      0.95      2142
          macro avg       0.93      0.97      0.95      2142

                     precision    recall  f1-score   support

        DESIGNATION       1.00      1.00      1.00       222
           LOCATION       1.00      1.00      1.00       592
             DEGREE       0.00      0.00      0.00        74
               NAME       1.00      1.00      1.00       222
COMPANIES WORKED AT 