In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np 
from pathlib import Path 
import os 
import matplotlib.pyplot as plt
import json
import logging
import re

os.environ['TOKENIZERS_PARALLELISM'] = 'true'

from transformers import (
    BertForTokenClassification, 
    BertTokenizerFast, 
    TrainingArguments, 
    Trainer
)
import torch 
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score

In [2]:
base_dir = Path.cwd().parent
data_dir = base_dir / 'data'

dataset_name = 'resume_data.json'

In [3]:
df_data = pd.read_json(data_dir / dataset_name, lines=True)
df_data.head()

Unnamed: 0,content,annotation,extras
0,Abhishek Jha\nApplication Development Associat...,"[{'label': ['Skills'], 'points': [{'start': 12...",
1,Afreen Jamadar\nActive member of IIIT Committe...,"[{'label': ['Email Address'], 'points': [{'sta...",
2,"Akhil Yadav Polemaina\nHyderabad, Telangana - ...","[{'label': ['Skills'], 'points': [{'start': 37...",
3,Alok Khandai\nOperational Analyst (SQL DBA) En...,"[{'label': ['Skills'], 'points': [{'start': 80...",
4,Ananya Chavan\nlecturer - oracle tutorials\n\n...,"[{'label': ['Degree'], 'points': [{'start': 20...",


In [4]:
# Reference: https://github.com/singhsourabh/Resume-NER/blob/master/utils.py

import json
import re
import logging

def convert_goldparse(dataturks_JSON_FilePath):
    """
    Converts labeled data from a Dataturks JSON file format into spaCy's NER training format.
    
    Args:
        dataturks_JSON_FilePath (str): Path to the Dataturks JSON file containing the labeled data.
        
    Returns:
        list: A list of tuples representing spaCy-compatible training data in the format:
              [
                  (
                      "content",
                      {
                          "entities": [(start, end, entity_label), ...]
                      }
                  ),
                  ...
              ]
        or None if an error occurs.
    """
    try:
        training_data = []
        
        with open(dataturks_JSON_FilePath, 'r') as f:
            lines = f.readlines()

        for line in lines:
            data = json.loads(line)  
            text = data['content'].replace("\n", " ")  
            entities = []  
            data_annotations = data.get('annotation', None)
            
            if data_annotations:
                for annotation in data_annotations:
                    point = annotation['points'][0]  
                    labels = annotation['label']

                    if not isinstance(labels, list):
                        labels = [labels]

                    for label in labels:
                        point_start = point['start']  # Start position of the entity
                        point_end = point['end']      # End position of the entity
                        point_text = point['text']    # Text covered by this entity

                        # Calculate differences for leading/trailing whitespaces in the entity text
                        lstrip_diff = len(point_text) - len(point_text.lstrip())
                        rstrip_diff = len(point_text) - len(point_text.rstrip())
                        
                        # Adjust start and end indices to remove extra whitespaces
                        if lstrip_diff != 0:
                            point_start += lstrip_diff
                        if rstrip_diff != 0:
                            point_end -= rstrip_diff
                        
                        entities.append((point_start, point_end + 1, label))
            
            training_data.append((text, {"entities": entities}))
        
        return training_data

    except Exception as e:
        logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e))
        return None


def trim_entity_spans(data: list) -> list:
    """
    Cleans spaCy-compatible training data by removing leading and trailing white spaces
    from entity spans.
    
    Args:
        data (list): The training data in spaCy format with each entry as a tuple:
                     [
                         (
                             "content",
                             {
                                 "entities": [(start, end, label), ...]
                             }
                         ),
                         ...
                     ]
    
    Returns:
        list: The cleaned training data with precise entity spans after removing whitespace.
    """
    # Regular expression pattern to match whitespace characters
    invalid_span_tokens = re.compile(r'\s')

   
    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']  
        valid_entities = []  

        for start, end, label in entities:
            valid_start = start  # Initialize valid start position
            valid_end = end      # Initialize valid end position

            # Adjust valid_start to skip leading whitespace characters
            while valid_start < len(text) and invalid_span_tokens.match(text[valid_start]):
                valid_start += 1
            
            # Adjust valid_end to skip trailing whitespace characters
            while valid_end > 1 and invalid_span_tokens.match(text[valid_end - 1]):
                valid_end -= 1
            
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])
    
    return cleaned_data

In [5]:
tags_vals = ["UNKNOWN", "O", "Name", "Degree", "Skills", "College Name", "Email Address",
             "Designation", "Companies worked at", "Graduation Year", "Years of Experience", "Location"]

tag2idx = {t: i for i, t in enumerate(tags_vals)}
idx2tag = {i: t for i, t in enumerate(tags_vals)}

def get_label(offset, labels):
    if offset[0] == 0 and offset[1] == 0:
        return 'O'
    for label in labels:
        if offset[1] >= label[0] and offset[0] <= label[1]:
            return label[2]
    return 'O'

def process_resume(data, tokenizer, tag2idx, max_len, is_test=False):
    tok = tokenizer.encode_plus(
        data[0], max_length=max_len, return_offsets_mapping=True)
    curr_sent = {'orig_labels': [], 'labels': []}

    padding_length = max_len - len(tok['input_ids'])

    if not is_test:
        labels = data[1]['entities']
        labels.reverse()
        for off in tok['offset_mapping']:
            label = get_label(off, labels)
            curr_sent['orig_labels'].append(label)
            curr_sent['labels'].append(tag2idx[label])
        curr_sent['labels'] = curr_sent['labels'] + ([0] * padding_length)

    curr_sent['input_ids'] = tok['input_ids'] + ([0] * padding_length)
    curr_sent['token_type_ids'] = tok['token_type_ids'] + \
        ([0] * padding_length)
    curr_sent['attention_mask'] = tok['attention_mask'] + \
        ([0] * padding_length)
    return curr_sent


class ResumeDataset(Dataset):
    def __init__(self, resume, tokenizer, tag2idx, max_len, is_test=False):
        self.resume = resume
        self.tokenizer = tokenizer
        self.is_test = is_test
        self.tag2idx = tag2idx
        self.max_len = max_len

    def __len__(self):
        return len(self.resume)

    def __getitem__(self, idx):
        data = process_resume(
            self.resume[idx], self.tokenizer, self.tag2idx, self.max_len, self.is_test)
        return {
            'input_ids': torch.tensor(data['input_ids'], dtype=torch.long),
            'token_type_ids': torch.tensor(data['token_type_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(data['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(data['labels'], dtype=torch.long),
            'orig_label': data['orig_labels']
        }


In [6]:
cleaned_data = trim_entity_spans(convert_goldparse(data_dir / dataset_name))

In [7]:
TEST_SPLIT=0.15
text = [_[0] for _ in cleaned_data]
entities = [_[1] for _ in cleaned_data]

train_text, valid_text, train_entities, valid_entities = train_test_split(text, 
                                                            entities, 
                                                            test_size=TEST_SPLIT, 
                                                            random_state=10
                                                            )

train_data = [(train_text[i], train_entities[i])   for i in range(len(train_text))]
valid_data = [(valid_text[i], valid_entities[i])   for i in range(len(valid_text))]

print("Number of Training Samples : {TRAIN_DATA}".format(TRAIN_DATA=len(train_data)))
print("Number of Validation Samples : {VALID_DATA}".format(VALID_DATA=len(valid_data)))

Number of Training Samples : 187
Number of Validation Samples : 33


In [14]:
base_model_name = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(base_model_name, truncation=True)
model = BertForTokenClassification.from_pretrained(base_model_name, num_labels=len(tag2idx))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
max_len = 500  # Set based on your requirements

train_dataset = ResumeDataset(train_data, tokenizer, tag2idx, max_len)
val_dataset = ResumeDataset(valid_data, tokenizer, tag2idx, max_len)

In [10]:
def get_special_tokens(tokenizer, tag2idx):
    vocab = tokenizer.get_vocab()
    pad_tok = vocab["[PAD]"]
    sep_tok = vocab["[SEP]"]
    cls_tok = vocab["[CLS]"]
    o_lab = tag2idx["O"]
    return pad_tok, sep_tok, cls_tok, o_lab



pad_tok, sep_tok, cls_tok, o_lab = get_special_tokens(tokenizer, tag2idx)

In [23]:
def compute_metrics(p):
    predictions, labels = p

    # Apply argmax to get the predicted class indices
    predictions = np.argmax(predictions, axis=2)
    
    # Initialize lists to store filtered predictions and true labels
    valid_predictions, valid_labels = [], []

    for pred, label in zip(predictions, labels):
        # Mask the predictions to exclude [CLS], [SEP], [PAD]
        preds_mask = (label != pad_tok) & (label != sep_tok) & (label != cls_tok)

        # Apply the mask to the predictions only
        valid_pred = np.array(pred)[preds_mask]
        valid_label = np.array(label)[preds_mask]

        # Convert to tag names using idx2tag mapping
        valid_predictions.append([idx2tag[p] for p in valid_pred])
        valid_labels.append([idx2tag[l] for l in valid_label])

    # Calculate metrics
    precision = precision_score(valid_labels, valid_predictions)
    recall = recall_score(valid_labels, valid_predictions)
    f1 = f1_score(valid_labels, valid_predictions)
    accuracy = accuracy_score(valid_labels, valid_predictions)

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "accuracy": accuracy,
    }

In [27]:
experiment_name = 'ner_exp1'

training_args = TrainingArguments(
    output_dir=base_dir / f"checkpoints/{experiment_name}",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_steps=10_000,
    logging_dir= base_dir / f"checkpoints/{experiment_name}/logs",
    logging_steps=200,
    learning_rate=5e-5,
    weight_decay=0.01,
)

In [28]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# 10. Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.364806,0.618047,0.549149,0.581564,0.890943
2,No log,0.304992,0.671306,0.555373,0.607861,0.903156
3,No log,0.293687,0.632105,0.659528,0.645525,0.900251
4,No log,0.279549,0.71322,0.576754,0.637769,0.909163
5,No log,0.285981,0.647279,0.666118,0.656563,0.906852
6,No log,0.266024,0.674265,0.616228,0.643942,0.908173
7,No log,0.254174,0.696078,0.623833,0.657979,0.912266
8,No log,0.248865,0.665811,0.711075,0.687699,0.912662
9,No log,0.249045,0.681084,0.717738,0.69893,0.914906
10,No log,0.249668,0.689767,0.698465,0.694089,0.914708


TrainOutput(global_step=30, training_loss=0.22910205523173013, metrics={'train_runtime': 37.5013, 'train_samples_per_second': 49.865, 'train_steps_per_second': 0.8, 'total_flos': 477215929080000.0, 'train_loss': 0.22910205523173013, 'epoch': 10.0})

In [29]:
model.save_pretrained(base_dir / f"checkpoints/final_model")
tokenizer.save_pretrained(base_dir / f"checkpoints/final_model")

('/nfs/home/scg1143/ATSChecker/ner-extraction/checkpoints/final_model/tokenizer_config.json',
 '/nfs/home/scg1143/ATSChecker/ner-extraction/checkpoints/final_model/special_tokens_map.json',
 '/nfs/home/scg1143/ATSChecker/ner-extraction/checkpoints/final_model/vocab.txt',
 '/nfs/home/scg1143/ATSChecker/ner-extraction/checkpoints/final_model/added_tokens.json',
 '/nfs/home/scg1143/ATSChecker/ner-extraction/checkpoints/final_model/tokenizer.json')