<a href="https://colab.research.google.com/github/atikjain55/Automatic-Resume-Evaluation-System/blob/main/NER_using_Spacy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json
import spacy

import os
for dirname, _, filenames in os.walk('./NLP/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# import logging
import json
import re

# JSON formatting functions
def convert_dataturks_to_spacy(json_file_path):
    training_data = []
    lines=[]
    with open(json_file_path, 'r', encoding="utf-8") as f:
        lines = f.readlines()

    for line in lines:
        if line=="":
            print("empty line")
            continue
        data = json.loads(line)
        text = data['content'].replace("\n", " ")
        entities = []
        data_annotations = data['annotation']
        if data_annotations is not None:
            for annotation in data_annotations:
                #only a single point in text annotation.
                point = annotation['points'][0]
                labels = annotation['label']
                # handle both list of labels or a single label.
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    point_start = point['start']
                    point_end = point['end']
                    point_text = point['text']

                    lstrip_diff = len(point_text) - len(point_text.lstrip())
                    rstrip_diff = len(point_text) - len(point_text.rstrip())
                    if lstrip_diff != 0:
                        point_start = point_start + lstrip_diff
                    if rstrip_diff != 0:
                        point_end = point_end - rstrip_diff
                    entities.append((point_start, point_end + 1 , label))
        training_data.append((text, {"entities" : entities}))
    return training_data

def trim_entity_spans(data: list) -> list:
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])
    return cleaned_data

In [None]:
data = trim_entity_spans(convert_dataturks_to_spacy("./NLP/Entity Recognition in Resumes.json"))
data[0]

FileNotFoundError: ignored

In [None]:
def clean_entities(training_data):
    
     clean_data = []
     for text, annotation in training_data:
        
         entities = annotation.get('entities')
         entities_copy = entities.copy()
       
         # append entity only if it is longer than its overlapping entity
         i = 0
         for entity in entities_copy:
             j = 0
             for overlapping_entity in entities_copy:
                 # Skip self
                 if i != j:
                     e_start, e_end, oe_start, oe_end = entity[0], entity[1], overlapping_entity[0], overlapping_entity[1]
                     # Delete any entity that overlaps, keep if longer
                     if ((e_start >= oe_start and e_start <= oe_end) \
                     or (e_end <= oe_end and e_end >= oe_start)) \
                     and ((e_end - e_start) <= (oe_end - oe_start)):
                         entities.remove(entity)
                 j += 1
             i += 1
         clean_data.append((text, {'entities': entities}))
               
     return clean_data
data = clean_entities(data)

In [None]:
import random
import math

def train_test_split(data, test_size, random_state):

    random.Random(random_state).shuffle(data)
    test_idx = len(data) - math.floor(test_size * len(data))
    train_set = data[0: test_idx]
    test_set = data[test_idx: ]

    return train_set, test_set

In [None]:
train_data, test_data = train_test_split(data, test_size = 0.1, random_state = 42)

In [None]:
def train_spacy():
    
    nlp = spacy.blank('en')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
      
    # add labels
    for _, annotations in train_data:
         for ent in annotations.get("entities"):
            ner.add_label(ent[2])
          
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(10):
            print("Starting iteration " + str(itn))
            random.shuffle(train_data)
            losses = {}
            for text, annotations in train_data:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
    return nlp

In [None]:
nlp = train_spacy()

Starting iteration 0
{'ner': 22286.373561714707}
Starting iteration 1
{'ner': 17487.633669415176}
Starting iteration 2
{'ner': 13402.510354205631}
Starting iteration 3
{'ner': 13303.678709161606}
Starting iteration 4
{'ner': 13300.484113648534}
Starting iteration 5
{'ner': 12342.20715512839}
Starting iteration 6
{'ner': 10887.051619787746}
Starting iteration 7
{'ner': 9454.769237496577}
Starting iteration 8
{'ner': 10497.795937107152}
Starting iteration 9
{'ner': 9891.889112478688}


In [None]:
from spacy.gold import GoldParse
from itertools import groupby

def doc_to_bilou(nlp, text):
    
    doc = nlp(text)
    tokens = [(tok.text, tok.idx, tok.ent_type_) for tok in doc]
    entities = []
    for entity, group in groupby(tokens, key=lambda t: t[-1]):
        if not entity:
            continue
        group = list(group)
        _, start, _ = group[0]
        word, last, _ = group[-1]
        end = last + len(word)
        
        entities.append((
                start,
                end,
                entity
            ))

    gold = GoldParse(nlp(text), entities = entities)
    pred_ents = gold.ner
    
    return pred_ents

y_test = []
y_pred = []

for text, annots in test_data:
    
    gold = GoldParse(nlp.make_doc(text), entities = annots.get("entities"))
    ents = gold.ner
    pred_ents = doc_to_bilou(nlp, text)
    
    y_test.append(ents)
    y_pred.append(pred_ents)
    
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from itertools import chain

def ner_report(y_true, y_pred):
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_)
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset
    ), accuracy_score(y_true_combined, y_pred_combined)
    
report, accuracy = ner_report(y_test, y_pred)
print(report)

                       precision    recall  f1-score   support

                    -       0.00      0.00      0.00       142
       B-College Name       0.65      0.81      0.72        32
       I-College Name       0.60      0.86      0.71        63
       L-College Name       0.62      0.78      0.69        32
       U-College Name       1.00      1.00      1.00         1
B-Companies worked at       0.59      0.67      0.62        30
I-Companies worked at       0.00      0.25      0.01         4
L-Companies worked at       0.53      0.60      0.56        30
U-Companies worked at       0.32      0.27      0.29        41
             B-Degree       0.91      0.83      0.87        24
             I-Degree       0.88      0.92      0.90        66
             L-Degree       0.91      0.83      0.87        24
             U-Degree       0.20      0.67      0.31         3
        B-Designation       0.65      0.68      0.67        47
        I-Designation       0.82      0.57      0.68  

  _warn_prf(average, modifier, msg_start, len(result))



In [None]:
print(accuracy)

0.9011957205789805


In [None]:
nlp.to_disk('nlp_model')

In [None]:
nlp_model = spacy.load('nlp_model')

In [None]:
text = train_data[0][0]

In [None]:
doc = nlp_model(text)
for ent in doc.ents:
    print(f'{ent.label_.upper():{30}}- {ent.text}')

NAME                          - Sridevi H
LOCATION                      - Bangalore
EMAIL ADDRESS                 - indeed.com/r/Sridevi-H/63703b24aaaa54e4
DESIGNATION                   - Principal System Engineer
COMPANIES WORKED AT           - Aricent Technologies
DESIGNATION                   - Technical Lead
DEGREE                        - M.S in Software Systems
COLLEGE NAME                  - BITS Pilani
LOCATION                      - Pilani
DEGREE                        - B.E. in Computer Science
COLLEGE NAME                  - Board of Technical Education
SKILLS                        - Networking/Platform/Drivers/Vxworks
GRADUATION YEAR               - 2016


In [None]:
!pip install PyMuPDF



In [None]:
import sys, fitz

In [None]:
fname = 'Alice Clark CV.pdf'
doc = fitz.open(fname)
text = ""

for page in doc:
    text = text + str(page.getText())

In [None]:
tx = " ".join(text.split('\n'))

In [None]:
tx

'Alice Clark  AI / Machine Learning    Delhi, India Email me on Indeed  •  20+ years of experience in data handling, design, and development  •  Data Warehouse: Data analysis, star/snow flake scema data modelling and design specific to  data warehousing and business intelligence  •  Database: Experience in database designing, scalability, back-up and recovery, writing and  optimizing SQL code and Stored Procedures, creating functions, views, triggers and indexes.  Cloud platform: Worked on Microsoft Azure cloud services like Document DB, SQL Azure,  Stream Analytics, Event hub, Power BI, Web Job, Web App, Power BI, Azure data lake  analytics(U-SQL)  Willing to relocate anywhere    WORK EXPERIENCE  Software Engineer  Microsoft – Bangalore, Karnataka  January 2000 to Present  1. Microsoft Rewards Live dashboards:  Description: - Microsoft rewards is loyalty program that rewards Users for browsing and shopping  online. Microsoft Rewards members can earn points when searching with Bing, br

In [None]:
doc = nlp_model(tx)
for ent in doc.ents:
    print(f'{ent.label_.upper():{30}}- {ent.text}')

NAME                          - Alice Clark
LOCATION                      - Delhi
EMAIL ADDRESS                 - •
COMPANIES WORKED AT           - Microsoft
DESIGNATION                   - Software Engineer
COMPANIES WORKED AT           - Microsoft
LOCATION                      - Bangalore
COMPANIES WORKED AT           - Microsoft
COMPANIES WORKED AT           - Microsoft
COMPANIES WORKED AT           - Microsoft
COMPANIES WORKED AT           - Microsoft
COMPANIES WORKED AT           - Microsoft
LOCATION                      - Store
DEGREE                        - EDUCATION
COLLEGE NAME                  - Indian Institute of Technology – Mumbai
GRADUATION YEAR               - 2001
SKILLS                        - Machine Learning, Natural Language Processing, and Big Data Handling    ADDITIONAL INFORMATION  Professional Skills  • Excellent analytical, problem solving, communication, knowledge transfer and interpersonal
