# Data Prepocessing

In [1]:
# import logging
import json
import re

# JSON formatting functions
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    training_data = []
    lines=[]
    with open(dataturks_JSON_FilePath, 'r', encoding="utf8") as f:
        lines = f.readlines()

    for line in lines:
        data = json.loads(line)
        text = data['content'].replace("\n", " ")
        
        entities = []
        data_annotations = data['annotation']
        if data_annotations is not None:
            for annotation in data_annotations:
                #only a single point in text annotation.
                point = annotation['points'][0]
                labels = annotation['label']
                # handle both list of labels or a single label.
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    point_start = point['start']
                    point_end = point['end']
                    point_text = point['text']

                    lstrip_diff = len(point_text) - len(point_text.lstrip())
                    rstrip_diff = len(point_text) - len(point_text.rstrip())
                    if lstrip_diff != 0:
                        point_start = point_start + lstrip_diff
                    if rstrip_diff != 0:
                        point_end = point_end - rstrip_diff
                    entities.append((point_start, point_end + 1 , label))
        training_data.append((text, {"entities" : entities}))
    return training_data

def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        data (list): The data to be cleaned in spaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append((valid_start, valid_end, label))
        cleaned_data.append((text,valid_entities))
    return cleaned_data


In [2]:
data = trim_entity_spans(convert_dataturks_to_spacy("Entity Recognition in Resumes.json"))

len(data)


220

In [3]:
def SelfAnnoted(Path):
    Cleaned=[]
    
    with open(Path, 'r', encoding="utf8") as f:
            lines = f.readlines()

    for line in lines:
        sample = json.loads(line)
        annot=sample["annotations"]
        text=annot[0][0]
        text= re.sub(r'[\s]'," ", text)
        entity=annot[0][1]["entities"]
        Cleaned.append((text,entity))
    return Cleaned
data2=SelfAnnoted("Self_Annot.json")    
data=data+data2


In [4]:
    len(data)

267

In [5]:
data[0]

("Abhishek Jha Application Development Associate - Accenture  Bengaluru, Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a  • To work for an organization which provides me the opportunity to improve my skills and knowledge for my individual and company's growth in best possible ways.  Willing to relocate to: Bangalore, Karnataka  WORK EXPERIENCE  Application Development Associate  Accenture -  November 2017 to Present  Role: Currently working on Chat-bot. Developing Backend Oracle PeopleSoft Queries for the Bot which will be triggered based on given input. Also, Training the bot for different possible utterances (Both positive and negative), which will be given as input by the user.  EDUCATION  B.E in Information science and engineering  B.v.b college of engineering and technology -  Hubli, Karnataka  August 2013 to June 2017  12th in Mathematics  Woodbine modern school  April 2011 to March 2013  10th  Kendriya Vidyalaya  April 2001 to March 2011  SKILLS  C (Le

# Making the data ready for Training

In [9]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

def get_spacy_doc(data):
    nlp = spacy.blank("en") # load a new spacy model
    db = DocBin() # create a DocBin object

    for text, annot in tqdm(data): # data in previous format
        doc = nlp(text) # create doc object from text
#         annot=annot["entities"]
        ents=[]
        entity_indices=[]
        for start, end, label in annot :
            # add character indexes
            skip_entity=False
            for idx in range(start,end):
                if idx in entity_indices:
                    skip_entity=True
                    break
            if skip_entity==True:
                continue
            entity_indices=entity_indices+list(range(start,end))
            
            
            try:
                span = doc.char_span(start, end, label=label)
            except:
                continue
            if span is None:               
                continue
            else:
                ents.append(span)
            
        try:
            doc.ents=ents
            db.add(doc)
        except:
            pass  
    return db

db=get_spacy_doc(data)
db.to_disk("./train.spacy") # save the docbin object

100%|████████████████████████████████████████| 267/267 [00:01<00:00, 144.61it/s]


# Training the Data

In [82]:
!python -m spacy init fill-config base_config.cfg config.cfg



[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [83]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./train.spacy

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m
[2022-11-25 23:40:21,782] [INFO] Set up nlp object from config
[2022-11-25 23:40:21,788] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-11-25 23:40:21,791] [INFO] Created vocabulary
[2022-11-25 23:40:22,206] [INFO] Added vectors: en_core_web_sm
[2022-11-25 23:40:22,207] [INFO] Finished initializing nlp object
[2022-11-25 23:40:35,179] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     68.09    0.00    0.00    0.00    0.00
  0     200       2115.03  12825.30   27.75   36.33   22.45    0.28
  1     400       5112.15   7197.44   49.68   60.98   41.91    0.50
 

# Loading best model saved after training

In [7]:
import spacy
nlp = spacy.load(r"./output/model-best") #load the best model



# Function to extract Phone Number and Email

In [8]:
#Function to extract Phone Numbers from string using regular expressions
def extract_phone_numbers(string):
    r = re.compile(r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})')
    phone_numbers = r.findall(string)
    return [re.sub(r'\D', '', number) for number in phone_numbers]
#Function to extract Email address from a string using regular expressions
def extract_email_addresses(string):
    r = re.compile(r'[\w\.-]+@[\w\.-]+')
    return r.findall(string)


# Function for Cosine Similarity

In [9]:
import math
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def cosine_similarity(X,Y):
    X=" ".join(X)
    X_list = word_tokenize(X)
    Y_list = word_tokenize(Y)

    l1 =[];l2 =[]
    for w in X_list:
        if w in Y_list:
            l1.append(1)
            l2.append(1)
            
        else:
            l1.append(1)
            l2.append(0)
        
            
    XY = 0
    for i in range(len(X_list)):
            XY+= l1[i]*l2[i]
    if (sum(l1)*sum(l2))**0.5!=0:
        
        cosine = XY / float((sum(l1)*sum(l2))**0.5)
    else:
        cosine=0
        
    return cosine

def getSkills(nlp,text):
    doc=nlp(text)
    skill=""
    for ent in doc.ents:
        if ent.label_=="Skills":
            skill=skill+" "+str(ent.text)

    return skill

In [10]:
Computer_Science_Skills=["C++","Java","Python","SQL","Database","web development","MySQL","Oracle","Cloud Computing","Problem-solving","Communication","Javascript","UNIX","Linux","Java"]
Data_Science_Skills=["Python","Machine Learning","Deep Learning","Tableau","Data Visualisation","Big Data","Data Mining","Natural Language Processing","NLP","TensorFlow","Statistical Modeling","Data Science","Web Scraping"]

# Reading PDF files and processing to get skills, similarity value , Phone No. and Email Id


In [15]:
from spacy import displacy
import PyPDF2
import re
def Skills(n,Profile,nlp):
    Skill_Score={}
    
    for i in range(1,n):
        pdfFileObj = open(str(i)+".pdf", 'rb')
        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
        text_sample=""
        for i in range(pdfReader.numPages):
            pageObj = pdfReader.getPage(i)
            text_sample=text_sample+pageObj.extractText()
        pdfFileObj.close()
        
        text_sample=text_sample.replace("\n"," ")
        text_sample=re.sub(r"[^a-zA-Z0-9\-\+\.\/\@]"," ",text_sample)
        doc = nlp(text_sample)
        displacy.render(doc,style="ent",jupyter=True)
        print("-----------------------------------------------------------------------------------------------")
        Skill=getSkills(nlp,text_sample)
#         Skill_Score[i]=cosine_similarity(Profile,Skill)
        print("Similarity with Required Skills: ",cosine_similarity(Profile,Skill))
        print("Mobile No:",extract_phone_numbers(text_sample))
        print("Email Id: ",extract_email_addresses(text_sample))
        print("------------------------------------------------------------------------------------------------")
    return None

In [16]:
Skills(7,Data_Science_Skills,nlp)

-----------------------------------------------------------------------------------------------
Similarity with Required Skills:  0.6915640748081247
Mobile No: []
Email Id:  []
------------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
Similarity with Required Skills:  0.6915640748081247
Mobile No: ['8628963924']
Email Id:  ['abhijeetmanhas720@gmail.com']
------------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
Similarity with Required Skills:  0.6255432421712243
Mobile No: ['8058041116', '0202021']
Email Id:  ['deepalisingh2k@gmail.com']
------------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
Similarity with Required Skills:  0.20851441405707477
Mobile No: []
Email Id:  ['475b19144@students.iitmandi.ac.in']
------------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
Similarity with Required Skills:  0.20851441405707477
Mobile No: ['9816111272']
Email Id:  ['samagnaneshwarreddy@gmail.com', 'cgauss@codeforces']
------------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
Similarity with Required Skills:  0
Mobile No: []
Email Id:  []
------------------------------------------------------------------------------------------------


# Thank You