# RESUME PARSER

### Installing dependencies

In [5]:
#!pip install tika

### Extracting text from pdf

In [6]:
from tika import parser
file = r'/content/candidate_042.pdf'
file_data = parser.from_file(file)
text = file_data['content']
print(text)






































candidate_042


Ryan Nelson
F R E S H E R  S O F T W A R E  D E V E L O P E R  

Executive Profile

I consider myself a team player and ethics
maintainer. When it comes to coding I
always try to be be well informed about
the technology and its application. Would
like to join any firm as a software
developer in machine learning projects.

Other Activities

Machine learning - Stanford University
SAP Fundamentals - Arjuvo Limited

Projects

Classification of Cassava Leaves

Skills

Machine Learning, Software Engineering,
Python, Java, SAP,C programming

Work Experience

Accenture  
Software Developer, Apr 2019 - Dec 2020 

Worked on multiple projects that dealt with system frame
development and solution supply.

Education

B.Tech Computer Science From JNTU, 2019





#### Creating a dictionary to store parsed content

In [7]:
parsed_content = {}

### Extracting E-Mail from the text

In [8]:
#E-MAIL
import re
def get_email_addresses(string):
    r = re.compile(r'[\w\.-]+@[\w\.-]+')
    return r.findall(string)

email = get_email_addresses(text)
print(email)
parsed_content['E-mail'] = email

[]


### Extracting Phone Number

In [9]:
#PHONE NUMBER
import re
def get_phone_numbers(string):
    r = re.compile(r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})')
    phone_numbers = r.findall(string)
    return [re.sub(r'\D', '', num) for num in phone_numbers]

phone_number= get_phone_numbers(text)
if len(phone_number) <= 10:
    print(phone_number)
    parsed_content['Phone number'] = phone_number

[]


### Extracting Name

In [10]:
import spacy
nlp = spacy.load('en_core_web_sm')
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

def extract_name(text):
   nlp_text = nlp(text)

   # First name and Last name are always Proper Nouns
   pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}]

   matcher.add('NAME', [pattern], on_match = None)

   matches = matcher(nlp_text)

   for match_id, start, end in matches:
       span = nlp_text[start:end]
       return span.text

name = extract_name(text)
print(name)
parsed_content['Name'] =  name

Ryan Nelson


### List of Keywords to identify 'Headings' in the Resume text

In [11]:
Keywords = ["education",
            "summary",
            "accomplishments",
            "executive profile",
            "professional profile",
            "personal profile",
            "work background",
            "academic profile",
            "other activities",
            "qualifications",
            "experience",
            "interests",
            "skills",
            "achievements",
            "publications",
            "publication",
            "certifications",
            "workshops",
            "projects",
            "internships",
            "trainings",
            "hobbies",
            "overview",
            "objective",
            "position of responsibility",
            "jobs"
           ]

### Cleaning the resume text

In [12]:
text = text.replace("\n"," ")
text = text.replace("[^a-zA-Z0-9]", " ");
re.sub('\W+','', text)
text = text.lower()
print(text)

                                     candidate_042   ryan nelson f r e s h e r  s o f t w a r e  d e v e l o p e r    executive profile  i consider myself a team player and ethics maintainer. when it comes to coding i always try to be be well informed about the technology and its application. would like to join any firm as a software developer in machine learning projects.  other activities  machine learning - stanford university sap fundamentals - arjuvo limited  projects  classification of cassava leaves  skills  machine learning, software engineering, python, java, sap,c programming  work experience  accenture   software developer, apr 2019 - dec 2020   worked on multiple projects that dealt with system frame development and solution supply.  education  b.tech computer science from jntu, 2019   


### Finding the headings and corresponding indices

In [13]:
content = {}
indices = []
keys = []
for key in Keywords:
    try:
        content[key] = text[text.index(key) + len(key):]
        indices.append(text.index(key))
        keys.append(key)
    except:
        pass


In [14]:
#Sorting the indices
zipped_lists = zip(indices, keys)
sorted_pairs = sorted(zipped_lists)
sorted_pairs

tuples = zip(*sorted_pairs)
indices, keys = [ list(tuple) for tuple in  tuples]
keys

['executive profile',
 'projects',
 'other activities',
 'skills',
 'experience',
 'education']

In [15]:
#Keeping the required content and removing the redundant part
content = []
for idx in range(len(indices)):
    if idx != len(indices)-1:
        content.append(text[indices[idx]: indices[idx+1]])
    else:
        content.append(text[indices[idx]: ])

### Storing the parsed content in the dictionary

In [16]:
for i in range(len(indices)):
    parsed_content[keys[i]] = content[i]

In [17]:
#Displaying the parsed content
parsed_content

{'E-mail': [],
 'Phone number': [],
 'Name': 'Ryan Nelson',
 'executive profile': 'executive profile  i consider myself a team player and ethics maintainer. when it comes to coding i always try to be be well informed about the technology and its application. would like to join any firm as a software developer in machine learning ',
 'projects': 'projects.  ',
 'other activities': 'other activities  machine learning - stanford university sap fundamentals - arjuvo limited  projects  classification of cassava leaves  ',
 'skills': 'skills  machine learning, software engineering, python, java, sap,c programming  work ',
 'experience': 'experience  accenture   software developer, apr 2019 - dec 2020   worked on multiple projects that dealt with system frame development and solution supply.  ',
 'education': 'education  b.tech computer science from jntu, 2019   '}

### Dumping the dictionary into json format

In [18]:
import json
with open("Parsed_Resume.json", "w") as outfile:
    json.dump(parsed_content, outfile)

### Displaying the contents of json file

In [19]:
a_file = open("Parsed_Resume.json", "r")
a_json = json.load(a_file)
pretty_json = json.dumps(a_json, indent=4)
a_file.close()
print(pretty_json)

{
    "E-mail": [],
    "Phone number": [],
    "Name": "Ryan Nelson",
    "executive profile": "executive profile  i consider myself a team player and ethics maintainer. when it comes to coding i always try to be be well informed about the technology and its application. would like to join any firm as a software developer in machine learning ",
    "projects": "projects.  ",
    "other activities": "other activities  machine learning - stanford university sap fundamentals - arjuvo limited  projects  classification of cassava leaves  ",
    "skills": "skills  machine learning, software engineering, python, java, sap,c programming  work ",
    "experience": "experience  accenture   software developer, apr 2019 - dec 2020   worked on multiple projects that dealt with system frame development and solution supply.  ",
    "education": "education  b.tech computer science from jntu, 2019   "
}
