Skip to content

Commit

Permalink
Adding university matching
Browse files Browse the repository at this point in the history
  • Loading branch information
Brendan J. Herger committed Oct 20, 2017
1 parent 099a141 commit d5f72d9
Show file tree
Hide file tree
Showing 7 changed files with 69 additions and 16 deletions.
29 changes: 22 additions & 7 deletions bin/field_extraction.py
@@ -1,16 +1,14 @@
import logging

import re
from gensim.utils import simple_preprocess

import lib


EMAIL_REGEX = r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}"
PHONE_REGEX = r"\(?(\d{3})?\)?[\s\.-]{0,2}?(\d{3})[\s\.-]{0,2}(\d{4})"


def candidate_name_extractor(input_string, nlp):

input_string = unicode(input_string)

doc = nlp(input_string)
Expand All @@ -27,16 +25,16 @@ def candidate_name_extractor(input_string, nlp):
candidate_name = doc_persons[0]
return candidate_name

def extract_skills(resume_text):

def extract_skills(resume_text):
potential_skills_dict = dict()
matched_skills = set()

# TODO This skill input formatting could happen once per run, instead of once per observation.
for skill_input in lib.get_conf('skills'):

# Format list inputs
if type(skill_input) is list and len(skill_input) >=1:
if type(skill_input) is list and len(skill_input) >= 1:
potential_skills_dict[skill_input[0]] = skill_input

# Format string inputs
Expand All @@ -51,12 +49,29 @@ def extract_skills(resume_text):
skill_matches = 0
# Iterate through aliases
for skill_alias in skill_alias_list:

# Add the number of matches for each alias
skill_matches += lib.term_count(resume_text, skill_alias.lower())

# If at least one alias is found, add skill name to set of skills
if skill_matches > 0:
matched_skills.add(skill_name)

return matched_skills
return matched_skills


def extract_universities(resume_text):

# Reference variables
matched_universities = set()
normalized_resume_text = ' '.join(simple_preprocess(resume_text))

# Iterate through possible universities
for university in lib.get_conf('universities'):

university = ' '.join(simple_preprocess(university))
university_count = lib.term_count(normalized_resume_text, university)

if university_count > 0:
matched_universities.add(university)

return matched_universities
3 changes: 2 additions & 1 deletion bin/main.py
Expand Up @@ -41,7 +41,6 @@ def main():
pass

def extract():
# TODO Docstring
logging.info('Begin extract')

# Reference variables
Expand Down Expand Up @@ -86,6 +85,8 @@ def transform(observations, nlp):
observations['email'] = observations['text'].apply(lambda x: lib.term_match(x, field_extraction.EMAIL_REGEX))
observations['phone'] = observations['text'].apply(lambda x: lib.term_match(x,field_extraction.PHONE_REGEX))

# Extract university
observations['universities'] = observations['text'].apply(field_extraction.extract_universities)
# Extract skills
observations['skills'] = observations['text'].apply(field_extraction.extract_skills)

Expand Down
9 changes: 8 additions & 1 deletion confs/confs.yaml.template
Expand Up @@ -6,4 +6,11 @@ skills:
- [skill2, skill2_alias_A, skill2_alias_B]
- python
- [sklearn, scikit-learn, sk-learn]
- [tensorflow, tf, tensor-flow]
- [tensorflow, tf, tensor-flow]
universities:
- University One
- University Two
- University Three
- University of San Francisco
- Bucknell University
- University of California
Binary file modified data/input/example_resumes/SGresume-1.pdf
Binary file not shown.
12 changes: 6 additions & 6 deletions data/output/resume_summary.csv
@@ -1,4 +1,4 @@
index,file_path,extension,text,candidate_name,email,phone,skills
index,file_path,extension,text,candidate_name,email,phone,universities,skills
1,../data/input/example_resumes/Brendan_Herger_Resume.pdf,.pdf,"Brendan Herger

Hergertarian.com | 13herger@gmail.com | + 1 (415) 582-7457
Expand Down Expand Up @@ -85,7 +85,7 @@ linkedin.com/in/bjherger

hergertarian.wordpress.com/

",Brendan Herger,13herger@gmail.com,"(415, 582, 7457)","{python, sklearn, java, sql}"
",Brendan Herger,13herger@gmail.com,"(415, 582, 7457)",{university of san francisco},"{python, sklearn}"
2,../data/input/example_resumes/john_smith.docx,.docx,"John Smith


Expand Down Expand Up @@ -194,7 +194,7 @@ HONORS

Summit County Alumni Association Scholarship Spring 2014 - Present

Dean’s List Spring 2013 - Spring 2015",John Smith,sresume@kent.edu,"(614, 555, 5555)",{sql}
Dean’s List Spring 2013 - Spring 2015",John Smith,sresume@kent.edu,"(614, 555, 5555)",{},{}
3,../data/input/example_resumes/Layla_Martin_Resume.pdf,.pdf,"Layla Martin

2038 McAllister St
Expand Down Expand Up @@ -282,7 +282,7 @@ NCAA Division I Women’s Soccer: USF, 2010-2014
• Athletic scholarship (full scholarship when combined with University Scholarship).
• Committed 30 hours per week to training, meetings, travel, competition.

",Layla Martin,layla.d.martin@gmail.com,"(520, 271, 2492)","{python, sql}"
",Layla Martin,layla.d.martin@gmail.com,"(520, 271, 2492)",{university of san francisco},{python}
4,../data/input/example_resumes/resume_Meyer.pdf,.pdf,"MONICA MEYER

(415) · 497 · 7282 (cid:5) monica.meyer@comcast.net
Expand Down Expand Up @@ -346,7 +346,7 @@ Python, R
XML, JSON, REST
MySQL, PostgreSQL

",MONICA MEYER,monica.meyer@comcast.net,,"{python, sql}"
",MONICA MEYER,monica.meyer@comcast.net,,"{university of san francisco, university of california}",{python}
5,../data/input/example_resumes/SGresume-1.pdf,.pdf,"Sébastien Genty

1209 Page St, Apt 7, San Francisco, CA 94117
Expand Down Expand Up @@ -466,4 +466,4 @@ physics, research, reading, video
games, solving problems and fixing
things

",Sébastien Genty,sgenty@me.com,"(713, 301, 5648)","{python, tensorflow, java, sql}"
",Sébastien Genty,sgenty@me.com,"(713, 301, 5648)",{bucknell university},"{python, tensorflow}"
1 change: 1 addition & 0 deletions data/schema/transform.csv
Expand Up @@ -5,4 +5,5 @@ text,object,observations
candidate_name,object,observations
email,object,observations
phone,object,observations
universities,object,observations
skills,object,observations
31 changes: 30 additions & 1 deletion environment.yml
Expand Up @@ -24,20 +24,49 @@ dependencies:
- pip:
- argcomplete==1.8.2
- beautifulsoup4==4.5.3
- chardet==2.3.0
- boto==2.48.0
- bz2file==0.98
- certifi==2017.7.27.1
- chardet==3.0.4
- cymem==1.31.2
- cytoolz==0.8.2
- dill==0.2.7.1
- docx2txt==0.6
- ebooklib==0.15
- ftfy==4.4.3
- gensim==3.0.1
- html5lib==0.999999999
- idna==2.6
- lxml==4.1.0
- murmurhash==0.26.4
- numpy==1.13.3
- olefile==0.44
- pandas==0.20.3
- pathlib==1.0.1
- pillow==4.3.0
- plac==0.9.6
- pocketsphinx==0.1.3
- preshed==1.0.0
- python-dateutil==2.6.1
- python-pptx==0.6.5
- pytz==2017.2
- pyyaml==3.12
- regex==2017.9.23
- requests==2.18.4
- scipy==0.19.1
- six==1.10.0
- smart-open==1.5.3
- spacy==1.9.0
- speechrecognition==3.6.3
- termcolor==1.1.0
- textract==1.6.1
- thinc==6.5.2
- toolz==0.8.2
- tqdm==4.19.4
- ujson==1.35
- urllib3==1.22
- wcwidth==0.1.7
- webencodings==0.5.1
- wrapt==1.10.11
- xlrd==1.0.0
- xlsxwriter==1.0.2

0 comments on commit d5f72d9

Please sign in to comment.