In [85]:
import spacy
from spacy.matcher import *

nlp = spacy.load('en')

In [86]:
# Lets use Spacy.Matcher to find job titles!
job_titles = []

# Sample doc, from which we will annotate the information we seek
doc = nlp("Bridge the gap between the clinical, the imaging modalities, and IT stakeholders at a clinical site, and the research and prototyping back office within Philips. We are looking for a qualified data scientist to explore cutting edgle solutions in health informatics. You should be able to work in a multidisciplinary team of data scientists, software engineers, clinical scientists, physicians, department staff, and researchers. The Scientist will join an experienced team of Philips personnel and will be trained at the business and research facilities.")

# Define the patterns
job_title_pattern = [{'POS': 'PRON', 'OP': '+'}, # We/I/They/etc
                     {'LEMMA': 'be'}, # are/is/am
                     {'LOWER': 'looking'}, # looking
                     {'POS': 'ADP'}, # preposition, exactly 1
                     {'POS': 'DET'}, # determiner, exactly 1
                     {'POS': 'ADJ', 'OP': '*'}, # adjective(s), 0 or more
                     {'POS': 'NOUN', 'OP': '+'}] # PERSONs, at least 1

noun_phrase_pattern = [{'POS': 'DET'},
                       {'POS': 'ADJ', 'OP': '*'},
                       {'POS': 'NOUN', 'OP': '+'}]


In [87]:
# Build a Matcher and add the pattern to the matcher
matcher = Matcher(nlp.vocab)
matcher.add("Noun_Phrases", None, noun_phrase_pattern)
matcher.add("Job_Titles", None, job_title_pattern)

# Find matches
matches=matcher(doc)

In [88]:
# Loop through the matches and print out the relevent portion of the match
for match in matches: # match = [(pattern_id, start_idx, end_idx)]
    if match[0] == 13161979027269409329: # Matcher takes your pattern title and hashes it, so this corresponds to "Noun_Phrase"
        noun_phrase_tokens = [x.orth_ for x in doc[match[1]:match[2]]]
        print("Noun Phrase: " + " ".join(noun_phrase_tokens))
    else:    
        job_title_tokens = [x.orth_ for x in doc[match[1]:match[2]] if x.pos_=="NOUN"]
        print("Job Title: " + " ".join(job_title_tokens))

Noun Phrase: the gap
Noun Phrase: the imaging modalities
Noun Phrase: a clinical site
Noun Phrase: the research
Job Title: data scientist
Noun Phrase: a qualified data scientist
Noun Phrase: a multidisciplinary team
Noun Phrase: an experienced team
Noun Phrase: the business
