In [1]:
import logging
from pathlib import Path

import blacktape
from blacktape.pipeline import Pipeline

logger = logging.getLogger(__name__)

In [2]:
blacktape.__version__

'0.0.1.dev30'

### Test Data

In [3]:
test_file = Path('../tests/data/sample.txt')

In [4]:
text = test_file.read_text(encoding='UTF-8')
# print(text)

### Entities

In [5]:
# Entity types we're interested in
target_entities = {'PERSON', 'ORG', 'DATE', 'FOOD'}

In [6]:
# spaCy model name
model = 'en_core_web_sm'

### Regexes

In [7]:
# E.g: numbers and capitalized words
patterns = [
    (r"[0-9]+", "number"),
    (r"\b[A-Z][a-zA-Z]*\b", "capitalized word")
]

### Pipeline

In [8]:
with Pipeline(spacy_model=model) as pipeline:
    pipeline.submit_ner_job(text, target_entities)

    for pattern, label in patterns:
        pipeline.submit_regex_job(text, pattern, label)

    for result in pipeline.results():
        for match in result:
            print(match)

{'type': 'pattern', 'pattern': '[0-9]+', 'label': 'number', 'text': '12', 'offset': 98}
{'type': 'pattern', 'pattern': '[0-9]+', 'label': 'number', 'text': '2021', 'offset': 102}
{'type': 'pattern', 'pattern': '[0-9]+', 'label': 'number', 'text': '2', 'offset': 109}
{'type': 'pattern', 'pattern': '[0-9]+', 'label': 'number', 'text': '00', 'offset': 111}
{'type': 'pattern', 'pattern': '[0-9]+', 'label': 'number', 'text': '9', 'offset': 305}
{'type': 'pattern', 'pattern': '[0-9]+', 'label': 'number', 'text': '15', 'offset': 311}
{'type': 'pattern', 'pattern': '[0-9]+', 'label': 'number', 'text': '50', 'offset': 339}
{'type': 'pattern', 'pattern': '[0-9]+', 'label': 'number', 'text': '1', 'offset': 358}
{'type': 'pattern', 'pattern': '[0-9]+', 'label': 'number', 'text': '60', 'offset': 1119}
{'type': 'pattern', 'pattern': '[0-9]+', 'label': 'number', 'text': '90', 'offset': 1257}
{'type': 'pattern', 'pattern': '[0-9]+', 'label': 'number', 'text': '26', 'offset': 1274}
{'type': 'pattern', 