In [1]:
import logging
from pathlib import Path

import blacktape
from blacktape.pipeline import Pipeline

logger = logging.getLogger(__name__)

In [2]:
blacktape.__version__

'0.0.1.dev13'

### Test Data

In [3]:
test_file = Path('../tests/sample.txt')

In [4]:
text = test_file.read_text(encoding='UTF-8')
# print(text)

### Entities

In [5]:
# Entity types we're interested in
target_entities = {'PERSON', 'ORG', 'DATE', 'FOOD'}

In [6]:
# spaCy model name
model = 'en_core_web_sm'

### Regexes

In [7]:
# E.g: numbers and capitalized words
patterns = [r"[0-9]+", r"\b[A-Z][a-zA-Z]*\b"]

### Pipeline

In [8]:
with Pipeline(spacy_model=model) as pipeline:
    pipeline.submit_ner_job(text, target_entities)

    for pattern in patterns:
        pipeline.submit_regex_job(text, pattern)

    for result in pipeline.results():
        for match in result:
            print(match)

{'pattern': '[0-9]+', 'text': '12', 'offset': 98}
{'pattern': '[0-9]+', 'text': '2021', 'offset': 102}
{'pattern': '[0-9]+', 'text': '2', 'offset': 109}
{'pattern': '[0-9]+', 'text': '00', 'offset': 111}
{'pattern': '[0-9]+', 'text': '9', 'offset': 305}
{'pattern': '[0-9]+', 'text': '15', 'offset': 311}
{'pattern': '[0-9]+', 'text': '50', 'offset': 339}
{'pattern': '[0-9]+', 'text': '1', 'offset': 358}
{'pattern': '[0-9]+', 'text': '60', 'offset': 1119}
{'pattern': '[0-9]+', 'text': '90', 'offset': 1257}
{'pattern': '[0-9]+', 'text': '26', 'offset': 1274}
{'pattern': '[0-9]+', 'text': '24', 'offset': 1697}
{'pattern': '[0-9]+', 'text': '1', 'offset': 1710}
{'pattern': '[0-9]+', 'text': '6421', 'offset': 1944}
{'pattern': '[0-9]+', 'text': '29', 'offset': 1961}
{'pattern': '[0-9]+', 'text': '8', 'offset': 1993}
{'pattern': '[0-9]+', 'text': '95', 'offset': 2004}
{'pattern': '[0-9]+', 'text': '9', 'offset': 2559}
{'pattern': '[0-9]+', 'text': '15', 'offset': 2565}
{'pattern': '[0-9]+', '