In [None]:
import logging
from pathlib import Path

import blacktape
from blacktape.lib import match_entities_in_text, match_patterns_in_text, get_entities_for_spacy_model

logger = logging.getLogger(__name__)

In [None]:
blacktape.__version__

### spaCy Setup

In [None]:
!spacy-model install en_core_web_sm

### Test Data

In [None]:
test_file = Path('../tests/data/sample.txt')

In [None]:
text = test_file.read_text(encoding='UTF-8')
# print(text)

### Entities

In [None]:
# Entity types we're interested in
target_entities = {'PERSON', 'ORG', 'DATE', 'FOOD'}

In [None]:
# spaCy model name
model = 'en_core_web_sm'

In [None]:
# List known entity types for this model
known_entities = get_entities_for_spacy_model(model)
print(known_entities)

In [None]:
# Validate our target entities
for label in target_entities:
    if label not in known_entities:
        logger.warning(f"Unknown entity type: {label}")

In [None]:
ents = match_entities_in_text(text, model, target_entities)

In [None]:
for ent in ents:
    print(ent)

### Regexes

In [None]:
# E.g: numbers and capitalized words
patterns = [
    (r"[0-9]+", "number"),
    (r"\b[A-Z][a-zA-Z]*\b", "capitalized word")
]

In [None]:
for res in match_patterns_in_text(text, patterns):
    print(res)