In [None]:
from pathlib import Path

import spacy
from rich.console import Console

import blacktape
from blacktape.lib import chunks
from blacktape.pipeline import Pipeline

In [None]:
console = Console()

In [None]:
blacktape.__version__

### Test Data

In [None]:
test_file = Path('../tests/data/Crime_and_Punishment.txt')

### Chunking Options

In [None]:
# Pick a language model to detect sentences
model = "en_core_web_sm"
nlp = spacy.load(model, disable=["parser"])
nlp.enable_pipe("senter")

In [None]:
file_read_options = {
    'nlp': nlp,
    'max_chunk_size': 10_000,  # Value in characters (not bytes) to stay below spaCy's max doc size of 1_000_000 characters by default
    'encoding': "UTF-8",
    'errors': "ignore",
    'newline': '',  # To preserve line endings
}

### Target Entities

In [None]:
# Entity types we're interested in
target_entities = {'PERSON', 'ORG'}

### Feed chunks into a pipeline

In [None]:
# Pipeline model doesn't necessarily have to be the model used for chunking
with Pipeline(spacy_model=model) as pipeline:

    with console.status("[bold green]Chunking file and submitting jobs...") as status:
        for chunk in chunks(test_file, **file_read_options):
            pipeline.submit_ner_job(chunk, target_entities)

    for result in pipeline.results():
        for match in result:
            print(match)