In [None]:
from pathlib import Path
from pprint import pprint

import spacy
from rich.console import Console

import blacktape
from blacktape.lib import chunks
from blacktape.pipeline import Pipeline

In [None]:
console = Console()

In [None]:
blacktape.__version__

### Test Data

In [None]:
test_file = Path('../tests/data/Crime_and_Punishment.txt')

### Chunking Options

In [None]:
# Pick a language model to detect sentences
model = "en_core_web_sm"
nlp = spacy.load(model, disable=["parser"])
nlp.enable_pipe("senter")

In [None]:
file_read_options = {
    'nlp': nlp,
    'max_chunk_size': 10_000,  # Value in characters (not bytes) to stay below spaCy's max doc size of 1_000_000 characters by default
    'encoding': "UTF-8",
    'errors': "ignore",
    'newline': '',  # To preserve line endings
}

### Target Entities

In [None]:
# Entity types we're interested in
target_entities = {'PERSON', 'ORG'}

### Feed chunks into a pipeline

In [None]:
# Pipeline model doesn't necessarily have to be the model used for chunking
with Pipeline(spacy_model=model) as pipeline:

    matches = []

    with console.status("[bold green]Chunking file and submitting jobs...") as status:

        processed = 0  # Cumulative string length of previous chunks
        file_path = str(test_file.resolve())

        for chunk in chunks(test_file, **file_read_options):

            # Submit NER extraction job
            pipeline.submit_ner_job(
                chunk, target_entities,
                base_offset=processed,
                file=file_path)

            processed += len(chunk)

    # Process match results as they become available
    for result in pipeline.results():

        for match in result:
            # Resolve offset in document
            match["offset"] = match["offset"] + match.pop("base_offset")

            matches.append(match)

### Validate match results

In [None]:
# Load the entire text in memory
with test_file.open(mode="r", encoding="UTF-8", errors="ignore", newline="") as f:
    full_text = f.read()

for match in matches:
    # View match result from the pipeline
    pprint(match)

    # Check what's in the original text at that offset
    start, end = match["offset"], match["offset"] + len(match["text"])
    text_extract = full_text[start:end]

    print(text_extract, end="")

    try:
        assert match["text"] == text_extract
        print(" ✅")
    except:
        raise

    print("=======")