In [None]:
from datetime import datetime
from pathlib import Path

import spacy
from spacy_model_manager.lib import get_installed_model_version
from rich.console import Console

from blacktape.db import db_init, db_session
from blacktape.lib import chunks
from blacktape.models import Match, FileReport
from blacktape.pipeline import Pipeline
from blacktape.util import record_workflow_config

### Test Data

In [None]:
# test_file = Path("../tests/data/birds_and_bees.txt")
test_file = Path("../tests/data/newport_bermuda.txt")

# Make that an absolute path
test_file = test_file.resolve()

In [None]:
# Entity types we're interested in
target_entities = {"PERSON", "GPE"}

In [None]:
# Trivial regular expressions
patterns = [
    (r"[0-9]{4}", "4-digit number"),
    (r"\b[A-Z]{2,}\b", "acronym")
]

### Setup

In [None]:
console = Console()

#### spaCy

In [None]:
# Use spaCy model en_core_web_sm for both text chunking and entity extraction
model = "en_core_web_sm"

In [None]:
# Make sure the model is installed
!spacy-model install {model}

In [None]:
# Load the model
nlp = spacy.load(model, disable=["parser"])
nlp.enable_pipe("senter")

#### Text processing

In [None]:
chunking_options = {
    'nlp': nlp,
    'max_chunk_size': 10_000, # Value in characters (not bytes) to stay below spaCy's max doc size of 1_000_000 characters by default
}

text_file_open_options = {
    'encoding': "UTF-8",
    'errors': "ignore",
    'newline': '',  # To preserve line endings
}

#### DB output

In [None]:
SQLITE3_FILENAME_TEMPLATE = "{}_{}.sqlite3"

# Directory of sqlite3 files
out_dir = Path.cwd() / "results"
out_dir.mkdir(exist_ok=True)

# DB file timestamped for this pipeline run
db_file = out_dir / SQLITE3_FILENAME_TEMPLATE.format(
    test_file.name,
    datetime.now()
    .isoformat(timespec="seconds")
    .translate(str.maketrans({"-": "", ":": ""})),
)

# DB setup
Session = db_init(db_file)

#### Redacting

In [None]:
redacted_block_char = "\u2588"
redacted_block_length = 8

redacted_block = redacted_block_char * redacted_block_length

### Pipeline with DB session

In [None]:
%%time

with Pipeline(spacy_model=model) as pipeline, db_session(Session) as session:

    # Record source file info
    file_report = FileReport(path=str(test_file))
    session.add(file_report)

    # Record workflow parameters
    record_workflow_config(
        session,
        source=str(test_file),
        spacy_model=model,
        spacy_model_version=get_installed_model_version(model),
    )

    # Submit jobs
    with console.status("[bold green]Chunking file and submitting jobs...") as status:

        processed = 0  # Cumulative string length of previous chunks

        for chunk in chunks(test_file, **chunking_options, **text_file_open_options):

            # Submit a NER extraction job for this chunk
            pipeline.submit_ner_job(chunk, target_entities, base_offset=processed, file=str(test_file))

            # Submit regex matching jobs for this chunk
            for pattern, label in patterns:
                pipeline.submit_regex_job(chunk, pattern, label, base_offset=processed, file=str(test_file))

            processed += len(chunk)

    # Process job results as they become available
    for result in pipeline.results():

        for match in result:
            # Resolve offset in document
            match["offset"] = match["offset"] + match.pop("base_offset")

            # Make new Match object from result
            session.add(Match(**match, file_report=file_report))

### Examine DB output

#### Direct DB queries

In [None]:
# Match count
!sqlite3 -box {db_file} "select count(*) as match_count from match;"

In [None]:
# All matches
!sqlite3 -box {db_file} "select * from match;"

In [None]:
# Workflow config
!sqlite3 -box {db_file} "select * from configuration;"

In [None]:
# Source file
!sqlite3 -box {db_file} "select * from file_report;"

#### ORM queries

In [None]:
from dataclasses import dataclass

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

In [None]:
@dataclass
class Block:
    start: int
    end: int

In [None]:
engine = create_engine(f"sqlite:///{db_file}")
session = sessionmaker(bind=engine)()

In [None]:
#  Blocks of text that will need redacting
blocks = [Block(start=match.offset, end=match.offset+len(match.text)) for match in session.query(Match).order_by(Match.offset)]

# Merge overlapping or contiguous blocks
for i in range(len(blocks)-2, -1, -1):
    if blocks[i+1].start <= blocks[i].end+1:
        blocks[i].end = max(blocks[i].end, blocks[i+1].end)
        del blocks[i+1]

len(blocks)

In [None]:
# Put redacted text back together
redacted_file = test_file.parent / f"{test_file.stem}_redacted{test_file.suffix}"

In [None]:
with test_file.open(mode="r", **text_file_open_options) as src, redacted_file.open(mode="w", encoding="UTF-8") as dest:
    
    # Characters read so far from source file (cannot rely on file seek/tell in text mode)
    chars_read = 0

    for block in blocks:
        
        # Write up to the block to redact
        read_buffer_length = block.start - chars_read
        dest.write(src.read(read_buffer_length))
        chars_read += read_buffer_length

        # Skip block
        read_buffer_length = block.end - block.start
        _ = src.read(read_buffer_length)
        chars_read += read_buffer_length

        # Write a redacted block
        dest.write(redacted_block)

    # Write remaining text past the last block
    dest.write(src.read())

In [None]:
session.close()