# Extract a Word Document

**The old way:** Open Word, copy-paste text into another tool, manually export images one by one, lose table formatting in the process.

**The new way:** Upload a `.docx` to Istari, run extraction, get structured text, individual paragraphs, embedded images, and table snapshots automatically.

This notebook:
1. Navigates the Istari system hierarchy to find the Word document
2. Runs Word data extraction using Microsoft Word 2021 on Windows Server 2022
3. Views the 26 extracted artifacts (text, paragraphs, images, table snapshots)
4. Snapshots the results so the team can see what was extracted

See [`example-output/`](example-output/) for pre-computed results.

In [None]:
# Setup
import sys, json
from pathlib import Path

try:
    import istari_digital_client
except ImportError:
    !pip install istari-digital-client python-dotenv -q

repo_root = str(Path.cwd().parent.parent)
if repo_root not in sys.path:
    sys.path.insert(0, repo_root)

from istari_client import get_client

client = get_client()
user = client.get_current_user()
print(f"Connected as: {user.display_name} ({user.email})")

In [None]:
# Explore the system — navigate the Istari hierarchy
#
# This system tracks one Word document (100KB_DOCX.docx).
#
# Istari links:
#   System: https://demo.istari.app/systems/df00ca2e-8295-401a-9e6c-ee46241d15c8

SYSTEM_ID = "df00ca2e-8295-401a-9e6c-ee46241d15c8"  # Example: Extract Word Document
CONFIG_ID = "f5481d4d-c869-4569-8979-a8c7f03442f3"  # Baseline configuration

system = client.get_system(SYSTEM_ID)
print(f"System: {system.name}")
print(f"  {system.description}\n")

configs = client.list_system_configurations(SYSTEM_ID, page=1, size=50)
print(f"Configurations ({configs.total}):\n")

for config in configs.items:
    print(f"  {config.name}")
    print(f"    Config ID: {config.id}")

    tracked = client.list_tracked_files(config.id, page=1, size=50)
    print(f"    Tracked files ({tracked.total}):")
    for tf in tracked.items:
        mode = tf.specifier_type.value
        print(f"      {tf.file_id} ({mode})")

    snapshots = client.list_snapshots(configuration_id=config.id, page=1, size=10)
    print(f"    Snapshots ({snapshots.total}):")
    for snap in snapshots.items:
        tags = client.list_tags(snapshot_id=snap.id, page=1, size=10)
        tag_names = [t.tag for t in tags.items]
        tag_str = f"  [{', '.join(tag_names)}]" if tag_names else ""
        revs = client.list_snapshot_revisions(snap.id, page=1, size=50)
        print(f"      {snap.id[:8]}...{tag_str}  ({revs.total} file(s))")
        for r in revs.items:
            size_kb = r.size / 1024
            print(f"        - {r.name} ({size_kb:.1f} KB)")

In [None]:
# Load the model — show file details and revision history

MODEL_ID = "42623b64-26eb-47b6-b562-53a1cd62189c"  # 100KB_DOCX.docx

model = client.get_model(MODEL_ID)
name = model.display_name or model.file.revisions[0].name
print(f"Model: {name}")
print(f"  File ID:   {model.file.id}")
print(f"  Revisions: {len(model.file.revisions)}")
for rev in model.file.revisions:
    size_kb = rev.size / 1024
    ver = f"  ({rev.version_name})" if hasattr(rev, 'version_name') and rev.version_name else ""
    print(f"    {rev.name} — {size_kb:.1f} KB{ver}")
print(f"  Artifacts: {len(model.artifacts)}")
print(f"\nTracked in config: {CONFIG_ID[:8]}... (LATEST)")

In [None]:
# Run Word extraction
#
# Sends the .docx to an Istari agent running Microsoft Word.
# Extracts: full text, individual paragraphs, embedded images, and table snapshots.
#
# Docs: https://docs.istaridigital.com/integrations/documents/microsoft_office_word

from time import sleep
from datetime import datetime
from istari_digital_client import JobStatusName

print("Submitting Word extraction job...")
job = client.add_job(
    model_id=MODEL_ID,
    function="@istari:extract",
    tool_name="microsoft_office_word",
    tool_version="2021",
    operating_system="Windows Server 2022",
    parameters={},
)
print(f"Job: {job.id}")

while True:
    sleep(10)
    job = client.get_job(job.id)
    ts = datetime.now().strftime("%H:%M:%S")
    status = job.status.name.value
    print(f"  [{ts}] {status}")
    if job.status.name in {JobStatusName.COMPLETED, JobStatusName.FAILED}:
        break

if job.status.name == JobStatusName.COMPLETED:
    print("\nExtraction complete!")
else:
    print("\nExtraction failed!")

In [None]:
# View extraction results
#
# The agent produced 26 artifacts:
#   document.docx, all_text.txt, multiple Par-*.txt files,
#   image1.jpeg, Table-1.jpeg, Table-2.jpeg

model = client.get_model(MODEL_ID)
print(f"Artifacts: {len(model.artifacts)}\n")

for a in model.artifacts:
    rev = a.file.revisions[0] if a.file.revisions else None
    if rev:
        size_kb = rev.size / 1024
        ext = rev.name.split('.')[-1].lower() if '.' in rev.name else ''
        print(f"  {rev.name} ({size_kb:.1f} KB)")

        # Show JSON contents inline (500 char limit)
        if ext == 'json':
            try:
                data = json.loads(a.read_text())
                dumped = json.dumps(data, indent=2)
                print(f"    {dumped[:500]}")
                if len(dumped) > 500:
                    print("    ...")
            except Exception:
                pass
            print()

In [None]:
# Snapshot: capture state after extraction

from istari_digital_client import NewSnapshot, NewSnapshotTag

print("Creating snapshot: post-extraction...")
snap_response = client.create_snapshot(CONFIG_ID, NewSnapshot())
snapshot = snap_response.actual_instance

if hasattr(snapshot, "id"):
    snap_id = snapshot.id
    print(f"  New snapshot: {snap_id[:8]}...")
else:
    snaps = client.list_snapshots(configuration_id=CONFIG_ID, page=1, size=1)
    snap_id = snaps.items[0].id
    print(f"  No changes — tagging existing snapshot: {snap_id[:8]}...")

client.create_tag(snap_id, NewSnapshotTag(tag="post-extraction"))
print("  Tagged: post-extraction")

revs = client.list_snapshot_revisions(snap_id, page=1, size=50)
print(f"  Files: {revs.total}")

In [None]:
# View system state — all snapshots with tags, files, and sizes

print(f"System: {system.name}")
print(f"  {system.description}\n")

configs = client.list_system_configurations(SYSTEM_ID, page=1, size=50)
for config in configs.items:
    print(f"Configuration: {config.name}")
    print(f"  ID: {config.id}")

    tracked = client.list_tracked_files(config.id, page=1, size=50)
    print(f"  Tracked files: {tracked.total}")

    snapshots = client.list_snapshots(configuration_id=config.id, page=1, size=20)
    print(f"\n  Snapshots ({snapshots.total}):\n")

    for i, snap in enumerate(snapshots.items):
        tags = client.list_tags(snapshot_id=snap.id, page=1, size=10)
        tag_names = [t.tag for t in tags.items]
        tag_str = ", ".join(tag_names) if tag_names else "untagged"
        revs = client.list_snapshot_revisions(snap.id, page=1, size=50)

        print(f"    {i+1}. [{tag_str}]  ({revs.total} files)")
        for r in revs.items:
            size_kb = r.size / 1024
            if size_kb > 1024:
                print(f"       {r.name} ({size_kb/1024:.1f} MB)")
            else:
                print(f"       {r.name} ({size_kb:.1f} KB)")
        print()

## Summary

**What we did:**
1. Uploaded `100KB_DOCX.docx` to Istari
2. Ran automated extraction — full text, individual paragraphs, embedded images, and table snapshots
3. Viewed all 26 artifacts without opening Word
4. Snapshotted the extraction results for the team

### Version History

| # | Snapshot Tag | Files | What happened |
|---|-------------|-------|---------------|
| 1 | `initial-upload` | 1 | Raw 100KB_DOCX.docx uploaded |
| 2 | `post-extraction` | 27 | Original .docx + 26 extracted artifacts |

### What Istari Did

| Step | Inner Loop (Microsoft Word 2021 on Windows Server 2022) | Outer Loop (Istari) |
|------|--------------------------------------------------------|---------------------|
| Upload document | — | Stored .docx, tracked as LATEST |
| Extract data | Opened document in Word, exported full text, split into per-paragraph files, exported embedded images as JPEG, rendered tables as JPEG snapshots | Ran job on Windows agent, stored all 26 artifacts |
| Snapshot | — | Captured point-in-time state with all 27 files |

### What's Next?

- **Search paragraphs** — iterate the `Par-*.txt` files to find specific sections without loading the full document
- **Extract images** — use the `image*.jpeg` files for downstream vision analysis or archiving
- **Compare revisions** — upload an updated .docx, re-extract, and diff paragraph text across versions