# CaPo: Evaluating Large Language Models for Strategic Knowledge Extraction in Capability-Based Planning

## Extraction with LLMs

#### Environment and imports

In [3]:
import os
import sys
import json
from enum import Enum
from pathlib import Path

from dotenv import load_dotenv
try:
    from tqdm.notebook import tqdm
except Exception:
    from tqdm import tqdm

def get_project_root() -> Path:
    """Finds the project's root directory by looking for a known file/folder."""
    p = Path.cwd()
    while p != p.parent:
        if (p / 'scripts').exists() or (p / '.git').exists():
            return p
        p = p.parent
    return Path.cwd() # Fallback

PROJECT_ROOT = get_project_root()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

print("Project root:", PROJECT_ROOT)

dotenv_path = PROJECT_ROOT / ".env"
if dotenv_path.exists():
    load_dotenv(dotenv_path)
else:
    load_dotenv()

Project root: C:\Users\kolkhc\Coding\CaPo\git_code\capo


#### Project modules imports

In [4]:
from model.extractor import LLMExtractor
from schemas.capability import Capabilities
from utils.formatter import build_document_annotation, append_document_annotation
from utils.io import save_json_to_output

#### Config (models, providers, files to analyze)

We tried with different models. You can add/modify the `Model` and `ModelName` classes with your own models to try. We used models from two providers: Azure and Ollama. The files to analyze should be saved in a folder called `/data` located at the same level as this notebook.

In [5]:
class Model(str, Enum):
    GPT_4_1 = "gpt-4.1"
    GPT_4_1_mini = "gpt-4.1-mini"
    GPT_5_mini = "gpt-5-mini"
    GEMMA_3 = "hf.co/bartowski/google_gemma-3-27b-it-GGUF:Q8_0"
    MISTRAL_SMALL = (
        "hf.co/bartowski/mistralai_Mistral-Small-3.1-24B-Instruct-2503-GGUF:Q8_0"
    )
    QWEN_3 = "hf.co/unsloth/Qwen3-4B-Instruct-2507-GGUF:Q8_0"


class ModelName(str, Enum):
    GPT_4_1 = "gpt-4.1"
    GPT_4_1_mini = "gpt-4.1-mini"
    GPT_5_mini = "gpt-5-mini"
    GEMMA_3 = "gemma-3-27b"
    MISTRAL_SMALL = "mistral-small"
    QWEN_3 = "qwen3-4b"


class Provider(str, Enum):
    AZURE = "azure"
    OLLAMA = "ollama"


# Choose defaults (you can tweak interactively):
model_name = ModelName.GPT_5_mini
model = Model.GPT_5_mini
provider = Provider.AZURE

OUTPUT_DIR = PROJECT_ROOT / "output" / "llm_annotations"
DATA_DIR = PROJECT_ROOT / "data"

filenames = [
    "document_1_capiciteitsdruk",
    "document_2_cybercrime",
]

#### Load capabilities JSON

The capabilities should be saved in the `/data` folder at the same level as this notebook. The name should be `capabilities.json` and the structure is:
```json
{
  "capabilities": [
    {
      "name": "<Capability_Name>",
      "definition": "<Capability_Definition>",
      "category": "<Capability_Category>"
    },
  ],
}
```
For more information see the capability schema in `/schemas/capability.py`.

In [8]:
capabilities_path = DATA_DIR / "capabilities.json"
with open(capabilities_path, "r", encoding="utf-8") as f:
    capabilities_json = json.load(f)

capabilities = Capabilities.model_validate(capabilities_json)

ValidationError: 1 validation error for Capabilities
capabilities
  Field required [type=missing, input_value={}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing

#### Core extraction loop

In [None]:
bundle = None
run_input_tokens = 0
run_output_tokens = 0

for filename in tqdm(filenames, desc="Annotating", leave=False):
    print(f"\nAnnotating {filename}...")
    data_path = DATA_DIR / f"{filename}.txt"
    with open(data_path, "r", encoding="utf-8") as f:
        document = f.read()

    extractor = LLMExtractor(model=model, provider=provider)

    # 1) Strategies
    strategies = extractor.extract_strategies(document=document, temperature=0.2)
    # 2) Goals
    goals = extractor.extract_goals(
        document=document, temperature=0.1, strategies=strategies
    )
    # 3) Trends
    trends = extractor.extract_trends(
        document=document, temperature=0.1, strategies=strategies, goals=goals
    )
    # 4) Grouping
    grouped_strategies = extractor.group_strategies(
        document=document, temperature=0.2, strategies=strategies
    )
    grouped_goals = extractor.group_goals(
        document=document, temperature=0.2, goals=goals
    )
    grouped_trends = extractor.group_trends(
        document=document, temperature=0.2, trends=trends
    )
    # 5) Strategy ↔ Goal links
    strategy_goal_relations = extractor.link_strategies_and_goals(
        document=document, temperature=0.2, strategies=strategies, goals=goals
    )
    # 6) Strategy ↔ Trend links
    strategy_trend_relations = extractor.link_strategies_and_trends(
        document=document, temperature=0.2, strategies=strategies, trends=trends
    )
    # 7) Strategy ↔ Capability links
    strategy_capability_relations = extractor.link_strategies_and_capabilities(
        document=document,
        temperature=0.2,
        strategies=strategies,
        capabilities=capabilities,
    )

    run_input_tokens += getattr(extractor, "input_tokens_total", 0)
    run_output_tokens += getattr(extractor, "output_tokens_total", 0)

    document_annotation = build_document_annotation(
        document_id=filename,
        strategies=strategies,
        goals=goals,
        trends=trends,
        grouped_strategies=grouped_strategies,
        grouped_goals=grouped_goals,
        grouped_trends=grouped_trends,
        strategy_goal_relations=strategy_goal_relations,
        strategy_trend_relations=strategy_trend_relations,
        strategy_capability_relations=strategy_capability_relations,
    )

    bundle = append_document_annotation(
        bundle=bundle,
        annotator_id=f"{provider}_{model_name}",
        document_annotation=document_annotation,
    )

print("\nRun complete.")
print(f"TOTAL input tokens:  {run_input_tokens}")
print(f"TOTAL output tokens: {run_output_tokens}")
print(f"TOTAL tokens:        {run_input_tokens + run_output_tokens}")

#### Save output to JSON

In [None]:
result_json = json.dumps(bundle, indent=2, ensure_ascii=False)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

save_json_to_output(
    filename=f"{provider}_{model_name}",
    json_str=result_json,
    foldername="output",
)

print(f"Saved to: {OUTPUT_DIR / (str(provider) + '_' + str(model_name) + '.json')}")