# Enrichment API Experiments

This notebook lets you explore the external datasources used by the German Verb Master enrichment pipeline without running the application server.

## Requirements

Install the packages below in your Python environment before running the notebook:

- Python 3.10 or newer
- [`requests`](https://pypi.org/project/requests/) for HTTP calls
- [`python-dotenv`](https://pypi.org/project/python-dotenv/) *(optional, load API keys from a `.env` file)*
- [`pydantic`](https://pypi.org/project/pydantic/) *(optional, validate parsed payloads)*

Environment variables (only needed for the OpenAI helper cell):

- `OPENAI_API_KEY` — secret token for the Chat Completions API

All other datasources are public and can be queried without credentials.

In [23]:
import json
import re
from dataclasses import dataclass, field
from html import unescape
from typing import Any, Dict, List, Optional, Tuple

import requests

REQUEST_HEADERS = {
    "Accept": "application/json",
    "User-Agent": "GermanVerbMaster/1.0 (data enrichment experiments)",
}

WIKTIONARY_API = "https://de.wiktionary.org/w/api.php"
OPEN_THESAURUS_API = "https://www.openthesaurus.de/synonyme/search"
MY_MEMORY_API = "https://api.mymemory.translated.net/get"
TATOEBA_API = "https://tatoeba.org/en/api_v0/search"
WIKTEXTRACT_SEARCH_API = "https://kaikki.org/dewiktionary/search/start"
WIKTEXTRACT_RAW_MARKER = "[Show JSON for raw wiktextract data ▼]"
OPENAI_CHAT_COMPLETIONS = "https://api.openai.com/v1/chat/completions"

@dataclass
class WiktionaryVerbForms:
    praeteritum: Optional[str] = None
    partizip_ii: Optional[str] = None
    perfekt: Optional[str] = None
    aux: Optional[str] = None

@dataclass
class WiktionaryLookup:
    summary: Optional[str]
    english_hints: List[str]
    forms: Optional[WiktionaryVerbForms]

@dataclass
class SynonymLookup:
    synonyms: List[str]

@dataclass
class TranslationLookup:
    translation: Optional[str]
    source: str
    confidence: Optional[float]

@dataclass
class ExampleLookup:
    example_de: Optional[str]
    example_en: Optional[str]
    source: str

@dataclass
class WiktextractLookup:
    translations: List[str]
    synonyms: List[str]
    example: Optional[ExampleLookup]

@dataclass
class RequestRecord:
    method: str
    url: str
    params: Dict[str, Any]
    headers: Dict[str, Any]
    body: Optional[str] = None

@dataclass
class ProviderResponse:
    request: RequestRecord
    raw: Any
    parsed: Any
    history: List[RequestRecord] = field(default_factory=list)

session = requests.Session()


def prepare_request(method: str, url: str, *, params: Optional[Dict[str, Any]] = None,
                    headers: Optional[Dict[str, str]] = None, json_body: Any = None) -> Tuple[RequestRecord, requests.PreparedRequest]:
    request = requests.Request(method=method, url=url, params=params,
                               headers=headers or REQUEST_HEADERS,
                               json=json_body)
    prepared = session.prepare_request(request)
    record = RequestRecord(
        method=method,
        url=prepared.url,
        params=params or {},
        headers=dict(prepared.headers),
        body=prepared.body.decode("utf-8") if isinstance(prepared.body, bytes) else prepared.body,
    )
    return record, prepared


def send_request(prepared: requests.PreparedRequest) -> requests.Response:
    response = session.send(prepared)
    response.raise_for_status()
    return response


def to_array(value: Any) -> List[Any]:
    return value if isinstance(value, list) else []


def split_template_arguments(template: str) -> List[str]:
    content = template[2:-2]
    parts: List[str] = []
    buffer: List[str] = []
    depth = 0
    i = 0
    while i < len(content):
        char = content[i]
        nxt = content[i + 1] if i + 1 < len(content) else ""
        if char == "{" and nxt == "{":
            depth += 1
            buffer.append("{{")
            i += 2
            continue
        if char == "}" and nxt == "}":
            depth = max(depth - 1, 0)
            buffer.append("}}")
            i += 2
            continue
        if char == "|" and depth == 0:
            parts.append("".join(buffer).strip())
            buffer = []
            i += 1
            continue
        buffer.append(char)
        i += 1
    if buffer:
        parts.append("".join(buffer).strip())
    return [part for part in parts if part]


def extract_template(content: str, template_name: str) -> Optional[str]:
    needle = f"{{{{{template_name}"
    start = content.find(needle)
    if start == -1:
        return None
    depth = 0
    i = start
    while i < len(content) - 1:
        if content[i] == "{" and content[i + 1] == "{":
            depth += 1
            i += 2
            continue
        if content[i] == "}" and content[i + 1] == "}":
            depth -= 1
            i += 2
            if depth == 0:
                return content[start:i + 1]
            continue
        i += 1
    return None


def strip_wiki_markup(value: str) -> str:
    result = re.sub(r"'''", "", value)
    result = re.sub(r"''", "", result)
    result = re.sub(r"<[^>]+>", " ", result)
    result = result.replace("&nbsp;", " ")
    result = re.sub(r"\{\{(?:[lL]|link-de)\|[^|}]+\|([^}|]+)(?:\|[^}]*)*}}", r"\1", result)
    result = re.sub(r"\{\{lang\|[^|}]+\|([^}]+)}}", r"\1", result)
    result = re.sub(r"\{\{[^}]+}}", " ", result)
    result = re.sub(r"\[\[([^|\]]*\|)?([^\]]+)]]", r"\2", result)
    result = re.sub(r"\s+", " ", result)
    return result.strip()


def normalise_key(value: str) -> str:
    import unicodedata

    normalised = unicodedata.normalize("NFD", value)
    normalised = "".join(ch for ch in normalised if unicodedata.category(ch) != "Mn")
    normalised = normalised.lower()
    normalised = re.sub(r"[^a-z0-9]+", "", normalised)
    return normalised


def normalise_auxiliary(value: Optional[str]) -> Optional[str]:
    if not value:
        return None
    normalised = value.strip().lower()
    if "sein" in normalised:
        return "sein"
    if "haben" in normalised:
        return "haben"
    return None


def select_primary(value: Optional[str]) -> Optional[str]:
    if not value:
        return None
    parts = re.split(r"[;,/]| oder | bzw\.| bzw ", value)
    cleaned = [strip_wiki_markup(part).strip() for part in parts]
    cleaned = [part for part in cleaned if part]
    return (cleaned[0] if cleaned else strip_wiki_markup(value).strip()) or None


def match_from_markup(content: str, pattern: re.Pattern[str]) -> Optional[str]:
    match = pattern.search(content)
    if not match:
        return None
    return strip_wiki_markup(match.group(1) if match.lastindex else "")


def extract_verb_forms(content: str) -> Optional[WiktionaryVerbForms]:
    template_names = ["Deutsch Verb Übersicht", "Deutsch Verb Flexion"]
    forms = WiktionaryVerbForms()
    collected_keys: Dict[str, str] = {}
    positional: List[str] = []

    for name in template_names:
        template = extract_template(content, name)
        if not template:
            continue
        parts = split_template_arguments(template)
        for part in parts[1:]:
            if "=" not in part:
                positional.append(strip_wiki_markup(part))
                continue
            key, value = part.split("=", 1)
            key = normalise_key(key)
            value = strip_wiki_markup(value)
            if key and value:
                collected_keys[key] = value.strip()

    def first(*selectors: Any) -> Optional[str]:
        for selector in selectors:
            if isinstance(selector, str):
                value = collected_keys.get(selector)
                if value:
                    return value
            elif isinstance(selector, re.Pattern):
                for key, value in collected_keys.items():
                    if selector.search(key) and value:
                        return value
        return None

    forms.praeteritum = select_primary(
        first("prateritumich", "praeteritumich", "prateritum", "praeteritum", re.compile(r"^(praeteritum|prateritum)"))
        or (positional[2] if len(positional) > 2 else None)
    )
    forms.partizip_ii = select_primary(
        first("partizipii", "partizip2", "partizipii1", re.compile(r"^(partizip|partizipii|partizip2)"))
        or (positional[3] if len(positional) > 3 else None)
    )
    aux_value = first("hilfsverb", "auxiliar", "auxiliary", "perfektauxiliar",
                      re.compile(r"^hilfsverb"), re.compile(r"^aux"))
    forms.aux = normalise_auxiliary(aux_value)

    perfekt_candidate = first("perfektich", "perfektwir", "perfekt", "perfekt1", re.compile(r"^perfekt"))
    if not perfekt_candidate and len(positional) > 4:
        perfekt_candidate = positional[4]
    forms.perfekt = select_primary(perfekt_candidate)

    if not forms.praeteritum:
        forms.praeteritum = select_primary(match_from_markup(content, re.compile(r"'''Pr[aä]teritum:?'''([^ ]+)", re.I)))
    if not forms.partizip_ii:
        forms.partizip_ii = select_primary(match_from_markup(content, re.compile(r"'''Partizip(?:\s+II)?[:]?'''([^ ]+)", re.I)))
    if not forms.aux:
        forms.aux = normalise_auxiliary(match_from_markup(content, re.compile(r"'''Hilfsverb:?'''([^ ]+)", re.I)))
    if not forms.perfekt:
        perfekt_text = match_from_markup(content, re.compile(r"'''Perfekt:?'''([^ ]+)", re.I))
        forms.perfekt = select_primary(perfekt_text)

    if not forms.perfekt and forms.partizip_ii and forms.aux:
        auxiliary = "ist" if forms.aux == "sein" else "hat"
        forms.perfekt = f"{auxiliary} {forms.partizip_ii}"

    if forms.partizip_ii:
        forms.partizip_ii = select_primary(forms.partizip_ii)
    if forms.praeteritum:
        forms.praeteritum = select_primary(forms.praeteritum)
    if forms.perfekt:
        forms.perfekt = select_primary(forms.perfekt)

    if not any([forms.praeteritum, forms.partizip_ii, forms.perfekt, forms.aux]):
        return None
    if forms.aux not in (None, "haben", "sein"):
        forms.aux = None
    return forms


def build_wiktionary_request(lemma: str) -> Tuple[str, Dict[str, str]]:
    params = {
        "action": "query",
        "format": "json",
        "formatversion": "2",
        "titles": lemma,
        "prop": "extracts|langlinks|revisions",
        "explaintext": "1",
        "origin": "*",
        "lllang": "en",
        "rvslots": "main",
        "rvprop": "content",
    }
    return WIKTIONARY_API, params


def lookup_wiktionary_summary(lemma: str) -> ProviderResponse:
    url, params = build_wiktionary_request(lemma)
    record, prepared = prepare_request("GET", url, params=params)
    response = send_request(prepared)
    data = response.json()
    page_list = (data.get("query") or {}).get("pages")
    page = page_list[0] if isinstance(page_list, list) and page_list else None
    if not page or page.get("missing"):
        parsed = None
    else:
        extract = page.get("extract") or ""
        lines = [line.strip() for line in extract.split(" ") if line.strip()]
        summary = " ".join(lines[:4]) or None
        english_links: List[str] = []
        for link in to_array(page.get("langlinks")):
            if isinstance(link, dict) and link.get("lang") == "en" and link.get("title"):
                english_links.append(link["title"].strip())
        revisions = to_array(page.get("revisions"))
        raw_content = ""
        if revisions:
            raw_content = (revisions[0].get("slots", {}) if isinstance(revisions[0], dict) else {}).get("main", {}).get("content", "")
        forms = extract_verb_forms(raw_content) if raw_content else None
        parsed = WiktionaryLookup(summary=summary, english_hints=english_links, forms=forms)
    return ProviderResponse(request=record, raw=data, parsed=parsed)


def build_open_thesaurus_request(lemma: str) -> Tuple[str, Dict[str, str]]:
    return OPEN_THESAURUS_API, {"q": lemma, "format": "application/json"}


def lookup_open_thesaurus_synonyms(lemma: str) -> ProviderResponse:
    url, params = build_open_thesaurus_request(lemma)
    record, prepared = prepare_request("GET", url, params=params)
    response = send_request(prepared)
    data = response.json()
    collected: List[str] = []
    seen = set()
    for synset in to_array(data.get("synsets")):
        if not isinstance(synset, dict):
            continue
        for term in to_array(synset.get("terms")):
            if not isinstance(term, dict):
                continue
            value = (term.get("term") or "").strip()
            if value and value.lower() != lemma.lower() and value not in seen:
                collected.append(value)
                seen.add(value)
    parsed = SynonymLookup(synonyms=collected[:10])
    return ProviderResponse(request=record, raw=data, parsed=parsed)


def build_translation_request(lemma: str) -> Tuple[str, Dict[str, str]]:
    return MY_MEMORY_API, {"q": lemma, "langpair": "de|en"}


def lookup_translation(lemma: str) -> ProviderResponse:
    url, params = build_translation_request(lemma)
    record, prepared = prepare_request("GET", url, params=params)
    response = send_request(prepared)
    data = response.json()
    parsed: Optional[TranslationLookup] = None
    for match in to_array(data.get("matches")):
        if not isinstance(match, dict):
            continue
        translation = (match.get("translation") or "").strip()
        quality = match.get("quality")
        if translation and isinstance(quality, (int, float)) and quality >= 80:
            parsed = TranslationLookup(translation=translation, source="mymemory.translated.net", confidence=float(quality))
            break
    if not parsed:
        response_data = data.get("responseData") or {}
        fallback = response_data.get("translatedText")
        if isinstance(fallback, str) and fallback.strip() and fallback.strip().lower() != lemma.lower():
            match_value = response_data.get("match")
            confidence = float(match_value) * 100 if isinstance(match_value, (int, float)) else None
            parsed = TranslationLookup(translation=fallback.strip(), source="mymemory.translated.net", confidence=confidence)
    return ProviderResponse(request=record, raw=data, parsed=parsed)


def build_tatoeba_request(lemma: str) -> Tuple[str, Dict[str, str]]:
    return TATOEBA_API, {"query": lemma, "from": "deu", "to": "eng", "sort": "relevance", "limit": "1"}


def lookup_example_sentence(lemma: str) -> ProviderResponse:
    url, params = build_tatoeba_request(lemma)
    record, prepared = prepare_request("GET", url, params=params)
    response = send_request(prepared)
    data = response.json()
    results = data.get("results") if isinstance(data, dict) else None
    result = results[0] if isinstance(results, list) and results else None
    parsed: Optional[ExampleLookup] = None
    if isinstance(result, dict):
        german = (result.get("text") or "").strip()
        if german:
            for translation in to_array(result.get("translations")):
                if not isinstance(translation, dict):
                    continue
                if translation.get("lang") == "eng":
                    english_text = (translation.get("text") or "").strip()
                    if english_text:
                        parsed = ExampleLookup(example_de=german, example_en=english_text, source="tatoeba.org")
                        break
    return ProviderResponse(request=record, raw=data, parsed=parsed)


def encode_wiktextract_prefix(prefix: str) -> str:
    return (
        prefix
        .replace("/", "_slash_")
        .replace("\\", "_backslash_")
        .replace("*", "_star_")
        .replace("?", "_ques_")
        .replace("#", "_hash_")
        .replace(".", "_dot_")
    )


def resolve_wiktextract_url(lemma: str) -> Tuple[Optional[str], List[RequestRecord]]:
    trimmed = lemma.strip()
    if not trimmed:
        return None, []
    lower = trimmed.lower()
    history: List[RequestRecord] = []
    for length in dict.fromkeys([3, 2, 1]):
        if length > len(lower):
            continue
        prefix = lower[:length]
        search_url = f"{WIKTEXTRACT_SEARCH_API}/{encode_wiktextract_prefix(prefix)}.json"
        record, prepared = prepare_request("GET", search_url)
        history.append(record)
        response = send_request(prepared)
        data = response.json()
        entries = data[1] if isinstance(data, list) and len(data) > 1 else []
        direct = next((item for item in entries if item[0].lower() == lower), None)
        if direct:
            return direct[1], history
        loose = next((item for item in entries if item[0].lower().startswith(lower)), None)
        if loose:
            return loose[1], history
    return None, history


def lookup_wiktextract(lemma: str) -> ProviderResponse:
    url, history = resolve_wiktextract_url(lemma)
    if not url:
        return ProviderResponse(
            request=RequestRecord(method="GET", url="", params={}, headers={}),
            raw=None,
            parsed=None,
            history=history,
        )
    record, prepared = prepare_request("GET", url)
    response = send_request(prepared)
    html = response.text
    marker_index = html.find(WIKTEXTRACT_RAW_MARKER)
    raw_data = None
    if marker_index != -1:
        pre_start = html.find("<pre>", marker_index)
        pre_end = html.find("</pre>", pre_start)
        if pre_start != -1 and pre_end != -1:
            raw_json = html[pre_start + 5:pre_end]
            raw_json = unescape(raw_json)
            raw_data = json.loads(raw_json)
    if not isinstance(raw_data, dict):
        return ProviderResponse(request=record, raw=html, parsed=None, history=history)
    translations: List[str] = []
    for translation in to_array(raw_data.get("translations")):
        if not isinstance(translation, dict):
            continue
        lang = (translation.get("lang") or "").lower()
        code = (translation.get("lang_code") or "").lower()
        word = (translation.get("word") or "").strip()
        if not word or word.lower() == lemma.lower():
            continue
        if lang.startswith("engl") or code == "en":
            translations.append(word)
    synonyms: List[str] = []
    for synonym in to_array(raw_data.get("synonyms")):
        if isinstance(synonym, dict) and synonym.get("word"):
            synonyms.append(synonym["word"].strip())
    example: Optional[ExampleLookup] = None
    for sense in to_array(raw_data.get("senses")):
        if not isinstance(sense, dict):
            continue
        for candidate in to_array(sense.get("examples")):
            if not isinstance(candidate, dict):
                continue
            example_de = (candidate.get("text") or "").strip()
            example_en = (candidate.get("translation") or "").strip()
            if example_de or example_en:
                example = ExampleLookup(example_de=example_de or None, example_en=example_en or None, source="kaikki.org")
                if example.example_en:
                    break
        if example and example.example_en:
            break
    parsed = None
    if translations or synonyms or example:
        parsed = WiktextractLookup(translations=translations[:10], synonyms=synonyms, example=example)
    return ProviderResponse(request=record, raw=raw_data, parsed=parsed, history=history)


def build_openai_request(lemma: str, pos: str) -> Tuple[str, Dict[str, Any]]:
    body = {
        "model": "gpt-4o-mini",
        "temperature": 0.2,
        "messages": [
            {
                "role": "system",
                "content": "You are a linguistics assistant that responds with valid JSON only. Provide translations and simple bilingual example sentences for German vocabulary.",
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": f"Return a JSON object with keys translation, exampleDe, exampleEn for the German {pos} '{lemma}'. Use neutral tone and CEFR A2 difficulty. If unsure, omit the key.",
                    }
                ],
            },
        ],
        "response_format": {"type": "json_object"},
    }
    return OPENAI_CHAT_COMPLETIONS, body


def lookup_ai_assistance(lemma: str, pos: str, api_key: Optional[str]) -> ProviderResponse:
    if not api_key:
        return ProviderResponse(
            request=RequestRecord(method="POST", url=OPENAI_CHAT_COMPLETIONS, params={}, headers={}),
            raw=None,
            parsed=None,
        )
    _, body = build_openai_request(lemma, pos)
    headers = {**REQUEST_HEADERS, "Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
    record, prepared = prepare_request("POST", OPENAI_CHAT_COMPLETIONS, headers=headers, json_body=body)
    response = send_request(prepared)
    data = response.json()
    content = None
    choices = data.get("choices")
    if isinstance(choices, list) and choices:
        content = choices[0].get("message", {}).get("content")
    parsed = None
    if isinstance(content, str):
        try:
            payload = json.loads(content)
            parsed = {
                "translation": payload.get("translation"),
                "example": ExampleLookup(
                    example_de=payload.get("exampleDe"),
                    example_en=payload.get("exampleEn"),
                    source="openai.com",
                ),
            }
        except json.JSONDecodeError:
            parsed = None
    return ProviderResponse(request=record, raw=data, parsed=parsed)

## Wiktionary — Summary, English hints, and verb forms

**API URL:** `https://de.wiktionary.org/w/api.php`

**Sample request parameters:**
```python
url, params = build_wiktionary_request("machen")
print(url)
print(params)
```

Run the next cell to send the request, inspect the raw JSON, and view the parsed payload.

In [24]:
wiktionary.request

RequestRecord(method='GET', url='https://de.wiktionary.org/w/api.php?action=query&format=json&formatversion=2&titles=machen&prop=extracts%7Clanglinks%7Crevisions&explaintext=1&origin=%2A&lllang=en&rvslots=main&rvprop=content', params={'action': 'query', 'format': 'json', 'formatversion': '2', 'titles': 'machen', 'prop': 'extracts|langlinks|revisions', 'explaintext': '1', 'origin': '*', 'lllang': 'en', 'rvslots': 'main', 'rvprop': 'content'}, headers={'User-Agent': 'GermanVerbMaster/1.0 (data enrichment experiments)', 'Accept-Encoding': 'gzip, deflate, br', 'Accept': 'application/json', 'Connection': 'keep-alive'}, body=None)

In [25]:
lemma = "machen"
wiktionary = lookup_wiktionary_summary(lemma)
print(wiktionary.request)
print(json.dumps(wiktionary.raw, indent=2)[:1000])
print(wiktionary.parsed)

RequestRecord(method='GET', url='https://de.wiktionary.org/w/api.php?action=query&format=json&formatversion=2&titles=machen&prop=extracts%7Clanglinks%7Crevisions&explaintext=1&origin=%2A&lllang=en&rvslots=main&rvprop=content', params={'action': 'query', 'format': 'json', 'formatversion': '2', 'titles': 'machen', 'prop': 'extracts|langlinks|revisions', 'explaintext': '1', 'origin': '*', 'lllang': 'en', 'rvslots': 'main', 'rvprop': 'content'}, headers={'User-Agent': 'GermanVerbMaster/1.0 (data enrichment experiments)', 'Accept-Encoding': 'gzip, deflate, br', 'Accept': 'application/json', 'Connection': 'keep-alive'}, body=None)
{
  "batchcomplete": true,
  "query": {
    "pages": [
      {
        "pageid": 1974,
        "ns": 0,
        "title": "machen",
        "extract": "\n== machen (Deutsch) ==\n\n\n=== Verb ===\n\nWorttrennung:\nma\u00b7chen, Pr\u00e4teritum: mach\u00b7te, Partizip II: ge\u00b7macht\nAussprache:\nIPA: [\u02c8maxn\u0329]\nH\u00f6rbeispiele:  machen (Info),  machen (

## OpenThesaurus — Synonym lookup

**API URL:** `https://www.openthesaurus.de/synonyme/search`

**Sample request parameters:**
```python
url, params = build_open_thesaurus_request("machen")
print(url)
print(params)
```

The next cell fetches synonyms and shows how the enrichment pipeline filters duplicates.

In [None]:
synonyms = lookup_open_thesaurus_synonyms(lemma)
print(synonyms.request)
print(json.dumps(synonyms.raw, indent=2)[:1000])
print(synonyms.parsed)

## MyMemory — Translation lookup

**API URL:** `https://api.mymemory.translated.net/get`

**Sample request parameters:**
```python
url, params = build_translation_request("machen")
print(url)
print(params)
```

Use the next cell to retrieve translations and inspect both the scored matches and fallback response.

In [None]:
translation = lookup_translation(lemma)
print(translation.request)
print(json.dumps(translation.raw, indent=2)[:1000])
print(translation.parsed)

## Tatoeba — Example sentence lookup

**API URL:** `https://tatoeba.org/en/api_v0/search`

**Sample request parameters:**
```python
url, params = build_tatoeba_request("machen")
print(url)
print(params)
```

Call the helper below to see the bilingual example the enrichment pipeline would store.

In [None]:
example = lookup_example_sentence(lemma)
print(example.request)
print(json.dumps(example.raw, indent=2)[:1000])
print(example.parsed)

## Wiktextract — Additional translations, synonyms, and examples

**Search API URL:** `https://kaikki.org/dewiktionary/search/start/<prefix>.json`

**Sample request:** the code resolves the correct JSON detail URL automatically.

Execute the following cell to inspect the Kaikki HTML payload and the parsed enrichment result.

In [28]:
wiktextract.request

RequestRecord(method='GET', url='https://kaikki.org/dewiktionary/All%20languages%20combined/meaning/m/ma/machen.html', params={}, headers={'User-Agent': 'GermanVerbMaster/1.0 (data enrichment experiments)', 'Accept-Encoding': 'gzip, deflate, br', 'Accept': 'application/json', 'Connection': 'keep-alive'}, body=None)

In [26]:
wiktextract = lookup_wiktextract(lemma)
print(wiktextract.history)
print(wiktextract.request)
print(str(wiktextract.raw)[:1000])
print(wiktextract.parsed)

[RequestRecord(method='GET', url='https://kaikki.org/dewiktionary/search/start/mac.json', params={}, headers={'User-Agent': 'GermanVerbMaster/1.0 (data enrichment experiments)', 'Accept-Encoding': 'gzip, deflate, br', 'Accept': 'application/json', 'Connection': 'keep-alive'}, body=None)]
RequestRecord(method='GET', url='https://kaikki.org/dewiktionary/All%20languages%20combined/meaning/m/ma/machen.html', params={}, headers={'User-Agent': 'GermanVerbMaster/1.0 (data enrichment experiments)', 'Accept-Encoding': 'gzip, deflate, br', 'Accept': 'application/json', 'Connection': 'keep-alive'}, body=None)
<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="utf-8">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <title>machen in All languages combined</title>
  <link rel="canonical" href="https://kaikki.org/dewiktionary/All languages combined/meaning/m/ma/machen.html"/>
  <link rel="shortcut icon" type="image/png" href="data:image/png;base64,iVBORw0KGgoAAAANSUhE

## Optional: OpenAI helper

The enrichment pipeline can fall back to OpenAI for suggestions when configured. This is optional and requires an API key.

```python
import os
api_key = os.getenv("OPENAI_API_KEY")
ai_result = lookup_ai_assistance(lemma, "verb", api_key)
print(ai_result.request)
print(json.dumps(ai_result.raw, indent=2)[:1000])
print(ai_result.parsed)
```

The call is skipped automatically when `OPENAI_API_KEY` is unset.

In [32]:
import requests
import json

BASE = "https://kaikki.org/dictionary/German/meaning"

def kaikki_jsonl_url(word: str) -> str:
    w = word.strip()
    first = w[0].lower()
    first_two = w[:2].lower()
    return f"{BASE}/{first}/{first_two}/{w.lower()}.jsonl"

def get_wiktextract_german_verb(word: str):
    url = kaikki_jsonl_url(word)
    r = requests.get(url, timeout=20)
    if r.status_code == 404:
        raise Exception(f"Not found on Kaikki: {url}")
    r.raise_for_status()

    entries = [json.loads(line) for line in r.text.splitlines() if line.strip()]

    # pick the lemma entry for the German verb
    verb_entries = [e for e in entries if e.get("lang") == "German" and e.get("pos") == "verb"]
    if not verb_entries:
        # Sometimes only inflected-form pages exist; surface what we have
        raise Exception("No German verb lemma entry found in JSONL.")

    entry = verb_entries[0]

    # Forms (e.g., abgebogen, bog ab, auxiliaries)
    forms = [{"form": f.get("form"), "tags": f.get("tags")} for f in entry.get("forms", [])]

    # Examples + translations aggregated across senses
    examples = []
    translations = []
    for s in entry.get("senses", []):
        examples.extend(s.get("examples", []))
        for t in s.get("translations", []) or []:
            translations.append({"word": t.get("word"), "lang": t.get("lang")})

    return {
        "word": entry.get("word", word),
        "forms": forms,
        "examples": examples,
        "translations": translations,
        "source": url,
    }


pprint(get_wiktextract_german_verb("machen"))


{'examples': [{'bold_text_offsets': [[4, 7], [25, 32]],
               'bold_translation_offsets': [[2, 6]],
               'english': 'I made you a pie!',
               'text': 'Ich hab dir einen Kuchen gemacht!',
               'translation': 'I made you a pie!',
               'type': 'example'},
              {'bold_text_offsets': [[3, 7], [21, 28]],
               'bold_translation_offsets': [[4, 8]],
               'english': 'You made a mistake.',
               'text': 'Du hast einen Fehler gemacht.',
               'translation': 'You made a mistake.',
               'type': 'example'},
              {'bold_text_offsets': [[0, 6]],
               'bold_translation_offsets': [[0, 3], [8, 14]],
               'english': 'Are you making dinner today?',
               'text': 'Machst du heute das Essen?',
               'translation': 'Are you making dinner today?',
               'type': 'example'},
              {'bold_text_offsets': [[16, 22]],
               'bold_translation

In [36]:
import requests, json, urllib.parse

KAiKKI_EN_RAW = "https://kaikki.org/dictionary/rawdata.html"  # docs
BASE_DE = "https://kaikki.org/dictionary/German/meaning"
BASE_EN = "https://kaikki.org/dictionary/English/meaning"

def _jsonl(url: str):
    r = requests.get(url, timeout=20)
    if r.status_code == 404:
        return []
    r.raise_for_status()
    return [json.loads(line) for line in r.text.splitlines() if line.strip()]

def _entry_url(base: str, word: str):
    w = word.strip().lower()
    return f"{base}/{w[0]}/{w[:2]}/{urllib.parse.quote(w)}.jsonl"

def get_german_verb_with_translations(word: str, pivot=True):
    # 1) German lemma entry
    url_de = _entry_url(BASE_DE, word)
    entries = _jsonl(url_de)
    verb = next((e for e in entries if e.get("lang") == "German" and e.get("pos") == "verb"), None)
    if not verb:
        raise ValueError(f"No German verb entry for {word!r} at {url_de}")

    # Forms, examples, translations (if any)
    forms = [{"form": f.get("form"), "tags": f.get("tags")} for f in verb.get("forms", [])]

    examples, translations = [], []
    glosses = []
    for s in verb.get("senses", []):
        examples += s.get("examples", []) or []
        glosses += s.get("glosses", []) or []
        for t in s.get("translations", []) or []:
            translations.append({"word": t.get("word"), "lang": t.get("lang")})

    # 2) If there were no translations on the German lemma, fall back:
    fallback_en = [{"word": g, "lang": "English"} for g in glosses] if not translations else []

    # 3) Optional pivot: use the first English gloss to open the EN headword page
    pivot_translations = []
    if pivot and glosses:
        en_head = glosses[0]
        url_en = _entry_url(BASE_EN, en_head)
        en_entries = _jsonl(url_en)
        # Aggregate translations from all EN senses that match the headword
        for e in en_entries:
            if e.get("lang") == "English":
                for s in e.get("senses", []) or []:
                    for t in s.get("translations", []) or []:
                        # t has fields like {"word": "...", "lang": "German"} etc.
                        pivot_translations.append({"word": t.get("word"), "lang": t.get("lang")})

    return {
        "word": verb.get("word", word),
        "forms": forms,
        "examples": examples,
        "translations": translations or pivot_translations or fallback_en,
        "source_de": url_de,
        "pivot_used": bool(pivot and glosses and not translations),
    }

# Example:
pprint(get_german_verb_with_translations("ab"))


ValueError: No German verb entry for 'ab' at https://kaikki.org/dictionary/German/meaning/a/ab/ab.jsonl

In [48]:
url_de = _entry_url(BASE_DE, "lesen")
entries = _jsonl(url_de)

In [49]:
entries

[{'senses': [{'categories': ['German intransitive verbs',
     'German transitive verbs'],
    'links': [['read', 'read'],
     ['look', 'look'],
     ['understand', 'understand'],
     ['symbol', 'symbol'],
     ['word', 'word'],
     ['data', 'data']],
    'raw_glosses': ['(transitive or intransitive) to read (look at and understand symbols, words, or data)'],
    'glosses': ['to read (look at and understand symbols, words, or data)'],
    'tags': ['class-5', 'intransitive', 'strong', 'transitive']},
   {'categories': ['German transitive verbs'],
    'links': [['select', 'select'],
     ['gather', 'gather'],
     ['harvest', 'harvest'],
     ['grape', 'grape']],
    'raw_glosses': ['(transitive) to select and gather or harvest (things like grapes)'],
    'glosses': ['to select and gather or harvest (things like grapes)'],
    'tags': ['class-5', 'strong', 'transitive']}],
  'pos': 'verb',
  'head_templates': [{'name': 'de-verb',
    'args': {'1': 'lesen<liest#las,gelesen,läse>'},
   