# Testing

In [1]:
# Install requirements if needed
# !pip install beautifulsoup4 lxml

# 1. Import your parser
import sys
import yaml
import json
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

from semantic_html.parser import parse_note

# 2. Define a small test HTML string
html = """
<!DOCTYPE html><html><head><meta charset="utf-8"></head><body><div class="zotero-notes"><div class="zotero-note"><h1>Header1</h1>
<h2>header2</h2>
<h3>header3</h3>
<p><code>p.44</code></p>
<p>comment related to p. 44 and header 3</p>
<blockquote>
<p style="margin-left: 30px;">“<em>M@áan</em> is a concept like #animal oder #human#oid. <strong><a href="wd:Paris" rel="noopener noreferrer nofollow">Paris</a></strong> is a <em>city</em>. that is testified by <u>wikipedia</u>.”</p>
</blockquote>
<p><span class="citation">(<a href="zotero://select/library/items/KUHYUVEH">“test”</a>)</span></p>
<h3>header4</h3>
<p><code>p.54</code></p>
<p>comment related to p. 54 and header 4</p>
<blockquote>
<p style="margin-left: 30px;">“<em>Woman </em>is a concept. <strong><a href="wd:Rome" rel="noopener noreferrer nofollow">Rome</a></strong> is a <em>city</em>. that is testified by <u>wikipedia</u>.”</p>
</blockquote>
<pre>This will be ignored <em>Man22</em></pre>
</div></div></body></html>
"""

metadata = {"prov:wasAttributedTo":{"@id":"https://github.com/ch-sander"}, "prov:wasGeneratedBy": "https://github.com/ch-sander/semantic-html/blob/main/tests/Testing.ipynb"}


# 3. Define your mapping
mapping = {
    # Top-level mapping keys correspond to semantic classes used during parsing.
    # These must match expected parser categories (Note, Document, Annotation, etc.)
    "@type": ["ResearchNote"],  # Default type for the root Note

    "Document": {
        # Represents text blocks (e.g. sentences or paragraphs) in the note.
        "tags": ["p"],  # Matches all <p> tags
        "types": ["Doc"]  # This type will be assigned to these nodes
    },

    "Annotation": {
        # Annotations are inline elements that can be nested in Document.
        # Each sub-entry defines a different category of annotation.

        "Concept": {
            # Marks concepts, e.g. keywords or philosophical terms.
            "tags": ["em"],  # e.g. <em>term</em>
            "styles": ["font-style:italic"],  # Alternatively detected via inline CSS
            "types": ["Annotation", "Concept"]
        },

        "Entity": {
            # Marks named entities, typically rendered as bold text.
            "tags": ["strong"],  # e.g. <strong>Rome</strong>
            "types": ["Entity"]
        },

        "Entity2": {
            # Matches entities in plain text using regex, e.g. #human# or #animal
            "regex": [r"#\w+#?"],  # Applied before IGNORE rules! Matches '#' + word + optional trailing '#'
            "types": ["Entity"]
        },

        "Reference1": {
            # Marks generic references, e.g. underlined text.
            "tags": ["u"],  # e.g. <u>source</u>
            "types": ["Reference"]
        },

        "Reference2": {
            # Marks Zotero-style citations with specific class.
            "tags": ["span"],
            "class": "citation",  # e.g. <span class="citation">...</span>
            "types": ["Reference Zotero"]
        }
    },

    "Locator": {
        # Represents location information (e.g. page numbers).
        "tags": ["code"],  # e.g. <code>p.44</code>
        "types": ["Page-Locator"]
    },

    "Structure": {
        # Represents document structure and hierarchy (e.g. headings).
        # Used to model logical organization (chapter, section, etc.).
        "tags": ["h1", "h2", "h3"],
        "types": ["Structure"]
    },

    "Quotation": {
        # Represents quoted text blocks (e.g. citations or sources).
        "tags": ["blockquote"],  # e.g. <blockquote>...</blockquote>
        "types": ["Quotation"]
    },

    "IGNORE": {
        # Defines content to be excluded from parsing, both at HTML and text level.

        "tags": ["pre"],  # Removes <pre> blocks entirely
        "regex": ["#"]    # Removes standalone '#' characters from visible text
                          # (use with care – affects all visible text nodes)
    },

    # You may define a JSON-LD context here to control URI mappings
    # "@context": {
    #     "@vocab": "https://semantic-html.org/vocab#",
    #     "same:as": {"@id": "owl:sameAs", "@type": "@id"},
    #     "locator": {"@id": "in_locator", "@type": "@id"},
    #     "structure": {"@id": "in_structure", "@type": "@id"},
    #     "note": {"@id": "in_note", "@type": "@id"}
    # },
}


# 4. Parse the note
result = parse_note(html, mapping, return_annotated_html=True, note_uri="https://semantic-html.org/note1", metadata=metadata)

# 5. Show JSON-LD result
# print("=== JSON-LD ===")
print(json.dumps(result["jsonld"], indent=4, ensure_ascii=False))

# # 6. Show annotated HTML
# print("\n=== Annotated HTML ===")
# print(result["RDFa"])

# # 7. Show YAML
# print("\n=== YAML ===")
# print(yaml.dump(result["jsonld"], allow_unicode=True, default_flow_style=False, Dumper=yaml.Dumper))


{
    "@context": {
        "xsd": "http://www.w3.org/2001/XMLSchema#",
        "schema": "http://schema.org/",
        "doco": "http://purl.org/spar/doco/",
        "dcterms": "http://purl.org/dc/terms/",
        "prov": "http://www.w3.org/ns/prov#",
        "owl": "http://www.w3.org/2002/07/owl#",
        "@vocab": "https://semantic-html.org/vocab#",
        "Note": "doco:Document",
        "Structure": "doco:DiscourseElement",
        "Locator": "ex:Locator",
        "Doc": "doco:Section",
        "Annotation": "schema:Comment",
        "Quotation": "doco:BlockQuotation",
        "note": {
            "@id": "inNote",
            "@type": "@id"
        },
        "structure": {
            "@id": "inStructure",
            "@type": "@id"
        },
        "locator": {
            "@id": "hasLocator",
            "@type": "@id"
        },
        "sameAs": {
            "@id": "owl:sameAs",
            "@type": "@id"
        },
        "doc": {
            "@id": "dcterms:isPartOf",