In [1]:
#!/usr/bin/env python3
"""
classify_temp_strings.py
────────────────────────
usage:
    python classify_temp_strings.py  In.csv  Out.csv

The input CSV must contain at least a column called ‘object’.
Ten Boolean columns (has_number … has_any_other_text) are added.
"""

'\nclassify_temp_strings.py\n────────────────────────\nusage:\n    python classify_temp_strings.py  In.csv  Out.csv\n\nThe input CSV must contain at least a column called ‘object’.\nTen Boolean columns (has_number … has_any_other_text) are added.\n'

In [2]:
from __future__ import annotations

import re
import sys
from collections import OrderedDict
from decimal import Decimal
from typing import Callable, TypeVar
from typing import Dict, Optional
from typing import Iterable, Final, Pattern
from typing import Union
from uuid import uuid4

import numpy as np
import pandas as pd
from rdflib import (
    Graph, Namespace, BNode, URIRef, Literal, RDF, XSD,
)

from pathlib import Path
import os

In [3]:

ABOVE_RX = re.compile(r"\babove\b", re.I)
AND_RX = re.compile(r"\band\b", re.I)
AT_RX = re.compile(r"\bat\b", re.I)
BELOW_RX = re.compile(r"\bbelow\b", re.I)
CELSIUS_RX = re.compile(r"(?<![A-Za-z])(?:c|℃)(?![A-Za-z])", re.I)
COMMA_RX = re.compile(r",")
DEGREE_RX = re.compile(r"[°º˚℃]")
DIGIT_RX = re.compile(r"\d")
FAHREN_RX = re.compile(r"(?<![A-Za-z])(?:°\s*F|f(?:ahrenheit)?)(?![A-Za-z])", re.I)
HYPHEN_RX = re.compile(r"[-–—−]")
INEQ_RX = re.compile(r"[<>≤≥⩽⩾]")
INITIAL_HYPHEN_RX = re.compile(r"^\s*[-–—−]")
INIT_ABOVE_RX = re.compile(r"^\s*above\s*", re.I)
INIT_BELOW_RX = re.compile(r"^\s*below\s*", re.I)
INTERNAL_ABOVE_RX = re.compile(r'^\s*\S+.*\babove\b', re.I)
INTERNAL_BELOW_RX = re.compile(r'^\s*\S+.*\bbelow\b', re.I)
KELVIN_RX = re.compile(r"(?<![A-Za-z])(?:°\s*K|kelvin|k)(?![A-Za-z])", re.I)
LETTER_RX = re.compile(r"[A-Za-z]")
MIN_RX = re.compile(r"\bmin\b", re.I)
OPHIL_RX = re.compile(r"ophil", re.I)
OR_RX = re.compile(r"\bor\b", re.I)
PAREN_RX = re.compile(r"[()]")
PLUSMINUS_RX = re.compile(r"±|\+/?-")
POOR_RX = re.compile(r"\bpoor\b", re.I)
QUALIFIER_RX = re.compile(r"\b(optimum|optimal|max(?:imum)?|min(?:imum)?|upper limit|lower limit)\b", re.I)
RANGEWORD_RX = re.compile(r"\b(between|from|until)\b|<\s*\d+\s*<", re.I)
SLASH_RX = re.compile(r"[\/|]")
TERM_PUNCT_RX = re.compile(r"[.;,]$")
TIME_RX = re.compile(r"\b(?:min(?:ute)?s?|h(?:ours?)?|hr?s?|sec(?:ond)?s?|days?)\b", re.I)
TOLERAN_RX = re.compile(r"toleran", re.I)
TO_RX = re.compile(r"\bto\b", re.I)
UP_TO_RX = re.compile(r"\bup\s*to\b", re.I)  # <── NEW
WEAK_RX = re.compile(r"\b(weak|weakly)\b", re.I)
WORD_DEGREE_RX = re.compile(r"\bdeg(?:ree)?s?\s*(?:c|cel(?:sius)?)\b", re.I)


In [4]:
# ------------------------------------------------------------------
#  Patterns whose matches should be removed before "other text" test
# ------------------------------------------------------------------
_STRIPPABLE_RXES: list[re.Pattern] = [
    # HYPHEN_RX,
    # INEQ_RX,
    # INITIAL_HYPHEN_RX,
    # PAREN_RX,
    # PLUSMINUS_RX,
    # SLASH_RX,
    ABOVE_RX,
    AND_RX,
    AT_RX,
    BELOW_RX,
    CELSIUS_RX,
    COMMA_RX,
    DEGREE_RX,
    FAHREN_RX,
    KELVIN_RX,
    MIN_RX,
    OPHIL_RX,
    OR_RX,
    POOR_RX,
    QUALIFIER_RX,
    RANGEWORD_RX,
    TIME_RX,
    TOLERAN_RX,
    TO_RX,
    UP_TO_RX,
    WEAK_RX,
    WORD_DEGREE_RX,
]

In [5]:
# build one big strip-regex for the “other text” test
STRIP_RX = re.compile("|".join(rx.pattern for rx in _STRIPPABLE_RXES), re.I)

In [6]:
# This works in Jupyter and with papermill --cwd
notebook_dir = Path().resolve()
project_root = notebook_dir if (notebook_dir / "Makefile").exists() else notebook_dir.parent
assets_dir = project_root / "assets"
local_dir = project_root / "local"

In [7]:
in_path = local_dir / "n4l-temperature.csv"
ttl_out = local_dir / "n4l-temperature.ttl"
unparsed_out = local_dir / "n4l-temperature-un-parsed.csv"

In [8]:
df = pd.read_csv(in_path, dtype=str).fillna("")

In [9]:
if "object" not in df.columns:
    sys.exit("Input must contain a column called ‘object’")

In [10]:
s = df["object"]

In [11]:
# ----------------------------------------------------------------------
#  3.  Vectorised feature checks
# ----------------------------------------------------------------------
tests = {
    "has_above": ABOVE_RX,
    "has_and": AND_RX,
    "has_any_hyphen": HYPHEN_RX,
    "has_any_initial_hyphen": INITIAL_HYPHEN_RX,
    "has_at": AT_RX,
    "has_below": BELOW_RX,
    "has_celsius": CELSIUS_RX,
    "has_degree_symbol": DEGREE_RX,
    "has_inequality_symbol": INEQ_RX,
    "has_init_above": INIT_ABOVE_RX,
    "has_init_below": INIT_BELOW_RX,
    "has_internal_above": INTERNAL_ABOVE_RX,
    "has_internal_below": INTERNAL_BELOW_RX,
    "has_internal_comma": re.compile(r",(?=.)"),  # comma not at end
    "has_number": DIGIT_RX,
    "has_ophil": OPHIL_RX,
    "has_or": OR_RX,
    "has_parentheses": PAREN_RX,
    "has_plus_minus": PLUSMINUS_RX,
    "has_poor": POOR_RX,
    "has_qualifier_word": QUALIFIER_RX,
    "has_range_keyword": RANGEWORD_RX,
    "has_slash_separator": SLASH_RX,
    "has_terminal_punctuation": TERM_PUNCT_RX,
    "has_time_expression": TIME_RX,
    "has_to": TO_RX,
    "has_toleran": TOLERAN_RX,
    "has_unit_fahrenheit": FAHREN_RX,
    "has_unit_kelvin": KELVIN_RX,
    "has_up_to": UP_TO_RX,
    "has_weak": WEAK_RX,
    "has_word_degree": WORD_DEGREE_RX,
}

In [12]:
# Boolean DataFrame built in one comprehension
flag_df = pd.DataFrame({
    name: s.str.contains(rx, regex=True, na=False)
    for name, rx in tests.items()
})

  name: s.str.contains(rx, regex=True, na=False)


In [13]:
# ----------------------------------------------------------------------
#  4.  “Has any OTHER text”  (also vectorised)
# ----------------------------------------------------------------------
stripped = s.str.replace(STRIP_RX, " ", regex=True)
flag_df["has_any_other_text"] = stripped.str.contains(LETTER_RX, na=False)

In [14]:
enriched = pd.concat([df, flag_df], axis=1)

enriched_len = enriched.shape[0]


In [15]:
FLAGS = sorted(c for c in enriched.columns if c.startswith("has_"))

In [16]:
NFLAGS = len(FLAGS)

In [17]:
if NFLAGS > 63:
    raise ValueError("bitmask needs uint64 – you have >63 flags")

In [18]:
powers_of_two = 1 << np.arange(NFLAGS)

In [19]:
enriched["pattern_id"] = (
    enriched[FLAGS].astype(np.uint64).dot(powers_of_two).astype(np.uint64)
)

In [20]:
flag2bit = {f: i for i, f in enumerate(FLAGS)}

In [21]:
bit2flag = {i: f for f, i in flag2bit.items()}

In [22]:
# ---------------------------------------------------------------------
# 1.  ↔ transformation helpers
# ---------------------------------------------------------------------
def flags_to_pid(flag_names):
    """
    Convert an iterable of flag names to one integer pattern_id.
    """
    missing = set(flag_names) - flag2bit.keys()
    if missing:
        raise ValueError(f"Unknown flag name(s): {', '.join(missing)}")

    pid = 0
    for name in flag_names:
        pid |= 1 << flag2bit[name]
    return pid

In [23]:
def pid_to_flags(pid):
    """
    Return the list of *true* flag names for this pattern_id.
    """
    return [bit2flag[i] for i in range(NFLAGS) if pid & (1 << i)]

In [24]:
# ---------------------------------------------------------------------
# 2.  row-selection helpers  (with optional false_flags)
# ---------------------------------------------------------------------
def _validate_flags(df, flags):
    """Raise if any flag name is unknown."""
    missing = set(flags) - set(df.columns)
    if missing:
        raise KeyError(f"Unknown flag name(s): {', '.join(missing)}")

In [25]:
def select_exact(df, true_flags, *, false_flags=None, flags_col_prefix="has_"):
    """
    Keep rows where
        • every flag in true_flags  is  True
        • every flag in false_flags (if given) is False
        • all remaining has_* columns are also False
    """
    true_flags = list(true_flags)
    false_flags = list(false_flags or [])
    _validate_flags(df, true_flags + false_flags)

    # True constraints
    mask = df[true_flags].all(axis=1)

    # Explicit False constraints (if any)
    if false_flags:
        mask &= (~df[false_flags]).all(axis=1)

    # For 'exact' we require all *other* has_* columns to be False
    other_flags = [
        c for c in df.columns
        if c.startswith(flags_col_prefix) and c not in true_flags + false_flags
    ]
    mask &= (~df[other_flags]).all(axis=1)

    return df[mask]

In [26]:
def select_at_least(df, true_flags, *, false_flags=None):
    """
    Keep rows where
        • every flag in true_flags  is True
        • every flag in false_flags (if given) is False
        • the rest may be either
    """
    true_flags = list(true_flags)
    false_flags = list(false_flags or [])
    _validate_flags(df, true_flags + false_flags)

    mask = df[true_flags].all(axis=1)
    if false_flags:
        mask &= (~df[false_flags]).all(axis=1)

    return df[mask]

----

In [27]:
def most_frequent_pattern_id(df: pd.DataFrame, rank: int = 1) -> tuple[int, int, int]:
    """
    Return (pattern_id, frequency) for the *rank*-th most common pattern_id
    in the DataFrame.

    Parameters
    ----------
    df   : DataFrame
           Must contain a column called 'pattern_id'.
    rank : int, default 1
           1 → most common, 2 → second most common, …

    Raises
    ------
    ValueError  if rank < 1 or rank exceeds the number of distinct patterns.
    """
    if rank < 1:
        raise ValueError("rank must be ≥ 1")

    vc = df["pattern_id"].value_counts()

    if rank > len(vc):
        raise ValueError(
            f"rank={rank} exceeds the number of distinct patterns ({len(vc)})"
        )

    pid = int(vc.index[rank - 1])  # pattern_id at the requested rank
    freq = int(vc.iloc[rank - 1])  # its frequency

    return pid, freq, rank

In [28]:
# ------------------------------------------------------------
#  new helper: summary up to a given rank
# ------------------------------------------------------------
def top_pattern_summary(df: pd.DataFrame, top_n: int = 10,
                        pattern_col: str = "pattern_id") -> OrderedDict:
    """
    Return an OrderedDict
        rank  →  {"pattern_id": int,
                  "frequency": int,
                  "true_flags": list[str]}
    for the `top_n` most frequent patterns in *df*.
    """
    if pattern_col not in df.columns:
        raise KeyError(f"Column {pattern_col!r} not found")

    vc = df[pattern_col].value_counts()

    summary = OrderedDict()
    for rank, (pid, freq) in enumerate(vc.items(), 1):
        if rank > top_n:
            break
        summary[rank] = {
            "pattern_id": int(pid),
            "count": int(freq),
            "true_flags": pid_to_flags(int(pid)),
        }
    return summary

In [29]:
def value_counts_df(
        df: pd.DataFrame,
        column: Union[str, int],
        normalize: bool = False,
        sort_desc: bool = True,
        dropna: bool = True,
        name_value: str = "value",
        name_count: str = "count"
) -> pd.DataFrame:
    """
    Return a two-column DataFrame that contains the value-counts of *column*.

    Parameters
    ----------
    df        : DataFrame
    column    : str | int
                The column whose value distribution you want.
    normalize : bool, default False
                If True, return relative frequencies (0–1) instead of counts.
    sort_desc : bool, default True
                Sort by frequency descending (True) or ascending (False).
    dropna    : bool, default True
                If False, include NaN/None as a separate category.
    name_value: str, default "value"
    name_count: str, default "count"

    Returns
    -------
    DataFrame  with columns [name_value, name_count].
    """
    vc = df[column].value_counts(
        normalize=normalize,
        dropna=dropna,
        ascending=not sort_desc
    )

    out = vc.rename_axis(name_value).reset_index(name=name_count)
    return out

In [30]:
summary = top_pattern_summary(enriched, top_n=30)

In [31]:
summary

OrderedDict([(1,
              {'pattern_id': 33156,
               'count': 2886,
               'true_flags': ['has_any_hyphen',
                'has_celsius',
                'has_degree_symbol',
                'has_number']}),
             (2,
              {'pattern_id': 33152,
               'count': 2276,
               'true_flags': ['has_celsius',
                'has_degree_symbol',
                'has_number']}),
             (3,
              {'pattern_id': 65552,
               'count': 529,
               'true_flags': ['has_any_other_text', 'has_ophil']}),
             (4,
              {'pattern_id': 67142016,
               'count': 426,
               'true_flags': ['has_celsius',
                'has_degree_symbol',
                'has_number',
                'has_to']}),
             (5,
              {'pattern_id': 49536,
               'count': 345,
               'true_flags': ['has_celsius',
                'has_degree_symbol',
                'has_internal_

begin here

In [32]:
requested_pid, pid_count, pid_rank = most_frequent_pattern_id(enriched, rank=14)
true_flags = pid_to_flags(requested_pid)
subset_exact = select_exact(enriched, true_flags)
positive_object_values = value_counts_df(
    subset_exact,
    "object",
)

In [33]:
print(f"pattern {requested_pid} has rank {pid_rank} with {pid_count} rows")

pattern 16810372 has rank 14 with 15 rows


In [34]:
true_flags

['has_any_hyphen',
 'has_celsius',
 'has_degree_symbol',
 'has_number',
 'has_terminal_punctuation']

In [35]:
positive_object_values

Unnamed: 0,value,count
0,"20°C-40°C,",4
1,50-60°C.,3
2,"37.0-39.0 °C,",2
3,"4-43 °C,",2
4,"15–45 °C,",2
5,4–38 °C.,2


In [36]:
# ==========================================================================
# 1.  RANGE-ONLY temperature parser  ---------------------------------------
# ==========================================================================
_UNIT_TABLE = [
    (r'(?:°|º)?\s*C(?:elsius)?', 'Cel'),
    (r'(?:°|º)?\s*F', '[degF]'),
    (r'\bK\b', 'K'),
]

_DASHES = "-–—‒−‐"
_DASH_TRANS = str.maketrans({d: "-" for d in _DASHES})
_NUMBER_RGX = re.compile(r'(?<!\d)[+-]?\d+(?:\.\d+)?')
_RANGE_TOKEN_RGX = re.compile(r'-|\bto\b', re.I)
_UNIT_STRIP_RGX = re.compile(r'(?:°|º)?\s*[CFK](?:elsius)?', flags=re.I)


def parse_hyphen_range_celsius_degree_values(raw: str) -> Dict:
    comp: Dict = {"component_text": raw}

    txt = (raw.translate(_DASH_TRANS)
           .replace("\u2009", " ")
           .replace("\u202f", " "))

    # unit
    for pat, ucum in _UNIT_TABLE:
        if re.search(pat, txt, flags=re.I):
            comp["unit"] = ucum
            break

    txt_unitless = _UNIT_STRIP_RGX.sub("", txt)

    if _RANGE_TOKEN_RGX.search(txt_unitless):
        nums = [Decimal(n) for n in _NUMBER_RGX.findall(txt_unitless)]
        if len(nums) >= 2:
            comp["minimum_value"], comp["maximum_value"] = nums[:2]
            if comp["minimum_value"] > comp["maximum_value"]:
                comp["minimum_value"], comp["maximum_value"] = (
                    comp["maximum_value"], comp["minimum_value"])
            return comp  # SUCCESS → return with range fields set

    # NOT a clear range
    comp["unparsed_text"] = raw
    return comp

In [37]:
# ============================================================================
# 1.  PARSER  (spot value in °C / °F / K)   ----------------------------------
# ============================================================================

_UNIT_TABLE = [
    (r'(?:°|º)?\s*C(?:elsius)?', 'Cel'),
    (r'(?:°|º)?\s*F', '[degF]'),
    (r'\bK\b', 'K'),
]

_NUMBER_RGX = re.compile(r'(?<!\d)[+-]?\d+(?:\.\d+)?')  # prevents “25-30”→“-30”


def parse_spot_celsius_degree_value(raw: str) -> Dict:
    """Return a dict that meets the ParseComponent cardinality rule."""
    comp: Dict = {"component_text": raw}

    if m := _NUMBER_RGX.search(raw):
        comp["spot_value"] = Decimal(m.group())

    for pat, ucum in _UNIT_TABLE:
        if re.search(pat, raw, flags=re.I):
            comp["unit"] = ucum
            break

    if "spot_value" not in comp or "unit" not in comp:
        comp["unparsed_text"] = raw

    return comp

<https://w3id.org/biolink/vocab/has_phenotype> values from KG-Microbe

* https://www.example.org/UNKNOWN/temperature:hyperthermophilic	"263"^^xsd:integer
* https://www.example.org/UNKNOWN/temperature:mesophilic	"26633"^^xsd:integer
* https://www.example.org/UNKNOWN/temperature:psychrophilic	"1406"^^xsd:integer
* https://www.example.org/UNKNOWN/temperature:thermophilic	"1942"^^xsd:integer

In [38]:
# ==========================================================================
# 1.  Canonical categorical & qualifier vocabulary  ------------------------
#    – keys are canonical tokens we want in RDF
#    – each value is ONE regex that matches plural, singular, adjective …
# ==========================================================================
_CATEGORIES = {
    # canonical        regex that matches ALL listed variants
    "hyperthermophilic": r"\bhyperthermophil(?:e|es|ic|ics)?\b",
    "mesophilic": r"\bmesophil(?:e|es|ic|ics|ism|s)?\b",
    "psychrophilic": r"\bpsychrophil(?:e|es|ic|ics|s)?\b",
    "psychrotolerant": r"\bpsychrotolerant(?:s)?\b",
    "psychrotrophic": r"\bpsychrotroph(?:e|es|ic|ics|s)?\b",
    "thermoacidophilic": r"\bthermoacidophil(?:e|es|ic|ics)?\b",
    "thermophilic": r"\b(?:thermophil(?:e|es|ic|ics|ism|s)|thermophile)\b",
}

_QUALIFIERS = {
    "moderately": r"\bmoderate(?:ly)?\b",
    "slightly": r"\bslight(?:ly)?\b",
    "extremely": r"\bextreme(?:ly)?\b",
    "strict": r"\bstrict(?:ly)?\b",
    "obligate": r"\bobligat(?:e|ely)\b",
    # add more if you encounter them
}

# pre-compile once for speed
_CAT_RGX = {canon: re.compile(rx, re.I) for canon, rx in _CATEGORIES.items()}
_QUAL_RGX = {canon: re.compile(rx, re.I) for canon, rx in _QUALIFIERS.items()}


# ==========================================================================
# 2.  The parser function  --------------------------------------------------
# ==========================================================================
def parse_categorical_label(raw: str) -> Dict:
    """
    Parse ONE free-text string for a categorical_label (mesophilic,
    thermophilic, …) and an optional qualifier_label (moderately, very, …).

    Output dict satisfies the ParseComponent cardinality rule:
      • contains categorical_label   → OK
      • **or** sets unparsed_text    → OK
    """
    comp: Dict = {"component_text": raw}

    # ------------------- detect category ----------------------------------
    for canon, rgx in _CAT_RGX.items():
        if rgx.search(raw):
            comp["categorical_label"] = canon
            break  # stop at first hit

    # ------------------- detect qualifier (optional) ----------------------
    for canon, rgx in _QUAL_RGX.items():
        if rgx.search(raw):
            comp["qualifier_label"] = canon
            break

    # ------------------- fall-back ----------------------------------------
    if "categorical_label" not in comp:
        comp["unparsed_text"] = raw

    return comp



In [39]:
# -------------------------------------------------------------------------
# 1.  pre-compiled regexes
# -------------------------------------------------------------------------
_RANGE_TO_RGX = re.compile(
    r"""
    (?P<min>[+-]?\d+(?:\.\d+)?)     # first number
    \s*(?:°|º)?\s*C?                # optional °,º,C after first number
    \s*to\s*                        # the word 'to' (any case)
    (?P<max>[+-]?\d+(?:\.\d+)?)     # second number
    \s*(?:°|º)?\s*C?                # optional °,º,C after second number
    """,
    flags=re.I | re.X,  # ignore case + verbose mode
)

_UNIT_PRESENT_RGX = re.compile(r'(?:°|º)|\bC\b', flags=re.I)


# -------------------------------------------------------------------------
# 2.  parser function
# -------------------------------------------------------------------------
def parse_to_temperature_range(raw: str) -> Dict:
    """
    Parse strings like
        "10 to 45"
        "10° C to 45° C"
        "10°to45C"
    into {minimum_value, maximum_value, unit?} or fall back to unparsed_text.
    """
    comp: Dict = {"component_text": raw}

    if m := _RANGE_TO_RGX.search(raw):
        comp["minimum_value"] = Decimal(m.group("min"))
        comp["maximum_value"] = Decimal(m.group("max"))

        # swap if entered in descending order (rare but safe)
        if comp["minimum_value"] > comp["maximum_value"]:
            comp["minimum_value"], comp["maximum_value"] = (
                comp["maximum_value"], comp["minimum_value"])

        # Unit: set only when any ° or C is present
        if _UNIT_PRESENT_RGX.search(raw):
            comp["unit"] = "Cel"
    else:
        comp["unparsed_text"] = raw

    return comp



In [40]:
# ───────────────────────────────────────────────────────────────
#  pre-compiled regex
#     ┌──────── dir            (above / below, case-insensitive)
#     │┌─────── optional leading ° / C (any whitespace)
#     ││   ┌─── number (int or float, signed)
#     ││   │┌─ optional trailing ° / C
HALF_RANGE_RX = re.compile(
    r"""
    (?P<dir>above|below)               # direction keyword
    \s*                                # optional blanks
    (?:[°º]\s* C? \s*)?                # optional “°C” (before number)
    (?P<val>[-+]?\d+(?:\.\d+)?)        # the numeric value
    \s* (?:[°º]\s* C?)?                # optional “°C” (after number)
    """,
    re.I | re.VERBOSE,
)


def parse_half_range_celsius_degree_value(text: str) -> Dict:
    """
    Parse strings like
        “above 45°C”, “below 30 C”, “ABOVE °​60C”, …
    Return a dict suitable for add_parsegroup().
    """
    if text is None:  # guard against None
        raise ValueError("input is None")

    m = HALF_RANGE_RX.search(text.strip())
    if m is None:
        raise ValueError(f"cannot parse half-range: {text!r}")

    direction = m.group("dir").lower()
    value = float(m.group("val"))

    out = {
        "component_text": text.strip(),  # verbatim (for provenance)
        "unit": "Cel",
    }
    if direction == "above":
        out["minimum_value"] = value  # only the lower bound
    else:  # below
        out["maximum_value"] = value  # only the upper bound

    return out

In [41]:
# ==========================================================================
#  Updated “generic RDF writer”
#
#  •  to_parse   – the string that was actually fed into the temperature
#                  parser (usually the cleaned / normalised token)
#  •  to_claim   – the *verbatim* text that appears in the source; it is
#                  what you want in ENV.raw_text for provenance
#
#  If you pass only to_parse, it is used for both purposes (same behaviour
#  as the previous version).  If you supply two different strings you keep
#  the original-substring provenance while still parsing the cleaned text.
# ==========================================================================


ENV = Namespace("http://example.org/env-parse#")
BASE_PG = Namespace("http://example.org/pg/")


def add_parsegroup(comp: Dict,
                   *,
                   graph: Graph,
                   to_parse: str,
                   to_claim: Optional[str] = None,
                   pg_uri: Optional[URIRef] = None) -> URIRef:
    """
    Insert one ParseGroup + ONE ParseComponent into `graph`.

    Parameters
    ----------
    comp      : dict
        Output of your (spot / range) parser.  Must contain at least
        "component_text"; may contain the optional numeric & label keys.
    graph     : rdflib.Graph
        The graph that will receive the triples.
    to_parse  : str
        The cleaned string that was actually parsed.
    to_claim  : str | None
        The *verbatim* substring that will be stored as ENV.raw_text.
        If None, `to_parse` is used (back-compat with older code).
    pg_uri    : URIRef | None
        Use an existing URI for the ParseGroup or let the function
        mint a fresh one under BASE_PG.

    Returns
    -------
    URIRef
        The (possibly freshly minted) URI of the new ParseGroup.
    """
    to_claim = to_claim if to_claim is not None else to_parse

    pg_uri = pg_uri or BASE_PG[str(uuid4())]
    comp_bn = BNode()

    # ------------------------------------------------------------------
    # ParseGroup triples
    # ------------------------------------------------------------------
    graph.add((pg_uri, RDF.type, ENV.ParseGroup))
    graph.add((pg_uri, ENV.raw_text, Literal(to_claim, datatype=XSD.string)))
    # (optional) store the text-that-was-parsed as well — can be handy
    if to_parse != to_claim:
        graph.add((pg_uri, ENV.parse_text, Literal(to_parse, datatype=XSD.string)))

    graph.add((pg_uri, ENV.parse_component, comp_bn))

    # ------------------------------------------------------------------
    # ParseComponent – common triples
    # ------------------------------------------------------------------
    graph.add((comp_bn, RDF.type, ENV.ParseComponent))
    graph.add((comp_bn, ENV.component_text,
               Literal(comp["component_text"], datatype=XSD.string)))

    # optional numeric / label / leftover fields -----------------------
    field_map = {
        "minimum_value": ENV.minimum_value,
        "maximum_value": ENV.maximum_value,
        "spot_value": ENV.spot_value,
        "unit": ENV.unit,
        "categorical_label": ENV.categorical_label,
        "qualifier_label": ENV.qualifier_label,
        "unparsed_text": ENV.unparsed_text,
    }
    for k, prop in field_map.items():
        if k in comp:
            lit = (Literal(str(comp[k]), datatype=XSD.decimal)
                   if k.endswith("_value") else
                   Literal(comp[k], datatype=XSD.string))
            graph.add((comp_bn, prop, lit))

    return pg_uri

In [42]:
T = TypeVar("T")  # parsed result of any type
ParserFunc = Callable[[str], T]

In [43]:
# ----------------------------------------------------------------------
#  helper: take a row, run the domain parser, write the triples
# ----------------------------------------------------------------------
def _row_to_graph(
        row,
        *,
        graph: Graph,
        s_col: str,
        p_col: str,
        to_parse_col: str,
        to_claim_col: str | None,
        parser: Callable[[str], dict],
):
    subj_uri = URIRef(row[s_col])
    pred_uri = URIRef(row[p_col])
    to_parse = row[to_parse_col]
    to_claim = row[to_claim_col] if to_claim_col is not None else None

    # run the domain-specific parser (returns dict with component_text etc.)
    comp = parser(to_parse)

    # add_parsegroup inserts all ParseGroup / ParseComponent triples
    add_parsegroup(
        comp,
        graph=graph,
        to_parse=to_parse,
        to_claim=to_claim,  # <- may be None → falls back to to_parse
        pg_uri=None,  # new URI minted automatically
    )

    # link the ParseGroup to the subject / predicate (your vocab!)
    # (remove / change if you already do this elsewhere)
    pg_uri = BASE_PG[str(uuid4())]
    graph.add((subj_uri, pred_uri, pg_uri))


# ----------------------------------------------------------------------
#  main helper
# ----------------------------------------------------------------------
def _pick_claim_column(
        df: pd.DataFrame,
        *,
        to_parse_col: str,
        to_claim_col: str | None,
        auto_suffixes: tuple[str, ...] = ("_intact", "_raw", "_orig"),
) -> str | None:
    """
    Decide which column to take for the verbatim `raw_text`.

    • If `to_claim_col` was supplied and exists → use it.
    • Else look for <to_parse_col><suffix> in *df* (common patterns).
    • Else return None  → add_parsegroup() will fall back to parse string.
    """
    if to_claim_col and to_claim_col in df.columns:
        return to_claim_col

    for suf in auto_suffixes:
        cand = f"{to_parse_col}{suf}"
        if cand in df.columns:
            return cand

    return None  # let add_parsegroup() fall back


def parse_triple_dataframe(
        df: pd.DataFrame,
        *,
        s_col: str = "subject",
        p_col: str = "predicate",
        to_parse_col: str = "object",
        to_claim_col: str | None = None,  #  now truly optional
        parser: Callable[[str], dict],
        graph: Graph | None = None,
) -> Graph:
    """
    Iterate over *df*, parse each row’s value and emit ParseGroup triples.
    """
    if graph is None:
        graph = Graph()

    # decide once which column to take for raw_text
    claim_col = _pick_claim_column(df,
                                   to_parse_col=to_parse_col,
                                   to_claim_col=to_claim_col)

    for _, row in df.iterrows():
        subj_uri = URIRef(row[s_col])
        pred_uri = URIRef(row[p_col])

        to_parse = row[to_parse_col]
        to_claim = row[claim_col] if claim_col else None

        comp = parser(to_parse)

        pg_uri = add_parsegroup(
            comp,
            graph=graph,
            to_parse=to_parse,
            to_claim=to_claim,  # may be None → fallback inside helper
        )

        graph.add((subj_uri, pred_uri, pg_uri))

    return graph

In [44]:
"""
Temperature-list splitter + unit normaliser
───────────────────────────────────────────
For strings such as

    "4 °C, 45°C"          →  ["4 C", "45 C"]
    "4, 52 °C"            →  ["4 C", "52 C"]
    "25°C, 30°C"          →  ["25 C", "30 C"]

1. Decide which temperature unit is present (C / F / K).
2. Split on the internal comma / “and” / “or”.
3. Remove every occurrence of the unit or degree glyph inside each part.
4. Re-append the normalised unit to every part.
5. `explode_value_column()` does the usual DataFrame explode, now
   delegating the heavy lifting to the new splitter class.
"""

# ──────────────────────────────────────────────────────────────
#  Local helpers
# ──────────────────────────────────────────────────────────────
_SPLIT_RX: Final[Pattern] = re.compile(r"\s*(?:,|\band\b|\bor\b)\s*", re.I)
_TERM_PUNCT_RX: Final[Pattern] = re.compile(r"[.;,\s]+$")

_UNIT_STRIP_RXES: Final[tuple[Pattern, ...]] = (
    CELSIUS_RX,
    FAHREN_RX,
    KELVIN_RX,
    WORD_DEGREE_RX,
    DEGREE_RX,
)


class TemperatureSplitter:
    """
    Splitter + unit-normaliser for comma-separated temperature lists.

    Example
    -------
    >>> TemperatureSplitter().split("4 °C, 45°C")
    ['4 C', '45 C']
    """

    __slots__ = ()

    # ––––––––––––––––––––––––––––––––––––––––––––––––––––––––
    #  Public interface
    # ––––––––––––––––––––––––––––––––––––––––––––––––––––––––
    def split(self, text: str | None) -> list[str]:
        """
        Split `text` into normalised temperature parts.

        • If `text` is None, returns [].
        • If `text` is empty/whitespace, returns [''].
        • If the heuristic decides that no split is required,
          the cleaned value (unit normalised once) is wrapped in a list.
        """
        if text is None:
            return []

        text = text.strip()
        if not text:
            return [""]

        if not self._should_split(text):
            return [self._clean(text, self._detect_unit(text))]

        unit = self._detect_unit(text)
        parts: Iterable[str] = (p for p in _SPLIT_RX.split(text) if p)

        return [self._clean(p, unit) for p in parts] or [text]

    # ––––––––––––––––––––––––––––––––––––––––––––––––––––––––
    #  Internals
    # ––––––––––––––––––––––––––––––––––––––––––––––––––––––––
    @staticmethod
    def _detect_unit(text: str) -> str:
        """Return 'C', 'F', 'K' or ''."""
        if FAHREN_RX.search(text):
            return "F"
        if KELVIN_RX.search(text):
            return "K"
        if CELSIUS_RX.search(text) or WORD_DEGREE_RX.search(text):
            return "C"
        return ""

    @staticmethod
    def _should_split(text: str) -> bool:
        """
        Heuristic: does *text* look like a list of temperature values?
        (number(s) + unit + at least one recognised separator)
        """
        # ignore a possible trailing comma when looking for a separator
        searchable = text[:-1] if text.endswith(",") else text

        has_separator = bool(_SPLIT_RX.search(searchable))
        has_number = bool(DIGIT_RX.search(text))
        has_unit = any(
            rx.search(text) for rx in (CELSIUS_RX, FAHREN_RX, KELVIN_RX)
        )
        return has_separator and has_number and has_unit

    @staticmethod
    def _clean(part: str, unit: str) -> str:
        """Strip units & punctuation from *part*, then append *unit*."""
        for rx in _UNIT_STRIP_RXES:
            part = rx.sub("", part)

        part = _TERM_PUNCT_RX.sub("", part).strip()
        return f"{part} {unit}".strip() if unit else part


# Singleton used by the thin convenience wrapper below
_DEFAULT_SPLITTER: Final[TemperatureSplitter] = TemperatureSplitter()


def split_temperature_values(text: str | None) -> list[str]:
    """
    Backwards-compatibility wrapper around the `TemperatureSplitter`.
    """
    return _DEFAULT_SPLITTER.split(text)


# ──────────────────────────────────────────────────────────────
#  DataFrame helper
# ──────────────────────────────────────────────────────────────
def explode_value_column(
        df: pd.DataFrame,
        value_col: str = "object",
        *,
        drop_original: bool = True,
        splitter: TemperatureSplitter | None = None,
) -> pd.DataFrame:
    """
    Split / normalise *value_col*, explode, keep both the split parts and
    the untouched original value.

    Resulting columns
    -----------------
    {value_col}_split   – individual, normalised pieces
    {value_col}_intact  – original string (repeated on every exploded row)
    """
    if value_col not in df.columns:
        raise KeyError(f"{value_col!r} is not a column of the DataFrame")

    splitter = splitter or _DEFAULT_SPLITTER
    split_col, intact_col = f"{value_col}_split", f"{value_col}_intact"

    out = (
        df.assign(
            **{
                intact_col: df[value_col],  # original
                split_col: df[value_col].apply(splitter.split),
            }
        )
        .explode(split_col, ignore_index=True)
    )

    return out.drop(columns=value_col) if drop_original else out

In [45]:
# ------------------------------------------------------------
# 1.  build a frequency-rank lookup in two lines
# ------------------------------------------------------------
FLAG_COLS = [c for c in enriched.columns if c.startswith("has_")]

# each row → frozenset of flags whose value is True
enriched["_flag_set"] = enriched[FLAG_COLS].apply(
    lambda r: frozenset(r.index[r]), axis=1
)

# frequency table:  flag-set  →  count, then convert to rank (1 = most common)
freq = (
    enriched["_flag_set"]
    .value_counts()  # Series: index = flag-set, value = count
    .rank(ascending=False, method="dense")  # 1, 2, 3, …  (dense ranking)
    .astype(int)  # make it nice integers
)


# ------------------------------------------------------------
# 2.  quick helper
# ------------------------------------------------------------
def freq_rank_of_flags(flags) -> int | None:
    """Return frequency-rank (1 = most common) for a list of flags."""
    return freq.get(frozenset(flags))


----

In [46]:

# ──────────────────────────────────────────────────────────────
# 1.  cookbook that drives the whole parsing pipeline
# ──────────────────────────────────────────────────────────────
TASKS = [
    dict(
        flag_sets=[
            ['has_any_hyphen', 'has_celsius', 'has_degree_symbol', 'has_number', 'has_terminal_punctuation'],
            ['has_any_hyphen', 'has_celsius', 'has_degree_symbol', 'has_number'],
            ['has_any_hyphen', 'has_celsius', 'has_number', 'has_terminal_punctuation'],
            ['has_any_hyphen', 'has_celsius', 'has_number'],
            ['has_any_hyphen', 'has_degree_symbol', 'has_number', 'has_terminal_punctuation'],
            ['has_any_hyphen', 'has_degree_symbol', 'has_number'],
        ],
        parser=parse_hyphen_range_celsius_degree_values,
        to_parse="object",
        to_claim=None,  # let add_parsegroup() fall back
        explode=False,
        out_file="parse_hyphen_range_celsius_degree_values.ttl",
        subsetting="exact",
    ),

    dict(
        flag_sets=[
            ['has_celsius', 'has_degree_symbol', 'has_number', 'has_terminal_punctuation'],
            ['has_celsius', 'has_degree_symbol', 'has_number'],
            ['has_celsius', 'has_number', 'has_terminal_punctuation'],
            ['has_celsius', 'has_number'],
            ['has_degree_symbol', 'has_number', 'has_terminal_punctuation'],
            ['has_degree_symbol', 'has_number'],
        ],
        parser=parse_spot_celsius_degree_value,
        to_parse="object",
        to_claim=None,
        explode=False,
        out_file="parse_spot_celsius_degree_value.ttl",
        subsetting="exact",
    ),

    dict(
        flag_sets=[
            ['has_ophil'],
            ['has_toleran'],
        ],
        parser=parse_categorical_label,
        to_parse="object",
        to_claim=None,
        explode=False,
        out_file="parse_categorical_label.ttl",
        subsetting="at_least",
    ),

    # rank 4  -----------------------------------------------------------
    dict(
        flag_sets=[
            ['has_to', 'has_celsius', 'has_degree_symbol', 'has_number', 'has_terminal_punctuation'],
            ['has_to', 'has_celsius', 'has_degree_symbol', 'has_number'],
            ['has_to', 'has_celsius', 'has_number', 'has_terminal_punctuation'],
            ['has_to', 'has_celsius', 'has_number'],
            ['has_to', 'has_degree_symbol', 'has_number', 'has_terminal_punctuation'],
            ['has_to', 'has_degree_symbol', 'has_number'],
        ],
        parser=parse_to_temperature_range,
        to_parse="object",
        to_claim=None,
        explode=False,
        out_file="parse_to_temperature_range.ttl",
        subsetting="exact",
    ),

    # rank 5 / 6 / 7  ---------------------------------------------------
    dict(
        flag_sets=[
            ['has_celsius', 'has_and', 'has_number', 'has_terminal_punctuation'],
            ['has_celsius', 'has_and', 'has_number'],
            ['has_celsius', 'has_degree_symbol', 'has_and', 'has_number', 'has_terminal_punctuation'],
            ['has_celsius', 'has_degree_symbol', 'has_and', 'has_number'],
            ['has_celsius', 'has_degree_symbol', 'has_internal_comma', 'has_number', 'has_terminal_punctuation'],
            ['has_celsius', 'has_degree_symbol', 'has_internal_comma', 'has_number'],
            ['has_celsius', 'has_degree_symbol', 'has_or', 'has_number', 'has_terminal_punctuation'],
            ['has_celsius', 'has_degree_symbol', 'has_or', 'has_number'],
            ['has_celsius', 'has_internal_comma', 'has_number', 'has_terminal_punctuation'],
            ['has_celsius', 'has_internal_comma', 'has_number'],
            ['has_celsius', 'has_or', 'has_number', 'has_terminal_punctuation'],
            ['has_celsius', 'has_or', 'has_number'],
            ['has_degree_symbol', 'has_and', 'has_number', 'has_terminal_punctuation'],
            ['has_degree_symbol', 'has_and', 'has_number'],
            ['has_degree_symbol', 'has_internal_comma', 'has_number', 'has_terminal_punctuation'],
            ['has_degree_symbol', 'has_internal_comma', 'has_number'],
            ['has_degree_symbol', 'has_or', 'has_number', 'has_terminal_punctuation'],
            ['has_degree_symbol', 'has_or', 'has_number'],
        ],
        parser=parse_spot_celsius_degree_value,
        to_parse="object_split",
        to_claim="object_intact",
        explode=True,
        out_file="parse_exploded_temperature_range.ttl",
        subsetting="exact",
    ),

    # rank 9 & 11  ------------------------------------------------------
    dict(
        flag_sets=[
            ['has_celsius', 'has_degree_symbol', 'has_init_above', 'has_above', 'has_number',
             'has_terminal_punctuation'],
            ['has_celsius', 'has_degree_symbol', 'has_init_above', 'has_above', 'has_number'],
            ['has_celsius', 'has_degree_symbol', 'has_init_below', 'has_below', 'has_number',
             'has_terminal_punctuation'],
            ['has_celsius', 'has_degree_symbol', 'has_init_below', 'has_below', 'has_number'],
            ['has_celsius', 'has_init_above', 'has_above', 'has_number', 'has_terminal_punctuation'],
            ['has_celsius', 'has_init_above', 'has_above', 'has_number'],
            ['has_celsius', 'has_init_below', 'has_below', 'has_number', 'has_terminal_punctuation'],
            ['has_celsius', 'has_init_below', 'has_below', 'has_number'],
            ['has_degree_symbol', 'has_init_above', 'has_above', 'has_number', 'has_terminal_punctuation'],
            ['has_degree_symbol', 'has_init_above', 'has_above', 'has_number'],
            ['has_degree_symbol', 'has_init_below', 'has_below', 'has_number', 'has_terminal_punctuation'],
            ['has_degree_symbol', 'has_init_below', 'has_below', 'has_number'],
        ],
        parser=parse_half_range_celsius_degree_value,
        to_parse="object",
        to_claim=None,
        explode=False,
        out_file="parse_half_range_celsius_degree_value.ttl",
        subsetting="exact",
    ),

    # rank ???  ------------------------------------------------------
    dict(
        flag_sets=[
            [
                'has_above',
                'has_below',
                'has_celsius',
                'has_degree_symbol',
                'has_internal_comma',
                'has_number',
            ],
            [
                'has_above',
                'has_below',
                'has_celsius',
                'has_degree_symbol',
                'has_and',
                'has_number',
            ],
            [
                'has_above',
                'has_below',
                'has_celsius',
                'has_degree_symbol',
                'has_or',
                'has_number',
            ]
        ],
        parser=parse_half_range_celsius_degree_value,
        to_parse="object_split",
        to_claim="object_intact",
        explode=True,
        out_file="parse_half_range_celsius_degree_value.ttl",
        subsetting="at_least",
        false_flags=['has_at', 'has_any_other_text'],
    ),

]



In [47]:
# ──────────────────────────────────────────────────────────────
# 2.  main loop: parse, serialize per task, collect graphs
# ──────────────────────────────────────────────────────────────
graphs = []  # one graph per task → later union
cumulative = 0
processed_idx = set()



In [48]:
for task in TASKS:
    # 2.1  show ranks once per flag-set
    for fs in task["flag_sets"]:
        print(f"{fs}  is ranked  {freq_rank_of_flags(fs)}")

    # -----------------------------------------------------------------
    # 2.2  matching rows (union of all flag-sets in the task)
    # -----------------------------------------------------------------
    false_flags = task.get('false_flags')          # will be None if key absent

    if task['subsetting'] == "exact":
        subset = (
            pd.concat(
                select_exact(enriched, fs, false_flags=false_flags)    # <── pass it
                for fs in task["flag_sets"]
            )
            .drop_duplicates()
        )

    elif task['subsetting'] == "at_least":
        subset = (
            pd.concat(
                select_at_least(enriched, fs, false_flags=false_flags) # <── pass it
                for fs in task["flag_sets"]
            )
            .drop_duplicates()
        )

    else:
        raise ValueError("Illegal subsetting value: "
                         f"{task['subsetting']!r}")

    processed_idx.update(subset.index)  # 2️⃣ remember them
    pre_explode_len = len(subset)  # ← this is what we want to count

    # optional explode for parsing only
    if task["explode"]:
        subset = explode_value_column(subset, value_col="object")

    # 2.3  run the domain parser → per-task graph
    g = parse_triple_dataframe(
        subset,
        s_col="subject",
        p_col="predicate",
        to_parse_col=task["to_parse"],
        to_claim_col=task["to_claim"],
        parser=task["parser"],
    )
    graphs.append(g)

    # # optional individual serialisation
    # g.serialize(task["out_file"], format="turtle")

    # accumulate coverage using the pre-explode size
    cumulative += pre_explode_len
    print("rows added (unique):", pre_explode_len,
          "cumulative:", cumulative, "\n")



['has_any_hyphen', 'has_celsius', 'has_degree_symbol', 'has_number', 'has_terminal_punctuation']  is ranked  13
['has_any_hyphen', 'has_celsius', 'has_degree_symbol', 'has_number']  is ranked  1
['has_any_hyphen', 'has_celsius', 'has_number', 'has_terminal_punctuation']  is ranked  None
['has_any_hyphen', 'has_celsius', 'has_number']  is ranked  19
['has_any_hyphen', 'has_degree_symbol', 'has_number', 'has_terminal_punctuation']  is ranked  None
['has_any_hyphen', 'has_degree_symbol', 'has_number']  is ranked  None


rows added (unique): 2910 cumulative: 2910 

['has_celsius', 'has_degree_symbol', 'has_number', 'has_terminal_punctuation']  is ranked  26
['has_celsius', 'has_degree_symbol', 'has_number']  is ranked  2
['has_celsius', 'has_number', 'has_terminal_punctuation']  is ranked  None
['has_celsius', 'has_number']  is ranked  11
['has_degree_symbol', 'has_number', 'has_terminal_punctuation']  is ranked  None
['has_degree_symbol', 'has_number']  is ranked  25


rows added (unique): 2302 cumulative: 5212 

['has_ophil']  is ranked  None
['has_toleran']  is ranked  None
rows added (unique): 630 cumulative: 5842 

['has_to', 'has_celsius', 'has_degree_symbol', 'has_number', 'has_terminal_punctuation']  is ranked  None
['has_to', 'has_celsius', 'has_degree_symbol', 'has_number']  is ranked  4
['has_to', 'has_celsius', 'has_number', 'has_terminal_punctuation']  is ranked  None
['has_to', 'has_celsius', 'has_number']  is ranked  22
['has_to', 'has_degree_symbol', 'has_number', 'has_terminal_punctuation']  is ranked  None
['has_to', 'has_degree_symbol', 'has_number']  is ranked  None


rows added (unique): 432 cumulative: 6274 

['has_celsius', 'has_and', 'has_number', 'has_terminal_punctuation']  is ranked  None
['has_celsius', 'has_and', 'has_number']  is ranked  None
['has_celsius', 'has_degree_symbol', 'has_and', 'has_number', 'has_terminal_punctuation']  is ranked  None
['has_celsius', 'has_degree_symbol', 'has_and', 'has_number']  is ranked  6
['has_celsius', 'has_degree_symbol', 'has_internal_comma', 'has_number', 'has_terminal_punctuation']  is ranked  10
['has_celsius', 'has_degree_symbol', 'has_internal_comma', 'has_number']  is ranked  5
['has_celsius', 'has_degree_symbol', 'has_or', 'has_number', 'has_terminal_punctuation']  is ranked  25
['has_celsius', 'has_degree_symbol', 'has_or', 'has_number']  is ranked  7
['has_celsius', 'has_internal_comma', 'has_number', 'has_terminal_punctuation']  is ranked  None
['has_celsius', 'has_internal_comma', 'has_number']  is ranked  None
['has_celsius', 'has_or', 'has_number', 'has_terminal_punctuation']  is ranked  N

rows added (unique): 885 cumulative: 7159 

['has_celsius', 'has_degree_symbol', 'has_init_above', 'has_above', 'has_number', 'has_terminal_punctuation']  is ranked  None
['has_celsius', 'has_degree_symbol', 'has_init_above', 'has_above', 'has_number']  is ranked  9
['has_celsius', 'has_degree_symbol', 'has_init_below', 'has_below', 'has_number', 'has_terminal_punctuation']  is ranked  None
['has_celsius', 'has_degree_symbol', 'has_init_below', 'has_below', 'has_number']  is ranked  10
['has_celsius', 'has_init_above', 'has_above', 'has_number', 'has_terminal_punctuation']  is ranked  None
['has_celsius', 'has_init_above', 'has_above', 'has_number']  is ranked  None
['has_celsius', 'has_init_below', 'has_below', 'has_number', 'has_terminal_punctuation']  is ranked  None
['has_celsius', 'has_init_below', 'has_below', 'has_number']  is ranked  None
['has_degree_symbol', 'has_init_above', 'has_above', 'has_number', 'has_terminal_punctuation']  is ranked  None
['has_degree_symbol', 'has_in

In [49]:
# ──────────────────────────────────────────────────────────────
# 3.  after-the-fact union  →  MASTER graph
# ──────────────────────────────────────────────────────────────
master = Graph()


In [50]:
for g in graphs:
    master += g  # rdflib unions duplicates automatically


In [51]:

master.serialize(ttl_out, format="turtle")


<Graph identifier=N6b4f360dad75433b8de8ec1657e1cbca (<class 'rdflib.graph.Graph'>)>

In [52]:
print("overall coverage:", cumulative / len(enriched))

overall coverage: 0.9413066111400312


In [53]:
unprocessed = enriched.loc[~enriched.index.isin(processed_idx)]

In [54]:
unprocessed.shape

(451, 39)

In [55]:
unprocessed.to_csv(unparsed_out, index=False)

In [56]:
sorted(enriched.columns)

['_flag_set',
 'graph',
 'has_above',
 'has_and',
 'has_any_hyphen',
 'has_any_initial_hyphen',
 'has_any_other_text',
 'has_at',
 'has_below',
 'has_celsius',
 'has_degree_symbol',
 'has_inequality_symbol',
 'has_init_above',
 'has_init_below',
 'has_internal_above',
 'has_internal_below',
 'has_internal_comma',
 'has_number',
 'has_ophil',
 'has_or',
 'has_parentheses',
 'has_plus_minus',
 'has_poor',
 'has_qualifier_word',
 'has_range_keyword',
 'has_slash_separator',
 'has_terminal_punctuation',
 'has_time_expression',
 'has_to',
 'has_toleran',
 'has_unit_fahrenheit',
 'has_unit_kelvin',
 'has_up_to',
 'has_weak',
 'has_word_degree',
 'object',
 'pattern_id',
 'predicate',
 'subject']