In [25]:
import pandas as pd
import re
from pydantic import BaseModel, Field
from typing import Optional, List
from oaklib import get_adapter
from quantulum3 import parser as quant_parser

from tqdm.notebook import tqdm

In [2]:
# Load input
input_csv = "parsed_salinity.csv"


In [3]:
df = pd.read_csv(input_csv)

In [4]:
# Ontology adapters for individual ontologies
ontology_sources = ["envo", "chebi", "pato"]


In [5]:
oak_adapters = [get_adapter(f"sqlite:obo:{src}") for src in ontology_sources]

Downloading envo.db.gz: 0.00B [00:00, ?B/s]

Downloading chebi.db.gz: 0.00B [00:00, ?B/s]

Downloading pato.db.gz: 0.00B [00:00, ?B/s]

In [26]:
# Custom rule-based regex patterns
range_patterns = [
    (re.compile(r"up to ([\d.]+)\s*%"), lambda m: (0, float(m.group(1)))),
    (re.compile(r"below ([\d.]+)\s*%"), lambda m: (0, float(m.group(1)))),
    (re.compile(r"more than ([\d.]+)\s*%"), lambda m: (float(m.group(1)), None)),
]

In [27]:
# Pydantic model for structured output
class SaltConditionMetadata(BaseModel):
    consensus_unit: Optional[str] = None
    raw_text: str
    concentration_value: Optional[float] = None
    concentration_unit: Optional[str] = None
    concentration_qualifier: Optional[str] = None
    concentration_range_min: Optional[float] = None
    concentration_range_max: Optional[float] = None
    compound_label: Optional[str] = None
    compound_curie: Optional[str] = None
    medium_label: Optional[str] = None
    medium_curie: Optional[str] = None
    phenotype_label: Optional[str] = None
    phenotype_curie: Optional[str] = None
    growth_modifier: Optional[str] = None
    unparsed_text: Optional[str] = None

In [12]:
# def extract_metadata(text: str) -> SaltConditionMetadata:
#     result = SaltConditionMetadata(raw_text=text)
#     consumed_spans = []
#
#     # Quantities (e.g., 3%)
#     for q in quant_parser.parse(text):
#         result.concentration_value = q.value
#         if q.unit:
#             result.concentration_unit = q.unit.name
#         if q.surface.lower().endswith("(w/v)") or "wt/vol" in q.surface.lower():
#             result.concentration_qualifier = "w/v"
#         consumed_spans.append(q.span)
#
#     # Range phrases
#     for pattern, extractor in range_patterns:
#         match = pattern.search(text)
#         if match:
#             result.concentration_range_min, result.concentration_range_max = extractor(match)
#             consumed_spans.append(match.span())
#             break
#
#     # OAK annotations (chemical, medium, phenotype)
#     for oak in oak_adapters:
#         annotations = list(oak.annotate_text(text))
#         for ann in annotations:
#             label = getattr(ann, "subject_label", None)
#             curie = getattr(ann, "subject", None)
#             matched = getattr(ann, "match_string", None)
#             span = getattr(ann, "start", None), getattr(ann, "end", None)
#             consumed_spans.append(span)
#
#             if "NaCl" in matched or "salt" in matched:
#                 result.compound_label = label
#                 result.compound_curie = curie
#             elif "marine" in matched or "medium" in matched:
#                 result.medium_label = label
#                 result.medium_curie = curie
#             elif "halo" in matched or "tolerant" in matched:
#                 result.phenotype_label = label
#                 result.phenotype_curie = curie
#
#     # Growth modifier (weak/strong/etc)
#     if "weak" in text:
#         result.growth_modifier = "weak"
#         consumed_spans.append((text.find("weak"), text.find("weak") + 4))
#
#     # Unparsed text
#     tokens = [(m.start(), m.end()) for m in re.finditer(r"\S+", text)]
#     unexplained = []
#     for start, end in tokens:
#         if not any((cs is not None and ce is not None and (cs <= start < ce or cs < end <= ce)) for cs, ce in consumed_spans):
#             unexplained.append(text[start:end])
#     if unexplained:
#         result.unparsed_text = " ".join(unexplained)
#
#     return result


In [28]:
def extract_metadata(text: str) -> SaltConditionMetadata:
    result = SaltConditionMetadata(raw_text=text)
    consumed_spans = []
    parsed_units = []

    # Quantities (e.g., 3%)
    parsed_quantities = quant_parser.parse(text)
    for q in parsed_quantities:
        # Don't assign value if uncertainty implies a range
        if not q.uncertainty:
            result.concentration_value = q.value
        if q.unit:
            parsed_units.append(q.unit.name)
        result.concentration_unit = q.unit.name
        if q.surface.lower().endswith("(w/v)") or "wt/vol" in q.surface.lower():
            result.concentration_qualifier = "w/v"
            # Handle range via uncertainty if available
    if q.uncertainty:
        result.concentration_range_min = q.value - q.uncertainty
        result.concentration_range_max = q.value + q.uncertainty
        result.concentration_value = None
    else:
        result.concentration_value = q.value

    # Fix known misinterpretations from Quantulum3
    if q.unit:
        unit_str = q.unit.name.lower()
        if unit_str in ["metre", "millimetre", "mole", "metre mole"]:
            result.concentration_unit = "molar"
        elif unit_str == "millimolar":
            result.concentration_unit = "mM"
        elif unit_str == "mole per litre" or unit_str == "mol/i":
            result.concentration_unit = "mol/L"
        elif unit_str == "nanoampere centilitre":
            result.concentration_unit = "molar"
        else:
            result.concentration_unit = q.unit.name

    if q.surface.lower().endswith("(w/v)") or "wt/vol" in q.surface.lower():
        result.concentration_qualifier = "w/v"
    consumed_spans.append(q.span)

    # Range phrases
    for pattern, extractor in range_patterns:
        match = pattern.search(text)
        if match:
            result.concentration_range_min, result.concentration_range_max = extractor(match)
            consumed_spans.append(match.span())
            break

    # (weak / strong / etc)
    if "weak" in text:
        result.growth_modifier = "weak"
        consumed_spans.append((text.find("weak"), text.find("weak") + 4))

    # Unparsed text
    tokens = [(m.start(), m.end()) for m in re.finditer(r"\S+", text)]
    unexplained = []
    for start, end in tokens:
        if not any((cs is not None and ce is not None and (cs <= start < ce or cs < end <= ce)) for cs, ce in
                   consumed_spans):
            unexplained.append(text[start:end])
    if unexplained:
        result.unparsed_text = " ".join(unexplained)

    # Infer consensus_unit from structured + unparsed evidence
    if parsed_units:
        result.consensus_unit = parsed_units[0].lower()
    elif result.unparsed_text:
        if any(u in result.unparsed_text.lower() for u in ["molar", "mol/l", "m/l", "mmol", "mm", "g/l", "%"]):
            result.consensus_unit = "molar" if "molar" in result.unparsed_text.lower() else "%"" ".join(unexplained)

    return result

In [29]:
# Run extraction with progress bar
results = [extract_metadata(t) for t in tqdm(df["raw_text"].dropna(), desc="Parsing salt conditions")]


Parsing salt conditions:   0%|          | 0/4308 [00:00<?, ?it/s]

In [22]:
structured_df = pd.DataFrame([r.model_dump() for r in results])

In [30]:
structured_df

Unnamed: 0,raw_text,concentration_value,concentration_unit,concentration_qualifier,concentration_range_min,concentration_range_max,compound_label,compound_curie,medium_label,medium_curie,phenotype_label,phenotype_curie,growth_modifier,unparsed_text
0,0–2 %,1.00,dimensionless,,0.0,2.0,,,,,,,,%
1,0–2 %,1.00,dimensionless,,0.0,2.0,,,,,,,,%
2,0.5% and 21%,21.00,percentage,,,,,,,,,,,0.5% and
3,more than 7 %,7.00,dimensionless,,7.0,,,,,,,,,
4,up to 4% (w/v),4.00,percentage,,0.0,4.0,,,,,,,,(w/v) [POSSIBLE QUALIFIER PRESENT]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4303,0.01,0.01,dimensionless,,,,,,,,,,,
4304,0,0.00,dimensionless,,,,,,,,,,,
4305,0,0.00,dimensionless,,,,,,,,,,,
4306,2-3%,2.50,percentage,,2.0,3.0,,,,,,,,


In [31]:
structured_df.to_csv("structured_salt_conditions.tsv", sep="\t", index=False)