In [None]:
import pandas as pd
import re
from typing import List, Tuple

In [None]:
input = "../assets/n4l-temperature.csv"

In [None]:
df = pd.read_csv(input)

In [None]:
# df

In [None]:
df.columns

In [None]:
def filter_rows(
        df: pd.DataFrame,
        column_name: str,
        list_of_patterns: List[str],
        case_sensitive: bool = False
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Split `df` into two DataFrames based on whether *column_name*
    fully matches **any** pattern in *list_of_patterns*.

    Parameters
    ----------
    df : pandas.DataFrame
    column_name : str
        Column whose cells should be tested.
    list_of_patterns : List[str]
        Regex patterns **without** ^ and $ anchors – the function
        adds them automatically.
    case_sensitive : bool, default False
        If False (default) matching is case‑insensitive (re.IGNORECASE);
        if True, matching is case‑sensitive.

    Returns
    -------
    (matches, non_matches) : Tuple[pd.DataFrame, pd.DataFrame]
        • matches     – rows whose column value matches at least
                        one wrapped pattern
        • non_matches – the remaining rows
    """
    # Wrap each pattern in ^…$ unless user already provided anchors
    wrapped = [
        pat if pat.startswith("^") and pat.endswith("$") else f"^{pat}$"
        for pat in list_of_patterns
    ]
    flags = 0 if case_sensitive else re.IGNORECASE
    combined = re.compile("|".join(wrapped), flags=flags)

    mask = df[column_name].astype(str).str.match(combined, na=False)
    return df[mask].copy(), df[~mask].copy()

In [None]:
patterns = [
    # Mixed discrete values with commas/and/or, optionally ending with a range
    r"\s*-?(?:\d+(?:\.\d+)?|\.\d+)\s*(?:°\s*C?)?(?:\s*(?:,|and|or)\s*-?(?:\d+(?:\.\d+)?|\.\d+)\s*(?:°\s*C?)?)+(?:\s*(?:to|[-–—])\s*-?(?:\d+(?:\.\d+)?|\.\d+)\s*(?:°\s*C?)?)?\s*[,.;]?",

    # Comma-separated ranges like 25–30°C, 40–50°C
    r"\s*(?:-?(?:\d+(?:\.\d+)?|\.\d+)\s*(?:°\s*C?)?\s*[-–—]\s*-?(?:\d+(?:\.\d+)?|\.\d+)\s*(?:°\s*C?)?)(?:\s*,\s*-?(?:\d+(?:\.\d+)?|\.\d+)\s*(?:°\s*C?)?\s*[-–—]\s*-?(?:\d+(?:\.\d+)?|\.\d+)\s*(?:°\s*C?)?)*\s*[,.;]?",

    # Discrete values separated by "and" or "or"
    r"\s*-?(?:\d+(?:\.\d+)?|\.\d+)\s*(?:[°º]?\s*C?)?\s*(?:and|or)\s*-?(?:\d+(?:\.\d+)?|\.\d+)\s*(?:[°º]?\s*C?)?[,.;]?",

    # Comma-separated discrete values
    r"\s*-?(?:\d+(?:\.\d+)?|\.\d+)\s*(?:°\s*C?)?(?:\s*,\s*-?(?:\d+(?:\.\d+)?|\.\d+)\s*(?:°\s*C?)?)*\s*[,.;]?",

    # Single discrete value
    r"\s*-?(?:\d+(?:\.\d+)?|\.\d+)\s*(?:°\s*C?)?[,.;]?",

    # Simple range using to or dash
    r"\s*-?(?:\d+(?:\.\d+)?|\.\d+)\s*(?:°\s*C?)?\s*(?:to|[-–—])\s*-?(?:\d+(?:\.\d+)?|\.\d+)\s*(?:°\s*C?)?[,.;]?",

    # Comma-delimited two-value range
    r"\s*-?(?:\d+(?:\.\d+)?|\.\d+)\s*(?:°\s*C?)?\s*,\s*-?(?:\d+(?:\.\d+)?|\.\d+)\s*(?:°\s*C?)?[,.;]?",

    # and/or ranges explicitly
    r"\s*-?(?:\d+(?:\.\d+)?|\.\d+)\s*(?:°\s*C?)?\s*and\s*-?(?:\d+(?:\.\d+)?|\.\d+)\s*(?:°\s*C?)?[,.;]?",
    r"\s*-?(?:\d+(?:\.\d+)?|\.\d+)\s*(?:°\s*C?)?\s*or\s*-?(?:\d+(?:\.\d+)?|\.\d+)\s*(?:°\s*C?)?[,.;]?",

    # Prefix modifiers for discrete values
    r"\s*above\s+-?(?:\d+(?:\.\d+)?|\.\d+)\s*(?:[°º]?\s*C?)?\s*[,.;]?",
    r"\s*below\s+-?(?:\d+(?:\.\d+)?|\.\d+)\s*(?:[°º]?\s*C?)?\s*[,.;]?",
    r"\s*max:?\s*-?(?:\d+(?:\.\d+)?|\.\d+)\s*(?:[°º]?\s*C?)?\s*[,.;]?",
    r"\s*optimum\s+-?(?:\d+(?:\.\d+)?|\.\d+)\s*(?:[°º]?\s*C?)?\s*[,.;]?",
    r"\s*tmax:\s*-?(?:\d+(?:\.\d+)?|\.\d+)\s*(?:[°º]?\s*C?)?\s*[,.;]?",
    r"\s*up\s+to\s+-?(?:\d+(?:\.\d+)?|\.\d+)\s*(?:[°º]?\s*C?)?\s*[,.;]?",

    # Phenotypic labels
    r"cold-adapted[\,\.]?",
    r"extremely thermophilic[\,\.]?",
    r"hyperthermophilic[\,\.]?",
    r"mesophile[\,\.]?",
    r"mesophiles[\,\.]?",
    r"mesophilic[\,\.]?",
    r"moderate thermophile[\,\.]?",
    r"moderately thermoacidophilic[\,\.]?",
    r"moderately thermophilic[\,\.]?",
    r"moderately thermotolerant[\,\.]?",
    r"obligate thermophile[\,\.]?",
    r"obligately thermophilic[\,\.]?",
    r"psychrophilic[\,\.]?",
    r"psychrotolerant[\,\.]?",
    r"psychrotrophic[\,\.]?",
    r"slightly thermophilic[\,\.]?",
    r"strictly psychrophilic[\,\.]?",
    r"strictly thermophilic[\,\.]?",
    r"thermoacidophilic[\,\.]?",
    r"thermophile[\,\.]?",
    r"thermophilic[\,\.]?",
    r"thermotolerant[\,\.]?",
    # I. Single temperature with qualifier in parentheses
    r"\s*-?(?:\d+(?:\.\d+)?|\.\d+)\s*[°º]?\s*C?\s*\([^)]+\)\s*[,.;]?",

    # II. Comma-separated list of temperatures, some with qualifiers
    r"(?:-?(?:\d+(?:\.\d+)?|\.\d+)\s*[°º]?\s*C?(?:\s*\([^)]+\))?)(?:\s*,\s*-?(?:\d+(?:\.\d+)?|\.\d+)\s*[°º]?\s*C?(?:\s*\([^)]+\))?){1,}\s*[,.;]?",

    # III. Comparison operators: > < ≥ ≤ ⩾ ⩽
    r"\s*[><≥≤⩾⩽]\s*-?(?:\d+(?:\.\d+)?|\.\d+)\s*[°º]?\s*C?\s*[,.;]?",

    # IV. Temperature range followed by qualifier
    r"\s*-?(?:\d+(?:\.\d+)?|\.\d+)\s*[°º]?\s*C?\s*[-–—‒]\s*-?(?:\d+(?:\.\d+)?|\.\d+)\s*[°º]?\s*C?\s*\([^)]+\)\s*[,.;]?",

    # V. "minimum", "maximum", "optimum" with range or value
    r"\s*(?:minimum|maximum|optimum)\s*(?:about\s*)?-?(?:\d+(?:\.\d+)?|\.\d+)(?:\s*[-–—‒]\s*-?(?:\d+(?:\.\d+)?|\.\d+))?\s*[°º]?\s*C?\s*[,.;]?",

    # VI. At temp for duration
    r"\s*(?:at\s+)?-?(?:\d+(?:\.\d+)?|\.\d+)\s*[°º]?\s*C?\s+for\s+\d+\s*(?:min|minutes)\s*[,.;]?",

    # VII. Discrete with parens qualifier
    r"\s*-?(?:\d+(?:\.\d+)?|\.\d+)\s*[°º]?\s*C?\s*\([^)]+\)\s*[,.;]?",
    # 1. "between X and Y °C"
    r"\s*between\s+-?(?:\d+(?:\.\d+)?|\.\d+)\s*(?:°\s*C?)?\s*and\s+-?(?:\d+(?:\.\d+)?|\.\d+)\s*(?:°\s*C?)?\s*[,.;]?",

    # 2. "minimum X–Y°C" or "maximum X–Y°C"
    r"\s*(?:minimum|maximum)\s+-?(?:\d+(?:\.\d+)?|\.\d+)\s*[-–—‒−]\s*-?(?:\d+(?:\.\d+)?|\.\d+)\s*(?:°\s*C?)?\s*[,.;]?",

    # 3. "X°C or above/below Y°C" (inclusive range expressions)
    r"\s*-?(?:\d+(?:\.\d+)?|\.\d+)\s*(?:°\s*C?)?\s*or\s+(?:above|below)\s+-?(?:\d+(?:\.\d+)?|\.\d+)\s*(?:°\s*C?)?\s*[,.;]?",

    # 4. "X to Y °C" with dash or "to" (includes Unicode minus)
    r"\s*-?(?:\d+(?:\.\d+)?|\.\d+)\s*(?:°\s*C?)?\s*(?:to|[-–—‒−])\s*-?(?:\d+(?:\.\d+)?|\.\d+)\s*(?:°\s*C?)?\s*[,.;]?",

    # 5. Isolated value like "50 ºC." at end of sentence
    r"\s*-?(?:\d+(?:\.\d+)?|\.\d+)\s*[°º]?\s*C?\.\s*$",
]


In [None]:
matches, non_matches = filter_rows(
    df,  # your DataFrame
    "object",  # column to test
    patterns,  # list of patterns
    case_sensitive=False
)

In [None]:
matches.shape

In [None]:
non_matches.shape

In [None]:
non_matches

In [None]:
categorical_patterns = {
    "psychrophile":  r".*\b(?:psychrophil(?:e|ic)|psychrotolerant|psychrotrophic|cold[- ]adapted)\b.*",
    "mesophile":     r".*\bmesophil(?:e|es|ic)\b.*",
    "thermotolerant":r".*\b(?:thermotolerant|moderate(?:ly)? thermophil(?:e|ic)|slightly thermophil(?:e|ic))\b.*",
    "thermophile":   r".*\b(?:thermophil(?:e|ic)|obligate(?:ly)? thermophil(?:e|ic)|strictly thermophil(?:e|ic))\b.*",
    "hyperthermophile": r".*\b(?:hyperthermophil(?:e|ic)|extreme(?:ly)? thermophil(?:e|ic))\b.*",
}

In [None]:
def extract_category(row: str) -> List[str]:
    hits = [canon for canon, pat in categorical_patterns.items()
            if re.match(pat, row, flags=re.IGNORECASE)]
    return hits            # could be 0, 1 or several

In [None]:
non_matches["temperature_category"] = non_matches["object"].astype(str).map(extract_category)

In [None]:
non_matches