In [None]:
import os
import re
from pathlib import Path
from urllib.parse import quote, urlparse

import pandas as pd
from rdflib import Dataset, Literal, URIRef

In [None]:
# Handle both interactive and CLI (e.g., papermill) execution
notebook_path = Path(__file__).parent if "__file__" in globals() else Path.cwd()

# Project root is the parent of `metpo/`, even though the notebook lives in `metpo/`
project_root = notebook_path if (notebook_path / "assets").is_dir() else notebook_path.parent
assets_dir = project_root / "assets"

In [None]:
n4l_data_directory = assets_dir / "N4L_phenotypic_ontology_2016"

In [None]:
xlsx_config_path = assets_dir / "n4l-xlsx-parsing-config.tsv"

In [None]:
predicate_mapping_normalization_file = assets_dir / "n4l_predicate_mapping_normalization.csv"

In [None]:
nq_out = project_root / "local" / "n4l-tables.nq"

In [None]:
n4l_prefix = "http://example.com/n4l/"

In [None]:
mapping_df = pd.read_csv(predicate_mapping_normalization_file)

In [None]:
delimited_text_configs = [
    {
        "filename": "N4L_Taxonomy_20220802_pruned.tsv",
        "path": f"{n4l_data_directory}/N4L_Taxonomy_20220802_pruned.tsv",
        "id_column": "N4LID",
        "delimiter": "\t"
    },
    {
        "filename": "N4L_Taxonomy_20220802.tsv",
        "path": f"{n4l_data_directory}/N4L_Taxonomy_20220802.tsv",
        "id_column": "N4LID",
        "delimiter": "\t"
    },
    {
        "filename": "reference_id_mapping.csv",
        "path": f"{n4l_data_directory}/reference_id_mapping.csv",
        "id_column": "refid",
        "delimiter": ","
    }
]

In [None]:
def safe_iri_component(value: str) -> str:
    # Replace all whitespace characters and colons with underscores
    cleaned = re.sub(r"[\s:]+", "_", value.strip())
    # Percent-encode everything else, but preserve underscores
    return quote(cleaned, safe="_")

In [None]:
def str_to_bool(val):
    """Convert common string values to boolean."""
    if pd.isna(val):
        return False
    return str(val).strip().lower() in {"1", "true", "yes", "y", "t"}

In [None]:
def process_transposed_sheet(df, id_column):
    df = df.transpose()
    df.columns = df.iloc[0]
    df = df[1:].reset_index(drop=True)

    if id_column not in df.columns:
        raise ValueError(f"'{id_column}' not found in transposed headers")

    df = df.dropna(subset=[id_column])
    melted = df.melt(id_vars=[id_column], var_name="predicate", value_name="object_value")
    return melted

In [None]:
def process_standard_sheet(df, id_column):
    if id_column not in df.columns:
        raise ValueError(f"'{id_column}' not found in standard headers")

    df = df.dropna(subset=[id_column])
    melted = df.melt(id_vars=[id_column], var_name="predicate", value_name="object_value")
    return melted

In [None]:
def safe_object_term(val):
    if pd.isna(val):
        return None
    val = str(val).strip()
    parsed = urlparse(val)
    if parsed.scheme in ("http", "https") and parsed.netloc and " " not in val:
        try:
            return URIRef(val)  # Only if it's really URI-safe
        except:
            pass
    return Literal(val)

In [None]:
melted_frames = []
melted_dropped_frames = []

In [None]:
for config in delimited_text_configs:
    try:
        df = pd.read_csv(config["path"], sep=config["delimiter"], low_memory=False)

        graph_iri = f"{n4l_prefix}{safe_iri_component(config['filename'].strip())}"

        # Remove exact duplicates before anything else
        before = df.shape[0]
        df = df.drop_duplicates()
        after = df.shape[0]
        if after < before:
            print(f"[QC] {config['filename']} - Removed {before - after} fully duplicated rows")

        id_column = config["id_column"]
        if id_column not in df.columns:
            print(
                f"[ERROR] {config['filename']} - ID column '{id_column}' not found. Available columns: {df.columns.tolist()}")
            continue

        # Drop and log missing ID rows
        missing_id_rows = df[df[id_column].isna()]
        if not missing_id_rows.empty:
            print(f"[QC] {config['filename']} - Dropped {len(missing_id_rows)} rows missing '{id_column}'")
            melted_missing = missing_id_rows.melt(var_name="predicate", value_name="object_value")
            melted_missing["subject"] = None
            melted_missing["source_file"] = config["filename"]
            melted_missing["drop_reason"] = "missing_id"
            melted_missing["graph"] = graph_iri
            melted_dropped_frames.append(melted_missing)

        df = df.dropna(subset=[id_column])

        # Drop and log duplicated IDs
        duplicated_mask = df[id_column].duplicated(keep=False)
        if duplicated_mask.any():
            duplicated_ids = df.loc[duplicated_mask, id_column].unique()
            print(
                f"[DUPLICATES] {config['filename']} - {duplicated_mask.sum()} duplicate rows on '{id_column}' → {duplicated_ids.tolist()}")
            melted_dupes = df.loc[duplicated_mask].melt(var_name="predicate", value_name="object_value")
            melted_dupes["subject"] = df.loc[duplicated_mask, id_column].values.repeat(len(df.columns) - 1)
            melted_dupes["source_file"] = config["filename"]
            melted_dupes["drop_reason"] = "duplicate_id"
            melted_dupes["graph"] = graph_iri
            melted_dropped_frames.append(melted_dupes)
            df = df[~duplicated_mask]
        else:
            print(f"[DUPLICATES] {config['filename']} - No duplicates in '{id_column}'")

        # Melt and append
        melted = df.melt(id_vars=[id_column], var_name="predicate", value_name="object_value")
        melted = melted.rename(columns={id_column: "subject"})
        melted = melted.dropna(subset=["subject", "predicate", "object_value"])
        melted["subject"] = melted["subject"].astype(str).apply(
            lambda x: f"{n4l_prefix}{safe_iri_component(x.strip())}")
        melted["predicate"] = melted["predicate"].astype(str).apply(
            lambda x: f"{n4l_prefix}{safe_iri_component(x.strip())}")
        melted["source_file"] = config["filename"]
        melted["graph"] = graph_iri
        melted_frames.append(melted)
        print(f"[INFO] {config['filename']} → {melted.shape[0]} melted rows")

    except Exception as e:
        print(f"[ERROR] Failed processing {config['filename']} - {e}")


In [None]:
xlsx_sheet_configs = pd.read_csv(xlsx_config_path, sep="\t")

In [None]:
xlsx_sheet_configs

In [None]:
for _, row in xlsx_sheet_configs.iterrows():
    print(row)

    skip = str_to_bool(row.get("skip", False))
    spo_already = str_to_bool(row.get("spo_already", False))
    requires_transposition = str_to_bool(row.get("requires_transposition", False))
    id_column = row.get("id_column")
    composite_columns = None

    if skip:
        continue

    file_path = os.path.join(n4l_data_directory, row["filename"])
    sheet_name = row["sheet_name"]
    graph_iri = f"{n4l_prefix}{safe_iri_component(row['filename'].strip())}/{safe_iri_component(sheet_name.strip())}"

    try:
        df = pd.read_excel(file_path, sheet_name=sheet_name, header=None if requires_transposition else 0)

        if requires_transposition:
            df = df.transpose()
            df.columns = df.iloc[0]
            df = df[1:].reset_index(drop=True)

        df.columns = df.columns.map(lambda x: str(x).strip())  # Normalize column names

        if spo_already:
            if df.shape[1] != 3:
                print(
                    f"[ERROR] {row['filename']}:{sheet_name} - Expected 3 columns for SPO format, found {df.shape[1]}")
                continue
            df.columns = ["subject", "predicate", "object_value"]
            df = df.dropna(subset=["subject", "predicate", "object_value"])
            df["subject"] = df["subject"].astype(str).apply(lambda x: f"{n4l_prefix}{safe_iri_component(x.strip())}")
            df["predicate"] = df["predicate"].astype(str).apply(
                lambda x: f"{n4l_prefix}{safe_iri_component(x.strip())}")
            df["source_file"] = row["filename"]
            df["source_sheet"] = sheet_name
            df["graph"] = graph_iri
            melted_frames.append(df)
            print(f"[INFO] {row['filename']}:{sheet_name} (SPO) → {df.shape[0]} rows")
            continue

        # --- Composite ID handling (improved) ---
        if isinstance(id_column, str) and "|" in id_column:
            composite_columns = [col.strip() for col in id_column.split("|")]

            normalized_cols = {str(col).strip(): col for col in df.columns}
            missing = [col for col in composite_columns if col not in normalized_cols]
            if missing:
                print(f"[ERROR] {row['filename']}:{sheet_name} - Missing composite ID columns: {missing}")
                continue

            matched = [normalized_cols[col] for col in composite_columns]
            id_column = "_".join(composite_columns)

            df[matched] = df[matched].astype(str).applymap(lambda v: v.strip() if isinstance(v, str) else v)

            # Log partial composite IDs
            partial_ids = df[matched].isna().any(axis=1) & ~df[matched].isna().all(axis=1)
            if partial_ids.any():
                print(f"[QC] {row['filename']}:{sheet_name} - {partial_ids.sum()} rows with partial composite IDs")

            # Join only valid components
            def safe_join(vals):
                return "_".join([str(v).strip() for v in vals if v and str(v).strip().lower() != "nan"])

            df[id_column] = df[matched].agg(safe_join, axis=1)

            # Drop rows where all components were missing
            blank_ids = df[matched].isna().all(axis=1)
            if blank_ids.any():
                print(f"[QC] {row['filename']}:{sheet_name} - Dropped {blank_ids.sum()} rows with blank synthetic ID")
                df = df[~blank_ids]

            print(f"[INFO] Created synthetic ID column '{id_column}' from: {composite_columns}")

        else:
            if pd.isna(id_column):
                print(f"[ERROR] {row['filename']}:{sheet_name} - ID column is NaN")
                continue

            normalized_cols = {str(col).strip(): col for col in df.columns}
            if id_column.strip() not in normalized_cols:
                print(
                    f"[ERROR] {row['filename']}:{sheet_name} - ID column '{id_column}' not found. Available columns: {df.columns.tolist()}")
                continue
            id_column = normalized_cols[id_column.strip()]

        df = df.drop_duplicates()
        df = df.dropna(subset=[id_column])

        full_dupes = df.duplicated()
        if full_dupes.any():
            print(f"[QC] {row['filename']}:{sheet_name} - Removed {full_dupes.sum()} fully duplicated rows")
            df = df[~full_dupes]

        duplicated_mask = df[id_column].duplicated(keep=False)
        duplicated_rows = df[duplicated_mask]

        if not duplicated_rows.empty:
            duplicated_ids = df[duplicated_mask][id_column].unique()
            print(
                f"[DUPLICATES] {row['filename']}:{sheet_name} - {len(duplicated_rows)} duplicate rows on '{id_column}' → {list(duplicated_ids)}")
            df = df[~duplicated_mask]
        else:
            print(f"[DUPLICATES] {row['filename']}:{sheet_name} - No duplicates in '{id_column}'")

        melted = df.melt(id_vars=[id_column], var_name="predicate", value_name="object_value")
        melted = melted.rename(columns={id_column: "subject"})
        melted = melted.dropna(subset=["subject", "predicate", "object_value"])
        melted["subject"] = melted["subject"].astype(str).apply(
            lambda x: f"{n4l_prefix}{safe_iri_component(x.strip())}")
        melted["predicate"] = melted["predicate"].astype(str).apply(
            lambda x: f"{n4l_prefix}{safe_iri_component(x.strip())}")
        melted["source_file"] = row["filename"]
        melted["source_sheet"] = sheet_name
        melted["graph"] = graph_iri
        melted_frames.append(melted)
        print(f"[INFO] {row['filename']}:{sheet_name} → {melted.shape[0]} melted rows")

    except Exception as e:
        print(f"[ERROR] Failed processing {row['filename']}:{sheet_name} - {e}")


In [None]:
# Combine all into one frame
combined_df = pd.concat(melted_frames, ignore_index=True)

In [None]:
combined_df.shape


In [None]:
combined_df = combined_df.drop_duplicates()

In [None]:
combined_df.shape

In [None]:
combined_df

In [None]:
combined_df = combined_df.dropna(subset=["subject", "predicate", "object_value"])


In [None]:
combined_df.shape

In [None]:
combined_df.columns

In [None]:
combined_df

In [None]:
ds = Dataset()

In [None]:
mapping_df

In [None]:
predicate_mapping = dict(zip(mapping_df["original_predicate"], mapping_df["normalized_predicate"], strict=False))

In [None]:
for _, row in combined_df.iterrows():
    subj = URIRef(row["subject"])

    if row["predicate"] not in predicate_mapping:
        raise ValueError(f"Predicate not found in mapping: {row['predicate']}")

    pred_iri = predicate_mapping[row["predicate"]]
    pred = URIRef(pred_iri)

    obj = safe_object_term(row["object_value"])
    graph_iri = URIRef(row["graph"])

    ds.add((subj, pred, obj, graph_iri))

11 min

In [None]:
# ✅ Serialize to N-Quads file
ds.serialize(destination=nq_out, format="nquads")

now zip and load into graphdb