In [1]:
import os
import re
from urllib.parse import quote
from urllib.parse import urlparse

import pandas as pd
from rdflib import Dataset, URIRef, Literal

from pathlib import Path

In [2]:
# Handle both interactive and CLI (e.g., papermill) execution
notebook_path = Path(__file__).parent if "__file__" in globals() else Path.cwd()

# Project root is the parent of `metpo/`, even though the notebook lives in `metpo/`
project_root = notebook_path if (notebook_path / "assets").is_dir() else notebook_path.parent
assets_dir = project_root / "assets"

In [3]:
n4l_data_directory = assets_dir / "N4L_phenotypic_ontology_2016"

In [4]:
xlsx_config_path = assets_dir / "n4l-xlsx-parsing-config.tsv"

In [5]:
predicate_mapping_normalization_file = assets_dir / "n4l_predicate_mapping_normalization.csv"

In [6]:
nq_out = project_root / "local" / "n4l-tables.nq"

In [7]:
n4l_prefix = "http://example.com/n4l/"

In [8]:
mapping_df = pd.read_csv(predicate_mapping_normalization_file)

In [9]:
delimited_text_configs = [
    {
        "filename": "N4L_Taxonomy_20220802.tsv",
        "path": f"{n4l_data_directory}/N4L_Taxonomy_20220802.tsv",
        "id_column": "N4LID",
        "delimiter": "\t"
    },
    {
        "filename": "N4L_Taxonomy_20220802_pruned.tsv",
        "path": f"{n4l_data_directory}/N4L_Taxonomy_20220802_pruned.tsv",
        "id_column": "N4LID",
        "delimiter": "\t"
    },
    {
        "filename": "reference_id_mapping.csv",
        "path": f"{n4l_data_directory}/reference_id_mapping.csv",
        "id_column": "refid",
        "delimiter": ","
    }
]

In [10]:
def safe_iri_component(value: str) -> str:
    # Replace all whitespace characters and colons with underscores
    cleaned = re.sub(r'[\s:]+', '_', value.strip())
    # Percent-encode everything else, but preserve underscores
    return quote(cleaned, safe='_')

In [11]:
def str_to_bool(val):
    """Convert common string values to boolean."""
    if pd.isna(val):
        return False
    return str(val).strip().lower() in {"1", "true", "yes", "y", "t"}

In [12]:
def process_transposed_sheet(df, id_column):
    df = df.transpose()
    df.columns = df.iloc[0]
    df = df[1:].reset_index(drop=True)

    if id_column not in df.columns:
        raise ValueError(f"'{id_column}' not found in transposed headers")

    df = df.dropna(subset=[id_column])
    melted = df.melt(id_vars=[id_column], var_name="predicate", value_name="object_value")
    return melted

In [13]:
def process_standard_sheet(df, id_column):
    if id_column not in df.columns:
        raise ValueError(f"'{id_column}' not found in standard headers")

    df = df.dropna(subset=[id_column])
    melted = df.melt(id_vars=[id_column], var_name="predicate", value_name="object_value")
    return melted

In [14]:
def safe_object_term(val):
    if pd.isna(val):
        return None
    val = str(val).strip()
    parsed = urlparse(val)
    if parsed.scheme in ("http", "https") and parsed.netloc and " " not in val:
        try:
            return URIRef(val)  # Only if it's really URI-safe
        except:
            pass
    return Literal(val)

In [15]:
melted_frames = []
melted_dropped_frames = []

In [16]:
for config in delimited_text_configs:
    try:
        df = pd.read_csv(config["path"], sep=config["delimiter"], low_memory=False)

        graph_iri = f"{n4l_prefix}{safe_iri_component(config['filename'].strip())}"

        # Remove exact duplicates before anything else
        before = df.shape[0]
        df = df.drop_duplicates()
        after = df.shape[0]
        if after < before:
            print(f"[QC] {config['filename']} - Removed {before - after} fully duplicated rows")

        id_column = config["id_column"]
        if id_column not in df.columns:
            print(
                f"[ERROR] {config['filename']} - ID column '{id_column}' not found. Available columns: {df.columns.tolist()}")
            continue

        # Drop and log missing ID rows
        missing_id_rows = df[df[id_column].isna()]
        if not missing_id_rows.empty:
            print(f"[QC] {config['filename']} - Dropped {len(missing_id_rows)} rows missing '{id_column}'")
            melted_missing = missing_id_rows.melt(var_name="predicate", value_name="object_value")
            melted_missing["subject"] = None
            melted_missing["source_file"] = config["filename"]
            melted_missing["drop_reason"] = "missing_id"
            melted_missing["graph"] = graph_iri
            melted_dropped_frames.append(melted_missing)

        df = df.dropna(subset=[id_column])

        # Drop and log duplicated IDs
        duplicated_mask = df[id_column].duplicated(keep=False)
        if duplicated_mask.any():
            duplicated_ids = df.loc[duplicated_mask, id_column].unique()
            print(
                f"[DUPLICATES] {config['filename']} - {duplicated_mask.sum()} duplicate rows on '{id_column}' → {duplicated_ids.tolist()}")
            melted_dupes = df.loc[duplicated_mask].melt(var_name="predicate", value_name="object_value")
            melted_dupes["subject"] = df.loc[duplicated_mask, id_column].values.repeat(len(df.columns) - 1)
            melted_dupes["source_file"] = config["filename"]
            melted_dupes["drop_reason"] = "duplicate_id"
            melted_dupes["graph"] = graph_iri
            melted_dropped_frames.append(melted_dupes)
            df = df[~duplicated_mask]
        else:
            print(f"[DUPLICATES] {config['filename']} - No duplicates in '{id_column}'")

        # Melt and append
        melted = df.melt(id_vars=[id_column], var_name="predicate", value_name="object_value")
        melted = melted.rename(columns={id_column: "subject"})
        melted = melted.dropna(subset=["subject", "predicate", "object_value"])
        melted["subject"] = melted["subject"].astype(str).apply(
            lambda x: f"{n4l_prefix}{safe_iri_component(x.strip())}")
        melted["predicate"] = melted["predicate"].astype(str).apply(
            lambda x: f"{n4l_prefix}{safe_iri_component(x.strip())}")
        melted["source_file"] = config["filename"]
        melted["graph"] = graph_iri
        melted_frames.append(melted)
        print(f"[INFO] {config['filename']} → {melted.shape[0]} melted rows")

    except Exception as e:
        print(f"[ERROR] Failed processing {config['filename']} - {e}")


[DUPLICATES] N4L_Taxonomy_20220802.tsv - No duplicates in 'N4LID'


[INFO] N4L_Taxonomy_20220802.tsv → 1273296 melted rows


[DUPLICATES] N4L_Taxonomy_20220802_pruned.tsv - No duplicates in 'N4LID'


[INFO] N4L_Taxonomy_20220802_pruned.tsv → 957331 melted rows
[DUPLICATES] reference_id_mapping.csv - No duplicates in 'refid'


[INFO] reference_id_mapping.csv → 64272 melted rows


In [17]:
xlsx_sheet_configs = pd.read_csv(xlsx_config_path, sep="\t")

In [18]:
xlsx_sheet_configs

Unnamed: 0,filename,sheet_name,id_column,skip,requires_transposition,spo_already,note
0,article_download_status_20161222.xlsx,all_protologs,UID,False,False,,UID style: rid.10014_nm.6832
1,article_download_status_20161222.xlsx,parsed_protologs,UID,False,False,,
2,complete.term.book_07.18.2013_CTP.xlsx,Sheet1,Term|class,False,False,,
3,complete.term.book_07.18.2013_CTP.xlsx,Sheet2,,True,False,,blank
4,complete.term.book_07.18.2013_CTP.xlsx,Sheet3,,True,False,,blank
5,media_normalized_20130916.xlsx,chemical_exemplar,chemical_exemplar.id,False,False,,
6,media_normalized_20130916.xlsx,chemical_name,chemical_name.id,False,False,,
7,media_normalized_20130916.xlsx,chemical_taxon,chemical_taxon.id,False,False,,
8,media_normalized_20130916.xlsx,chemical_taxon_exemplar,chemical_taxon.id,False,False,,
9,media_normalized_20130916.xlsx,ingredient,substance_id,False,False,,(inconsistent) Excel formulae in substance_nam...


In [19]:
for _, row in xlsx_sheet_configs.iterrows():
    print(row)

    skip = str_to_bool(row.get("skip", False))
    spo_already = str_to_bool(row.get("spo_already", False))
    requires_transposition = str_to_bool(row.get("requires_transposition", False))
    id_column = row.get("id_column")
    composite_columns = None

    if skip:
        continue

    file_path = os.path.join(n4l_data_directory, row["filename"])
    sheet_name = row["sheet_name"]
    graph_iri = f"{n4l_prefix}{safe_iri_component(row['filename'].strip())}/{safe_iri_component(sheet_name.strip())}"

    try:
        df = pd.read_excel(file_path, sheet_name=sheet_name, header=None if requires_transposition else 0)

        if requires_transposition:
            df = df.transpose()
            df.columns = df.iloc[0]
            df = df[1:].reset_index(drop=True)

        df.columns = df.columns.map(lambda x: str(x).strip())  # Normalize column names

        if spo_already:
            if df.shape[1] != 3:
                print(
                    f"[ERROR] {row['filename']}:{sheet_name} - Expected 3 columns for SPO format, found {df.shape[1]}")
                continue
            df.columns = ["subject", "predicate", "object_value"]
            df = df.dropna(subset=["subject", "predicate", "object_value"])
            df["subject"] = df["subject"].astype(str).apply(lambda x: f"{n4l_prefix}{safe_iri_component(x.strip())}")
            df["predicate"] = df["predicate"].astype(str).apply(
                lambda x: f"{n4l_prefix}{safe_iri_component(x.strip())}")
            df["source_file"] = row["filename"]
            df["source_sheet"] = sheet_name
            df["graph"] = graph_iri
            melted_frames.append(df)
            print(f"[INFO] {row['filename']}:{sheet_name} (SPO) → {df.shape[0]} rows")
            continue

        # --- Composite ID handling (improved) ---
        if isinstance(id_column, str) and "|" in id_column:
            composite_columns = [col.strip() for col in id_column.split("|")]

            normalized_cols = {str(col).strip(): col for col in df.columns}
            missing = [col for col in composite_columns if col not in normalized_cols]
            if missing:
                print(f"[ERROR] {row['filename']}:{sheet_name} - Missing composite ID columns: {missing}")
                continue

            matched = [normalized_cols[col] for col in composite_columns]
            id_column = "_".join(composite_columns)

            df[matched] = df[matched].astype(str).applymap(lambda v: v.strip() if isinstance(v, str) else v)

            # Log partial composite IDs
            partial_ids = df[matched].isna().any(axis=1) & ~df[matched].isna().all(axis=1)
            if partial_ids.any():
                print(f"[QC] {row['filename']}:{sheet_name} - {partial_ids.sum()} rows with partial composite IDs")

            # Join only valid components
            def safe_join(vals):
                return "_".join([str(v).strip() for v in vals if v and str(v).strip().lower() != "nan"])

            df[id_column] = df[matched].agg(safe_join, axis=1)

            # Drop rows where all components were missing
            blank_ids = df[matched].isna().all(axis=1)
            if blank_ids.any():
                print(f"[QC] {row['filename']}:{sheet_name} - Dropped {blank_ids.sum()} rows with blank synthetic ID")
                df = df[~blank_ids]

            print(f"[INFO] Created synthetic ID column '{id_column}' from: {composite_columns}")

        else:
            if pd.isna(id_column):
                print(f"[ERROR] {row['filename']}:{sheet_name} - ID column is NaN")
                continue

            normalized_cols = {str(col).strip(): col for col in df.columns}
            if id_column.strip() not in normalized_cols:
                print(
                    f"[ERROR] {row['filename']}:{sheet_name} - ID column '{id_column}' not found. Available columns: {df.columns.tolist()}")
                continue
            id_column = normalized_cols[id_column.strip()]

        df = df.drop_duplicates()
        df = df.dropna(subset=[id_column])

        full_dupes = df.duplicated()
        if full_dupes.any():
            print(f"[QC] {row['filename']}:{sheet_name} - Removed {full_dupes.sum()} fully duplicated rows")
            df = df[~full_dupes]

        duplicated_mask = df[id_column].duplicated(keep=False)
        duplicated_rows = df[duplicated_mask]

        if not duplicated_rows.empty:
            duplicated_ids = df[duplicated_mask][id_column].unique()
            print(
                f"[DUPLICATES] {row['filename']}:{sheet_name} - {len(duplicated_rows)} duplicate rows on '{id_column}' → {list(duplicated_ids)}")
            df = df[~duplicated_mask]
        else:
            print(f"[DUPLICATES] {row['filename']}:{sheet_name} - No duplicates in '{id_column}'")

        melted = df.melt(id_vars=[id_column], var_name="predicate", value_name="object_value")
        melted = melted.rename(columns={id_column: "subject"})
        melted = melted.dropna(subset=["subject", "predicate", "object_value"])
        melted["subject"] = melted["subject"].astype(str).apply(
            lambda x: f"{n4l_prefix}{safe_iri_component(x.strip())}")
        melted["predicate"] = melted["predicate"].astype(str).apply(
            lambda x: f"{n4l_prefix}{safe_iri_component(x.strip())}")
        melted["source_file"] = row["filename"]
        melted["source_sheet"] = sheet_name
        melted["graph"] = graph_iri
        melted_frames.append(melted)
        print(f"[INFO] {row['filename']}:{sheet_name} → {melted.shape[0]} melted rows")

    except Exception as e:
        print(f"[ERROR] Failed processing {row['filename']}:{sheet_name} - {e}")


filename                  article_download_status_20161222.xlsx
sheet_name                                        all_protologs
id_column                                                   UID
skip                                                      False
requires_transposition                                    False
spo_already                                                 NaN
note                               UID style: rid.10014_nm.6832
Name: 0, dtype: object


[DUPLICATES] article_download_status_20161222.xlsx:all_protologs - No duplicates in 'UID'


[INFO] article_download_status_20161222.xlsx:all_protologs → 58601 melted rows
filename                  article_download_status_20161222.xlsx
sheet_name                                     parsed_protologs
id_column                                                   UID
skip                                                      False
requires_transposition                                    False
spo_already                                                 NaN
note                                                        NaN
Name: 1, dtype: object


[DUPLICATES] article_download_status_20161222.xlsx:parsed_protologs - No duplicates in 'UID'
[INFO] article_download_status_20161222.xlsx:parsed_protologs → 39440 melted rows
filename                  complete.term.book_07.18.2013_CTP.xlsx
sheet_name                                                Sheet1
id_column                                             Term|class
skip                                                       False
requires_transposition                                     False
spo_already                                                  NaN
note                                                         NaN
Name: 2, dtype: object


  df[matched] = df[matched].astype(str).applymap(lambda v: v.strip() if isinstance(v, str) else v)


[INFO] Created synthetic ID column 'Term_class' from: ['Term', 'class']
[DUPLICATES] complete.term.book_07.18.2013_CTP.xlsx:Sheet1 - No duplicates in 'Term_class'
[INFO] complete.term.book_07.18.2013_CTP.xlsx:Sheet1 → 4309 melted rows
filename                  complete.term.book_07.18.2013_CTP.xlsx
sheet_name                                                Sheet2
id_column                                                    NaN
skip                                                        True
requires_transposition                                     False
spo_already                                                  NaN
note                                                       blank
Name: 3, dtype: object
filename                  complete.term.book_07.18.2013_CTP.xlsx
sheet_name                                                Sheet3
id_column                                                    NaN
skip                                                        True
requires_transposition     

[DUPLICATES] media_normalized_20130916.xlsx:chemical_name - No duplicates in 'chemical_name.id'
[INFO] media_normalized_20130916.xlsx:chemical_name → 258 melted rows
filename                  media_normalized_20130916.xlsx
sheet_name                                chemical_taxon
id_column                              chemical_taxon.id
skip                                               False
requires_transposition                             False
spo_already                                          NaN
note                                                 NaN
Name: 7, dtype: object
[DUPLICATES] media_normalized_20130916.xlsx:chemical_taxon - No duplicates in 'chemical_taxon.id'
[INFO] media_normalized_20130916.xlsx:chemical_taxon → 114 melted rows
filename                  media_normalized_20130916.xlsx
sheet_name                       chemical_taxon_exemplar
id_column                              chemical_taxon.id
skip                                               False
requires_transp

[DUPLICATES] media_normalized_20130916.xlsx:ingredient - 920 duplicate rows on 'substance_id' → ['CHEBI:17634', 'N4L.m_p:00001', 'CHEBI:26710', 'N4L.m_p:00004', 'N4L.m_p:00002', 'N4L.m_p:00003', 'N4L.m_p:00005', 'CHEBI:17716', 'CHEBI:17992', 'N4L.m_p:00008', 'CHEBI:31991', 'N4L.m_p:00006', 'N4L.m_p:00021', 'N4L.m_p:00009', 'CHEBI:50144', 'CHEBI:32599', 'N4L.m_p:00010', 'N4L.m_p:00011', 'N4L.m_p:00012', 'N4L.m_p:00013', 'CHEBI:63005', 'CHEBI:32588', 'CHEBI:31795', 'N4L.m_p:00014', 'N4L.m_p:00015', 'CHEBI:32954', 'CHEBI:53426', 'CHEBI:53258', 'N4L.m_p:00019', 'N4L.m_p:00020', 'N4L.m_p:00030', 'N4L.m_p:00022', 'CHEBI:3311', 'CHEBI:62946', 'CHEBI:63041', 'CHEBI:32312', 'CHEBI:34683', 'CHEBI:63036', 'N4L.m_p:00023', 'N4L.m_p:00040', 'CHEBI:16899', 'N4L.m_p:00031', 'CHEBI:31604', 'N4L.m_p:00045', 'CHEBI:22653', 'N4L.m_p:00024', 'CHEBI:72449', 'CHEBI:17754', 'N4L.m_p:00054', 'CHEBI:16015', 'CHEBI:31440', 'CHEBI:16709', 'CHEBI:15956', 'CHEBI:3312', '10.1601/mixture.exemplar.43', '10.1601/mixtu

[INFO] media_normalized_20130916.xlsx:medium_features (SPO) → 627 rows
filename                  media_normalized_20130916.xlsx
sheet_name                                   medium_name
id_column                              substance_name.id
skip                                               False
requires_transposition                             False
spo_already                                          NaN
note                                                 NaN
Name: 12, dtype: object
[DUPLICATES] media_normalized_20130916.xlsx:medium_name - No duplicates in 'substance_name.id'
[INFO] media_normalized_20130916.xlsx:medium_name → 989 melted rows
filename                  media_normalized_20130916.xlsx
sheet_name                                  medium_taxon
id_column                             substance_taxon.id
skip                                               False
requires_transposition                             False
spo_already                                          NaN
n

[DUPLICATES] media_normalized_20130916.xlsx:medium_taxon - No duplicates in 'substance_taxon.id'
[INFO] media_normalized_20130916.xlsx:medium_taxon → 362 melted rows
filename                  media_normalized_20130916.xlsx
sheet_name                         medium_taxon_exemplar
id_column                                       taxon_id
skip                                               False
requires_transposition                             False
spo_already                                          NaN
note                                                 NaN
Name: 14, dtype: object
[ERROR] media_normalized_20130916.xlsx:medium_taxon_exemplar - ID column 'taxon_id' not found. Available columns: ['medium_taxon_exemplar.taxon_id', 'medium_taxon_exemplar.exemplar_id']
filename                                     media_normalized_20130916.xlsx
sheet_name                                                  medium_taxonomy
id_column                 substance_taxonomy.parent_taxon_id|substance_t.

  df[matched] = df[matched].astype(str).applymap(lambda v: v.strip() if isinstance(v, str) else v)


[INFO] Created synthetic ID column 'substance_taxonomy.parent_taxon_id_substance_taxonomy.child_taxon_id' from: ['substance_taxonomy.parent_taxon_id', 'substance_taxonomy.child_taxon_id']
[DUPLICATES] media_normalized_20130916.xlsx:medium_taxonomy - No duplicates in 'substance_taxonomy.parent_taxon_id_substance_taxonomy.child_taxon_id'
[INFO] media_normalized_20130916.xlsx:medium_taxonomy → 722 melted rows
filename                  media_normalized_20130916.xlsx
sheet_name                          medium_taxonomy_full
id_column                              substance_name.id
skip                                               False
requires_transposition                             False
spo_already                                          NaN
note                                                 NaN
Name: 16, dtype: object
[DUPLICATES] media_normalized_20130916.xlsx:medium_taxonomy_full - No duplicates in 'substance_name.id'
[INFO] media_normalized_20130916.xlsx:medium_taxonomy_full → 22

[DUPLICATES] media_normalized_20130916.xlsx:mixture_exemplar - No duplicates in 'exemplar_id'
[INFO] media_normalized_20130916.xlsx:mixture_exemplar → 99 melted rows
filename                  media_normalized_20130916.xlsx
sheet_name                            mixture_ingredient
id_column                  ingredient.list_id|sort_order
skip                                               False
requires_transposition                             False
spo_already                                          NaN
note                                                 NaN
Name: 18, dtype: object
[INFO] Created synthetic ID column 'ingredient.list_id_sort_order' from: ['ingredient.list_id', 'sort_order']
[DUPLICATES] media_normalized_20130916.xlsx:mixture_ingredient - No duplicates in 'ingredient.list_id_sort_order'
[INFO] media_normalized_20130916.xlsx:mixture_ingredient → 527 melted rows
filename                  media_normalized_20130916.xlsx
sheet_name                                  mixture_nam

  df[matched] = df[matched].astype(str).applymap(lambda v: v.strip() if isinstance(v, str) else v)


[DUPLICATES] media_normalized_20130916.xlsx:mixture_taxon - No duplicates in 'substance_taxon.id'
[INFO] media_normalized_20130916.xlsx:mixture_taxon → 53 melted rows
filename                  media_normalized_20130916.xlsx
sheet_name                        mixture_taxon_exemplar
id_column                                       taxon_id
skip                                               False
requires_transposition                             False
spo_already                                          NaN
note                                                 NaN
Name: 21, dtype: object
[DUPLICATES] media_normalized_20130916.xlsx:mixture_taxon_exemplar - No duplicates in 'taxon_id'
[INFO] media_normalized_20130916.xlsx:mixture_taxon_exemplar → 36 melted rows
filename                                     media_normalized_20130916.xlsx
sheet_name                                                 mixture_taxonomy
id_column                 substance_taxonomy.parent|substance_taxonomy.c...
skip   

  df[matched] = df[matched].astype(str).applymap(lambda v: v.strip() if isinstance(v, str) else v)
  df[matched] = df[matched].astype(str).applymap(lambda v: v.strip() if isinstance(v, str) else v)


[INFO] Created synthetic ID column 'substance_taxonomy.parent_substance_taxonomy.child' from: ['substance_taxonomy.parent', 'substance_taxonomy.child']
[DUPLICATES] media_normalized_20130916.xlsx:mixture_taxonomy - No duplicates in 'substance_taxonomy.parent_substance_taxonomy.child'
[INFO] media_normalized_20130916.xlsx:mixture_taxonomy → 106 melted rows
filename                      media_normalized_20130916.xlsx
sheet_name                                           product
id_column                 substance_name.id|vendor|productId
skip                                                   False
requires_transposition                                 False
spo_already                                              NaN
note                                                     NaN
Name: 23, dtype: object
[INFO] Created synthetic ID column 'substance_name.id_vendor_productId' from: ['substance_name.id', 'vendor', 'productId']
[DUPLICATES] media_normalized_20130916.xlsx:product - No duplicates 

[DUPLICATES] media_normalized_20130916.xlsx:substance_exemplar_type - No duplicates in 'substance_exemplar_feature_type.type'
[INFO] media_normalized_20130916.xlsx:substance_exemplar_type → 4 melted rows
filename                  media_normalized_20130916.xlsx
sheet_name                                          unit
id_column                                             id
skip                                               False
requires_transposition                             False
spo_already                                          NaN
note                                                 NaN
Name: 27, dtype: object
[DUPLICATES] media_normalized_20130916.xlsx:unit - No duplicates in 'id'
[INFO] media_normalized_20130916.xlsx:unit → 36 melted rows
filename                  media_normalized_20130916.xlsx
sheet_name                                        vendor
id_column                                         abbrev
skip                                               False
requires_tra

[DUPLICATES] media_normalized_20130916.xlsx:vendor - No duplicates in 'abbrev'
[INFO] media_normalized_20130916.xlsx:vendor → 69 melted rows
filename                  N4L_ID_to_NCBI_mappings.xlsx
sheet_name                          N4L_NM.ID_to_EX.ID
id_column                                   Name N4LID
skip                                             False
requires_transposition                           False
spo_already                                        NaN
note                                               NaN
Name: 29, dtype: object


[DUPLICATES] N4L_ID_to_NCBI_mappings.xlsx:N4L_NM.ID_to_EX.ID - No duplicates in 'Name N4LID'


[INFO] N4L_ID_to_NCBI_mappings.xlsx:N4L_NM.ID_to_EX.ID → 279917 melted rows
filename                  N4L_ID_to_NCBI_mappings.xlsx
sheet_name                     N4L_NM.ID_to_NCBI_TaxID
id_column                                   Name N4LID
skip                                             False
requires_transposition                           False
spo_already                                        NaN
note                                               NaN
Name: 30, dtype: object


[DUPLICATES] N4L_ID_to_NCBI_mappings.xlsx:N4L_NM.ID_to_NCBI_TaxID - No duplicates in 'Name N4LID'


[INFO] N4L_ID_to_NCBI_mappings.xlsx:N4L_NM.ID_to_NCBI_TaxID → 91363 melted rows
filename                  N4L_ID_to_NCBI_mappings.xlsx
sheet_name                         N4L_REF.ID_to_DOCID
id_column                                        refid
skip                                             False
requires_transposition                           False
spo_already                                        NaN
note                                               NaN
Name: 31, dtype: object


[DUPLICATES] N4L_ID_to_NCBI_mappings.xlsx:N4L_REF.ID_to_DOCID - No duplicates in 'refid'


[INFO] N4L_ID_to_NCBI_mappings.xlsx:N4L_REF.ID_to_DOCID → 64272 melted rows
filename                  N4L_Taxonomy_20220802_pruned.xlsx
sheet_name                     N4L_Taxonomy_20220802_pruned
id_column                                             N4LID
skip                                                  False
requires_transposition                                False
spo_already                                             NaN
note                                                    NaN
Name: 32, dtype: object


[DUPLICATES] N4L_Taxonomy_20220802_pruned.xlsx:N4L_Taxonomy_20220802_pruned - No duplicates in 'N4LID'


[INFO] N4L_Taxonomy_20220802_pruned.xlsx:N4L_Taxonomy_20220802_pruned → 957331 melted rows
filename                      N4L_Taxonomy_20220802.xlsx
sheet_name                N4L_Taxonomy_20220802_complete
id_column                                          N4LID
skip                                               False
requires_transposition                             False
spo_already                                          NaN
note                                                 NaN
Name: 33, dtype: object


[DUPLICATES] N4L_Taxonomy_20220802.xlsx:N4L_Taxonomy_20220802_complete - No duplicates in 'N4LID'


[INFO] N4L_Taxonomy_20220802.xlsx:N4L_Taxonomy_20220802_complete → 1273296 melted rows
filename                    N4L_Taxonomy_20220802.xlsx
sheet_name                N4L_Taxonomy_20220802_pruned
id_column                                        N4LID
skip                                             False
requires_transposition                           False
spo_already                                        NaN
note                                               NaN
Name: 34, dtype: object


[DUPLICATES] N4L_Taxonomy_20220802.xlsx:N4L_Taxonomy_20220802_pruned - No duplicates in 'N4LID'


[INFO] N4L_Taxonomy_20220802.xlsx:N4L_Taxonomy_20220802_pruned → 957331 melted rows
filename                  protolog_normalization_categories_with_1000_DB...
sheet_name                                                  1000_proto_proj
id_column                                           rid (effective)|name.id
skip                                                                  False
requires_transposition                                                 True
spo_already                                                             NaN
note                                                                    NaN
Name: 35, dtype: object


  df[matched] = df[matched].astype(str).applymap(lambda v: v.strip() if isinstance(v, str) else v)


[INFO] Created synthetic ID column 'rid (effective)_name.id' from: ['rid (effective)', 'name.id']
[DUPLICATES] protolog_normalization_categories_with_1000_DB.xlsx:1000_proto_proj - 324 duplicate rows on 'rid (effective)_name.id' → ['']
[INFO] protolog_normalization_categories_with_1000_DB.xlsx:1000_proto_proj → 8856 melted rows
filename                  protolog_normalization_categories_with_1000_DB...
sheet_name                                                           Sheet1
id_column                                                               NaN
skip                                                                   True
requires_transposition                                                False
spo_already                                                             NaN
note                                    Two-level class hierarchy of traits
Name: 36, dtype: object
filename                  protolog_normalization_categories_with_1000_DB...
sheet_name                            

  warn(msg)
  df[matched] = df[matched].astype(str).applymap(lambda v: v.strip() if isinstance(v, str) else v)


[INFO] Created synthetic ID column 'rid_name.id' from: ['rid', 'name.id']
[DUPLICATES] protolog_normalization_categories_with_1000_DB.xlsx:Sheet2 - 2 duplicate rows on 'rid_name.id' → ['rid.2332_RID.2332 many OCR errors']
[INFO] protolog_normalization_categories_with_1000_DB.xlsx:Sheet2 → 20948 melted rows
filename                  protolog_normalization_categories_with_1000_DB...
sheet_name                                                           Sheet3
id_column                                                       rid|name.id
skip                                                                  False
requires_transposition                                                 True
spo_already                                                             NaN
note                                                                    NaN
Name: 38, dtype: object


  warn(msg)
  df[matched] = df[matched].astype(str).applymap(lambda v: v.strip() if isinstance(v, str) else v)


[INFO] Created synthetic ID column 'rid_name.id' from: ['rid', 'name.id']
[DUPLICATES] protolog_normalization_categories_with_1000_DB.xlsx:Sheet3 - No duplicates in 'rid_name.id'
[INFO] protolog_normalization_categories_with_1000_DB.xlsx:Sheet3 → 5425 melted rows
filename                  protolog_normalization_categories_with_1000_KM...
sheet_name                                                  1000_proto_proj
id_column                                rid (effective/emendation)|name.id
skip                                                                  False
requires_transposition                                                 True
spo_already                                                             NaN
note                                                                    NaN
Name: 39, dtype: object


  df[matched] = df[matched].astype(str).applymap(lambda v: v.strip() if isinstance(v, str) else v)


[INFO] Created synthetic ID column 'rid (effective/emendation)_name.id' from: ['rid (effective/emendation)', 'name.id']
[DUPLICATES] protolog_normalization_categories_with_1000_KMP.xlsx:1000_proto_proj - 15 duplicate rows on 'rid (effective/emendation)_name.id' → ['', 'rid.3042_nm.4682', 'rid.15377_nm.207', 'rid.7454_nm.209']


[INFO] protolog_normalization_categories_with_1000_KMP.xlsx:1000_proto_proj → 33064 melted rows
filename                  protolog_normalization_categories_with_1000_KM...
sheet_name                                                  All(Sheet1,2,3)
id_column                                                       rid|name.id
skip                                                                  False
requires_transposition                                                 True
spo_already                                                             NaN
note                                                                    NaN
Name: 40, dtype: object


  df[matched] = df[matched].astype(str).applymap(lambda v: v.strip() if isinstance(v, str) else v)


[INFO] Created synthetic ID column 'rid_name.id' from: ['rid', 'name.id']
[DUPLICATES] protolog_normalization_categories_with_1000_KMP.xlsx:All(Sheet1,2,3) - 31 duplicate rows on 'rid_name.id' → ['rid.2332_RID.2332 many OCR errors', 'rid.2351_nm.2812', 'rid.2584_nm.10201', 'rid.2692_nm.4564', 'rid.2809_nm.9168', 'rid.2868_nm.9103', 'rid.2981_nm.1983', 'rid.3042_nm.4682', '', 'rid.3107_nm.7957', 'rid.15377_nm.207', 'rid.7454_nm.209']


[INFO] protolog_normalization_categories_with_1000_KMP.xlsx:All(Sheet1,2,3) → 57109 melted rows
filename                  protolog_normalization_categories_with_1000_KM...
sheet_name                                     EffectRIDProtos(rid.2300 up)
id_column                                                       rid|name.id
skip                                                                  False
requires_transposition                                                 True
spo_already                                                             NaN
note                                                                    NaN
Name: 41, dtype: object


  warn(msg)
  df[matched] = df[matched].astype(str).applymap(lambda v: v.strip() if isinstance(v, str) else v)


[INFO] Created synthetic ID column 'rid_name.id' from: ['rid', 'name.id']
[DUPLICATES] protolog_normalization_categories_with_1000_KMP.xlsx:EffectRIDProtos(rid.2300 up) - 2 duplicate rows on 'rid_name.id' → ['rid.2332_RID.2332 many OCR errors']
[INFO] protolog_normalization_categories_with_1000_KMP.xlsx:EffectRIDProtos(rid.2300 up) → 20945 melted rows
filename                  protolog_normalization_categories_with_1000_KM...
sheet_name                                                            Notes
id_column                                                               NaN
skip                                                                   True
requires_transposition                                                False
spo_already                                                             NaN
note                                                             skip notes
Name: 42, dtype: object
filename                  protolog_normalization_categories_with_1000_KM...
sheet_name    

[INFO] Created synthetic ID column 'rid_name.id' from: ['rid', 'name.id']
[DUPLICATES] protolog_normalization_categories_with_1000_KMP.xlsx:Sheet2 - No duplicates in 'rid_name.id'
[INFO] protolog_normalization_categories_with_1000_KMP.xlsx:Sheet2 → 5406 melted rows


  warn(msg)
  df[matched] = df[matched].astype(str).applymap(lambda v: v.strip() if isinstance(v, str) else v)


In [20]:
# Combine all into one frame
combined_df = pd.concat(melted_frames, ignore_index=True)

In [21]:
combined_df.shape


(6180621, 6)

In [22]:
combined_df = combined_df.drop_duplicates()

In [23]:
combined_df.shape

(6180612, 6)

In [24]:
combined_df

Unnamed: 0,subject,predicate,object_value,source_file,graph,source_sheet
0,http://example.com/n4l/nm.0,http://example.com/n4l/index,1,N4L_Taxonomy_20220802.tsv,http://example.com/n4l/N4L_Taxonomy_20220802.tsv,
1,http://example.com/n4l/nm.1,http://example.com/n4l/index,2,N4L_Taxonomy_20220802.tsv,http://example.com/n4l/N4L_Taxonomy_20220802.tsv,
2,http://example.com/n4l/nm.2,http://example.com/n4l/index,3,N4L_Taxonomy_20220802.tsv,http://example.com/n4l/N4L_Taxonomy_20220802.tsv,
3,http://example.com/n4l/nm.31636,http://example.com/n4l/index,24009,N4L_Taxonomy_20220802.tsv,http://example.com/n4l/N4L_Taxonomy_20220802.tsv,
4,http://example.com/n4l/nm.3,http://example.com/n4l/index,4,N4L_Taxonomy_20220802.tsv,http://example.com/n4l/N4L_Taxonomy_20220802.tsv,
...,...,...,...,...,...,...
6180616,http://example.com/n4l/nm.8017,http://example.com/n4l/note,The descriptions of Murinilabilia sulmonicolor...,protolog_normalization_categories_with_1000_KM...,http://example.com/n4l/protolog_normalization_...,Sheet2
6180617,http://example.com/n4l/nm.2199,http://example.com/n4l/note,The description of Acidithiobacillus thiooxida...,protolog_normalization_categories_with_1000_KM...,http://example.com/n4l/protolog_normalization_...,Sheet2
6180618,http://example.com/n4l/nm.5724,http://example.com/n4l/note,In addition to the description of the genus,protolog_normalization_categories_with_1000_KM...,http://example.com/n4l/protolog_normalization_...,Sheet2
6180619,http://example.com/n4l/rid.4142_nm.1005,http://example.com/n4l/note,no true protolog in this reference,protolog_normalization_categories_with_1000_KM...,http://example.com/n4l/protolog_normalization_...,Sheet2


In [25]:
combined_df = combined_df.dropna(subset=["subject", "predicate", "object_value"])


In [26]:
combined_df.shape

(6180612, 6)

In [27]:
combined_df.columns

Index(['subject', 'predicate', 'object_value', 'source_file', 'graph',
       'source_sheet'],
      dtype='object')

In [28]:
combined_df

Unnamed: 0,subject,predicate,object_value,source_file,graph,source_sheet
0,http://example.com/n4l/nm.0,http://example.com/n4l/index,1,N4L_Taxonomy_20220802.tsv,http://example.com/n4l/N4L_Taxonomy_20220802.tsv,
1,http://example.com/n4l/nm.1,http://example.com/n4l/index,2,N4L_Taxonomy_20220802.tsv,http://example.com/n4l/N4L_Taxonomy_20220802.tsv,
2,http://example.com/n4l/nm.2,http://example.com/n4l/index,3,N4L_Taxonomy_20220802.tsv,http://example.com/n4l/N4L_Taxonomy_20220802.tsv,
3,http://example.com/n4l/nm.31636,http://example.com/n4l/index,24009,N4L_Taxonomy_20220802.tsv,http://example.com/n4l/N4L_Taxonomy_20220802.tsv,
4,http://example.com/n4l/nm.3,http://example.com/n4l/index,4,N4L_Taxonomy_20220802.tsv,http://example.com/n4l/N4L_Taxonomy_20220802.tsv,
...,...,...,...,...,...,...
6180616,http://example.com/n4l/nm.8017,http://example.com/n4l/note,The descriptions of Murinilabilia sulmonicolor...,protolog_normalization_categories_with_1000_KM...,http://example.com/n4l/protolog_normalization_...,Sheet2
6180617,http://example.com/n4l/nm.2199,http://example.com/n4l/note,The description of Acidithiobacillus thiooxida...,protolog_normalization_categories_with_1000_KM...,http://example.com/n4l/protolog_normalization_...,Sheet2
6180618,http://example.com/n4l/nm.5724,http://example.com/n4l/note,In addition to the description of the genus,protolog_normalization_categories_with_1000_KM...,http://example.com/n4l/protolog_normalization_...,Sheet2
6180619,http://example.com/n4l/rid.4142_nm.1005,http://example.com/n4l/note,no true protolog in this reference,protolog_normalization_categories_with_1000_KM...,http://example.com/n4l/protolog_normalization_...,Sheet2


In [29]:
ds = Dataset()

In [30]:
mapping_df

Unnamed: 0,original_predicate,normalized_predicate
0,http://example.com/n4l/per_volume_units,http://example.com/n4l/per_volume_units
1,http://example.com/n4l/preparation,http://example.com/n4l/preparation
2,http://example.com/n4l/per_volume_amount,http://example.com/n4l/per_volume_amount
3,http://example.com/n4l/chemical_taxon.rank,http://example.com/n4l/chemical_taxon_rank
4,http://example.com/n4l/reference,http://example.com/n4l/reference
...,...,...
282,http://example.com/n4l/eponym,http://example.com/n4l/eponym
283,http://example.com/n4l/see_also,http://example.com/n4l/see_also
284,http://example.com/n4l/Prescott,http://example.com/n4l/prescott
285,http://example.com/n4l/eponym_meaning,http://example.com/n4l/eponym_meaning


In [31]:
predicate_mapping = dict(zip(mapping_df["original_predicate"], mapping_df["normalized_predicate"]))

In [32]:
for _, row in combined_df.iterrows():
    subj = URIRef(row["subject"])

    if row["predicate"] not in predicate_mapping:
        raise ValueError(f"Predicate not found in mapping: {row['predicate']}")

    pred_iri = predicate_mapping[row["predicate"]]
    pred = URIRef(pred_iri)

    obj = safe_object_term(row["object_value"])
    graph_iri = URIRef(row["graph"])

    ds.add((subj, pred, obj, graph_iri))

11 min

In [33]:
# ✅ Serialize to N-Quads file
ds.serialize(destination=nq_out, format="nquads")

<Graph identifier=N1c576016d4e241038309dfa99c2ad744 (<class 'rdflib.graph.Dataset'>)>

now zip and load into graphdb