In [64]:
import csv
import os
from urllib.parse import quote

import pandas as pd
from rdflib import ConjunctiveGraph, Graph
from rdflib import Dataset, URIRef, Literal
from rdflib.term import Node
from urllib.parse import urlparse

In [2]:
n4l_data_directory = "N4L_phenotypic_ontology_2016"

In [3]:
xlsx_config_path = "n4l-xlsx-parsing-config.tsv"

In [4]:
delimited_text_configs = [
    {
        "filename": "N4L_Taxonomy_20220802.tsv",
        "path": f"{n4l_data_directory}/N4L_Taxonomy_20220802.tsv",
        "id_column": "N4LID",
        "delimiter": "\t"
    },
    {
        "filename": "N4L_Taxonomy_20220802_pruned.tsv",
        "path": f"{n4l_data_directory}/N4L_Taxonomy_20220802_pruned.tsv",
        "id_column": "N4LID",
        "delimiter": "\t"
    },
    {
        "filename": "reference_id_mapping.csv",
        "path": f"{n4l_data_directory}/reference_id_mapping.csv",
        "id_column": "refid",
        "delimiter": ","
    }
]

In [5]:
def str_to_bool(val):
    """Convert common string values to boolean."""
    if pd.isna(val):
        return False
    return str(val).strip().lower() in {"1", "true", "yes", "y", "t"}

In [6]:
def process_transposed_sheet(df, id_column):
    df = df.transpose()
    df.columns = df.iloc[0]
    df = df[1:].reset_index(drop=True)

    if id_column not in df.columns:
        raise ValueError(f"'{id_column}' not found in transposed headers")

    df = df.dropna(subset=[id_column])
    melted = df.melt(id_vars=[id_column], var_name="predicate", value_name="object_value")
    return melted

In [7]:
def process_standard_sheet(df, id_column):
    if id_column not in df.columns:
        raise ValueError(f"'{id_column}' not found in standard headers")

    df = df.dropna(subset=[id_column])
    melted = df.melt(id_vars=[id_column], var_name="predicate", value_name="object_value")
    return melted

In [65]:
def safe_object_term(val):
    if pd.isna(val):
        return None
    val = str(val).strip()
    parsed = urlparse(val)
    if parsed.scheme in ("http", "https") and parsed.netloc and " " not in val:
        try:
            return URIRef(val)  # Only if it's really URI-safe
        except:
            pass
    return Literal(val)

In [8]:
melted_frames = []
melted_dropped_frames = []

In [9]:
for config in delimited_text_configs:
    try:
        df = pd.read_csv(config["path"], sep=config["delimiter"], low_memory=False)

        graph_iri = f"http://example.com/metpo/{quote(config['filename'].strip())}"

        # Remove exact duplicates before anything else
        before = df.shape[0]
        df = df.drop_duplicates()
        after = df.shape[0]
        if after < before:
            print(f"[QC] {config['filename']} - Removed {before - after} fully duplicated rows")

        id_column = config["id_column"]
        if id_column not in df.columns:
            print(
                f"[ERROR] {config['filename']} - ID column '{id_column}' not found. Available columns: {df.columns.tolist()}")
            continue

        # Drop and log missing ID rows
        missing_id_rows = df[df[id_column].isna()]
        if not missing_id_rows.empty:
            print(f"[QC] {config['filename']} - Dropped {len(missing_id_rows)} rows missing '{id_column}'")
            melted_missing = missing_id_rows.melt(var_name="predicate", value_name="object_value")
            melted_missing["subject"] = None
            melted_missing["source_file"] = config["filename"]
            melted_missing["drop_reason"] = "missing_id"
            melted_missing["graph"] = graph_iri
            melted_dropped_frames.append(melted_missing)

        df = df.dropna(subset=[id_column])

        # Drop and log duplicated IDs
        duplicated_mask = df[id_column].duplicated(keep=False)
        if duplicated_mask.any():
            duplicated_ids = df.loc[duplicated_mask, id_column].unique()
            print(
                f"[DUPLICATES] {config['filename']} - {duplicated_mask.sum()} duplicate rows on '{id_column}' → {duplicated_ids.tolist()}")
            melted_dupes = df.loc[duplicated_mask].melt(var_name="predicate", value_name="object_value")
            melted_dupes["subject"] = df.loc[duplicated_mask, id_column].values.repeat(len(df.columns) - 1)
            melted_dupes["source_file"] = config["filename"]
            melted_dupes["drop_reason"] = "duplicate_id"
            melted_dupes["graph"] = graph_iri
            melted_dropped_frames.append(melted_dupes)
            df = df[~duplicated_mask]
        else:
            print(f"[DUPLICATES] {config['filename']} - No duplicates in '{id_column}'")

        # Melt and append
        melted = df.melt(id_vars=[id_column], var_name="predicate", value_name="object_value")
        melted = melted.rename(columns={id_column: "subject"})
        melted = melted.dropna(subset=["subject", "predicate", "object_value"])
        melted["subject"] = melted["subject"].astype(str).apply(
            lambda x: f"http://example.com/metpo/{quote(x.strip())}")
        melted["predicate"] = melted["predicate"].astype(str).apply(
            lambda x: f"http://example.com/metpo/{quote(x.strip())}")
        melted["source_file"] = config["filename"]
        melted["graph"] = graph_iri
        melted_frames.append(melted)
        print(f"[INFO] {config['filename']} → {melted.shape[0]} melted rows")

    except Exception as e:
        print(f"[ERROR] Failed processing {config['filename']} - {e}")


[DUPLICATES] N4L_Taxonomy_20220802.tsv - No duplicates in 'N4LID'
[INFO] N4L_Taxonomy_20220802.tsv → 1273296 melted rows
[DUPLICATES] N4L_Taxonomy_20220802_pruned.tsv - No duplicates in 'N4LID'
[INFO] N4L_Taxonomy_20220802_pruned.tsv → 957331 melted rows
[DUPLICATES] reference_id_mapping.csv - No duplicates in 'refid'
[INFO] reference_id_mapping.csv → 64272 melted rows


In [10]:
# combined_delim_text_df = pd.concat(melted_frames, ignore_index=True)

In [11]:
# xlsx_sheet_configs = []

In [12]:
# for filename in xlsx_files:
#     print(f"Reading {filename}")
#     path = os.path.join(n4l_data_directory, filename)
#     try:
#         xls = pd.ExcelFile(path)
#         for sheet_name in xls.sheet_names:
#             xlsx_sheet_configs.append({
#                 "filename": filename,
#                 "sheet_name": sheet_name,
#                 "id_column": None,
#                 "skip": False
#             })
#     except Exception as e:
#         print(f"Error reading {filename}: {e}")

In [13]:
# xlsx_sheet_configs

In [14]:
# xlsx_sheet_configs_frame = pd.DataFrame(xlsx_sheet_configs)

In [15]:
# xlsx_sheet_configs_frame.to_csv("n4l-xlsx-parsing-config.tsv", sep="\t", index=False)

In [16]:
xlsx_sheet_configs = pd.read_csv(xlsx_config_path, sep="\t")

In [17]:
xlsx_sheet_configs

Unnamed: 0,filename,sheet_name,id_column,skip,requires_transposition,spo_already,note
0,article_download_status_20161222.xlsx,all_protologs,UID,False,False,,UID style: rid.10014_nm.6832
1,article_download_status_20161222.xlsx,parsed_protologs,UID,False,False,,
2,complete.term.book_07.18.2013_CTP.xlsx,Sheet1,Term|class,False,False,,
3,complete.term.book_07.18.2013_CTP.xlsx,Sheet2,,True,False,,blank
4,complete.term.book_07.18.2013_CTP.xlsx,Sheet3,,True,False,,blank
5,media_normalized_20130916.xlsx,chemical_exemplar,chemical_exemplar.id,False,False,,
6,media_normalized_20130916.xlsx,chemical_name,chemical_name.id,False,False,,
7,media_normalized_20130916.xlsx,chemical_taxon,chemical_taxon.id,False,False,,
8,media_normalized_20130916.xlsx,chemical_taxon_exemplar,chemical_taxon.id,False,False,,
9,media_normalized_20130916.xlsx,ingredient,,True,False,,I see Excel formulae


In [18]:
# melted_frames = []
# melted_dropped_frames = []

In [19]:
for _, row in xlsx_sheet_configs.iterrows():
    print(row)

    skip = str_to_bool(row.get("skip", False))
    spo_already = str_to_bool(row.get("spo_already", False))
    requires_transposition = str_to_bool(row.get("requires_transposition", False))
    id_column = row.get("id_column")
    composite_columns = None

    if skip:
        continue

    file_path = os.path.join(n4l_data_directory, row["filename"])
    sheet_name = row["sheet_name"]
    graph_iri = f"http://example.com/metpo/{quote(row['filename'].strip())}/{quote(sheet_name.strip())}"

    try:
        df = pd.read_excel(file_path, sheet_name=sheet_name, header=None if requires_transposition else 0)

        if requires_transposition:
            df = df.transpose()
            df.columns = df.iloc[0]
            df = df[1:].reset_index(drop=True)

        df.columns = df.columns.map(lambda x: str(x).strip())  # Normalize column names

        if spo_already:
            if df.shape[1] != 3:
                print(
                    f"[ERROR] {row['filename']}:{sheet_name} - Expected 3 columns for SPO format, found {df.shape[1]}")
                continue
            df.columns = ["subject", "predicate", "object_value"]
            df = df.dropna(subset=["subject", "predicate", "object_value"])
            df["subject"] = df["subject"].astype(str).apply(lambda x: f"http://example.com/metpo/{quote(x.strip())}")
            df["predicate"] = df["predicate"].astype(str).apply(
                lambda x: f"http://example.com/metpo/{quote(x.strip())}")
            df["source_file"] = row["filename"]
            df["source_sheet"] = sheet_name
            df["graph"] = graph_iri
            melted_frames.append(df)
            print(f"[INFO] {row['filename']}:{sheet_name} (SPO) → {df.shape[0]} rows")
            continue

        # Handle composite columns
        if isinstance(id_column, str) and "|" in id_column:
            composite_columns = [col.strip() for col in id_column.split("|")]

            # Match normalized names
            normalized_cols = {str(col).strip(): col for col in df.columns}
            missing = [col for col in composite_columns if col not in normalized_cols]
            if missing:
                print(f"[ERROR] {row['filename']}:{sheet_name} - Missing composite ID columns: {missing}")
                continue

            matched = [normalized_cols[col] for col in composite_columns]
            id_column = "_".join(composite_columns)
            df[id_column] = df[matched].astype(str).agg(
                lambda vals: "_".join([v for v in vals if v != "nan"]), axis=1
            )
            blank_ids = df[id_column] == ""
            if blank_ids.any():
                print(f"[QC] {row['filename']}:{sheet_name} - Dropped {blank_ids.sum()} rows with blank synthetic ID")
                df = df[~blank_ids]
            print(f"[INFO] Created synthetic ID column '{id_column}' from: {composite_columns}")
        else:
            if pd.isna(id_column):
                print(f"[ERROR] {row['filename']}:{sheet_name} - ID column is NaN")
                continue

            normalized_cols = {str(col).strip(): col for col in df.columns}
            if id_column.strip() not in normalized_cols:
                print(
                    f"[ERROR] {row['filename']}:{sheet_name} - ID column '{id_column}' not found. Available columns: {df.columns.tolist()}")
                continue
            id_column = normalized_cols[id_column.strip()]

        df = df.drop_duplicates()
        df = df.dropna(subset=[id_column])

        full_dupes = df.duplicated()
        if full_dupes.any():
            print(f"[QC] {row['filename']}:{sheet_name} - Removed {full_dupes.sum()} fully duplicated rows")
            df = df[~full_dupes]

        duplicated_mask = df[id_column].duplicated(keep=False)
        duplicated_rows = df[duplicated_mask]

        if not duplicated_rows.empty:
            duplicated_ids = df[duplicated_mask][id_column].unique()
            print(
                f"[DUPLICATES] {row['filename']}:{sheet_name} - {len(duplicated_rows)} duplicate rows on '{id_column}' → {list(duplicated_ids)}")
            df = df[~duplicated_mask]
        else:
            print(f"[DUPLICATES] {row['filename']}:{sheet_name} - No duplicates in '{id_column}'")

        melted = df.melt(id_vars=[id_column], var_name="predicate", value_name="object_value")
        melted = melted.rename(columns={id_column: "subject"})
        melted = melted.dropna(subset=["subject", "predicate", "object_value"])
        melted["subject"] = melted["subject"].astype(str).apply(
            lambda x: f"http://example.com/metpo/{quote(x.strip())}")
        melted["predicate"] = melted["predicate"].astype(str).apply(
            lambda x: f"http://example.com/metpo/{quote(x.strip())}")
        melted["source_file"] = row["filename"]
        melted["source_sheet"] = sheet_name
        melted["graph"] = graph_iri
        melted_frames.append(melted)
        print(f"[INFO] {row['filename']}:{sheet_name} → {melted.shape[0]} melted rows")

    except Exception as e:
        print(f"[ERROR] Failed processing {row['filename']}:{sheet_name} - {e}")


filename                  article_download_status_20161222.xlsx
sheet_name                                        all_protologs
id_column                                                   UID
skip                                                      False
requires_transposition                                    False
spo_already                                                 NaN
note                               UID style: rid.10014_nm.6832
Name: 0, dtype: object
[DUPLICATES] article_download_status_20161222.xlsx:all_protologs - No duplicates in 'UID'
[INFO] article_download_status_20161222.xlsx:all_protologs → 58601 melted rows
filename                  article_download_status_20161222.xlsx
sheet_name                                     parsed_protologs
id_column                                                   UID
skip                                                      False
requires_transposition                                    False
spo_already                             

  warn(msg)


[INFO] Created synthetic ID column 'rid_name.id' from: ['rid', 'name.id']
[DUPLICATES] protolog_normalization_categories_with_1000_DB.xlsx:Sheet2 - 2 duplicate rows on 'rid_name.id' → ['rid.2332_RID.2332 many OCR errors']
[INFO] protolog_normalization_categories_with_1000_DB.xlsx:Sheet2 → 20917 melted rows
filename                  protolog_normalization_categories_with_1000_DB...
sheet_name                                                           Sheet3
id_column                                                       rid|name.id
skip                                                                  False
requires_transposition                                                 True
spo_already                                                             NaN
note                                                                    NaN
Name: 38, dtype: object


  warn(msg)


[INFO] Created synthetic ID column 'rid_name.id' from: ['rid', 'name.id']
[DUPLICATES] protolog_normalization_categories_with_1000_DB.xlsx:Sheet3 - No duplicates in 'rid_name.id'
[INFO] protolog_normalization_categories_with_1000_DB.xlsx:Sheet3 → 5248 melted rows
filename                  protolog_normalization_categories_with_1000_KM...
sheet_name                                                  1000_proto_proj
id_column                                rid (effective/emendation)|name.id
skip                                                                  False
requires_transposition                                                 True
spo_already                                                             NaN
note                                                                    NaN
Name: 39, dtype: object
[QC] protolog_normalization_categories_with_1000_KMP.xlsx:1000_proto_proj - Dropped 9 rows with blank synthetic ID
[INFO] Created synthetic ID column 'rid (effective/emendation)_na

  warn(msg)


[INFO] Created synthetic ID column 'rid_name.id' from: ['rid', 'name.id']
[DUPLICATES] protolog_normalization_categories_with_1000_KMP.xlsx:EffectRIDProtos(rid.2300 up) - 2 duplicate rows on 'rid_name.id' → ['rid.2332_RID.2332 many OCR errors']
[INFO] protolog_normalization_categories_with_1000_KMP.xlsx:EffectRIDProtos(rid.2300 up) → 20915 melted rows
filename                  protolog_normalization_categories_with_1000_KM...
sheet_name                                                            Notes
id_column                                                               NaN
skip                                                                   True
requires_transposition                                                False
spo_already                                                             NaN
note                                                             skip notes
Name: 42, dtype: object
filename                  protolog_normalization_categories_with_1000_KM...
sheet_name    

  warn(msg)


In [28]:
# Combine all into one frame
combined_df = pd.concat(melted_frames, ignore_index=True)

In [29]:
combined_df.shape


(6179231, 6)

In [30]:
combined_df = combined_df.drop_duplicates()

In [31]:
combined_df.shape

(6179222, 6)

In [32]:
combined_df

Unnamed: 0,subject,predicate,object_value,source_file,graph,source_sheet
0,http://example.com/metpo/nm.0,http://example.com/metpo/index,1,N4L_Taxonomy_20220802.tsv,http://example.com/metpo/N4L_Taxonomy_20220802...,
1,http://example.com/metpo/nm.1,http://example.com/metpo/index,2,N4L_Taxonomy_20220802.tsv,http://example.com/metpo/N4L_Taxonomy_20220802...,
2,http://example.com/metpo/nm.2,http://example.com/metpo/index,3,N4L_Taxonomy_20220802.tsv,http://example.com/metpo/N4L_Taxonomy_20220802...,
3,http://example.com/metpo/nm.31636,http://example.com/metpo/index,24009,N4L_Taxonomy_20220802.tsv,http://example.com/metpo/N4L_Taxonomy_20220802...,
4,http://example.com/metpo/nm.3,http://example.com/metpo/index,4,N4L_Taxonomy_20220802.tsv,http://example.com/metpo/N4L_Taxonomy_20220802...,
...,...,...,...,...,...,...
6179226,http://example.com/metpo/nm.8017,http://example.com/metpo/note,The descriptions of Murinilabilia sulmonicolor...,protolog_normalization_categories_with_1000_KM...,http://example.com/metpo/protolog_normalizatio...,Sheet2
6179227,http://example.com/metpo/nm.2199,http://example.com/metpo/note,The description of Acidithiobacillus thiooxida...,protolog_normalization_categories_with_1000_KM...,http://example.com/metpo/protolog_normalizatio...,Sheet2
6179228,http://example.com/metpo/nm.5724,http://example.com/metpo/note,In addition to the description of the genus,protolog_normalization_categories_with_1000_KM...,http://example.com/metpo/protolog_normalizatio...,Sheet2
6179229,http://example.com/metpo/rid.4142_nm.1005,http://example.com/metpo/note,no true protolog in this reference,protolog_normalization_categories_with_1000_KM...,http://example.com/metpo/protolog_normalizatio...,Sheet2


In [33]:
combined_df = combined_df.dropna(subset=["subject", "predicate", "object_value"])


In [34]:
combined_df.shape

(6179222, 6)

In [36]:
combined_df.columns

Index(['subject', 'predicate', 'object_value', 'source_file', 'graph',
       'source_sheet'],
      dtype='object')

In [35]:
combined_df

Unnamed: 0,subject,predicate,object_value,source_file,graph,source_sheet
0,http://example.com/metpo/nm.0,http://example.com/metpo/index,1,N4L_Taxonomy_20220802.tsv,http://example.com/metpo/N4L_Taxonomy_20220802...,
1,http://example.com/metpo/nm.1,http://example.com/metpo/index,2,N4L_Taxonomy_20220802.tsv,http://example.com/metpo/N4L_Taxonomy_20220802...,
2,http://example.com/metpo/nm.2,http://example.com/metpo/index,3,N4L_Taxonomy_20220802.tsv,http://example.com/metpo/N4L_Taxonomy_20220802...,
3,http://example.com/metpo/nm.31636,http://example.com/metpo/index,24009,N4L_Taxonomy_20220802.tsv,http://example.com/metpo/N4L_Taxonomy_20220802...,
4,http://example.com/metpo/nm.3,http://example.com/metpo/index,4,N4L_Taxonomy_20220802.tsv,http://example.com/metpo/N4L_Taxonomy_20220802...,
...,...,...,...,...,...,...
6179226,http://example.com/metpo/nm.8017,http://example.com/metpo/note,The descriptions of Murinilabilia sulmonicolor...,protolog_normalization_categories_with_1000_KM...,http://example.com/metpo/protolog_normalizatio...,Sheet2
6179227,http://example.com/metpo/nm.2199,http://example.com/metpo/note,The description of Acidithiobacillus thiooxida...,protolog_normalization_categories_with_1000_KM...,http://example.com/metpo/protolog_normalizatio...,Sheet2
6179228,http://example.com/metpo/nm.5724,http://example.com/metpo/note,In addition to the description of the genus,protolog_normalization_categories_with_1000_KM...,http://example.com/metpo/protolog_normalizatio...,Sheet2
6179229,http://example.com/metpo/rid.4142_nm.1005,http://example.com/metpo/note,no true protolog in this reference,protolog_normalization_categories_with_1000_KM...,http://example.com/metpo/protolog_normalizatio...,Sheet2


In [66]:
ds = Dataset()

In [None]:
for _, row in combined_df.iterrows():
    subj = URIRef(row["subject"])
    pred = URIRef(row["predicate"])
    obj = safe_object_term(row["object_value"])
    graph_iri = URIRef(row["graph"])
    ds.add((subj, pred, obj, graph_iri))

In [62]:
# for _, row in combined_df.iterrows():
#     subj = URIRef(row["subject"])
#     pred = URIRef(row["predicate"])
#
#     # Determine if object is a URI or a literal
#     obj_val = row["object_value"]
#     if isinstance(obj_val, str) and obj_val.startswith("http://"):
#         obj = URIRef(obj_val)
#     else:
#         obj = Literal(obj_val)
#
#     graph_iri = URIRef(row["graph"])
#     ds.get_context(graph_iri).add((subj, pred, obj))


http://www.bd.com/ds/technicalCenter/misc/difcobblmanual_2nded_lowres.pdf (pg 580) & http://www.neogen.com/Acumedia/pdf/ProdInfo/7100_PI.pdf & http://www.atcc.org/~/media/C34920C063D045518910B21D7F95FCD8.ashx does not look like a valid URI, trying to serialize this will break.
http://www.bd.com/ds/technicalCenter/brochures/br_3_2362.pdf & http://www.dsmz.de/microorganisms/medium/pdf/DSMZ_Medium1.pdf & Difco 619 & Atlas 666 & http://www.atcc.org/~/media/E6FA2163B72E4DCD880719A2612F2C92.ashx does not look like a valid URI, trying to serialize this will break.
http://www.bd.com/ds/productCenter/221872.asp & Atlas 867 does not look like a valid URI, trying to serialize this will break.
http://www.atcc.org/~/media/59CA2C55406C491BA70D6BED2E710710.ashx & Atlas 69 does not look like a valid URI, trying to serialize this will break.
http://www.dsmz.de/microorganisms/medium/pdf/DSMZ_Medium67.pdf & http://www.atcc.org/~/media/DEE558F092A74E06B8DC94DC84950A71.ashx does not look like a valid URI, 

In [63]:
# ✅ Serialize to N-Quads file
ds.serialize(destination="n4l-tables.nq", format="nquads")

Exception: "http://himedialabs.com/TD/M1544.pdf & Atlas 472" does not look like a valid URI, I cannot serialize this as N3/Turtle. Perhaps you wanted to urlencode it?