# Parsing HMDB including molecule classes and subclasses (full onthology)

In [None]:
import os
import qiime2
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Define the working directory
wd = '/home/meyeanni/cloud/meyeanni/LP4/artifacts/metabolomics'

# Change to the working directory
os.chdir(wd)

# Verify current working directory
print("Current working directory:", os.getcwd())

In [None]:
import csv
import io
from lxml import etree
from tqdm import tqdm

xml_file = 'hmdb_metabolites.xml'
output_file = 'hmdb_mass_ontology.csv'
namespace = 'http://www.hmdb.ca'
ns = {'hmdb': namespace}

# Clean null bytes (just in case)
def clean_xml(file_path):
    with open(file_path, 'rb') as f:
        return io.BytesIO(f.read().replace(b'\x00', b''))

# Utility: safe text extraction
def extract_text(elem, path):
    try:
        return elem.xpath(path, namespaces=ns)[0].strip()
    except:
        return 'NA'

# Start parsing
xml_cleaned = clean_xml(xml_file)

fieldnames = [
    'accession', 'name', 'monoisotopic_molecular_weight', 'chemical_formula',
    'kingdom', 'super_class', 'class', 'sub_class', 'molecular_framework'
]

with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    context = etree.iterparse(xml_cleaned, tag=f'{{{namespace}}}metabolite', recover=True)

    for _, elem in tqdm(context, desc="Extracting HMDB data", unit="entries"):
        row = {
            'accession': extract_text(elem, 'hmdb:accession/text()'),
            'name': extract_text(elem, 'hmdb:name/text()'),
            'monoisotopic_molecular_weight': extract_text(elem, 'hmdb:monoisotopic_molecular_weight/text()'),
            'chemical_formula': extract_text(elem, 'hmdb:chemical_formula/text()'),
            'kingdom': extract_text(elem, 'hmdb:taxonomy/hmdb:kingdom/text()'),
            'super_class': extract_text(elem, 'hmdb:taxonomy/hmdb:super_class/text()'),
            'class': extract_text(elem, 'hmdb:taxonomy/hmdb:class/text()'),
            'sub_class': extract_text(elem, 'hmdb:taxonomy/hmdb:sub_class/text()'),
            'molecular_framework': extract_text(elem, 'hmdb:taxonomy/hmdb:molecular_framework/text()')
        }
        writer.writerow(row)

        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
    del context

print(f"\nDone! Data saved to {output_file}")


