In [1]:
import os
from pathlib import Path

import warnings
warnings.filterwarnings("ignore")

import logging
# logging.basicConfig(level=logging.DEBUG)
logging.getLogger("openff.toolkit").setLevel(logging.ERROR)

from openff import toolkit, evaluator

import pandas as pd
import json

In [2]:
from openff.evaluator.datasets import PhysicalProperty, PropertyPhase
from openff.evaluator.datasets.thermoml import thermoml_property
from openff.evaluator import properties
from openff.units import unit

In [3]:
from openff.evaluator.datasets import PhysicalProperty, PropertyPhase
from openff.evaluator.datasets.thermoml import thermoml_property
from openff.units import unit
from openff.evaluator.datasets.thermoml import ThermoMLDataSet
from openff.evaluator.datasets.thermoml.thermoml import _Compound
from openff.toolkit import Molecule
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole

In [4]:
import requests
from urllib.error import HTTPError
from xml.etree import ElementTree

# with open('sorted_dois.json') as f:
#     doi_dat = json.load(f)

# for d in doi_dat['working']:
#     doi=d
#     url=f"https://trc.nist.gov/ThermoML/{doi}.xml"
#     try:
#         request = requests.get(url)
#         request.raise_for_status()

#         # Handle the case where ThermoML returns a 404 error code, but rather
#         # redirects to an error page with code 200.
#         if request.text.startswith("<html>"):
#             raise HTTPError(url, 404, "Not found", None, None)

#     except (HTTPError, requests.exceptions.HTTPError):
#         print(f"No ThermoML file could not be found at {url}")
    
doi = '10.1021/je800307g'
url = f"https://trc.nist.gov/ThermoML/{doi}.xml"
try:
    request = requests.get(url)
    request.raise_for_status()

    # Handle the case where ThermoML returns a 404 error code, but rather
    # redirects to an error page with code 200.
    if request.text.startswith("<html>"):
        raise HTTPError(url, 404, "Not found", None, None)

except (HTTPError, requests.exceptions.HTTPError):
    print(f"No ThermoML file could not be found at {url}")

In [5]:
import re
from collections import defaultdict
from openff.toolkit.utils.exceptions import InvalidIUPACNameError

tree = ElementTree.fromstring(request.text)

namespace_string = re.search(r"{.*\}", tree.tag).group(0)[1:-1]
namespace = {"ThermoML": namespace_string}

xml_res = []
for node in tree.findall("ThermoML:Compound", namespace):
    inchi_identifier_nodes  = node.findall("ThermoML:sStandardInChI", namespace)
    smiles_identifier_nodes = node.findall("ThermoML:sSmiles", namespace)
    common_identifier_nodes = node.findall("ThermoML:sCommonName", namespace)

    querydict = defaultdict(dict)
    for inchi_node in inchi_identifier_nodes:
        inchi_key = inchi_node.text
        rdmol = Chem.MolFromInchi(inchi_key)
        querydict['Inchi'][inchi_key] = rdmol

    for subnode in common_identifier_nodes:
        common_name = subnode.text
        try:
            mol = Molecule.from_iupac(common_name)
            querydict['Common name'][common_name] = True
            # offmoldict[common_name] = mol
        except InvalidIUPACNameError:
            querydict['Common name'][common_name] = False
    xml_res.append(querydict)

