# Core Imports and Setup

In [1]:
import os
import pandas as pd
from pathlib import Path

import warnings
warnings.filterwarnings("ignore")

import logging
# logging.getLogger("openff.toolkit").setLevel(logging.ERROR)
# logging.basicConfig(level=logging.DEBUG)

import json
from openff import toolkit, evaluator

In [2]:
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw, MolFromSmiles
from rdkit.Chem.Draw import MolsToGridImage, rdMolDraw2D, MolsMatrixToGridImage
import pubchempy
from PIL import Image, ImageDraw, ImageFont
IPythonConsole.ipython_useSVG=False  #< set this to False if you want PNGs instead of SVGs

In [3]:
from rdkit.Chem import rdMolDescriptors
import pubchempy as pcp
import math

In [4]:
from openff.evaluator.datasets import PhysicalProperty, PropertyPhase
from openff.evaluator.datasets.thermoml import thermoml_property
from openff.evaluator import properties
from openff.units import unit
from openff.evaluator.datasets.thermoml import ThermoMLDataSet
from openff.evaluator.datasets import PhysicalProperty, PropertyPhase, PhysicalPropertyDataSet

In [5]:
from openff.evaluator.datasets.curation.components.filtering import FilterByPropertyTypes, FilterByPropertyTypesSchema
from openff.evaluator.datasets.curation.components.filtering import FilterByTemperature, FilterByTemperatureSchema
from openff.evaluator.datasets.curation.components.filtering import FilterByPressure, FilterByPressureSchema
from openff.evaluator.datasets.curation.components.filtering import FilterBySmiles, FilterBySmilesSchema
from openff.evaluator.datasets.curation.components.filtering import FilterByNComponents, FilterByNComponentsSchema


In [6]:
from rich.progress import track
from collections import defaultdict
from openff.toolkit.utils.exceptions import UndefinedStereochemistryError

In [7]:
def filter_database (unfiltered_directory):
    data_set=[]
    # Loop through all files in the directory
    for filename in os.listdir(unfiltered_directory):
        # Check if the file ends with .json
        if filename.endswith('.xml'):
            # Full path to the file
            file_path = os.path.join(unfiltered_directory, filename)
            data_set.append(file_path)
    
    sorted_dois = defaultdict(list)
    for doi in track(data_set, description='Filtering DOIs...'):
        try:
            dataset = ThermoMLDataSet._from_file(doi)
            sorted_dois['working'].append(doi)
        except UndefinedStereochemistryError:
            sorted_dois['stereo_fail'].append(doi)
        except Exception as other_exc:
            sorted_dois[other_exc.__class__.__name__].append(doi)

    print('Amount of failing files: %i/%i' % (len(sorted_dois['stereo_fail']),len(data_set)))
    print('Amount of working files: %i/%i' % (len(sorted_dois['working']),len(data_set)))

    return sorted_dois['working']

In [8]:
def extract_database (database_directory, csv_cached):
    CACHED_PROP_PATH = Path(csv_cached)

    if CACHED_PROP_PATH.exists():
        prop_df = pd.read_csv(CACHED_PROP_PATH, index_col=0)
        # delete rows with underfined thermo params to avoid pesky indexing errors
        prop_df = prop_df.dropna(subset=['Temperature (K)'])
        prop_df = prop_df.dropna(subset=['Pressure (kPa)'])
        data_set = ThermoMLDataSet.from_pandas(prop_df)
    else:
        sorted_files=filter_database(database_directory)
        data_set=ThermoMLDataSet.from_file(*sorted_files)

        initial_df = data_set.to_pandas()
        with CACHED_PROP_PATH.open('w') as file:
            initial_df.to_csv(CACHED_PROP_PATH)
    return data_set

In [9]:
# # Ensure database_directory is defined
# database_directory = 'ThermoML.v2020-09-30'
database_directory = 'tests'
tml_data_set = extract_database(database_directory, 'tml_database.csv')

Output()

Amount of failing files: 1/6
Amount of working files: 5/6


In [9]:
data_set=[]
# Loop through all files in the directory
for filename in os.listdir(database_directory):
    # Check if the file ends with .json
    if filename.endswith('.xml'):
        # Full path to the file
        file_path = os.path.join(database_directory, filename)
        data_set.append(file_path)

print(data_set)

['tests/acs.jced.5b00005.xml', 'tests/acs.jced.5b00021.xml', 'tests/acs.jced.5b00018.xml', 'tests/acs.jced.5b00007.xml', 'tests/acs.jced.5b00009.xml', 'tests/acs.jced.5b00011.xml']


In [18]:
sorted_dois = defaultdict(list)
for doi in track(data_set, description='Filtering DOIs...'):
    try:
        dataset = ThermoMLDataSet._from_file(doi)
        sorted_dois['working'].append(doi)
    except UndefinedStereochemistryError:
        sorted_dois['stereo_fail'].append(doi)
    except Exception as other_exc:
        sorted_dois[other_exc.__class__.__name__].append(doi)

print('Amount of failing files: %i/%i' % (len(sorted_dois['stereo_fail']),len(data_set)))
print('Amount of working files: %i/%i' % (len(sorted_dois['working']),len(data_set)))

Output()

Amount of failing files: 1/6
Amount of working files: 5/6


In [19]:
tml_data_set=ThermoMLDataSet.from_file(*sorted_dois['working'])

In [20]:
tml_data_set

<PhysicalPropertyDataSet n_properties=537 n_substances=215 n_sources=2>

In [21]:
tml_data_set.to_pandas()

Unnamed: 0,Id,Temperature (K),Pressure (kPa),Phase,N Components,Component 1,Role 1,Mole Fraction 1,Exact Amount 1,Component 2,Role 2,Mole Fraction 2,Exact Amount 2,Component 3,Role 3,Mole Fraction 3,Exact Amount 3,Density Value (g / ml),Density Uncertainty (g / ml),Source
0,c53292c345a2430b94b37c836fa44e1e,278.15,101.0,Liquid,3,O.O.O=S(=O)([O-])[O-].[Ca+2],Solvent,0.000360,,[Cl-].[Na+],Solvent,0.001798,,O,Solvent,0.997843,,1.006000,0.001500,10.1021/acs.jced.5b00005
1,f1a8f964d7944e1d9466595e7f549816,278.15,101.0,Liquid,3,O.O.O=S(=O)([O-])[O-].[Ca+2],Solvent,0.000454,,[Cl-].[Na+],Solvent,0.004499,,O,Solvent,0.995047,,1.014000,0.001500,10.1021/acs.jced.5b00005
2,b3cd061edd0f4d8b87a908cca61647c9,278.15,101.0,Liquid,3,O.O.O=S(=O)([O-])[O-].[Ca+2],Solvent,0.000591,,[Cl-].[Na+],Solvent,0.008975,,O,Solvent,0.990434,,1.028000,0.001500,10.1021/acs.jced.5b00005
3,fb9b6f24eb914030855231a8d81b199b,278.15,101.0,Liquid,3,O.O.O=S(=O)([O-])[O-].[Ca+2],Solvent,0.000730,,[Cl-].[Na+],Solvent,0.017961,,O,Solvent,0.981308,,1.048000,0.001000,10.1021/acs.jced.5b00005
4,abbc10af581f47a080f51323c6b51bb0,278.15,101.0,Liquid,3,O.O.O=S(=O)([O-])[O-].[Ca+2],Solvent,0.000809,,[Cl-].[Na+],Solvent,0.027109,,O,Solvent,0.972082,,1.063000,0.001000,10.1021/acs.jced.5b00005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
532,b4a13d50731d482ba4484f14e0478793,323.15,101.0,Liquid,2,CCCC#N,Solvent,0.600000,,c1ccccc1,Solvent,0.400000,,,,,,0.798661,0.000063,10.1021/acs.jced.5b00009
533,454af844c84241c6af68f1a23b093fe6,323.15,101.0,Liquid,2,CCCC#N,Solvent,0.700000,,c1ccccc1,Solvent,0.300000,,,,,,0.789869,0.000065,10.1021/acs.jced.5b00009
534,5f26e10af9c04ee18ae1ac97813ae7bc,323.15,101.0,Liquid,2,CCCC#N,Solvent,0.800000,,c1ccccc1,Solvent,0.200000,,,,,,0.780977,0.000066,10.1021/acs.jced.5b00009
535,58c042e1e44d4d73aa95c21b58beeb19,323.15,101.0,Liquid,2,CCCC#N,Solvent,0.900000,,c1ccccc1,Solvent,0.100000,,,,,,0.771989,0.000068,10.1021/acs.jced.5b00009
