# Core Imports and Setup

In [1]:
import os
from pathlib import Path

import warnings
warnings.filterwarnings("ignore")

import logging
logging.getLogger("openff.toolkit").setLevel(logging.ERROR)

from openff import toolkit, evaluator

import pandas as pd
import json

In [2]:
from openff.evaluator.datasets import PhysicalProperty, PropertyPhase
from openff.evaluator.datasets.thermoml import thermoml_property
from openff.evaluator import properties
from openff.units import unit

In [3]:
from openff.evaluator.datasets import PhysicalProperty, PropertyPhase
from openff.evaluator.datasets.thermoml import thermoml_property
from openff.units import unit
from openff.evaluator.datasets.thermoml import ThermoMLDataSet
from openff.evaluator.datasets.thermoml.thermoml import _Compound
from openff.toolkit import Molecule
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole

# 0) Registering Custom ThermoML Properties

In [4]:
@thermoml_property("Osmotic coefficient", supported_phases=PropertyPhase.Liquid | PropertyPhase.Gas)
class OsmoticCoefficient(PhysicalProperty):
    """A class representation of a osmotic coeff property"""

    @classmethod
    def default_unit(cls):
        return unit.dimensionless
    
setattr(properties, OsmoticCoefficient.__name__, OsmoticCoefficient)

# 1) - Loading ThermoML Data Sets

## Extracting data from ThermoML

In [5]:
mol = Molecule.from_iupac('1,4-benzenediol')

In [6]:
# smi = 'C([C@@H]1[C@@H]2[C@@H]([C@H]([C@H](O1)O[C@@H]3[C@H](O[C@@H]([C@@H]([C@H]3O)O)O[C@@H]4[C@H](O[C@@H]([C@@H]([C@H]4O)O)O[C@@H]5[C@H](O[C@@H]([C@@H]([C@H]5O)O)O[C@@H]6[C@H](O[C@@H]([C@@H]([C@H]6O)O)O[C@@H]7[C@H](O[C@H](O2)[C@@H]([C@H]7O)O)CO)CO)CO)CO)CO)O)O)O'
# Chem.MolFromSmiles(smi)

In [7]:
# ds = ThermoMLDataSet.from_doi('10.1021/je800307g')

In [8]:
# ds.substances

In [9]:
import logging
logging.basicConfig(level=logging.DEBUG)

with open('sorted_dois.json') as f:
    doi_dat = json.load(f)
    data_set = ThermoMLDataSet.from_doi(*doi_dat['working'])

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): trc.nist.gov:443
DEBUG:urllib3.connectionpool:https://trc.nist.gov:443 "GET /ThermoML/10.1016/j.jct.2013.08.018.xml HTTP/1.1" 200 16953
DEBUG:charset_normalizer:Encoding detection: utf_8 is most likely the one.


DEBUG:charset_normalizer:Encoding detection: utf_8 is most likely the one.
DEBUG:root:An unsupported property was found (Molality, mol/kg) and will be skipped.
DEBUG:root:An unsupported property was found (Molality, mol/kg) and will be skipped.
DEBUG:root:An unsupported property was found (Molality, mol/kg) and will be skipped.
DEBUG:root:An unsupported property was found (Molality, mol/kg) and will be skipped.
DEBUG:root:An unsupported property was found (Molality, mol/kg) and will be skipped.
DEBUG:root:An unsupported property was found (Molality, mol/kg) and will be skipped.
DEBUG:root:An unsupported property was found (Molality, mol/kg) and will be skipped.
DEBUG:root:An unsupported property was found (Molality, mol/kg) and will be skipped.
DEBUG:root:An unsupported property was found (Molality, mol/kg) and will be skipped.
DEBUG:root:An unsupported property was found (Molality, mol/kg) and will be skipped.
DEBUG:root:An unsupported property was found (Molality, mol/kg) and will be

In [17]:
len(data_set), data_set.property_types

(16616, {'Density', 'OsmoticCoefficient'})

In [18]:
from openff.evaluator.datasets.curation.components.filtering import FilterByPropertyTypes, FilterByPropertyTypesSchema
from openff.evaluator.datasets.curation.components.filtering import FilterByNComponents, FilterByNComponentsSchema

data_set = FilterByNComponents.apply(
    data_set, FilterByNComponentsSchema(n_components=[2])
)

print(len(data_set))

INFO:openff.evaluator.datasets.curation.components.components:15616 data points were removed after applying the FilterByNComponents component.


1000


In [19]:
df = data_set.to_pandas()
df.to_csv("filtered_data_set_1.csv")

In [13]:
# liqgas = df[df['Phase'] == 'Liquid + Gas']
# liqgas

In [14]:
# df['Component 2'].unique()

In [15]:
# Chem.MolFromSmiles('CC(=O)[O-].[K+]')

## Filtering data set

In [16]:
from openff.evaluator.datasets.curation.components.filtering import FilterByPropertyTypes, FilterByPropertyTypesSchema
from openff.evaluator.datasets.curation.components.filtering import FilterByTemperature, FilterByTemperatureSchema
from openff.evaluator.datasets.curation.components.filtering import FilterByPressure, FilterByPressureSchema
from openff.evaluator.datasets.curation.components.filtering import FilterBySmiles, FilterBySmilesSchema

# Property
data_set = FilterByPropertyTypes.apply(
    data_set, FilterByPropertyTypesSchema(property_types=["OsmoticCoefficient"])
)
print(f"There are now {len(data_set)} properties after filtering")

INFO:openff.evaluator.datasets.curation.components.components:2424 data points were removed after applying the FilterByPropertyTypes component.


KeyError: 'Pressure (kPa)'

In [None]:
data_set.to_pandas()

In [None]:
from openff.evaluator.datasets.curation.components.filtering import FilterByPropertyTypes, FilterByPropertyTypesSchema
from openff.evaluator.datasets.curation.components.filtering import FilterByTemperature, FilterByTemperatureSchema
from openff.evaluator.datasets.curation.components.filtering import FilterByPressure, FilterByPressureSchema
from openff.evaluator.datasets.curation.components.filtering import FilterBySmiles, FilterBySmilesSchema

# # Property
# data_set = FilterByPropertyTypes.apply(
#     data_set, FilterByPropertyTypesSchema(property_types=["Density"])
# )

# Temperature
data_set = FilterByTemperature.apply(
    data_set, FilterByTemperatureSchema(minimum_temperature=295.0, maximum_temperature=305.0)
)

# Pressure
data_set = FilterByPressure.apply(
    data_set, FilterByPressureSchema(minimum_pressure=100.0, maximum_pressure=102.0)
)

# # SMILES
# data_set = FilterBySmiles.apply(
#     data_set, FilterBySmilesSchema(smiles_to_include=["[Na+].[Cl-]"])
# )

print(len(data_set))

KeyError: 'Temperature (K)'

In [None]:
pandas_data_set = data_set.to_pandas()
pandas_data_set[
    [
        "Temperature (K)",
        "Pressure (kPa)",
        "Component 1",
        "OsmoticCoefficient Value ()",
        "Source",
    ]
].head()

In [None]:
pandas_data_set.to_csv("filtered_data_set.csv")
#pandas_data_set.to_excel("filtered_data_set.csv")