# Core Imports and Setup

In [1]:
import os
from pathlib import Path

import warnings
warnings.filterwarnings("ignore")

import logging
logging.getLogger("openff.toolkit").setLevel(logging.ERROR)

import pint
from pint import measurement
from openff import evaluator, toolkit

# from openff.units import unit

# 1) - Loading ThermoML Data Sets

## Extracting data from ThermoML

In [2]:
from openff.evaluator.datasets import PhysicalProperty, PropertyPhase, PhysicalPropertyDataSet
from openff.evaluator.datasets.thermoml import thermoml_property, ThermoMLDataSet

data_set_initial = PhysicalPropertyDataSet.from_json("freesolv.json")
data_set_initial.to_pandas()

Unnamed: 0,Id,Temperature (K),Pressure (kPa),Phase,N Components,Component 1,Role 1,Mole Fraction 1,Exact Amount 1,Component 2,Role 2,Mole Fraction 2,Exact Amount 2,SolvationFreeEnergy Value (kJ / mol),SolvationFreeEnergy Uncertainty (kJ / mol),Source
0,265c7378309c4355aa01d60af70d3fca,298.15,101.325,Liquid,2,O,Solvent,1.0,,CCCCCC(=O)OC,Solute,,1,-10.41816,2.5104,10.1021/ct050097l
1,f89c43a7ef684aca89df924b29c487b1,298.15,101.325,Liquid,2,O,Solvent,1.0,,CCCCO,Solute,,1,-19.74848,2.5104,10.1021/ct050097l
2,198b886769694cc39828af5af498fd24,298.15,101.325,Liquid,2,O,Solvent,1.0,,Clc1ccc(-c2cc(Cl)c(Cl)c(Cl)c2Cl)cc1Cl,Solute,,1,-12.71936,0.4184,10.1007/s10822-012-9568-8
3,6cdf0d7abd354d8284ffe115d4c0ef33,298.15,101.325,Liquid,2,O,Solvent,1.0,,NC1CCCCC1,Solute,,1,-19.20456,2.5104,10.1021/ct050097l
4,e3e4778832a240eba754b5d4d010d21d,298.15,101.325,Liquid,2,O,Solvent,1.0,,O=COc1ccccc1,Solute,,1,-15.98288,2.5104,10.5281/zenodo.596537
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
637,0821476bf69143c4bd07f6e0d7f8e584,298.15,101.325,Liquid,2,O,Solvent,1.0,,Cl/C=C/Cl,Solute,,1,-3.26352,2.5104,10.1021/ct050097l
638,2e03601fd70142dd8092cc1eceb17b83,298.15,101.325,Liquid,2,O,Solvent,1.0,,CCc1ccc(C)cc1,Solute,,1,-3.97480,2.5104,10.1021/ct050097l
639,4ceca25966d64d3c88b9380829e7fa86,298.15,101.325,Liquid,2,O,Solvent,1.0,,CCBr,Solute,,1,-3.09616,2.5104,10.1021/ct050097l
640,bca16f5d94f54cf987903bcec2f9662b,298.15,101.325,Liquid,2,O,Solvent,1.0,,CC(C)SC(C)C,Solute,,1,-5.06264,2.5104,10.1021/ct050097l


In [3]:
print(len(data_set_initial))
print(data_set_initial.property_types)
print(data_set_initial.substances)

642
{'SolvationFreeEnergy'}
{<Substance Clc1cc(-c2c(Cl)cc(Cl)c(Cl)c2Cl)c(Cl)c(Cl)c1Cl{sol}{n=1}|O{solv}{x=1.000000}>, <Substance FC(F)(F)[C@H](Cl)Br{sol}{n=1}|O{solv}{x=1.000000}>, <Substance CCCC1CCCC1{sol}{n=1}|O{solv}{x=1.000000}>, <Substance CCC(C)(C)CC{sol}{n=1}|O{solv}{x=1.000000}>, <Substance FC(F)(F)CCl{sol}{n=1}|O{solv}{x=1.000000}>, <Substance Oc1ccccc1{sol}{n=1}|O{solv}{x=1.000000}>, <Substance CCC(=O)CC{sol}{n=1}|O{solv}{x=1.000000}>, <Substance CC(C)Cc1cnccn1{sol}{n=1}|O{solv}{x=1.000000}>, <Substance C=C{sol}{n=1}|O{solv}{x=1.000000}>, <Substance CC(C)COC(=O)C(C)C{sol}{n=1}|O{solv}{x=1.000000}>, <Substance CC(C)(C)C{sol}{n=1}|O{solv}{x=1.000000}>, <Substance CCc1cccnc1{sol}{n=1}|O{solv}{x=1.000000}>, <Substance CCOP(=S)(OCC)Oc1cc(C)nc(C(C)C)n1{sol}{n=1}|O{solv}{x=1.000000}>, <Substance Cc1cccc2ccccc12{sol}{n=1}|O{solv}{x=1.000000}>, <Substance CCOc1ccccc1{sol}{n=1}|O{solv}{x=1.000000}>, <Substance COc1ccccc1{sol}{n=1}|O{solv}{x=1.000000}>, <Substance Cc1ccnc(C)c1{sol}{n=1

## Filtering data set

In [4]:
from openff.evaluator.datasets.curation.components.filtering import FilterBySmiles, FilterBySmilesSchema

data_set_sfe= FilterBySmiles.apply(
    data_set_initial, FilterBySmilesSchema(smiles_to_include=['CCCCCC(=O)OC','O']))

print(len(data_set_sfe))

1


### Inspecting and saving new properties

In [5]:
# save for future use
data_set_path = Path('filtered_dataset_sfes.json')
data_set_sfe.json(data_set_path, format=True)

# inspect new properties
pandas_data_set = data_set_sfe.to_pandas()


In [6]:
pandas_data_set

Unnamed: 0,Id,Temperature (K),Pressure (kPa),Phase,N Components,Component 1,Role 1,Mole Fraction 1,Exact Amount 1,Component 2,Role 2,Mole Fraction 2,Exact Amount 2,SolvationFreeEnergy Value (kJ / mol),SolvationFreeEnergy Uncertainty (kJ / mol),Source
0,265c7378309c4355aa01d60af70d3fca,298.15,101.325,Liquid,2,O,Solvent,1.0,,CCCCCC(=O)OC,Solute,,1,-10.41816,2.5104,10.1021/ct050097l


# 2) Estimating Data Sets

### Loading data set and FF parameters

In [7]:
# load data
data_set_path = Path('filtered_dataset_sfes.json')
data_set = PhysicalPropertyDataSet.from_json(data_set_path)

In [8]:
from openff.toolkit.typing.engines.smirnoff import forcefield, ForceField
from openff.evaluator.forcefield import SmirnoffForceFieldSource

In [9]:
# load FF
ff_path = ForceField("openff-2.0.0.offxml")
force_field_source = SmirnoffForceFieldSource.from_object(ff_path)

In [10]:
# ff_path=('openff-2.0.0.offxml', 'opc.offxml')

In [11]:
# load FF
# # Evaluator wants to work with a JSON file for the force field
# force_field = ForceField('openff-2.0.0.offxml', 'opc.offxml')
# # with open("force-field.json", "w") as file:
# #     file.write(SmirnoffForceFieldSource.from_object(force_field).json())

# force_field_source = SmirnoffForceFieldSource.from_json("force-field.json")

### Defining Calculation Schemas

In [12]:
from openff.evaluator.properties import Density, EnthalpyOfMixing,SolvationFreeEnergy
from openff.evaluator.client import RequestOptions

# density_schema = Density.default_simulation_schema(n_molecules=256)
sfe_schema = SolvationFreeEnergy.default_simulation_schema(n_molecules=156)

# Create an options object which defines how the data set should be estimated.
estimation_options = RequestOptions()

# Specify that we only wish to use molecular simulation to estimate the data set.
estimation_options.calculation_layers = ["SimulationLayer"]

# Add our custom schemas, specifying that the should be used by the 'SimulationLayer'
estimation_options.add_schema("SimulationLayer", "SFE", sfe_schema)

## Launching a Server and Client

In [18]:
from openff.evaluator.backends import ComputeResources
from openff.evaluator.backends.dask import DaskLocalCluster
from openff.evaluator.server import EvaluatorServer
from openff.evaluator.client import EvaluatorClient
from openff.evaluator.client import ConnectionOptions

# define client to submit queries
port = 8119
evaluator_client = EvaluatorClient(ConnectionOptions(server_port=port))

# define available / preferred resources
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
resources = ComputeResources(
    number_of_threads=1,
    number_of_gpus=1,
    preferred_gpu_toolkit=ComputeResources.GPUToolkit.CUDA,
)

with DaskLocalCluster(number_of_workers=1, resources_per_worker=resources) as calculation_backend:
    # spin up server
    evaluator_server = EvaluatorServer(calculation_backend=calculation_backend, delete_working_files=False, port=port)
    evaluator_server.start(asynchronous=True)

    # estimate data set by submitting calculation schemas to newly-created server
    request, exception = evaluator_client.request_estimate(
        property_set=data_set,
        force_field_source=force_field_source,
        options=estimation_options,
    )

    # Wait for the results.
    results, exception = request.results(synchronous=True, polling_interval=30)
    assert exception is None

In [14]:
print(len(results.queued_properties))

print(len(results.estimated_properties))

print(len(results.unsuccessful_properties))
print(len(results.exceptions))

0
0
1
1


In [15]:
print(results.exceptions)

[WorkflowException(None)]


In [16]:
results.estimated_properties.json("estimated_dataset_hmix_dens.json", format=True)

'{\n  "@type": "openff.evaluator.datasets.datasets.PhysicalPropertyDataSet",\n  "properties": []\n}'

## 3) Analysing Data Sets

### Loading the Data Sets

In [17]:
experimental_data_set_path = "filtered_dataset_hmix.json"
estimated_data_set_path = "estimated_dataset_hmix.json"

experimental_data_set = PhysicalPropertyDataSet.from_json(experimental_data_set_path)
estimated_data_set = PhysicalPropertyDataSet.from_json(estimated_data_set_path)

FileNotFoundError: [Errno 2] No such file or directory: 'filtered_dataset_hmix.json'

In [None]:
experimental_data_set.to_pandas().head()

Unnamed: 0,Id,Temperature (K),Pressure (kPa),Phase,N Components,Component 1,Role 1,Mole Fraction 1,Exact Amount 1,Component 2,Role 2,Mole Fraction 2,Exact Amount 2,EnthalpyOfMixing Value (kJ / mol),EnthalpyOfMixing Uncertainty (kJ / mol),Source
0,6391,298.15,101.0,Liquid,2,CN(C)CCO,Solvent,0.2052,,O,Solvent,0.7948,,-2.587,,10.1016/j.jct.2007.03.010
1,6392,298.15,101.0,Liquid,2,CN(C)CCO,Solvent,0.5365,,O,Solvent,0.4635,,-2.575,,10.1016/j.jct.2007.03.010
2,6393,298.15,101.0,Liquid,2,CN(C)CCO,Solvent,0.7996,,O,Solvent,0.2004,,-1.247,,10.1016/j.jct.2007.03.010
3,6395,303.15,100.0,Liquid,2,CCN(CC)CCO,Solvent,0.2008,,O,Solvent,0.7992,,-2.185,,10.1016/j.jct.2015.04.030
4,6396,303.15,100.0,Liquid,2,CCN(CC)CCO,Solvent,0.5002,,O,Solvent,0.4998,,-2.504,,10.1016/j.jct.2015.04.030


In [None]:
estimated_data_set.to_pandas().head()