# Core Imports and Setup

In [1]:
import os
import pandas as pd
from pathlib import Path

import warnings
warnings.filterwarnings("ignore")

import logging
logging.getLogger("openff.toolkit").setLevel(logging.ERROR)
logging.basicConfig(level=logging.DEBUG)

import json
from openff import toolkit, evaluator

DEBUG:pint.util:Changing app registry from <pint.registry.LazyRegistry object at 0x7fb093317e80> to <openff.units.units.UnitRegistry object at 0x7fb09314a950>.
INFO:rdkit:Enabling RDKit 2023.09.4 jupyter extensions
DEBUG:matplotlib:matplotlib data path: /home/bamo6610/miniconda3/envs/evaluator-openff/lib/python3.10/site-packages/matplotlib/mpl-data
DEBUG:matplotlib:CONFIGDIR=/home/bamo6610/.config/matplotlib
DEBUG:matplotlib:interactive is False
DEBUG:matplotlib:platform is linux


# 0) Registering Custom ThermoML Properties

In [2]:
from openff.evaluator.datasets import PhysicalProperty, PropertyPhase
from openff.evaluator.datasets.thermoml import thermoml_property
from openff.evaluator import properties
from openff.units import unit
from openff.evaluator.datasets.thermoml import ThermoMLDataSet

@thermoml_property("Osmotic coefficient", supported_phases=PropertyPhase.Liquid | PropertyPhase.Gas)
class OsmoticCoefficient(PhysicalProperty):
    """A class representation of a osmotic coeff property"""

    @classmethod
    def default_unit(cls):
        return unit.dimensionless
    
setattr(properties, OsmoticCoefficient.__name__, OsmoticCoefficient)

In [3]:
ThermoMLDataSet.registered_properties['Osmotic coefficient'].conversion_function

functools.partial(<function _default_mapping at 0x7fb092602d40>, <class '__main__.OsmoticCoefficient'>)

# 1) - Loading ThermoML Data Sets

## Extracting data from ThermoML

In [4]:
from openff.evaluator.datasets.thermoml import ThermoMLDataSet

CACHED_PROP_PATH = Path('osmotic_data.csv')

if CACHED_PROP_PATH.exists():
    prop_df = pd.read_csv(CACHED_PROP_PATH, index_col=0)
    # delete rows with underfined thermo params to avoid pesky indexing errors
    prop_df = prop_df.dropna(subset=['Temperature (K)'])
    prop_df = prop_df.dropna(subset=['Pressure (kPa)'])
    data_set = ThermoMLDataSet.from_pandas(prop_df)
else:
    with open('sorted_dois.json') as f:
        doi_dat = json.load(f)
        data_set = ThermoMLDataSet.from_doi(*doi_dat['working'])

    prop_df = data_set.to_pandas()
    with CACHED_PROP_PATH.open('w') as file:
        prop_df.to_csv(CACHED_PROP_PATH)


In [5]:
prop_df

Unnamed: 0,Id,Temperature (K),Pressure (kPa),Phase,N Components,Component 1,Role 1,Mole Fraction 1,Exact Amount 1,Component 2,...,Exact Amount 2,Component 3,Role 3,Mole Fraction 3,Exact Amount 3,OsmoticCoefficient Value (),OsmoticCoefficient Uncertainty (),Density Value (g / ml),Density Uncertainty (g / ml),Source
0,cddc9485a39c414a90b3a1b4e4219af0,298.15,101.0,Liquid,3,NCC(=O)O,Solvent,0.008848,,[Cl-].[Na+],...,,O,Solvent,0.982303,,0.910,0.0055,,,10.1016/j.jct.2013.08.018
1,7a612b0d9b7649b5917ed0a414946bb0,298.15,101.0,Liquid,3,NCC(=O)O,Solvent,0.017541,,[Cl-].[Na+],...,,O,Solvent,0.973688,,0.906,0.0055,,,10.1016/j.jct.2013.08.018
2,50bb0c2a55f847c99d3ece35f41de236,298.15,101.0,Liquid,3,NCC(=O)O,Solvent,0.026083,,[Cl-].[Na+],...,,O,Solvent,0.965222,,0.897,0.0055,,,10.1016/j.jct.2013.08.018
3,a59ee6df2abd46c1b290634a0ab20866,298.15,101.0,Liquid,3,NCC(=O)O,Solvent,0.034478,,[Cl-].[Na+],...,,O,Solvent,0.956903,,0.891,0.0055,,,10.1016/j.jct.2013.08.018
4,790f5a6318a14d109c5ae0fa8d48ce6b,298.15,101.0,Liquid,3,NCC(=O)O,Solvent,0.003527,,[Cl-].[Na+],...,,O,Solvent,0.978839,,0.949,0.0050,,,10.1016/j.jct.2013.08.018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5309,61b6618b9cf14bb4b834bdbc8781b778,298.15,100.0,Liquid,2,N[C@@H](CO)C(=O)O,Solvent,0.050150,,O,...,,,,,,0.855,0.0055,,,10.1016/j.jct.2014.11.014
5310,f0c41da8c27b4c8fb3dee9d80d6b92e7,298.15,100.0,Liquid,2,N[C@@H](CO)C(=O)O,Solvent,0.053226,,O,...,,,,,,0.875,0.0055,,,10.1016/j.jct.2014.11.014
5311,2aa873ad04d0463c94e30ebbc67b768a,298.15,100.0,Liquid,2,N[C@@H](CO)C(=O)O,Solvent,0.054518,,O,...,,,,,,0.872,0.0055,,,10.1016/j.jct.2014.11.014
5312,1616e00c3f9d408e839e3103487605f9,298.15,100.0,Liquid,2,N[C@@H](CO)C(=O)O,Solvent,0.058436,,O,...,,,,,,0.861,0.0055,,,10.1016/j.jct.2014.11.014


## Filtering data set

In [6]:
from openff.evaluator.datasets.curation.components.filtering import FilterByPropertyTypes, FilterByPropertyTypesSchema
from openff.evaluator.datasets.curation.components.filtering import FilterByTemperature, FilterByTemperatureSchema
from openff.evaluator.datasets.curation.components.filtering import FilterByPressure, FilterByPressureSchema
from openff.evaluator.datasets.curation.components.filtering import FilterBySmiles, FilterBySmilesSchema

In [7]:
# Property
# schema = FilterByPropertyTypesSchema(property_types=["Density"])
schema = FilterByPropertyTypesSchema(property_types=["OsmoticCoefficient"], strict=False)

data_set_osmotic = FilterByPropertyTypes.apply(data_set, schema)

print(len(data_set_osmotic))

INFO:openff.evaluator.datasets.curation.components.components:606 data points were removed after applying the FilterByPropertyTypes component.


2784


In [12]:
df_osmotic=data_set_osmotic.to_pandas()

In [14]:
df_osmotic.head()

Unnamed: 0,Id,Temperature (K),Pressure (kPa),Phase,N Components,Component 1,Role 1,Mole Fraction 1,Exact Amount 1,Component 2,Role 2,Mole Fraction 2,Exact Amount 2,Component 3,Role 3,Mole Fraction 3,Exact Amount 3,OsmoticCoefficient Value (),OsmoticCoefficient Uncertainty (),Source
0,cddc9485a39c414a90b3a1b4e4219af0,298.15,101.0,Liquid,3,NCC(=O)O,Solvent,0.008848,,[Cl-].[Na+],Solvent,0.008848,,O,Solvent,0.982303,,0.91,0.0055,10.1016/j.jct.2013.08.018
1,7a612b0d9b7649b5917ed0a414946bb0,298.15,101.0,Liquid,3,NCC(=O)O,Solvent,0.017541,,[Cl-].[Na+],Solvent,0.008771,,O,Solvent,0.973688,,0.906,0.0055,10.1016/j.jct.2013.08.018
2,50bb0c2a55f847c99d3ece35f41de236,298.15,101.0,Liquid,3,NCC(=O)O,Solvent,0.026083,,[Cl-].[Na+],Solvent,0.008694,,O,Solvent,0.965222,,0.897,0.0055,10.1016/j.jct.2013.08.018
3,a59ee6df2abd46c1b290634a0ab20866,298.15,101.0,Liquid,3,NCC(=O)O,Solvent,0.034478,,[Cl-].[Na+],Solvent,0.008619,,O,Solvent,0.956903,,0.891,0.0055,10.1016/j.jct.2013.08.018
4,790f5a6318a14d109c5ae0fa8d48ce6b,298.15,101.0,Liquid,3,NCC(=O)O,Solvent,0.003527,,[Cl-].[Na+],Solvent,0.017634,,O,Solvent,0.978839,,0.949,0.005,10.1016/j.jct.2013.08.018


In [13]:
df_osmotic.to_csv('osmotic_database.csv',index=False)

In [21]:
df_osmotic.columns.to_list()

['Id',
 'Temperature (K)',
 'Pressure (kPa)',
 'Phase',
 'N Components',
 'Component 1',
 'Role 1',
 'Mole Fraction 1',
 'Exact Amount 1',
 'Component 2',
 'Role 2',
 'Mole Fraction 2',
 'Exact Amount 2',
 'Component 3',
 'Role 3',
 'Mole Fraction 3',
 'Exact Amount 3',
 'OsmoticCoefficient Value ()',
 'OsmoticCoefficient Uncertainty ()',
 'Source']

In [26]:
df_osmotic[['N Components','Component 1']]

Unnamed: 0,N Components,Component 1
0,3,NCC(=O)O
1,3,NCC(=O)O
2,3,NCC(=O)O
3,3,NCC(=O)O
4,3,NCC(=O)O
...,...,...
2779,2,N[C@@H](CO)C(=O)O
2780,2,N[C@@H](CO)C(=O)O
2781,2,N[C@@H](CO)C(=O)O
2782,2,N[C@@H](CO)C(=O)O


In [56]:
df_select=df_osmotic[(df_osmotic['Temperature (K)'] > 298.0) & (df_osmotic['Temperature (K)'] < 300.0) & (df_osmotic['Pressure (kPa)'] == 101 ) & (df_osmotic['N Components'] == 2) & (df_osmotic['Component 2'] == 'O')]

In [57]:
print(df_select)

                                    Id  Temperature (K)  Pressure (kPa)  \
8     1690923bb58e473aa7ae28cca55e9516           298.15           101.0   
21    6c8b1a072a12461bb44dc735b30d12ed           298.15           101.0   
35    36504db66e8d417094240caa9ff34414           298.15           101.0   
44    73a36cdb4d8c494e97c2b2ca7442b953           298.15           101.0   
48    6f51277e358c474f98026d7d8a2d062d           298.15           101.0   
...                                ...              ...             ...   
2647  73fb5f74bbdd4504ae9da986ce050e69           298.15           101.0   
2648  6bbbb54020f94e03b2c4112d55c74954           298.15           101.0   
2649  560e52bc823b4253a40bf828a0955958           298.15           101.0   
2650  bc436df25a3a4043a3531358ee50d868           298.15           101.0   
2651  22fe6dc3b25a428584fe3da5c5a8478d           298.15           101.0   

       Phase  N Components  Component 1   Role 1  Mole Fraction 1  \
8     Liquid             2  [C

In [58]:
df_select[
['Id',
 'Temperature (K)',
 'Pressure (kPa)',
 'N Components',
 'Component 1',
 'Role 1',
 'Mole Fraction 1',
 'Component 2',
 'Role 2',
 'Mole Fraction 2',
 'OsmoticCoefficient Value ()',
 'OsmoticCoefficient Uncertainty ()',
 'Source']
].to_csv('osm_selection.csv', index=False)

In [66]:
df_select_ions=df_select[(df_select['Component 1'].str.contains('+', regex=False)) | (df_select['Component 1'].str.contains('-', regex=False))]
df_select_nonions=df_select[~(df_select['Component 1'].str.contains('+', regex=False)) | ~(df_select['Component 1'].str.contains('-', regex=False))]

In [67]:
df_select_ions.shape

(314, 20)

In [68]:
df_select_nonions.shape

(71, 20)

In [65]:
df_select_ions[
['Id',
 'Temperature (K)',
 'Pressure (kPa)',
 'N Components',
 'Component 1',
 'Role 1',
 'Mole Fraction 1',
 'Component 2',
 'Role 2',
 'Mole Fraction 2',
 'OsmoticCoefficient Value ()',
 'OsmoticCoefficient Uncertainty ()',
 'Source']
].to_csv('osm_selection_ions.csv', index=False)

In [69]:
df_select_nonions[
['Id',
 'Temperature (K)',
 'Pressure (kPa)',
 'N Components',
 'Component 1',
 'Role 1',
 'Mole Fraction 1',
 'Component 2',
 'Role 2',
 'Mole Fraction 2',
 'OsmoticCoefficient Value ()',
 'OsmoticCoefficient Uncertainty ()',
 'Source']
].to_csv('osm_selection_nonions.csv', index=False)

In [39]:
# df_select.to_csv('osm_selection.csv', index=False)

In [None]:
# pandas_data_set = data_set.to_pandas()
# pandas_data_set[
#     [
#         "Temperature (K)",
#         "Pressure (kPa)",
#         "Component 1",
#         "OsmoticCoefficient Value ()",
#         "Source",
#     ]
# ].head()

In [9]:

# # Property
# data_set = FilterByPropertyTypes.apply(
#     data_set, FilterByPropertyTypesSchema(property_types=["Density"])
# )

# # Temperature
# data_set = FilterByTemperature.apply(
#     data_set, FilterByTemperatureSchema(minimum_temperature=298.0, maximum_temperature=330.0)
# )

# # Pressure
# data_set = FilterByPressure.apply(
#     data_set, FilterByPressureSchema(minimum_pressure=100.0, maximum_pressure=105.426)
# )

# # Solvent
# data_set = FilterBySmiles.apply(
#     data_set, FilterBySmilesSchema(smiles_to_include=["CCO", "CC(C)O"])
# )

# print(len(data_set))