# Core Imports and Setup

In [1]:
import os
import pandas as pd
from pathlib import Path

import warnings
warnings.filterwarnings("ignore")

import logging
# logging.getLogger("openff.toolkit").setLevel(logging.ERROR)
# logging.basicConfig(level=logging.DEBUG)

import json
from openff import toolkit, evaluator

In [2]:
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw, MolFromSmiles
from rdkit.Chem.Draw import MolsToGridImage, rdMolDraw2D, MolsMatrixToGridImage
import pubchempy
from PIL import Image, ImageDraw, ImageFont
IPythonConsole.ipython_useSVG=False  #< set this to False if you want PNGs instead of SVGs

In [3]:
from rdkit.Chem import rdMolDescriptors
import pubchempy as pcp
import math

In [4]:
from openff.evaluator.datasets import PhysicalProperty, PropertyPhase
from openff.evaluator.datasets.thermoml import thermoml_property
from openff.evaluator import properties
from openff.units import unit
from openff.evaluator.datasets.thermoml import ThermoMLDataSet
from openff.evaluator.datasets import PhysicalProperty, PropertyPhase, PhysicalPropertyDataSet

In [5]:
from openff.evaluator.datasets.curation.components.filtering import FilterByPropertyTypes, FilterByPropertyTypesSchema
from openff.evaluator.datasets.curation.components.filtering import FilterByTemperature, FilterByTemperatureSchema
from openff.evaluator.datasets.curation.components.filtering import FilterByPressure, FilterByPressureSchema
from openff.evaluator.datasets.curation.components.filtering import FilterBySmiles, FilterBySmilesSchema
from openff.evaluator.datasets.curation.components.filtering import FilterByNComponents, FilterByNComponentsSchema


In [6]:
from rich.progress import track
from collections import defaultdict
from openff.toolkit.utils.exceptions import UndefinedStereochemistryError

# Registering New Properties

In [7]:
@thermoml_property("Osmotic coefficient", supported_phases=PropertyPhase.Liquid | PropertyPhase.Gas)
class OsmoticCoefficient(PhysicalProperty):
    """A class representation of a osmotic coeff property"""

    @classmethod
    def default_unit(cls):
        return unit.dimensionless
    
setattr(properties, OsmoticCoefficient.__name__, OsmoticCoefficient)

ThermoMLDataSet.registered_properties['Osmotic coefficient'].conversion_function

functools.partial(<function _default_mapping at 0x7efcbfb9ecb0>, <class '__main__.OsmoticCoefficient'>)

In [8]:
@thermoml_property("Activity coefficient", supported_phases=PropertyPhase.Liquid | PropertyPhase.Gas)
class ActivityCoefficient(PhysicalProperty):
    """A class representation of a Activity coeff property"""

    @classmethod
    def default_unit(cls):
        return unit.dimensionless
    
setattr(properties, ActivityCoefficient.__name__, ActivityCoefficient)

ThermoMLDataSet.registered_properties['Activity coefficient'].conversion_function

functools.partial(<function _default_mapping at 0x7efcbfb9ecb0>, <class '__main__.ActivityCoefficient'>)

In [9]:
@thermoml_property("Speed Of Sound (m/s)", supported_phases=PropertyPhase.Liquid | PropertyPhase.Gas)
class SpeedOfSound(PhysicalProperty):
    """A class representation of a Speed Of Sound property"""

    @classmethod
    def default_unit(cls):
        return unit.meter / unit.second
    
setattr(properties, SpeedOfSound.__name__, SpeedOfSound)

ThermoMLDataSet.registered_properties['Speed Of Sound (m/s)'].conversion_function

functools.partial(<function _default_mapping at 0x7efcbfb9ecb0>, <class '__main__.SpeedOfSound'>)

# Functions

In [10]:
def filter_database (unfiltered_directory):
    data_set=[]
    # Loop through all files in the directory
    for filename in os.listdir(unfiltered_directory):
        # Check if the file ends with .json
        if filename.endswith('.xml'):
            # Full path to the file
            file_path = os.path.join(unfiltered_directory, filename)
            data_set.append(file_path)
    
    sorted_dois = defaultdict(list)
    for doi in track(data_set, description='Filtering DOIs...'):
        try:
            dataset = ThermoMLDataSet._from_file(doi)
            sorted_dois['working'].append(doi)
        except UndefinedStereochemistryError:
            sorted_dois['stereo_fail'].append(doi)
        except Exception as other_exc:
            sorted_dois[other_exc.__class__.__name__].append(doi)

    print('Amount of failing files: %i/%i' % (len(sorted_dois['stereo_fail']),len(data_set)))
    print('Amount of working files: %i/%i' % (len(sorted_dois['working']),len(data_set)))

    return sorted_dois['working']

In [11]:
def extract_database (database_directory, csv_cached):
    CACHED_PROP_PATH = Path(csv_cached)

    if CACHED_PROP_PATH.exists():
        prop_df = pd.read_csv(CACHED_PROP_PATH, index_col=0)
        # delete rows with underfined thermo params to avoid pesky indexing errors
        prop_df = prop_df.dropna(subset=['Temperature (K)'])
        prop_df = prop_df.dropna(subset=['Pressure (kPa)'])
        data_set = ThermoMLDataSet.from_pandas(prop_df)
    else:
        sorted_files=filter_database(database_directory)
        data_set=ThermoMLDataSet.from_file(*sorted_files)

        initial_df = data_set.to_pandas()
        with CACHED_PROP_PATH.open('w') as file:
            initial_df.to_csv(CACHED_PROP_PATH)
    return data_set

# Import Directory

In [18]:
# Specify the directory containing the CSV files
csv_directory = './'  # Change to your directory path

# List of specific CSV files to read
csv_files_to_read = ['filtered_1007.csv', 'filtered_1016.csv', 'filtered_1021.csv']

# Create an empty dictionary to store DataFrames
dataframes_dict = {}

# Loop through the specified list of files
for filename in csv_files_to_read:
    # Generate the full path to the file
    file_path = os.path.join(csv_directory, filename)
    
    # Load the CSV file into a DataFrame
    df = pd.read_csv(file_path)
    
    # Extract the last four numbers before the extension to use as the key
    key = filename.split('_')[-1].split('.')[0]
    dataframes_dict[key] = df

# Merge all DataFrames together without repeating data points
merged_dataframe = pd.concat(dataframes_dict.values()).drop_duplicates().reset_index(drop=True)

# Print the keys of the dictionary to verify
print("Loaded DataFrames:", dataframes_dict.keys())

# Print the merged DataFrame to verify
print("Merged DataFrame:")
print(merged_dataframe)

Loaded DataFrames: dict_keys(['1007', '1016', '1021'])
Merged DataFrame:
        Unnamed: 0                                Id  Temperature (K)  \
0               13  7616bb9f827b4003bf01c20e13f08c1b           298.15   
1               14  41426fd0c07044919d388155321abcac           298.15   
2               15  298aee969c294104aa5c397f73b81e77           298.15   
3               16  39ced9cf9c7543668f0e61db747ab6c9           298.15   
4               17  7aa18c4cd41a40ee80a6b86bf2e61297           298.15   
...            ...                               ...              ...   
158432      324961  837dfe6504134a1482d50cb206fadd5f           298.15   
158433      324962  e360156829d844a7824d971f470dbb08           298.15   
158434      324963  c3cd4aa0d7474690ac64db2a020b93fc           298.15   
158435      324964  db5ee64564e8455fa6b27826e7d61794           298.15   
158436      324965  7df62d582a7646e48d811f2d7602926f           298.15   

        Pressure (kPa)   Phase  N Components  Comp

In [20]:
merged_dataframe.to_csv(f'filtered_merged.csv')

In [None]:
filtered_dataset=ThermoMLDataSet.from_pandas(merged_dataframe)

# Counting with Evaluator

## Filtering properties schemas

In [None]:
hmix_schema = FilterByPropertyTypesSchema(property_types=["EnthalpyOfMixing"], strict=True)
dens_schema = FilterByPropertyTypesSchema(property_types=["Density"], strict=True)
hmix_dens_schema = FilterByPropertyTypesSchema(property_types=["EnthalpyOfMixing","Density"])
hmix_dens_strict_schema = FilterByPropertyTypesSchema(property_types=["EnthalpyOfMixing","Density"], strict=True)

In [None]:
osmotic_schema = FilterByPropertyTypesSchema(property_types=["OsmoticCoefficient"], strict=True)
activity_schema = FilterByPropertyTypesSchema(property_types=["ActivityCoefficient"], strict=True)
oc_ac_schema = FilterByPropertyTypesSchema(property_types=["OsmoticCoefficient","ActivityCoefficient"])
oc_ac_strict_schema = FilterByPropertyTypesSchema(property_types=["OsmoticCoefficient","ActivityCoefficient"], strict=True)

## ThermoML Hmix and Density of binary mixtures

### Filtering

In [None]:
# Hmix + density
tml_hd_usable= FilterByPropertyTypes.apply(filtered_dataset, hmix_dens_strict_schema)
print(len(tml_hd_usable))

# save for future use
tml_hmix_dens_path = Path(f'filtered_merged/tml_filtered_hd_usable.json')
tml_hd_usable.json(tml_hmix_dens_path, format=True)

df_hmix_dens=tml_hd_usable.to_pandas()
df_hmix_dens.to_csv(f'filtered_merged/tml_hd_usable_database.csv',index=False)

3348
3348


In [None]:
# Hmix only
tml_hmix_usable = FilterByPropertyTypes.apply(filtered_dataset, hmix_schema)
print(len(tml_hmix_usable))

# save for future use
tml_hmix_path = Path(f'filtered_merged/tml_filtered_hmix_usable.json')
tml_hmix_usable.json(tml_hmix_path, format=True)

df_hmix=tml_hmix_usable.to_pandas()
df_hmix.to_csv(f'filtered_merged/tml_hmix_usable_database.csv',index=False)

KeyError: 'Pressure (kPa)'

In [None]:
# Density only
tml_dens_usable= FilterByPropertyTypes.apply(filtered_dataset, dens_schema)
print(len(tml_dens_usable))

# save for future use
tml_dens_path = Path(f'filtered_merged/tml_filtered_dens_usable.json')
tml_dens_usable.json(tml_dens_path, format=True)

df_dens=tml_dens_usable.to_pandas()
df_dens.to_csv(f'filtered_merged/tml_dens_usable_database.csv',index=False)

3348


In [None]:
print(f'ThermoML - Number of initial data points: {len(filtered_dataset)}')
print(f'ThermoML - Number of hmix+density data points: {len(tml_hd_usable)}')
print(f'ThermoML - Number of hmix data points: {len(tml_hmix_usable)}')
print(f'ThermoML - Number of density data points: {len(tml_dens_usable)}')

ThermoML - Number of initial data points: 3348
ThermoML - Number of hmix+density data points: 14908
ThermoML - Number of hmix data points: 12789
ThermoML - Number of density data points: 3348


### Get data information

#### Functions

In [None]:
def get_unique_temperature_pressure_for_mixtures(data, component_columns=['Component 1', 'Component 2']):
    """
    Loops through a dataframe and returns a dictionary where each unique
    combination of specified component columns is a key. The value for each key is 
    a dictionary containing a unique list of tuples with temperatures and pressures for that mixture 
    and the count of occurrences of each unique mixture.

    Parameters:
    - data (pd.DataFrame): DataFrame containing the data
    - component_columns (list): List of column names representing the mixture components
                                 Defaults to ['Component 1', 'Component 2', 'Component 3'].

    Returns:
    - dict: A dictionary with unique mixtures as keys and a dictionary with temperature-pressure
            information and counts as values.
    """
    mixtures_dict = {}
    unique_mixtures = set()


    for _, row in data.iterrows():
        # Create a tuple of the mixture components
        mixture_key = tuple(row[component] for component in component_columns)

        # Add to unique mixtures set (ignores duplicates)
        unique_mixtures.add(mixture_key)
        
        # Extract temperature and pressure
        temp_pressure = (row['Temperature (K)'], row['Pressure (kPa)'])
        
        # Only add to the dictionary if both temperature and pressure are valid numbers
        if not (math.isnan(temp_pressure[0]) or math.isnan(temp_pressure[1])):
            if mixture_key not in mixtures_dict:
                mixtures_dict[mixture_key] = {"temp_pressures": set(), "count": 0}
            
            # Add the temperature-pressure pair and increment the count
            mixtures_dict[mixture_key]["temp_pressures"].add(temp_pressure)
            mixtures_dict[mixture_key]["count"] += 1

    # Convert the sets back to lists for readability
    mixtures_dict = {
        key: {"temp_pressures": list(value["temp_pressures"]), "count": value["count"]}
        for key, value in mixtures_dict.items()
    }
    
    # Add deduplicated list of unique mixtures
    mixtures_dict['unique_mixtures'] = list(unique_mixtures)
    
    return mixtures_dict


In [None]:
def get_uniquemixs_counts(dataframe, properties):
    # Get the unique mixtures data with counts
    uniquemixs_temps_press = get_unique_temperature_pressure_for_mixtures(dataframe)

    # Prepare data for saving to a CSV file, excluding the 'unique_mixtures' entry itself
    mixtures_list = []

    for mixture, details in uniquemixs_temps_press.items():
        # Skip the 'unique_mixtures' key as it's not part of the temperature/pressure data
        if mixture == 'unique_mixtures':
            continue
        # Extract temperature-pressure pairs and count for each mixture
        count = details["count"]
        for temp, press in details["temp_pressures"]:
            mixtures_list.append({
                "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
                "Temperature": temp,
                "Pressure": press,
                "Count": count
            })

    # Convert to DataFrame
    mixtures_df = pd.DataFrame(mixtures_list)

    # Save to CSV with the additional 'Count' column
    output_path = f'filtered_merged/tml_uniqmixs_thermo_{properties}.csv'
    mixtures_df.to_csv(output_path, index=False)

    # Prepare a list to store each unique mixture and its count for the CSV
    umixs_list = [
        {
            "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
            "Count": uniquemixs_temps_press[mixture]["count"]
        }
        for mixture in uniquemixs_temps_press["unique_mixtures"]
    ]

    # Convert to DataFrame
    hmix_umixtures_df = pd.DataFrame(umixs_list)

    # Save to CSV with the additional 'Count' column
    output_path2 = f'filtered_merged/tml_uniqmixs_{properties}.csv'
    hmix_umixtures_df.to_csv(output_path2, index=False)

    print(f'ThermoML - Number of {properties} mixtures: {len(umixs_list)}')

In [None]:
def get_smiles(data_set_choice):
    print(f'Number of initial data points: {len(data_set_choice)}')
    subs=list(set(data_set_choice.substances))
    print(f'Number of individual mixtures: {len(subs)}')
    smiles_list=[]
    comps=[]

    for i in subs:
        if len(i.components) == 3:
            comps.append(i.components[0].smiles)
            comps.append(i.components[1].smiles)
            comps.append(i.components[2].smiles)
        elif len(i.components) == 2:
            comps.append(i.components[0].smiles)
            comps.append(i.components[1].smiles)
        elif len(i.components) == 1:
            comps.append(i.components[0].smiles)

    # comps.remove('O')

    [smiles_list.append(o) for o in set(comps)]

    print(f'Number of individual components: {len(smiles_list)}')

    return(smiles_list)

In [None]:
def get_indiv_comps(database, properties):
    indiv_compnts=get_smiles(database)
    indiv_compnts_df=pd.DataFrame(indiv_compnts)
    indiv_compnts_df.to_csv(f'filtered_merged/tml_indivcomps_{properties}.csv')

#### Hmix + dens

In [None]:
get_indiv_comps(tml_hd_usable, 'hd')
get_uniquemixs_counts(df_hmix_dens, 'hd')

Number of initial data points: 14908
Number of individual mixtures: 10003
Number of individual components: 97
ThermoML - Number of hd mixtures: 278


In [None]:
get_indiv_comps(tml_hmix_usable, 'hmix')
get_uniquemixs_counts(df_hmix, 'hmix')

Number of initial data points: 12789
Number of individual mixtures: 11206
Number of individual components: 206
ThermoML - Number of hmix mixtures: 585


In [None]:
get_indiv_comps(tml_dens_usable, 'dens')
get_uniquemixs_counts(df_dens, 'dens')

Number of initial data points: 3348
Number of individual mixtures: 972
Number of individual components: 71
ThermoML - Number of dens mixtures: 83


## ThermoML Osmotic and Activity Coefficients

### FIltering

In [None]:
# Osmotic + Activity
tml_ocac_usable= FilterByPropertyTypes.apply(filtered_dataset, oc_ac_strict_schema)
print(len(tml_ocac_usable))

# save for future use
tml_ocac_path = Path(f'filtered_merged/tml_filtered_ocac_usable.json')
tml_ocac_usable.json(tml_ocac_path, format=True)

df_ocac=tml_ocac_usable.to_pandas()
df_ocac.to_csv(f'filtered_merged/tml_ocac_usable_database.csv',index=False)

0


KeyError: 'N Components'

In [None]:
# Osmotic only
tml_oc_usable = FilterByPropertyTypes.apply(filtered_dataset, osmotic_schema)
print(len(tml_oc_usable))

# save for future use
tml_oc_path = Path(f'filtered_merged/tml_filtered_oc_usable.json')
tml_oc_usable.json(tml_oc_path, format=True)

df_oc=tml_oc_usable.to_pandas()
df_oc.to_csv(f'filtered_merged/tml_oc_usable_database.csv',index=False)

KeyError: 'Pressure (kPa)'

In [None]:
# Activity only
tml_ac_usable = FilterByPropertyTypes.apply(filtered_dataset, activity_schema)
print(len(tml_ac_usable))

# save for future use
tml_ac_path = Path(f'filtered_merged/tml_filtered_ac_usable.json')
tml_ac_usable.json(tml_ac_path, format=True)

df_ac=tml_ac_usable.to_pandas()
df_ac.to_csv(f'filtered_merged/tml_ac_usable_database.csv',index=False)

KeyError: 'Pressure (kPa)'

### Get data information

In [None]:
get_indiv_comps(tml_ocac_usable, 'ocac')
get_uniquemixs_counts(df_ocac, 'ocac')

NameError: name 'tml_ocac_usable' is not defined

In [None]:
get_indiv_comps(tml_oc_usable, 'oc')
get_uniquemixs_counts(df_oc, 'oc')

Number of initial data points: 1822
Number of individual mixtures: 1297
Number of individual components: 53
ThermoML - Number of oc mixtures: 96


In [None]:
get_indiv_comps(tml_ac_usable, 'ac')
get_uniquemixs_counts(df_ac, 'ac')

Number of initial data points: 9108
Number of individual mixtures: 333
Number of individual components: 99
ThermoML - Number of ac mixtures: 99
