# Core Imports and Setup

In [1]:
import os
import pandas as pd
from pathlib import Path

import warnings
warnings.filterwarnings("ignore")

import logging
# logging.getLogger("openff.toolkit").setLevel(logging.ERROR)
# logging.basicConfig(level=logging.DEBUG)

import json
from openff import toolkit, evaluator

In [2]:
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw, MolFromSmiles
from rdkit.Chem.Draw import MolsToGridImage, rdMolDraw2D, MolsMatrixToGridImage
import pubchempy
from PIL import Image, ImageDraw, ImageFont
IPythonConsole.ipython_useSVG=False  #< set this to False if you want PNGs instead of SVGs

In [3]:
from rdkit.Chem import rdMolDescriptors
import pubchempy as pcp
import math

In [4]:
from openff.evaluator.datasets import PhysicalProperty, PropertyPhase
from openff.evaluator.datasets.thermoml import thermoml_property
from openff.evaluator import properties
from openff.units import unit
from openff.evaluator.datasets.thermoml import ThermoMLDataSet
from openff.evaluator.datasets import PhysicalProperty, PropertyPhase, PhysicalPropertyDataSet

In [5]:
from openff.evaluator.datasets.curation.components.filtering import FilterByPropertyTypes, FilterByPropertyTypesSchema
from openff.evaluator.datasets.curation.components.filtering import FilterByTemperature, FilterByTemperatureSchema
from openff.evaluator.datasets.curation.components.filtering import FilterByPressure, FilterByPressureSchema
from openff.evaluator.datasets.curation.components.filtering import FilterBySmiles, FilterBySmilesSchema
from openff.evaluator.datasets.curation.components.filtering import FilterByNComponents, FilterByNComponentsSchema


# Functions

In [6]:
def get_smiles(data_set_choice):
    print(f'Number of initial data points: {len(data_set_choice)}')
    subs=list(set(data_set_choice.substances))
    print(f'Number of individual mixtures: {len(subs)}')
    smiles_list=[]
    comps=[]

    for i in subs:
        if len(i.components) == 3:
            comps.append(i.components[0].smiles)
            comps.append(i.components[1].smiles)
            comps.append(i.components[2].smiles)
        elif len(i.components) == 2:
            comps.append(i.components[0].smiles)
            comps.append(i.components[1].smiles)
        elif len(i.components) == 1:
            comps.append(i.components[0].smiles)

    # comps.remove('O')

    [smiles_list.append(o) for o in set(comps)]

    print(f'Number of individual components: {len(smiles_list)}')

    return(smiles_list)

In [7]:
def smiles_to_figures(smiles_list):
    func_group_mols=[]
    matchlegend=[]
    for m in set(smiles_list):
        if m != 'O':
            compounds=pubchempy.get_compounds(m, namespace='smiles')
            match = compounds[0]
            match_name=str(match.iupac_name)
            matchlegend.append(match_name)
            mol=Chem.MolFromSmiles(m)
            func_group_mols.append(mol)
    init_fig=MolsToGridImage(mols=func_group_mols, legends=matchlegend, molsPerRow=10, returnPNG=False)
    return init_fig

In [8]:
def smiles_to_iupac(df, smiles_column, new_column_name='IUPAC'):
 
    iupac_names = []


    for smiles in df[smiles_column]:
        try:
            # Convert the SMILES string to a molecule object using RDKit
            mol = Chem.MolFromSmiles(smiles)
            
            if mol: 
                # Get the molecular formula for reference (not necessary)
                formula = rdMolDescriptors.CalcMolFormula(mol)
                
                # Use PubChemPy to search for the IUPAC name by molecular formula or SMILES
                compounds = pcp.get_compounds(smiles, 'smiles')
                
                if compounds:
                    iupac_name = compounds[0].iupac_name  # Get the IUPAC name from the first match
                else:
                    iupac_name = None
            else:
                iupac_name = None
        except Exception as e:
            iupac_name = None
        
        # Append the IUPAC name (or None) to the list
        iupac_names.append(iupac_name)

    # Find the position of the reference column (smiles_column)
    col_position = df.columns.get_loc(smiles_column)
    
    # Insert the new column with IUPAC names right next to the SMILES column
    df.insert(col_position + 1, new_column_name, iupac_names)

    return df



In [9]:
def get_unique_temperature_pressure_strict(data, component_columns=['Component 1', 'Component 2']):
    """
    Loops through a dataframe and returns a dictionary where each unique
    combination of specified component columns is a key. The value for each key is 
    a dictionary containing a unique list of tuples with temperatures and pressures for that mixture 
    and the count of occurrences of each unique mixture.

    Parameters:
    - data (pd.DataFrame): DataFrame containing the data
    - component_columns (list): List of column names representing the mixture components
                                 Defaults to ['Component 1', 'Component 2', 'Component 3'].

    Returns:
    - dict: A dictionary with unique mixtures as keys and a dictionary with temperature-pressure
            information and counts as values.
    """
    mixtures_dict = {}
    unique_mixtures = set()


    for _, row in data.iterrows():
        # Create a tuple of the mixture components
        mixture_key = tuple(row[component] for component in component_columns)

        # Add to unique mixtures set (ignores duplicates)
        unique_mixtures.add(mixture_key)
        
        # Extract temperature and pressure
        temp_pressure = (row['Temperature (K)'], row['Pressure (kPa)'])
        
        # Only add to the dictionary if both temperature and pressure are valid numbers
        if not (math.isnan(temp_pressure[0]) or math.isnan(temp_pressure[1])):
            if mixture_key not in mixtures_dict:
                mixtures_dict[mixture_key] = {"temp_pressures": set(), "count": 0}
            
            # Add the temperature-pressure pair and increment the count
            mixtures_dict[mixture_key]["temp_pressures"].add(temp_pressure)
            mixtures_dict[mixture_key]["count"] += 1

    # Convert the sets back to lists for readability
    mixtures_dict = {
        key: {"temp_pressures": list(value["temp_pressures"]), "count": value["count"]}
        for key, value in mixtures_dict.items()
    }
    
    # Add deduplicated list of unique mixtures
    mixtures_dict['unique_mixtures'] = list(unique_mixtures)
    
    return mixtures_dict


In [10]:
def get_unique_temperature_pressure_for_mixtures(data, component_columns=['Component 1', 'Component 2', 'Component 3']):
    """
    Loops through a dataframe and returns a dictionary where each unique
    combination of specified component columns is a key. The value for each key is 
    a dictionary containing a unique list of tuples with temperatures and pressures for that mixture 
    and the count of occurrences of each unique mixture.

    Parameters:
    - data (pd.DataFrame): DataFrame containing the data
    - component_columns (list): List of column names representing the mixture components
                                 Defaults to ['Component 1', 'Component 2', 'Component 3'].

    Returns:
    - dict: A dictionary with unique mixtures as keys and a dictionary with temperature-pressure
            information and counts as values.
    """
    mixtures_dict = {}
    unique_mixtures = set()


    for _, row in data.iterrows():
        # Create a tuple of the mixture components
        mixture_key = tuple(row[component] for component in component_columns)

        # Add to unique mixtures set (ignores duplicates)
        unique_mixtures.add(mixture_key)
        
        # Extract temperature and pressure
        temp_pressure = (row['Temperature (K)'], row['Pressure (kPa)'])
        
        # Only add to the dictionary if both temperature and pressure are valid numbers
        if not (math.isnan(temp_pressure[0]) or math.isnan(temp_pressure[1])):
            if mixture_key not in mixtures_dict:
                mixtures_dict[mixture_key] = {"temp_pressures": set(), "count": 0}
            
            # Add the temperature-pressure pair and increment the count
            mixtures_dict[mixture_key]["temp_pressures"].add(temp_pressure)
            mixtures_dict[mixture_key]["count"] += 1

    # Convert the sets back to lists for readability
    mixtures_dict = {
        key: {"temp_pressures": list(value["temp_pressures"]), "count": value["count"]}
        for key, value in mixtures_dict.items()
    }
    
    # Add deduplicated list of unique mixtures
    mixtures_dict['unique_mixtures'] = list(unique_mixtures)
    
    return mixtures_dict


In [11]:
def extract_data_dois(dois_json,csv_cached):
    CACHED_PROP_PATH = Path(csv_cached)

    if CACHED_PROP_PATH.exists():
        prop_df = pd.read_csv(CACHED_PROP_PATH, index_col=0)
        # delete rows with underfined thermo params to avoid pesky indexing errors
        prop_df = prop_df.dropna(subset=['Temperature (K)'])
        prop_df = prop_df.dropna(subset=['Pressure (kPa)'])
        data_set = ThermoMLDataSet.from_pandas(prop_df)
    else:
        with open(dois_json) as f:
            doi_dat = json.load(f)
            data_set = ThermoMLDataSet.from_doi(*doi_dat['working'])

        initial_df = data_set.to_pandas()
        with CACHED_PROP_PATH.open('w') as file:
            initial_df.to_csv(CACHED_PROP_PATH)
    return data_set

# Hmix + Density

## 1) Loading ThermoML Data Sets

In [16]:
# ThermoML
hmix_data_set=extract_data_dois('sorted_hmix_dois.json','hmix_data.csv')

In [17]:
# Sage
hmix_dens_data_set_sage = PhysicalPropertyDataSet.from_json("sage-training-set.json")

## 2) Filtering data set

### Filtering properties schemas

In [22]:
hmix_schema = FilterByPropertyTypesSchema(property_types=["EnthalpyOfMixing"])
dens_schema = FilterByPropertyTypesSchema(property_types=["Density"])
hmix_dens_schema = FilterByPropertyTypesSchema(property_types=["EnthalpyOfMixing","Density"])
ncomp_schema = FilterByNComponentsSchema(n_components=[2])
hmix_dens_strict_schema = FilterByPropertyTypesSchema(property_types=["EnthalpyOfMixing","Density"], strict=True)
pressure_schema= FilterByPressureSchema(minimum_pressure = 90.0, maximum_pressure = 110.0)
temp_schema=FilterByTemperatureSchema(minimum_temperature = 273.15, maximum_temperature=373.0)

### ThermoML dois for Hmix of binary mixtures

In [23]:
# Hmix + density
data_set_hmix_dens= FilterByPropertyTypes.apply(hmix_data_set, hmix_dens_schema)
print(len(data_set_hmix_dens))

47709


In [24]:
data_set_2comps= FilterByNComponents.apply(data_set_hmix_dens, ncomp_schema)
print(len(data_set_2comps))

41394


In [25]:
data_set_hmix_dens_strict= FilterByPropertyTypes.apply(data_set_2comps, hmix_dens_strict_schema)
print(len(data_set_hmix_dens_strict))

20001


In [27]:
data_set_hd_strict_usable= FilterByPressure.apply(data_set_hmix_dens_strict, pressure_schema)
tml_hd_usable= FilterByTemperature.apply(data_set_hd_strict_usable, temp_schema)
print(len(tml_hd_usable))

19676


In [28]:
# save for future use
tml_hmix_dens_path = Path('tml_filtered_hmix_dens_strict_usable.json')
tml_hd_usable.json(tml_hmix_dens_path, format=True)

df_hmix_dens=tml_hd_usable.to_pandas()
df_hmix_dens.to_csv('tml_hmix_dens_strict_usable_database.csv',index=False)

In [29]:
# Hmix only
data_set_hmix = FilterByPropertyTypes.apply(hmix_data_set, hmix_schema)
data_set_hmix_usable= FilterByPressure.apply(data_set_hmix, pressure_schema)
tml_hmix_usable= FilterByTemperature.apply(data_set_hmix_usable, temp_schema)

# save for future use
tml_hmix_path = Path('tml_filtered_hmix_usable.json')
tml_hmix_usable.json(tml_hmix_path, format=True)

df_hmix=tml_hmix_usable.to_pandas()
df_hmix.to_csv('tml_hmix_usable_database.csv',index=False)
# df_hmix.head()
# df_hmix.columns.to_list()



In [30]:
# Density only
data_set_dens= FilterByPropertyTypes.apply(hmix_data_set, dens_schema)
data_set_dens_usable= FilterByPressure.apply(data_set_dens, pressure_schema)
tml_dens_usable= FilterByTemperature.apply(data_set_dens_usable, temp_schema)

# save for future use
tml_dens_path = Path('tml_filtered_dens_usable.json')
tml_dens_usable.json(tml_dens_path, format=True)

df_dens=tml_dens_usable.to_pandas()
df_dens.to_csv('tml_dens_usable_database.csv',index=False)

In [37]:
print(f'ThermoML - Number of initial data points: {len(hmix_data_set)}')
print(f'ThermoML - Number of hmix+density data points: {len(tml_hd_usable)}')
print(f'ThermoML - Number of hmix data points: {len(tml_hmix_usable)}')
print(f'ThermoML - Number of density data points: {len(tml_dens_usable)}')

ThermoML - Number of initial data points: 51130
ThermoML - Number of hmix+density data points: 19676
ThermoML - Number of hmix data points: 27004
ThermoML - Number of density data points: 16186


In [32]:
# df_hd_select=df_hmix_dens[(df_hmix_dens['Temperature (K)'] > 273.15) & (df_hmix_dens['Temperature (K)'] < 373.0) & (df_hmix_dens['Pressure (kPa)'] > 90 ) & (df_hmix_dens['Pressure (kPa)'] < 110 )]

In [33]:
# df_hd_select.to_csv('tml_usable_hmix_dens.csv', index=False)

In [34]:
# df_select=df_osmotic[(df_osmotic['Temperature (K)'] > 293.0) & (df_osmotic['Temperature (K)'] < 300.0) & (df_osmotic['Pressure (kPa)'] == 101 ) & (df_osmotic['N Components'] == 2) & (df_osmotic['Component 2'] == 'O')]


### Sage hmix+density training set

In [36]:
# Hmix + density
sage_hmix_dens= FilterByPropertyTypes.apply(hmix_dens_data_set_sage, hmix_dens_schema)
sage_2comps= FilterByNComponents.apply(sage_hmix_dens, ncomp_schema)
sage_hmix_dens_strict= FilterByPropertyTypes.apply(sage_2comps, hmix_dens_strict_schema)

sage_hd_strict_usable= FilterByPressure.apply(sage_hmix_dens_strict, pressure_schema)
sage_hd_usable= FilterByTemperature.apply(sage_hd_strict_usable, temp_schema)
print(len(sage_hd_usable))

# save for future use
sage_hmix_dens_path = Path('sage_filtered_hmix_dens_strict_usable.json')
sage_hd_usable.json(sage_hmix_dens_path, format=True)

df_sage_hmix_dens=sage_hd_usable.to_pandas()
df_sage_hmix_dens.to_csv('sage_hmix_dens_strict_usable_database.csv',index=False)

943


In [38]:
# Hmix only
sage_hmix = FilterByPropertyTypes.apply(hmix_dens_data_set_sage, hmix_schema)
sage_hmix_p_usable= FilterByPressure.apply(sage_hmix, pressure_schema)
sage_hmix_usable= FilterByTemperature.apply(sage_hmix_p_usable, temp_schema)

# save for future use
sage_hmix_path = Path('sage_filtered_hmix_usable.json')
sage_hmix_usable.json(sage_hmix_path, format=True)

df_sage_hmix=sage_hmix_usable.to_pandas()
df_sage_hmix.to_csv('sage_hmix_usable_database.csv',index=False)

In [39]:
# Density only
sage_dens = FilterByPropertyTypes.apply(hmix_dens_data_set_sage, dens_schema)
sage_dens_p_usable= FilterByPressure.apply(sage_dens, pressure_schema)
sage_dens_usable= FilterByTemperature.apply(sage_dens_p_usable, temp_schema)

# save for future use
sage_dens_path = Path('sage_filtered_dens_usable.json')
sage_dens_usable.json(sage_dens_path, format=True)

df_sage_dens=sage_dens_usable.to_pandas()
df_sage_dens.to_csv('sage_dens_usable_database.csv',index=False)

In [40]:
print(f'Sage - Number of initial data points: {len(hmix_dens_data_set_sage)}')
print(f'Sage - Number of hmix+density data points: {len(sage_hd_usable)}')
print(f'Sage - Number of hmix data points: {len(sage_hmix_usable)}')
print(f'Sage - Number of density data points: {len(sage_dens_usable)}')

Sage - Number of initial data points: 1032
Sage - Number of hmix+density data points: 943
Sage - Number of hmix data points: 477
Sage - Number of density data points: 555


## 3) Get data information

### ThermoML

#### Hmix+dens

In [41]:
hmix_dens_indiv_compnts=get_smiles(tml_hd_usable)
hmix_dens_indiv_compnts_df=pd.DataFrame(hmix_dens_indiv_compnts)
hmix_dens_indiv_compnts_df.to_csv('tml_hmix_dens_indiv_comps_usable.csv')

Number of initial data points: 19676
Number of individual mixtures: 14201
Number of individual components: 141


In [42]:
# Get the unique mixtures data with counts
hmix_dens_unique_mixtures_temperatures_pressures = get_unique_temperature_pressure_strict(df_hmix_dens)

# Prepare data for saving to a CSV file, excluding the 'unique_mixtures' entry itself
mixtures_list = []

for mixture, details in hmix_dens_unique_mixtures_temperatures_pressures.items():
    # Skip the 'unique_mixtures' key as it's not part of the temperature/pressure data
    if mixture == 'unique_mixtures':
        continue
    # Extract temperature-pressure pairs and count for each mixture
    count = details["count"]
    for temp, press in details["temp_pressures"]:
        mixtures_list.append({
            "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
            "Temperature": temp,
            "Pressure": press,
            "Count": count
        })

# Convert to DataFrame
mixtures_df = pd.DataFrame(mixtures_list)

# Save to CSV with the additional 'Count' column
output_path = 'tml_hmix_dens_uniqmix_temp_press_usable.csv'
mixtures_df.to_csv(output_path, index=False)

# Prepare a list to store each unique mixture and its count for the CSV
hmix_dens_umix_list = [
    {
        "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
        "Count": hmix_dens_unique_mixtures_temperatures_pressures[mixture]["count"]
    }
    for mixture in hmix_dens_unique_mixtures_temperatures_pressures["unique_mixtures"]
]

# Convert to DataFrame
hmix_umixtures_df = pd.DataFrame(hmix_dens_umix_list)

# Save to CSV with the additional 'Count' column
output_path2 = 'tml_hmix_dens_usable_uniqmixs.csv'
hmix_umixtures_df.to_csv(output_path2, index=False)

print(f'ThermoML - Number of hmix+dens unique mixtures: {len(hmix_dens_umix_list)}')


ThermoML - Number of hmix+dens unique mixtures: 360


##### Hmix

In [43]:
hmix_indiv_compnts=get_smiles(tml_hmix_usable)
hmix_indiv_compnts_df=pd.DataFrame(hmix_indiv_compnts)
hmix_indiv_compnts_df.to_csv('tml_hmix_indiv_comps_usable.csv')


Number of initial data points: 27004
Number of individual mixtures: 23547
Number of individual components: 310


In [44]:
# Get the unique mixtures data with counts
hmix_unique_mixtures_temperatures_pressures = get_unique_temperature_pressure_for_mixtures(df_hmix)

# Prepare data for saving to a CSV file, excluding the 'unique_mixtures' entry itself
mixtures_list = []

for mixture, details in hmix_unique_mixtures_temperatures_pressures.items():
    # Skip the 'unique_mixtures' key as it's not part of the temperature/pressure data
    if mixture == 'unique_mixtures':
        continue
    # Extract temperature-pressure pairs and count for each mixture
    count = details["count"]
    for temp, press in details["temp_pressures"]:
        mixtures_list.append({
            "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
            "Temperature": temp,
            "Pressure": press,
            "Count": count
        })

# Convert to DataFrame
mixtures_df = pd.DataFrame(mixtures_list)

# Save to CSV with the additional 'Count' column
output_path = 'tml_hmix_uniqmix_temp_press_usable.csv'
mixtures_df.to_csv(output_path, index=False)

# Prepare a list to store each unique mixture and its count for the CSV
hmix_umix_list = [
    {
        "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
        "Count": hmix_unique_mixtures_temperatures_pressures[mixture]["count"]
    }
    for mixture in hmix_unique_mixtures_temperatures_pressures["unique_mixtures"]
]

# Convert to DataFrame
hmix_umixtures_df = pd.DataFrame(hmix_umix_list)

# Save to CSV with the additional 'Count' column
output_path2 = 'tml_hmix_uniqmixs_usable.csv'
hmix_umixtures_df.to_csv(output_path2, index=False)

print(f'ThermoML - Number of hmix unique mixtures: {len(hmix_umix_list)}')

ThermoML - Number of hmix unique mixtures: 1158


#### Density

In [45]:
dens_indiv_compnts=get_smiles(tml_dens_usable)
dens_indiv_compnts_df=pd.DataFrame(dens_indiv_compnts)
dens_indiv_compnts_df.to_csv('tml_dens_indiv_comps_usable.csv')

Number of initial data points: 16186
Number of individual mixtures: 8809
Number of individual components: 254


In [46]:
# Get the unique mixtures data with counts
dens_unique_mixtures_temperatures_pressures = get_unique_temperature_pressure_for_mixtures(df_dens)

# Prepare data for saving to a CSV file, excluding the 'unique_mixtures' entry itself
mixtures_list = []

for mixture, details in dens_unique_mixtures_temperatures_pressures.items():
    # Skip the 'unique_mixtures' key as it's not part of the temperature/pressure data
    if mixture == 'unique_mixtures':
        continue
    # Extract temperature-pressure pairs and count for each mixture
    count = details["count"]
    for temp, press in details["temp_pressures"]:
        mixtures_list.append({
            "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
            "Temperature": temp,
            "Pressure": press,
            "Count": count
        })

# Convert to DataFrame
mixtures_df = pd.DataFrame(mixtures_list)

# Save to CSV with the additional 'Count' column
output_path = 'tml_dens_uniqmix_temp_press_usable.csv'
mixtures_df.to_csv(output_path, index=False)

# Prepare a list to store each unique mixture and its count for the CSV
dens_umix_list = [
    {
        "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
        "Count": dens_unique_mixtures_temperatures_pressures[mixture]["count"]
    }
    for mixture in dens_unique_mixtures_temperatures_pressures["unique_mixtures"]
]

# Convert to DataFrame
hmix_umixtures_df = pd.DataFrame(dens_umix_list)

# Save to CSV with the additional 'Count' column
output_path2 = 'tml_dens_uniqmixs_usable.csv'
hmix_umixtures_df.to_csv(output_path2, index=False)

print(f'ThermoML - Number of dens unique mixtures: {len(dens_umix_list)}')

ThermoML - Number of dens unique mixtures: 656


### Sage

#### Hmix + Density

In [47]:
sage_hmix_dens_indiv_compnts=get_smiles(sage_hd_usable)
sage_hmix_dens_indiv_compnts_df=pd.DataFrame(sage_hmix_dens_indiv_compnts)
sage_hmix_dens_indiv_compnts_df.to_csv('sage_hmix_dens_indiv_comps_usable.csv')

Number of initial data points: 943
Number of individual mixtures: 927
Number of individual components: 109


In [48]:
# Get the unique mixtures data with counts
sage_hmix_dens_unique_mixtures_temperatures_pressures = get_unique_temperature_pressure_strict(df_sage_hmix_dens,component_columns=['Component 1', 'Component 2'])

# Prepare data for saving to a CSV file, excluding the 'unique_mixtures' entry itself
mixtures_list = []

for mixture, details in sage_hmix_dens_unique_mixtures_temperatures_pressures.items():
    # Skip the 'unique_mixtures' key as it's not part of the temperature/pressure data
    if mixture == 'unique_mixtures':
        continue
    # Extract temperature-pressure pairs and count for each mixture
    count = details["count"]
    for temp, press in details["temp_pressures"]:
        mixtures_list.append({
            "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
            "Temperature": temp,
            "Pressure": press,
            "Count": count
        })

# Convert to DataFrame
mixtures_df = pd.DataFrame(mixtures_list)

# Save to CSV with the additional 'Count' column
output_path = 'sage_hmix_dens_uniqmix_temp_press_usable.csv'
mixtures_df.to_csv(output_path, index=False)

# Prepare a list to store each unique mixture and its count for the CSV
hmix_dens_umix_list = [
    {
        "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
        "Count": sage_hmix_dens_unique_mixtures_temperatures_pressures[mixture]["count"]
    }
    for mixture in sage_hmix_dens_unique_mixtures_temperatures_pressures["unique_mixtures"]
]

# Convert to DataFrame
hmix_umixtures_df = pd.DataFrame(hmix_dens_umix_list)

# Save to CSV with the additional 'Count' column
output_path2 = 'sage_hmix_dens_uniqmixs_usable.csv'
hmix_umixtures_df.to_csv(output_path2, index=False)

print(f'Sage - Number of hmix+dens unique mixtures: {len(hmix_dens_umix_list)}')

Sage - Number of hmix+dens unique mixtures: 159


#### Hmix

In [49]:
sage_hmix_indiv_compnts=get_smiles(sage_hmix_usable)
sage_hmix_indiv_compnts_df=pd.DataFrame(sage_hmix_indiv_compnts)
sage_hmix_indiv_compnts_df.to_csv('sage_hmix_indiv_comps_usable.csv')

Number of initial data points: 477
Number of individual mixtures: 477
Number of individual components: 110


In [50]:
# Get the unique mixtures data with counts
sage_hmix_unique_mixtures_temperatures_pressures = get_unique_temperature_pressure_for_mixtures(df_sage_hmix,component_columns=['Component 1', 'Component 2'])

# Prepare data for saving to a CSV file, excluding the 'unique_mixtures' entry itself
mixtures_list = []

for mixture, details in sage_hmix_unique_mixtures_temperatures_pressures.items():
    # Skip the 'unique_mixtures' key as it's not part of the temperature/pressure data
    if mixture == 'unique_mixtures':
        continue
    # Extract temperature-pressure pairs and count for each mixture
    count = details["count"]
    for temp, press in details["temp_pressures"]:
        mixtures_list.append({
            "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
            "Temperature": temp,
            "Pressure": press,
            "Count": count
        })

# Convert to DataFrame
mixtures_df = pd.DataFrame(mixtures_list)

# Save to CSV with the additional 'Count' column
output_path = 'sage_hmix_uniqmix_temp_press_usable.csv'
mixtures_df.to_csv(output_path, index=False)

# Prepare a list to store each unique mixture and its count for the CSV
hmix_umix_list = [
    {
        "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
        "Count": sage_hmix_unique_mixtures_temperatures_pressures[mixture]["count"]
    }
    for mixture in sage_hmix_unique_mixtures_temperatures_pressures["unique_mixtures"]
]

# Convert to DataFrame
hmix_umixtures_df = pd.DataFrame(hmix_umix_list)

# Save to CSV with the additional 'Count' column
output_path2 = 'sage_hmix_uniqmixs_usable.csv'
hmix_umixtures_df.to_csv(output_path2, index=False)

print(f'Sage - Number of hmix unique mixtures: {len(hmix_umix_list)}')

Sage - Number of hmix unique mixtures: 160


#### Density

In [51]:
sage_dens_indiv_compnts=get_smiles(sage_dens_usable)
sage_dens_indiv_compnts_df=pd.DataFrame(sage_dens_indiv_compnts)
sage_dens_indiv_compnts_df.to_csv('sage_dens_indiv_comps_usable.csv')

Number of initial data points: 555
Number of individual mixtures: 555
Number of individual components: 121


In [52]:
# Get the unique mixtures data with counts
sage_dens_unique_mixtures_temperatures_pressures = get_unique_temperature_pressure_for_mixtures(df_sage_dens,component_columns=['Component 1', 'Component 2'])

# Prepare data for saving to a CSV file, excluding the 'unique_mixtures' entry itself
mixtures_list = []

for mixture, details in sage_dens_unique_mixtures_temperatures_pressures.items():
    # Skip the 'unique_mixtures' key as it's not part of the temperature/pressure data
    if mixture == 'unique_mixtures':
        continue
    # Extract temperature-pressure pairs and count for each mixture
    count = details["count"]
    for temp, press in details["temp_pressures"]:
        mixtures_list.append({
            "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
            "Temperature": temp,
            "Pressure": press,
            "Count": count
        })

# Convert to DataFrame
mixtures_df = pd.DataFrame(mixtures_list)

# Save to CSV with the additional 'Count' column
output_path = 'sage_dens_uniqmix_temp_press_usable.csv'
mixtures_df.to_csv(output_path, index=False)

# Prepare a list to store each unique mixture and its count for the CSV
dens_umix_list = [
    {
        "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
        "Count": sage_dens_unique_mixtures_temperatures_pressures[mixture]["count"]
    }
    for mixture in sage_dens_unique_mixtures_temperatures_pressures["unique_mixtures"]
]

# Convert to DataFrame
dens_umixtures_df = pd.DataFrame(dens_umix_list)

# Save to CSV with the additional 'Count' column
output_path2 = 'sage_dens_uniqmixs_usable.csv'
dens_umixtures_df.to_csv(output_path2, index=False)

print(f'Sage - Number of dens unique mixtures: {len(dens_umix_list)}')

Sage - Number of dens unique mixtures: 234


# Osmotic Coefficients

## 0) Registering Custom ThermoML Properties

In [53]:
@thermoml_property("Osmotic coefficient", supported_phases=PropertyPhase.Liquid | PropertyPhase.Gas)
class OsmoticCoefficient(PhysicalProperty):
    """A class representation of a osmotic coeff property"""

    @classmethod
    def default_unit(cls):
        return unit.dimensionless
    
setattr(properties, OsmoticCoefficient.__name__, OsmoticCoefficient)

ThermoMLDataSet.registered_properties['Osmotic coefficient'].conversion_function

functools.partial(<function _default_mapping at 0x7f20d01decb0>, <class '__main__.OsmoticCoefficient'>)

## 1) Loading ThermoML Data Sets

In [54]:
oc_data_set=extract_data_dois('sorted_oc_dois.json','osmotic_data.csv')

## 2) Filtering data set

In [57]:
schema = FilterByPropertyTypesSchema(property_types=["OsmoticCoefficient"], strict=False)

In [59]:
data_set_osmotic = FilterByPropertyTypes.apply(oc_data_set, schema)

In [60]:
data_set_oc_p= FilterByPressure.apply(data_set_osmotic, pressure_schema)

In [61]:
tml_oc_usable= FilterByTemperature.apply(data_set_oc_p, temp_schema)

In [62]:

# save for future use
tml_oc_path = Path('tml_filtered_oc_usable.json')
tml_oc_usable.json(tml_oc_path, format=True)

df_osmotic=tml_oc_usable.to_pandas()
df_osmotic.to_csv('osmotic_usable_database.csv',index=False)
# df_osmotic.head()
# df_osmotic.columns.to_list()


In [63]:
print(f'Number of initial data points: {len(oc_data_set)}')
print(f'Number of osmotic coefficient data points: {len(tml_oc_usable)}')

Number of initial data points: 3390
Number of osmotic coefficient data points: 2711


## 3) Get data information

In [64]:
oc_indiv_compnts=get_smiles(tml_oc_usable)
oc_indiv_compnts_df=pd.DataFrame(oc_indiv_compnts)
oc_indiv_compnts_df.to_csv('osmotic_indiv_comps_usable.csv')


Number of initial data points: 2711
Number of individual mixtures: 1356
Number of individual components: 58


In [65]:
# Get the unique mixtures data with counts
osmotic_unique_mixtures_temperatures_pressures = get_unique_temperature_pressure_for_mixtures(df_osmotic)

# Prepare data for saving to a CSV file, excluding the 'unique_mixtures' entry itself
mixtures_list = []

for mixture, details in osmotic_unique_mixtures_temperatures_pressures.items():
    # Skip the 'unique_mixtures' key as it's not part of the temperature/pressure data
    if mixture == 'unique_mixtures':
        continue
    # Extract temperature-pressure pairs and count for each mixture
    count = details["count"]
    for temp, press in details["temp_pressures"]:
        mixtures_list.append({
            "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
            "Temperature": temp,
            "Pressure": press,
            "Count": count
        })

mixtures_df = pd.DataFrame(mixtures_list)

output_path = 'osmotic_unique_mixtures_temperatures_pressures_usable.csv'
mixtures_df.to_csv(output_path, index=False)

# Prepare a list to store each unique mixture and its count for the CSV
oc_umix_list = [
    {
        "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
        "Count": osmotic_unique_mixtures_temperatures_pressures[mixture]["count"]
    }
    for mixture in osmotic_unique_mixtures_temperatures_pressures["unique_mixtures"]
]

oc_umixtures_df = pd.DataFrame(oc_umix_list)

output_path2 = 'osmotic_unique_mixs_usable.csv'
oc_umixtures_df.to_csv(output_path2, index=False)

print(f'Number of unique mixtures: {len(oc_umix_list)}')


Number of unique mixtures: 103
