# Core Imports and Setup

In [1]:
import os
import pandas as pd
from pathlib import Path

import warnings
warnings.filterwarnings("ignore")

import logging
# logging.getLogger("openff.toolkit").setLevel(logging.ERROR)
# logging.basicConfig(level=logging.DEBUG)

import json
from openff import toolkit, evaluator

In [2]:
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw, MolFromSmiles
from rdkit.Chem.Draw import MolsToGridImage, rdMolDraw2D, MolsMatrixToGridImage
import pubchempy
from PIL import Image, ImageDraw, ImageFont
IPythonConsole.ipython_useSVG=False  #< set this to False if you want PNGs instead of SVGs

In [3]:
from rdkit.Chem import rdMolDescriptors
import pubchempy as pcp
import math

In [4]:
from openff.evaluator.datasets import PhysicalProperty, PropertyPhase
from openff.evaluator.datasets.thermoml import thermoml_property
from openff.evaluator import properties
from openff.units import unit
from openff.evaluator.datasets.thermoml import ThermoMLDataSet
from openff.evaluator.datasets import PhysicalProperty, PropertyPhase, PhysicalPropertyDataSet

In [5]:
from openff.evaluator.datasets.curation.components.filtering import FilterByPropertyTypes, FilterByPropertyTypesSchema
from openff.evaluator.datasets.curation.components.filtering import FilterByTemperature, FilterByTemperatureSchema
from openff.evaluator.datasets.curation.components.filtering import FilterByPressure, FilterByPressureSchema
from openff.evaluator.datasets.curation.components.filtering import FilterBySmiles, FilterBySmilesSchema

# Functions

In [6]:
def get_smiles(data_set_choice):
    print(f'Number of initial data points: {len(data_set_choice)}')
    subs=list(set(data_set_choice.substances))
    print(f'Number of individual mixtures: {len(subs)}')
    smiles_list=[]
    comps=[]

    for i in subs:
        if len(i.components) == 3:
            comps.append(i.components[0].smiles)
            comps.append(i.components[1].smiles)
            comps.append(i.components[2].smiles)
        elif len(i.components) == 2:
            comps.append(i.components[0].smiles)
            comps.append(i.components[1].smiles)
        elif len(i.components) == 1:
            comps.append(i.components[0].smiles)

    # comps.remove('O')

    [smiles_list.append(o) for o in set(comps)]

    print(f'Number of individual components: {len(smiles_list)}')

    return(smiles_list)

In [7]:
def smiles_to_figures(smiles_list):
    func_group_mols=[]
    matchlegend=[]
    for m in set(smiles_list):
        if m != 'O':
            compounds=pubchempy.get_compounds(m, namespace='smiles')
            match = compounds[0]
            match_name=str(match.iupac_name)
            matchlegend.append(match_name)
            mol=Chem.MolFromSmiles(m)
            func_group_mols.append(mol)
    init_fig=MolsToGridImage(mols=func_group_mols, legends=matchlegend, molsPerRow=10, returnPNG=False)
    return init_fig

In [8]:
def smiles_to_iupac(df, smiles_column, new_column_name='IUPAC'):
 
    iupac_names = []


    for smiles in df[smiles_column]:
        try:
            # Convert the SMILES string to a molecule object using RDKit
            mol = Chem.MolFromSmiles(smiles)
            
            if mol: 
                # Get the molecular formula for reference (not necessary)
                formula = rdMolDescriptors.CalcMolFormula(mol)
                
                # Use PubChemPy to search for the IUPAC name by molecular formula or SMILES
                compounds = pcp.get_compounds(smiles, 'smiles')
                
                if compounds:
                    iupac_name = compounds[0].iupac_name  # Get the IUPAC name from the first match
                else:
                    iupac_name = None
            else:
                iupac_name = None
        except Exception as e:
            iupac_name = None
        
        # Append the IUPAC name (or None) to the list
        iupac_names.append(iupac_name)

    # Find the position of the reference column (smiles_column)
    col_position = df.columns.get_loc(smiles_column)
    
    # Insert the new column with IUPAC names right next to the SMILES column
    df.insert(col_position + 1, new_column_name, iupac_names)

    return df



In [9]:
def get_unique_temperature_pressure_for_mixtures(data, component_columns=['Component 1', 'Component 2', 'Component 3']):
    """
    Loops through a dataframe and returns a dictionary where each unique
    combination of specified component columns is a key. The value for each key is 
    a dictionary containing a unique list of tuples with temperatures and pressures for that mixture 
    and the count of occurrences of each unique mixture.

    Parameters:
    - data (pd.DataFrame): DataFrame containing the data
    - component_columns (list): List of column names representing the mixture components
                                 Defaults to ['Component 1', 'Component 2', 'Component 3'].

    Returns:
    - dict: A dictionary with unique mixtures as keys and a dictionary with temperature-pressure
            information and counts as values.
    """
    mixtures_dict = {}
    unique_mixtures = set()


    for _, row in data.iterrows():
        # Create a tuple of the mixture components
        mixture_key = tuple(row[component] for component in component_columns)

        # Add to unique mixtures set (ignores duplicates)
        unique_mixtures.add(mixture_key)
        
        # Extract temperature and pressure
        temp_pressure = (row['Temperature (K)'], row['Pressure (kPa)'])
        
        # Only add to the dictionary if both temperature and pressure are valid numbers
        if not (math.isnan(temp_pressure[0]) or math.isnan(temp_pressure[1])):
            if mixture_key not in mixtures_dict:
                mixtures_dict[mixture_key] = {"temp_pressures": set(), "count": 0}
            
            # Add the temperature-pressure pair and increment the count
            mixtures_dict[mixture_key]["temp_pressures"].add(temp_pressure)
            mixtures_dict[mixture_key]["count"] += 1

    # Convert the sets back to lists for readability
    mixtures_dict = {
        key: {"temp_pressures": list(value["temp_pressures"]), "count": value["count"]}
        for key, value in mixtures_dict.items()
    }
    
    # Add deduplicated list of unique mixtures
    mixtures_dict['unique_mixtures'] = list(unique_mixtures)
    
    return mixtures_dict


In [10]:
def extract_data_dois(dois_json,csv_cached):
    CACHED_PROP_PATH = Path(csv_cached)

    if CACHED_PROP_PATH.exists():
        prop_df = pd.read_csv(CACHED_PROP_PATH, index_col=0)
        # delete rows with underfined thermo params to avoid pesky indexing errors
        prop_df = prop_df.dropna(subset=['Temperature (K)'])
        prop_df = prop_df.dropna(subset=['Pressure (kPa)'])
        data_set = ThermoMLDataSet.from_pandas(prop_df)
    else:
        with open(dois_json) as f:
            doi_dat = json.load(f)
            data_set = ThermoMLDataSet.from_doi(*doi_dat['working'])

        initial_df = data_set.to_pandas()
        with CACHED_PROP_PATH.open('w') as file:
            initial_df.to_csv(CACHED_PROP_PATH)
    return data_set

# Osmotic Coefficients

## 0) Registering Custom ThermoML Properties

In [11]:
@thermoml_property("Osmotic coefficient", supported_phases=PropertyPhase.Liquid | PropertyPhase.Gas)
class OsmoticCoefficient(PhysicalProperty):
    """A class representation of a osmotic coeff property"""

    @classmethod
    def default_unit(cls):
        return unit.dimensionless
    
setattr(properties, OsmoticCoefficient.__name__, OsmoticCoefficient)

ThermoMLDataSet.registered_properties['Osmotic coefficient'].conversion_function

functools.partial(<function _default_mapping at 0x7fcd42dc2cb0>, <class '__main__.OsmoticCoefficient'>)

## 1) Loading ThermoML Data Sets

In [12]:
oc_data_set=extract_data_dois('sorted_oc_dois.json','osmotic_data.csv')

## 2) Filtering data set

In [13]:
schema = FilterByPropertyTypesSchema(property_types=["OsmoticCoefficient"], strict=False)

In [14]:
data_set_osmotic = FilterByPropertyTypes.apply(oc_data_set, schema)
# save for future use
tml_oc_path = Path('tml_filtered_oc.json')
data_set_osmotic.json(tml_oc_path, format=True)

df_osmotic=data_set_osmotic.to_pandas()
df_osmotic.to_csv('osmotic_database.csv',index=False)
# df_osmotic.head()
# df_osmotic.columns.to_list()


In [15]:
print(f'Number of initial data points: {len(oc_data_set)}')
print(f'Number of osmotic coefficient data points: {len(data_set_osmotic)}')

Number of initial data points: 3390
Number of osmotic coefficient data points: 2784


## 3) Get data information

In [16]:
oc_indiv_compnts=get_smiles(data_set_osmotic)

oc_indiv_compnts_df=pd.DataFrame(oc_indiv_compnts)
oc_indiv_compnts_df.to_csv('osmotic_indiv comps.csv')


Number of initial data points: 2784
Number of individual mixtures: 1429
Number of individual components: 61


In [17]:
# Get the unique mixtures data with counts
osmotic_unique_mixtures_temperatures_pressures = get_unique_temperature_pressure_for_mixtures(df_osmotic)

# Prepare data for saving to a CSV file, excluding the 'unique_mixtures' entry itself
mixtures_list = []

for mixture, details in osmotic_unique_mixtures_temperatures_pressures.items():
    # Skip the 'unique_mixtures' key as it's not part of the temperature/pressure data
    if mixture == 'unique_mixtures':
        continue
    # Extract temperature-pressure pairs and count for each mixture
    count = details["count"]
    for temp, press in details["temp_pressures"]:
        mixtures_list.append({
            "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
            "Temperature": temp,
            "Pressure": press,
            "Count": count
        })

mixtures_df = pd.DataFrame(mixtures_list)

output_path = 'osmotic_unique_mixtures_temperatures_pressures.csv'
mixtures_df.to_csv(output_path, index=False)

# Prepare a list to store each unique mixture and its count for the CSV
oc_umix_list = [
    {
        "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
        "Count": osmotic_unique_mixtures_temperatures_pressures[mixture]["count"]
    }
    for mixture in osmotic_unique_mixtures_temperatures_pressures["unique_mixtures"]
]

oc_umixtures_df = pd.DataFrame(oc_umix_list)

output_path2 = 'osmotic_unique_mixs.csv'
oc_umixtures_df.to_csv(output_path2, index=False)

print(f'Number of unique mixtures: {len(oc_umix_list)}')


Number of unique mixtures: 106


### Adding IUPACS

In [18]:
# df_osmotic[['N Components','Component 1']]

In [19]:
# df_osmotic_new=smiles_to_iupac(df_osmotic, 'Component 1', new_column_name='IUPAC1')
# df_osmotic_new2=smiles_to_iupac(df_osmotic_new, 'Component 2', new_column_name='IUPAC2')
# df_osmotic_new3=smiles_to_iupac(df_osmotic_new2, 'Component 3', new_column_name='IUPAC3')
# df_osmotic_new3.head()

In [20]:
# iupacs=[i for i in set(df_osmotic_new3['IUPAC1'])]

In [21]:
# df_osmotic_new3[
# ['Id',
#  'Temperature (K)',
#  'Pressure (kPa)',
#  'N Components',
#  'Component 1',
#  'IUPAC1',
#  'Role 1',
#  'Mole Fraction 1',
#  'Component 2',
#  'IUPAC2',
#  'Role 2',
#  'Mole Fraction 2',
#  'Component 3',
#  'IUPAC3',
#  'Role 3',
#  'Mole Fraction 3',
#  'OsmoticCoefficient Value ()',
#  'OsmoticCoefficient Uncertainty ()',
#  'Source']
# ].to_csv('osmotic_database_iupac.csv', index=False)

### Advanced filtering

In [22]:
# df_select=df_osmotic[(df_osmotic['Temperature (K)'] > 293.0) & (df_osmotic['Temperature (K)'] < 300.0) & (df_osmotic['Pressure (kPa)'] == 101 ) & (df_osmotic['N Components'] == 2) & (df_osmotic['Component 2'] == 'O')]
# df_select=df_osmotic[(df_osmotic['Temperature (K)'] > 293.0) & (df_osmotic['Temperature (K)'] < 300.0) & (df_osmotic['Pressure (kPa)'] == 101 ) & (df_osmotic['N Components'] == 2)]

In [23]:
# print(df_select)

# df_select_smiles1=df_select['Component 1']
# df_select_smiles2=df_select['Component 2']
# df_select_smiles3=df_select['Component 3']

# smiles_to_figures(df_select_smiles1)
# smiles_to_figures(df_select_smiles2)
# smiles_to_figures(df_select_smiles3)

In [24]:
# df_select[
# ['Id',
#  'Temperature (K)',
#  'Pressure (kPa)',
#  'N Components',
#  'Component 1',
#  'Role 1',
#  'Mole Fraction 1',
#  'Component 2',
#  'Role 2',
#  'Mole Fraction 2',
#  'OsmoticCoefficient Value ()',
#  'OsmoticCoefficient Uncertainty ()',
#  'Source']
# ].to_csv('osm_selection.csv', index=False)

# Hmix + Density

## 1) Loading ThermoML Data Sets

In [25]:
# ThermoML
hmix_data_set=extract_data_dois('sorted_hmix_dois.json','hmix_data.csv')

In [26]:
# Sage
hmix_dens_data_set_sage = PhysicalPropertyDataSet.from_json("sage-training-set.json")

## 2) Filtering data set

### Filtering properties schemas

In [None]:
hmix_schema = FilterByPropertyTypesSchema(property_types=["EnthalpyOfMixing"])
dens_schema = FilterByPropertyTypesSchema(property_types=["Density"])
hmix_dens_schema = FilterByPropertyTypesSchema(property_types=["EnthalpyOfMixing","Density"], strict=True)

### ThermoML dois for Hmix of binary mixtures

In [28]:
# Hmix + density
data_set_hmix_dens= FilterByPropertyTypes.apply(hmix_data_set, hmix_dens_schema)

# save for future use
tml_hmix_dens_path = Path('tml_filtered_hmix_dens.json')
data_set_hmix_dens.json(tml_hmix_dens_path, format=True)

df_hmix_dens=data_set_hmix_dens.to_pandas()
df_hmix_dens.to_csv('tml_hmix_dens_database.csv',index=False)

In [29]:
# Hmix only
data_set_hmix = FilterByPropertyTypes.apply(hmix_data_set, hmix_schema)

# save for future use
tml_hmix_path = Path('tml_filtered_hmix.json')
data_set_hmix.json(tml_hmix_path, format=True)

df_hmix=data_set_hmix.to_pandas()
df_hmix.to_csv('tml_hmix_database.csv',index=False)
# df_hmix.head()
# df_hmix.columns.to_list()



KeyboardInterrupt: 

In [None]:
# Density only
data_set_dens= FilterByPropertyTypes.apply(hmix_data_set, dens_schema)

# save for future use
tml_dens_path = Path('tml_filtered_dens.json')
data_set_dens.json(tml_dens_path, format=True)

df_dens=data_set_dens.to_pandas()
df_dens.to_csv('tml_dens_database.csv',index=False)

In [None]:
print(f'ThermoML - Number of initial data points: {len(hmix_data_set)}')
print(f'ThermoML - Number of hmix+density data points: {len(data_set_hmix_dens)}')
print(f'ThermoML - Number of hmix data points: {len(data_set_hmix)}')
print(f'ThermoML - Number of density data points: {len(data_set_dens)}')

ThermoML - Number of initial data points: 51130
ThermoML - Number of hmix+density data points: 47709
ThermoML - Number of hmix data points: 31189
ThermoML - Number of density data points: 16520


### Sage hmix+density training set

In [None]:
# Hmix + density
sage_hmix_dens= FilterByPropertyTypes.apply(hmix_dens_data_set_sage, hmix_dens_schema)

# save for future use
sage_hmix_dens_path = Path('sage_filtered_hmix_dens.json')
sage_hmix_dens.json(sage_hmix_dens_path, format=True)

df_sage_hmix_dens=sage_hmix_dens.to_pandas()
df_sage_hmix_dens.to_csv('sage_hmix_dens_database.csv',index=False)

In [None]:
# Hmix only
sage_hmix = FilterByPropertyTypes.apply(hmix_dens_data_set_sage, hmix_schema)

# save for future use
sage_hmix_path = Path('sage_filtered_hmix.json')
sage_hmix.json(sage_hmix_path, format=True)

df_sage_hmix=sage_hmix.to_pandas()
df_sage_hmix.to_csv('sage_hmix_database.csv',index=False)

In [None]:
# Density only
sage_dens = FilterByPropertyTypes.apply(hmix_dens_data_set_sage, dens_schema)

# save for future use
sage_dens_path = Path('sage_filtered_dens.json')
sage_dens.json(sage_dens_path, format=True)

df_sage_dens=sage_dens.to_pandas()
df_sage_dens.to_csv('sage_dens_database.csv',index=False)

In [None]:
print(f'Sage - Number of initial data points: {len(hmix_dens_data_set_sage)}')
print(f'Sage - Number of hmix+density data points: {len(sage_hmix_dens)}')
print(f'Sage - Number of hmix data points: {len(sage_hmix)}')
print(f'Sage - Number of density data points: {len(sage_dens)}')

Sage - Number of initial data points: 1032
Sage - Number of hmix+density data points: 1032
Sage - Number of hmix data points: 477
Sage - Number of density data points: 555


## 3) Get data information

### ThermoML

#### Hmix+dens

In [None]:
hmix_dens_indiv_compnts=get_smiles(data_set_hmix_dens)
hmix_dens_indiv_compnts_df=pd.DataFrame(hmix_dens_indiv_compnts)
hmix_dens_indiv_compnts_df.to_csv('tml_hmix_dens_indiv_comps.csv')

Number of initial data points: 47709
Number of individual mixtures: 34287
Number of individual components: 349


In [None]:
# Get the unique mixtures data with counts
hmix_dens_unique_mixtures_temperatures_pressures = get_unique_temperature_pressure_for_mixtures(df_hmix_dens)

# Prepare data for saving to a CSV file, excluding the 'unique_mixtures' entry itself
mixtures_list = []

for mixture, details in hmix_dens_unique_mixtures_temperatures_pressures.items():
    # Skip the 'unique_mixtures' key as it's not part of the temperature/pressure data
    if mixture == 'unique_mixtures':
        continue
    # Extract temperature-pressure pairs and count for each mixture
    count = details["count"]
    for temp, press in details["temp_pressures"]:
        mixtures_list.append({
            "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
            "Temperature": temp,
            "Pressure": press,
            "Count": count
        })

# Convert to DataFrame
mixtures_df = pd.DataFrame(mixtures_list)

# Save to CSV with the additional 'Count' column
output_path = 'tml_hmix_dens_uniqmix_temp_press.csv'
mixtures_df.to_csv(output_path, index=False)

# Prepare a list to store each unique mixture and its count for the CSV
hmix_dens_umix_list = [
    {
        "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
        "Count": hmix_dens_unique_mixtures_temperatures_pressures[mixture]["count"]
    }
    for mixture in hmix_dens_unique_mixtures_temperatures_pressures["unique_mixtures"]
]

# Convert to DataFrame
hmix_umixtures_df = pd.DataFrame(hmix_dens_umix_list)

# Save to CSV with the additional 'Count' column
output_path2 = 'tml_hmix_dens_uniqmixs.csv'
hmix_umixtures_df.to_csv(output_path2, index=False)

print(f'ThermoML - Number of hmix+dens unique mixtures: {len(hmix_dens_umix_list)}')


ThermoML - Number of hmix+dens unique mixtures: 1494


##### Hmix

In [None]:
hmix_indiv_compnts=get_smiles(data_set_hmix)
hmix_indiv_compnts_df=pd.DataFrame(hmix_indiv_compnts)
hmix_indiv_compnts_df.to_csv('tml_hmix_indiv_comps.csv')


Number of initial data points: 31189
Number of individual mixtures: 25994
Number of individual components: 337


In [None]:
# Get the unique mixtures data with counts
hmix_unique_mixtures_temperatures_pressures = get_unique_temperature_pressure_for_mixtures(df_hmix)

# Prepare data for saving to a CSV file, excluding the 'unique_mixtures' entry itself
mixtures_list = []

for mixture, details in hmix_unique_mixtures_temperatures_pressures.items():
    # Skip the 'unique_mixtures' key as it's not part of the temperature/pressure data
    if mixture == 'unique_mixtures':
        continue
    # Extract temperature-pressure pairs and count for each mixture
    count = details["count"]
    for temp, press in details["temp_pressures"]:
        mixtures_list.append({
            "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
            "Temperature": temp,
            "Pressure": press,
            "Count": count
        })

# Convert to DataFrame
mixtures_df = pd.DataFrame(mixtures_list)

# Save to CSV with the additional 'Count' column
output_path = 'tml_hmix_uniqmix_temp_press.csv'
mixtures_df.to_csv(output_path, index=False)

# Prepare a list to store each unique mixture and its count for the CSV
hmix_umix_list = [
    {
        "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
        "Count": hmix_unique_mixtures_temperatures_pressures[mixture]["count"]
    }
    for mixture in hmix_unique_mixtures_temperatures_pressures["unique_mixtures"]
]

# Convert to DataFrame
hmix_umixtures_df = pd.DataFrame(hmix_umix_list)

# Save to CSV with the additional 'Count' column
output_path2 = 'tml_hmix_uniqmixs.csv'
hmix_umixtures_df.to_csv(output_path2, index=False)

print(f'ThermoML - Number of hmix unique mixtures: {len(hmix_umix_list)}')

ThermoML - Number of hmix unique mixtures: 1232


#### Density

In [None]:
dens_indiv_compnts=get_smiles(data_set_dens)
dens_indiv_compnts_df=pd.DataFrame(dens_indiv_compnts)
dens_indiv_compnts_df.to_csv('tml_dens_indiv_comps.csv')

Number of initial data points: 16520
Number of individual mixtures: 8820
Number of individual components: 254


In [None]:
# Get the unique mixtures data with counts
dens_unique_mixtures_temperatures_pressures = get_unique_temperature_pressure_for_mixtures(df_dens)

# Prepare data for saving to a CSV file, excluding the 'unique_mixtures' entry itself
mixtures_list = []

for mixture, details in dens_unique_mixtures_temperatures_pressures.items():
    # Skip the 'unique_mixtures' key as it's not part of the temperature/pressure data
    if mixture == 'unique_mixtures':
        continue
    # Extract temperature-pressure pairs and count for each mixture
    count = details["count"]
    for temp, press in details["temp_pressures"]:
        mixtures_list.append({
            "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
            "Temperature": temp,
            "Pressure": press,
            "Count": count
        })

# Convert to DataFrame
mixtures_df = pd.DataFrame(mixtures_list)

# Save to CSV with the additional 'Count' column
output_path = 'tml_dens_uniqmix_temp_press.csv'
mixtures_df.to_csv(output_path, index=False)

# Prepare a list to store each unique mixture and its count for the CSV
dens_umix_list = [
    {
        "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
        "Count": dens_unique_mixtures_temperatures_pressures[mixture]["count"]
    }
    for mixture in dens_unique_mixtures_temperatures_pressures["unique_mixtures"]
]

# Convert to DataFrame
hmix_umixtures_df = pd.DataFrame(dens_umix_list)

# Save to CSV with the additional 'Count' column
output_path2 = 'tml_dens_uniqmixs.csv'
hmix_umixtures_df.to_csv(output_path2, index=False)

print(f'ThermoML - Number of dens unique mixtures: {len(dens_umix_list)}')

ThermoML - Number of dens unique mixtures: 656


### Sage

#### Hmix + Density

In [None]:
sage_hmix_dens_indiv_compnts=get_smiles(sage_hmix_dens)
sage_hmix_dens_indiv_compnts_df=pd.DataFrame(sage_hmix_dens_indiv_compnts)
sage_hmix_dens_indiv_compnts_df.to_csv('sage_hmix_dens_indiv_comps.csv')

Number of initial data points: 1032
Number of individual mixtures: 1016
Number of individual components: 121


In [None]:
# Get the unique mixtures data with counts
sage_hmix_dens_unique_mixtures_temperatures_pressures = get_unique_temperature_pressure_for_mixtures(df_sage_hmix_dens,component_columns=['Component 1', 'Component 2'])

# Prepare data for saving to a CSV file, excluding the 'unique_mixtures' entry itself
mixtures_list = []

for mixture, details in sage_hmix_dens_unique_mixtures_temperatures_pressures.items():
    # Skip the 'unique_mixtures' key as it's not part of the temperature/pressure data
    if mixture == 'unique_mixtures':
        continue
    # Extract temperature-pressure pairs and count for each mixture
    count = details["count"]
    for temp, press in details["temp_pressures"]:
        mixtures_list.append({
            "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
            "Temperature": temp,
            "Pressure": press,
            "Count": count
        })

# Convert to DataFrame
mixtures_df = pd.DataFrame(mixtures_list)

# Save to CSV with the additional 'Count' column
output_path = 'sage_hmix_dens_uniqmix_temp_press.csv'
mixtures_df.to_csv(output_path, index=False)

# Prepare a list to store each unique mixture and its count for the CSV
hmix_dens_umix_list = [
    {
        "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
        "Count": sage_hmix_dens_unique_mixtures_temperatures_pressures[mixture]["count"]
    }
    for mixture in sage_hmix_dens_unique_mixtures_temperatures_pressures["unique_mixtures"]
]

# Convert to DataFrame
hmix_umixtures_df = pd.DataFrame(hmix_dens_umix_list)

# Save to CSV with the additional 'Count' column
output_path2 = 'sage_hmix_dens_uniqmixs.csv'
hmix_umixtures_df.to_csv(output_path2, index=False)

print(f'Sage - Number of hmix+dens unique mixtures: {len(hmix_dens_umix_list)}')

Sage - Number of hmix+dens unique mixtures: 236


#### Hmix

In [None]:
sage_hmix_indiv_compnts=get_smiles(sage_hmix)
sage_hmix_indiv_compnts_df=pd.DataFrame(sage_hmix_indiv_compnts)
sage_hmix_indiv_compnts_df.to_csv('sage_hmix_indiv_comps.csv')

Number of initial data points: 477
Number of individual mixtures: 477
Number of individual components: 110


In [None]:
# Get the unique mixtures data with counts
sage_hmix_unique_mixtures_temperatures_pressures = get_unique_temperature_pressure_for_mixtures(df_sage_hmix,component_columns=['Component 1', 'Component 2'])

# Prepare data for saving to a CSV file, excluding the 'unique_mixtures' entry itself
mixtures_list = []

for mixture, details in sage_hmix_unique_mixtures_temperatures_pressures.items():
    # Skip the 'unique_mixtures' key as it's not part of the temperature/pressure data
    if mixture == 'unique_mixtures':
        continue
    # Extract temperature-pressure pairs and count for each mixture
    count = details["count"]
    for temp, press in details["temp_pressures"]:
        mixtures_list.append({
            "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
            "Temperature": temp,
            "Pressure": press,
            "Count": count
        })

# Convert to DataFrame
mixtures_df = pd.DataFrame(mixtures_list)

# Save to CSV with the additional 'Count' column
output_path = 'sage_hmix_uniqmix_temp_press.csv'
mixtures_df.to_csv(output_path, index=False)

# Prepare a list to store each unique mixture and its count for the CSV
hmix_umix_list = [
    {
        "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
        "Count": sage_hmix_unique_mixtures_temperatures_pressures[mixture]["count"]
    }
    for mixture in sage_hmix_unique_mixtures_temperatures_pressures["unique_mixtures"]
]

# Convert to DataFrame
hmix_umixtures_df = pd.DataFrame(hmix_umix_list)

# Save to CSV with the additional 'Count' column
output_path2 = 'sage_hmix_uniqmixs.csv'
hmix_umixtures_df.to_csv(output_path2, index=False)

print(f'Sage - Number of hmix unique mixtures: {len(hmix_umix_list)}')

Sage - Number of hmix unique mixtures: 160


#### Density

In [None]:
sage_dens_indiv_compnts=get_smiles(sage_dens)
sage_dens_indiv_compnts_df=pd.DataFrame(sage_dens_indiv_compnts)
sage_dens_indiv_compnts_df.to_csv('sage_dens_indiv_comps.csv')

Number of initial data points: 555
Number of individual mixtures: 555
Number of individual components: 121


In [None]:
# Get the unique mixtures data with counts
sage_dens_unique_mixtures_temperatures_pressures = get_unique_temperature_pressure_for_mixtures(df_sage_dens,component_columns=['Component 1', 'Component 2'])

# Prepare data for saving to a CSV file, excluding the 'unique_mixtures' entry itself
mixtures_list = []

for mixture, details in sage_dens_unique_mixtures_temperatures_pressures.items():
    # Skip the 'unique_mixtures' key as it's not part of the temperature/pressure data
    if mixture == 'unique_mixtures':
        continue
    # Extract temperature-pressure pairs and count for each mixture
    count = details["count"]
    for temp, press in details["temp_pressures"]:
        mixtures_list.append({
            "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
            "Temperature": temp,
            "Pressure": press,
            "Count": count
        })

# Convert to DataFrame
mixtures_df = pd.DataFrame(mixtures_list)

# Save to CSV with the additional 'Count' column
output_path = 'sage_dens_uniqmix_temp_press.csv'
mixtures_df.to_csv(output_path, index=False)

# Prepare a list to store each unique mixture and its count for the CSV
dens_umix_list = [
    {
        "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
        "Count": sage_dens_unique_mixtures_temperatures_pressures[mixture]["count"]
    }
    for mixture in sage_dens_unique_mixtures_temperatures_pressures["unique_mixtures"]
]

# Convert to DataFrame
dens_umixtures_df = pd.DataFrame(dens_umix_list)

# Save to CSV with the additional 'Count' column
output_path2 = 'sage_dens_uniqmixs.csv'
dens_umixtures_df.to_csv(output_path2, index=False)

print(f'Sage - Number of dens unique mixtures: {len(dens_umix_list)}')

Sage - Number of dens unique mixtures: 234
