# Core Imports and Setup

In [1]:
import os
import pandas as pd
from pathlib import Path

import warnings
warnings.filterwarnings("ignore")

import logging
# logging.getLogger("openff.toolkit").setLevel(logging.ERROR)
# logging.basicConfig(level=logging.DEBUG)

import json
from openff import toolkit, evaluator

In [2]:
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw, MolFromSmiles
from rdkit.Chem.Draw import MolsToGridImage, rdMolDraw2D, MolsMatrixToGridImage
import pubchempy
from PIL import Image, ImageDraw, ImageFont
IPythonConsole.ipython_useSVG=False  #< set this to False if you want PNGs instead of SVGs

In [3]:
from rdkit.Chem import rdMolDescriptors
import pubchempy as pcp
import math

In [4]:
from openff.evaluator.datasets import PhysicalProperty, PropertyPhase
from openff.evaluator.datasets.thermoml import thermoml_property
from openff.evaluator import properties
from openff.units import unit
from openff.evaluator.datasets.thermoml import ThermoMLDataSet
from openff.evaluator.datasets import PhysicalProperty, PropertyPhase, PhysicalPropertyDataSet

In [5]:
from openff.evaluator.datasets.curation.components.filtering import FilterByPropertyTypes, FilterByPropertyTypesSchema
from openff.evaluator.datasets.curation.components.filtering import FilterByTemperature, FilterByTemperatureSchema
from openff.evaluator.datasets.curation.components.filtering import FilterByPressure, FilterByPressureSchema
from openff.evaluator.datasets.curation.components.filtering import FilterBySmiles, FilterBySmilesSchema
from openff.evaluator.datasets.curation.components.filtering import FilterByNComponents, FilterByNComponentsSchema


In [6]:
from rich.progress import track
from collections import defaultdict
from openff.toolkit.utils.exceptions import UndefinedStereochemistryError

# Registering New Properties

In [7]:
@thermoml_property("Osmotic coefficient", supported_phases=PropertyPhase.Liquid | PropertyPhase.Gas)
class OsmoticCoefficient(PhysicalProperty):
    """A class representation of a osmotic coeff property"""

    @classmethod
    def default_unit(cls):
        return unit.dimensionless
    
setattr(properties, OsmoticCoefficient.__name__, OsmoticCoefficient)

ThermoMLDataSet.registered_properties['Osmotic coefficient'].conversion_function

functools.partial(<function _default_mapping at 0x7f726a9c2cb0>, <class '__main__.OsmoticCoefficient'>)

In [8]:
@thermoml_property("Activity coefficient", supported_phases=PropertyPhase.Liquid | PropertyPhase.Gas)
class ActivityCoefficient(PhysicalProperty):
    """A class representation of a Activity coeff property"""

    @classmethod
    def default_unit(cls):
        return unit.dimensionless
    
setattr(properties, ActivityCoefficient.__name__, ActivityCoefficient)

ThermoMLDataSet.registered_properties['Activity coefficient'].conversion_function

functools.partial(<function _default_mapping at 0x7f726a9c2cb0>, <class '__main__.ActivityCoefficient'>)

In [9]:
@thermoml_property("Speed Of Sound (m/s)", supported_phases=PropertyPhase.Liquid | PropertyPhase.Gas)
class SpeedOfSound(PhysicalProperty):
    """A class representation of a Speed Of Sound property"""

    @classmethod
    def default_unit(cls):
        return unit.meter / unit.second
    
setattr(properties, SpeedOfSound.__name__, SpeedOfSound)

ThermoMLDataSet.registered_properties['Speed Of Sound (m/s)'].conversion_function

functools.partial(<function _default_mapping at 0x7f726a9c2cb0>, <class '__main__.SpeedOfSound'>)

# Functions

In [10]:
def filter_database (unfiltered_directory):
    data_set=[]
    # Loop through all files in the directory
    for filename in os.listdir(unfiltered_directory):
        # Check if the file ends with .json
        if filename.endswith('.xml'):
            # Full path to the file
            file_path = os.path.join(unfiltered_directory, filename)
            data_set.append(file_path)
    
    sorted_dois = defaultdict(list)
    for doi in track(data_set, description='Filtering DOIs...'):
        try:
            dataset = ThermoMLDataSet._from_file(doi)
            sorted_dois['working'].append(doi)
        except UndefinedStereochemistryError:
            sorted_dois['stereo_fail'].append(doi)
        except Exception as other_exc:
            sorted_dois[other_exc.__class__.__name__].append(doi)

    print('Amount of failing files: %i/%i' % (len(sorted_dois['stereo_fail']),len(data_set)))
    print('Amount of working files: %i/%i' % (len(sorted_dois['working']),len(data_set)))

    return sorted_dois['working']

In [11]:
def extract_database (database_directory, csv_cached):
    CACHED_PROP_PATH = Path(csv_cached)

    if CACHED_PROP_PATH.exists():
        prop_df = pd.read_csv(CACHED_PROP_PATH, index_col=0)
        # delete rows with underfined thermo params to avoid pesky indexing errors
        prop_df = prop_df.dropna(subset=['Temperature (K)'])
        prop_df = prop_df.dropna(subset=['Pressure (kPa)'])
        data_set = ThermoMLDataSet.from_pandas(prop_df)
    else:
        sorted_files=filter_database(database_directory)
        data_set=ThermoMLDataSet.from_file(*sorted_files)

        initial_df = data_set.to_pandas()
        with CACHED_PROP_PATH.open('w') as file:
            initial_df.to_csv(CACHED_PROP_PATH)
    return data_set

# Import Directory

In [12]:
database='merged'

In [13]:
# Specify the directory containing the CSV files
csv_directory = './'  # Change to your directory path

# List of specific CSV files to read
csv_files_to_read = ['filtered_1007.csv', 'filtered_1016.csv', 'filtered_1021.csv']

# Create an empty dictionary to store DataFrames
dataframes_dict = {}

# Loop through the specified list of files
for filename in csv_files_to_read:
    # Generate the full path to the file
    file_path = os.path.join(csv_directory, filename)
    
    # Load the CSV file into a DataFrame
    df = pd.read_csv(file_path)
    
    # Extract the last four numbers before the extension to use as the key
    key = filename.split('_')[-1].split('.')[0]
    dataframes_dict[key] = df

# Merge all DataFrames together without repeating data points
merged_dataframe = pd.concat(dataframes_dict.values()).drop_duplicates().reset_index(drop=True)

# Print the keys of the dictionary to verify
print("Loaded DataFrames:", dataframes_dict.keys())

# Print the merged DataFrame to verify
print("Merged DataFrame:")
print(merged_dataframe)
merged_dataframe.to_csv(f'filtered_merged.csv')

Loaded DataFrames: dict_keys(['1007', '1016', '1021'])
Merged DataFrame:
        Unnamed: 0                                Id  Temperature (K)  \
0               13  7616bb9f827b4003bf01c20e13f08c1b           298.15   
1               14  41426fd0c07044919d388155321abcac           298.15   
2               15  298aee969c294104aa5c397f73b81e77           298.15   
3               16  39ced9cf9c7543668f0e61db747ab6c9           298.15   
4               17  7aa18c4cd41a40ee80a6b86bf2e61297           298.15   
...            ...                               ...              ...   
158432      324961  837dfe6504134a1482d50cb206fadd5f           298.15   
158433      324962  e360156829d844a7824d971f470dbb08           298.15   
158434      324963  c3cd4aa0d7474690ac64db2a020b93fc           298.15   
158435      324964  db5ee64564e8455fa6b27826e7d61794           298.15   
158436      324965  7df62d582a7646e48d811f2d7602926f           298.15   

        Pressure (kPa)   Phase  N Components  Comp

In [14]:
filtered_dataset=ThermoMLDataSet.from_pandas(merged_dataframe)

# Counting with Evaluator

## Filtering properties schemas

In [15]:
hmix_schema = FilterByPropertyTypesSchema(property_types=["EnthalpyOfMixing"], strict=True)
dens_schema = FilterByPropertyTypesSchema(property_types=["Density"], strict=True)
hmix_dens_strict_schema = FilterByPropertyTypesSchema(property_types=["EnthalpyOfMixing","Density"], strict=True)

In [16]:
osmotic_schema = FilterByPropertyTypesSchema(property_types=["OsmoticCoefficient"], strict=False)
activity_schema = FilterByPropertyTypesSchema(property_types=["ActivityCoefficient"], strict=False)
oc_ac_strict_schema = FilterByPropertyTypesSchema(property_types=["OsmoticCoefficient","ActivityCoefficient"], strict=True)

## ThermoML Hmix and Density of binary mixtures

### Filtering

In [17]:
# Hmix + density
tml_hd_usable= FilterByPropertyTypes.apply(filtered_dataset, hmix_dens_strict_schema)
print(len(tml_hd_usable))

# save for future use
tml_hmix_dens_path = Path(f'filtered_{database}/tml_filtered_hd_usable.json')
tml_hd_usable.json(tml_hmix_dens_path, format=True)

df_hmix_dens=tml_hd_usable.to_pandas()
df_hmix_dens.to_csv(f'filtered_{database}/tml_hd_usable_database.csv',index=False)

32345


In [18]:
# Hmix only
tml_hmix_usable = FilterByPropertyTypes.apply(filtered_dataset, hmix_schema)
print(len(tml_hmix_usable))

# save for future use
tml_hmix_path = Path(f'filtered_{database}/tml_filtered_hmix_usable.json')
tml_hmix_usable.json(tml_hmix_path, format=True)

df_hmix=tml_hmix_usable.to_pandas()
df_hmix.to_csv(f'filtered_{database}/tml_hmix_usable_database.csv',index=False)

20309


In [19]:
# Density only
tml_dens_usable= FilterByPropertyTypes.apply(filtered_dataset, dens_schema)
print(len(tml_dens_usable))

# save for future use
tml_dens_path = Path(f'filtered_{database}/tml_filtered_dens_usable.json')
tml_dens_usable.json(tml_dens_path, format=True)

df_dens=tml_dens_usable.to_pandas()
df_dens.to_csv(f'filtered_{database}/tml_dens_usable_database.csv',index=False)

129564


In [20]:
print(f'ThermoML - Number of initial data points: {len(filtered_dataset)}')
print(f'ThermoML - Number of hmix+density data points: {len(tml_hd_usable)}')
print(f'ThermoML - Number of hmix data points: {len(tml_hmix_usable)}')
print(f'ThermoML - Number of density data points: {len(tml_dens_usable)}')

ThermoML - Number of initial data points: 158437
ThermoML - Number of hmix+density data points: 32345
ThermoML - Number of hmix data points: 20309
ThermoML - Number of density data points: 129564


### Get data information

In [21]:
import pandas as pd
import math

def get_unique_temperature_pressure_for_mixtures(data, component_columns=['Component 1', 'Component 2']):
    """
    Loops through a dataframe and returns a dictionary where each unique
    combination of specified component columns is a key. The value for each key is 
    a dictionary containing a list of tuples with temperatures, pressures, and their counts.

    Parameters:
    - data (pd.DataFrame): DataFrame containing the data
    - component_columns (list): List of column names representing the mixture components
                                 Defaults to ['Component 1', 'Component 2'].

    Returns:
    - dict: A dictionary with unique mixtures as keys and a dictionary with temperature-pressure
            information and counts as values.
    """
    mixtures_dict = {}
    unique_mixtures = set()

    for _, row in data.iterrows():
        # Create a normalized tuple of the mixture components (sorted for order-independent comparison)
        mixture_key = tuple(sorted([row[component] for component in component_columns if pd.notna(row[component])]))

        # Add to unique mixtures set (ignores duplicates)
        unique_mixtures.add(mixture_key)

        # Extract temperature and pressure
        temp_pressure = (row['Temperature (K)'], row['Pressure (kPa)'])

        # Only add to the dictionary if both temperature and pressure are valid numbers
        if not (math.isnan(temp_pressure[0]) or math.isnan(temp_pressure[1])):
            if mixture_key not in mixtures_dict:
                mixtures_dict[mixture_key] = {"temp_pressures": {}, "count": 0}

            # Increment count for the specific temperature-pressure combination
            if temp_pressure not in mixtures_dict[mixture_key]["temp_pressures"]:
                mixtures_dict[mixture_key]["temp_pressures"][temp_pressure] = 0

            mixtures_dict[mixture_key]["temp_pressures"][temp_pressure] += 1
            mixtures_dict[mixture_key]["count"] += 1

    # Format the temp_pressures dictionary as a list of tuples for compatibility
    mixtures_dict = {
        key: {
            "temp_pressures": [(tp[0], tp[1], count) for tp, count in value["temp_pressures"].items()],
            "count": value["count"]
        }
        for key, value in mixtures_dict.items()
    }

    # Add deduplicated list of unique mixtures
    mixtures_dict['unique_mixtures'] = list(unique_mixtures)

    return mixtures_dict


In [22]:
# import pandas as pd
# import math

# def get_unique_temperature_pressure_for_mixtures(data, component_columns=['Component 1', 'Component 2']):
#     """
#     Loops through a dataframe and returns a dictionary where each unique
#     combination of specified component columns is a key. The value for each key is 
#     a dictionary containing a list of tuples with temperatures, pressures, and their counts.

#     Parameters:
#     - data (pd.DataFrame): DataFrame containing the data
#     - component_columns (list): List of column names representing the mixture components
#                                  Defaults to ['Component 1', 'Component 2'].

#     Returns:
#     - dict: A dictionary with unique mixtures as keys and a dictionary with temperature-pressure
#             information and counts as values.
#     """
#     mixtures_dict = {}
#     unique_mixtures = set()

#     for _, row in data.iterrows():
#         # Create a tuple of the mixture components
#         mixture_key = tuple(row[component] for component in component_columns)

#         # Add to unique mixtures set (ignores duplicates)
#         unique_mixtures.add(mixture_key)

#         # Extract temperature and pressure
#         temp_pressure = (row['Temperature (K)'], row['Pressure (kPa)'])

#         # Only add to the dictionary if both temperature and pressure are valid numbers
#         if not (math.isnan(temp_pressure[0]) or math.isnan(temp_pressure[1])):
#             if mixture_key not in mixtures_dict:
#                 mixtures_dict[mixture_key] = {"temp_pressures": {}, "count": 0}

#             # Increment count for the specific temperature-pressure combination
#             if temp_pressure not in mixtures_dict[mixture_key]["temp_pressures"]:
#                 mixtures_dict[mixture_key]["temp_pressures"][temp_pressure] = 0

#             mixtures_dict[mixture_key]["temp_pressures"][temp_pressure] += 1
#             mixtures_dict[mixture_key]["count"] += 1

#     # Format the temp_pressures dictionary as a list of tuples for compatibility
#     mixtures_dict = {
#         key: {
#             "temp_pressures": [(tp[0], tp[1], count) for tp, count in value["temp_pressures"].items()],
#             "count": value["count"]
#         }
#         for key, value in mixtures_dict.items()
#     }

#     # Add deduplicated list of unique mixtures
#     mixtures_dict['unique_mixtures'] = list(unique_mixtures)

#     return mixtures_dict


In [23]:
# def get_uniquemixs_counts(dataframe, properties):
#     # Get the unique mixtures data with counts
#     uniquemixs_temps_press = get_unique_temperature_pressure_for_mixtures(dataframe)

#     # Prepare data for saving to a CSV file, excluding the 'unique_mixtures' entry itself
#     mixtures_list = []

#     for mixture, details in uniquemixs_temps_press.items():
#         # Skip the 'unique_mixtures' key as it's not part of the temperature/pressure data
#         if mixture == 'unique_mixtures':
#             continue

#         # Extract temperature-pressure pairs and their counts for each mixture
#         for temp, press, temp_count in details["temp_pressures"]:
#             mixtures_list.append({
#                 "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
#                 "Temperature": temp,
#                 "Pressure": press,
#                 "Count": temp_count  # Use temp_count instead of the overall count
#             })

#     # Convert to DataFrame
#     mixtures_df = pd.DataFrame(mixtures_list)

#     # Save to CSV with the additional 'Count' column
#     output_path = f'filtered_{database}/tml_uniqmixs_thermo_{properties}.csv'
#     mixtures_df.to_csv(output_path, index=False)

#     # Prepare a list to store each unique mixture and its overall count for the CSV
#     umixs_list = [
#         {
#             "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
#             "Count": uniquemixs_temps_press[mixture]["count"]
#         }
#         for mixture in uniquemixs_temps_press["unique_mixtures"]
#     ]

#     # Convert to DataFrame
#     hmix_umixtures_df = pd.DataFrame(umixs_list)

#     # Save to CSV with the additional 'Count' column
#     output_path2 = f'filtered_{database}/tml_uniqmixs_{properties}.csv'
#     hmix_umixtures_df.to_csv(output_path2, index=False)

#     print(f'ThermoML - Number of {properties} mixtures: {len(umixs_list)}')


In [24]:
def get_uniquemixs_counts(dataframe, properties):
    # Get the unique mixtures data with counts
    uniquemixs_temps_press = get_unique_temperature_pressure_for_mixtures(dataframe)

    # Prepare data for saving to a CSV file, excluding the 'unique_mixtures' entry itself
    mixtures_list = []

    for mixture, details in uniquemixs_temps_press.items():
        # Skip the 'unique_mixtures' key as it's not part of the temperature/pressure data
        if mixture == 'unique_mixtures':
            continue

        # Normalize mixture by sorting components (to handle order-independent comparison)
        normalized_mixture = tuple(sorted([str(comp) for comp in mixture if pd.notna(comp)]))

        # Extract temperature-pressure pairs and their counts for each mixture
        for temp, press, temp_count in details["temp_pressures"]:
            mixtures_list.append({
                "Mixture": " | ".join(normalized_mixture),
                "Temperature": temp,
                "Pressure": press,
                "Count": temp_count  # Use temp_count instead of the overall count
            })

    # Convert to DataFrame
    mixtures_df = pd.DataFrame(mixtures_list)

    # Save to CSV with the additional 'Count' column
    output_path = f'filtered_{database}/tml_uniqmixs_thermo_{properties}.csv'
    mixtures_df.to_csv(output_path, index=False)

    # Prepare a dictionary to aggregate counts for normalized mixtures
    normalized_counts = {}

    for mixture, details in uniquemixs_temps_press.items():
        # Skip the 'unique_mixtures' key as it's not part of the mixture data
        if mixture == 'unique_mixtures':
            continue

        # Normalize mixture by sorting components
        normalized_mixture = tuple(sorted([str(comp) for comp in mixture if pd.notna(comp)]))

        # Aggregate counts for normalized mixtures
        normalized_counts[normalized_mixture] = normalized_counts.get(normalized_mixture, 0) + details["count"]

    # Prepare a list to store each unique normalized mixture and its overall count for the CSV
    umixs_list = [
        {
            "Mixture": " | ".join(normalized_mixture),
            "Count": count
        }
        for normalized_mixture, count in normalized_counts.items()
    ]

    # Convert to DataFrame
    hmix_umixtures_df = pd.DataFrame(umixs_list)

    # Save to CSV with the additional 'Count' column
    output_path2 = f'filtered_{database}/tml_uniqmixs_{properties}.csv'
    hmix_umixtures_df.to_csv(output_path2, index=False)

    print(f'ThermoML - Number of {properties} mixtures: {len(umixs_list)}')


In [25]:
import os
import pandas as pd
import csv

def get_indiv_comps(database1, properties):
    """
    Processes a dataset to extract unique SMILES strings of components,
    taking into account duplicates and counting the occurrences of each SMILES.
    The results are saved to a CSV file.

    Parameters:
    - database1: Dataset containing substances with components.
    - properties: A string representing the properties for naming the output file.

    Returns:
    - list: A list of unique SMILES strings.
    """
    print(f'Number of initial data points: {len(database1)}')

    subs = list(set(database1.substances))
    print(f'Number of individual mixtures: {len(subs)}')

    smiles_counts = {}

    for i in subs:
        if len(i.components) == 3:
            for component in i.components:
                smiles = component.smiles
                if smiles in smiles_counts:
                    smiles_counts[smiles] += 1
                else:
                    smiles_counts[smiles] = 1
        elif len(i.components) == 2:
            for component in i.components:
                smiles = component.smiles
                if smiles in smiles_counts:
                    smiles_counts[smiles] += 1
                else:
                    smiles_counts[smiles] = 1
        elif len(i.components) == 1:
            smiles = i.components[0].smiles
            if smiles in smiles_counts:
                smiles_counts[smiles] += 1
            else:
                smiles_counts[smiles] = 1

    print(f'Number of individual components: {len(smiles_counts)}')

    # Convert to DataFrame and save to CSV
    indiv_compnts_df = pd.DataFrame(list(smiles_counts.items()), columns=['Component', 'Count'])
    
    # Ensure the directory exists
    output_dir = f'filtered_{database1}'
    os.makedirs(output_dir, exist_ok=True)
    
    output_path = os.path.join(output_dir, f'tml_indivcomps_{properties}.csv')
    indiv_compnts_df.to_csv(output_path, index=False)

    print(f'Individual components and counts saved to {output_path}')

    # Return the list of unique SMILES strings
    return list(smiles_counts.keys())


#### Hmix + dens

In [26]:
smiles_hd=get_indiv_comps(tml_hd_usable, 'hd')
get_uniquemixs_counts(df_hmix_dens, 'hd')

Number of initial data points: 32345
Number of individual mixtures: 18945
Number of individual components: 155
Individual components and counts saved to filtered_n_properties=32345 n_substances=18945 n_sources=32345/tml_indivcomps_hd.csv
ThermoML - Number of hd mixtures: 427


In [27]:
smiles_hmix=get_indiv_comps(tml_hmix_usable, 'hmix')
get_uniquemixs_counts(df_hmix, 'hmix')

Number of initial data points: 20309
Number of individual mixtures: 17262
Number of individual components: 258
Individual components and counts saved to filtered_n_properties=20309 n_substances=17262 n_sources=20309/tml_indivcomps_hmix.csv
ThermoML - Number of hmix mixtures: 879


In [28]:
smiles_dens=get_indiv_comps(tml_dens_usable, 'dens')
get_uniquemixs_counts(df_dens, 'dens')

Number of initial data points: 129564
Number of individual mixtures: 40839
Number of individual components: 582
Individual components and counts saved to filtered_n_properties=129564 n_substances=40839 n_sources=129564/tml_indivcomps_dens.csv
ThermoML - Number of dens mixtures: 2358


In [29]:
# def smiles_to_figures(smiles_list):
#     func_group_mols=[]
#     matchlegend=[]
#     for m in set(smiles_list):
#         if m != 'O':
#             compounds=pubchempy.get_compounds(m, namespace='smiles')
#             match = compounds[0]
#             match_name=str(match.iupac_name)
#             matchlegend.append(match)
#             mol=Chem.MolFromSmiles(m)
#             func_group_mols.append(mol)
#     init_fig=MolsToGridImage(mols=func_group_mols, legends=matchlegend, molsPerRow=3, returnPNG=False)
#     return init_fig

In [37]:
from rdkit import Chem
from rdkit.Chem import Draw
from PIL import Image, ImageDraw, ImageFont
import pubchempy
import os

def smiles_to_figures(smiles_list, directory="molecular_images", font_path="aileron_font/Aileron-Regular.otf"):
    """
    Generates molecular structure images with SMILES and IUPAC labels and saves them to a specified directory.

    :param smiles_list: List of SMILES strings to process.
    :param directory: Directory where images will be saved.
    :param font_path: Path to the TTF font file for text rendering.
    """
    images = []

    # Try to load the specified font
    try:
        font = ImageFont.truetype(font_path, 16)  # Adjust size as needed
    except IOError:
        print(f"Font not found at {font_path}, using default font.")
        font = ImageFont.load_default()

    # Create the directory if it does not exist
    if not os.path.exists(directory):
        os.makedirs(directory)

    for smiles in set(smiles_list):
        try:
            # Fetch compound details from PubChem
            compounds = pubchempy.get_compounds(smiles, namespace='smiles')
            if not compounds:
                continue
            match = compounds[0]
            iupac_name = str(match.iupac_name) if match.iupac_name else "Unknown"

            # Generate molecular structure image
            mol = Chem.MolFromSmiles(smiles)
            if mol is None:
                continue
            mol_image = Draw.MolToImage(mol)

            # Create an image with additional text (SMILES and IUPAC name)
            width, height = mol_image.size
            text_height = 60
            total_height = height + 2 * text_height

            result_image = Image.new("RGB", (width, total_height), "white")
            result_image.paste(mol_image, (0, 0))

            # Add text below the image
            draw = ImageDraw.Draw(result_image)

            # SMILES string
            smiles_text_position = (10, height + 5)
            draw.text(smiles_text_position, f"SMILES: {smiles}", fill="black", font=font)

            # IUPAC name
            iupac_text_position = (10, height + text_height + 5)
            draw.text(iupac_text_position, f"IUPAC: {iupac_name}", fill="black", font=font)

            # Save the result image to the directory
            filepath = os.path.join(directory, f"molecular_image_{len(images) + 1}.png")
            result_image.save(filepath)
            print(f"Saved: {filepath}")

            # Append the result image to the list
            images.append(result_image)
        except Exception as e:
            print(f"Error processing SMILES {smiles}: {e}")

    return images


In [39]:
def remove_common_elements(list1, list2):
    """
    Removes elements in list2 that also exist in list1.

    :param list1: List of elements to check against.
    :param list2: List of elements to filter.
    :return: A new list with elements from list2 that are not in list1.
    """
    return [item for item in list2 if item not in list1]


In [40]:
def merge_unique(list1, list2):
    """
    Merges two lists without duplicating elements.

    :param list1: First list.
    :param list2: Second list.
    :return: A merged list containing unique elements from both lists.
    """
    return list(set(list1 + list2))

In [None]:
figures_hd=smiles_to_figures(smiles_hd, directory="molecular_images", font_path="aileron_font/Aileron-Regular.otf")

Saved: molecular_images/molecular_image_1.png
Saved: molecular_images/molecular_image_2.png
Saved: molecular_images/molecular_image_3.png
Saved: molecular_images/molecular_image_4.png
Saved: molecular_images/molecular_image_5.png
Saved: molecular_images/molecular_image_6.png
Saved: molecular_images/molecular_image_7.png
Saved: molecular_images/molecular_image_8.png
Saved: molecular_images/molecular_image_9.png
Saved: molecular_images/molecular_image_10.png
Saved: molecular_images/molecular_image_11.png
Saved: molecular_images/molecular_image_12.png
Saved: molecular_images/molecular_image_13.png
Saved: molecular_images/molecular_image_14.png
Saved: molecular_images/molecular_image_15.png
Saved: molecular_images/molecular_image_16.png
Saved: molecular_images/molecular_image_17.png
Saved: molecular_images/molecular_image_18.png
Saved: molecular_images/molecular_image_19.png
Saved: molecular_images/molecular_image_20.png
Saved: molecular_images/molecular_image_21.png
Saved: molecular_image

In [44]:
# Merge lists
smiles_hmix_dens_merged = merge_unique(smiles_hmix, smiles_dens)
print(len(smiles_hmix_dens_merged))

640


In [45]:
# Remove common elements
filtered_list = remove_common_elements(smiles_hd, smiles_hmix_dens_merged)
print(len(filtered_list))

485


In [47]:
import csv

def write_list_to_csv_column(data_list, csv_file, column_name="Column1"):
    """
    Writes a list into a single column of a CSV file.

    :param data_list: The list of elements to write.
    :param csv_file: The path to the CSV file.
    :param column_name: The name of the column (optional).
    """
    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        
        # Write the header
        writer.writerow([column_name])
        
        # Write each item in the list as a new row
        for item in data_list:
            writer.writerow([item])

In [49]:
write_list_to_csv_column(filtered_list, 'filtered_smiles_hmix_dens')

In [50]:
print(filtered_list)

['C1COCCOCCOCCOCCOCCO1', 'CCCCCC(=O)O', 'OC[C@H]1O[C@H](OC[C@H]2O[C@H](O[C@]3(CO)O[C@H](CO)[C@@H](O)[C@@H]3O)[C@H](O)[C@@H](O)[C@@H]2O)[C@H](O)[C@@H](O)[C@H]1O', 'CCCN(CCC)CCC', 'Cc1cnccn1', 'CCOCCOCCOCC', 'CN1CCCN(C)C1=O', 'O=[PH](O)O', 'C1CCNC1', 'CCCc1ccccc1', 'CCCCCCOCC', 'CC(=O)OCCOC(C)=O', 'O=CO', 'O=C1CCCO1', 'CCCCCCCC(=O)O', 'OC[C@H]1O[C@@](CO)(O[C@H]2O[C@H](CO)[C@@H](O)[C@H](O)[C@H]2O)[C@@H](O[C@H]2O[C@H](CO)[C@@H](O)[C@H](O)[C@H]2O)[C@@H]1O', 'CC(=O)c1ccc(C)cc1', 'CCCCCCCCl', 'CCCCCCCCCCCC(=O)OC', 'CCC(C)(C)C', 'BrCBr', 'O=C(O)c1cccc(C(=O)O)c1', 'CC(C)(N)CO', 'C=COCCC', 'CCN1CCCC1=O', 'CC(CCO)CCO', 'COC(=O)CCCCC(=O)OC', 'CCCC(C)C', 'CCCCl', 'CCCCCCCCCC=O', 'CC(C)(CO)CO', 'C=CCCCCCC', 'Clc1ccccc1Cl', 'CC(C)(C)O', 'CCOCC', 'CCC(=O)OCCN1CCNCC1', 'CCOC(F)(F)C(F)(C(F)(F)F)C(F)(F)F', 'FC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F', 'COC(=O)[C@@H](C)CO', 'CN1CCN(C)CC1', 'CC1=CC(=O)c2ccccc2C1=O', 'O=S(=O)(O)CCCCNC(CO)(CO)CO', 'CC(=O)CC(C)=O', 'CN(C)CC(=O)O', 'CCOC(=O)C(=O)OCC',

In [46]:
figures_hmix_dens=smiles_to_figures(filtered_list, directory="figures_hmix_dens", font_path="aileron_font/Aileron-Regular.otf")

Saved: figures_hmix_dens/molecular_image_1.png
Saved: figures_hmix_dens/molecular_image_2.png
Saved: figures_hmix_dens/molecular_image_3.png
Saved: figures_hmix_dens/molecular_image_4.png
Saved: figures_hmix_dens/molecular_image_5.png
Saved: figures_hmix_dens/molecular_image_6.png
Saved: figures_hmix_dens/molecular_image_7.png
Saved: figures_hmix_dens/molecular_image_8.png
Saved: figures_hmix_dens/molecular_image_9.png
Saved: figures_hmix_dens/molecular_image_10.png
Saved: figures_hmix_dens/molecular_image_11.png
Saved: figures_hmix_dens/molecular_image_12.png
Saved: figures_hmix_dens/molecular_image_13.png
Saved: figures_hmix_dens/molecular_image_14.png
Saved: figures_hmix_dens/molecular_image_15.png
Saved: figures_hmix_dens/molecular_image_16.png
Saved: figures_hmix_dens/molecular_image_17.png
Saved: figures_hmix_dens/molecular_image_18.png
Saved: figures_hmix_dens/molecular_image_19.png
Saved: figures_hmix_dens/molecular_image_20.png
Saved: figures_hmix_dens/molecular_image_21.png
S

## ThermoML Osmotic and Activity Coefficients

### FIltering

In [50]:
# # Osmotic + Activity
# tml_ocac_usable= FilterByPropertyTypes.apply(filtered_dataset, oc_ac_strict_schema)
# print(len(tml_ocac_usable))

# # save for future use
# tml_ocac_path = Path(f'filtered_{database}/tml_filtered_ocac_usable.json')
# tml_ocac_usable.json(tml_ocac_path, format=True)

# df_ocac=tml_ocac_usable.to_pandas()
# df_ocac.to_csv(f'filtered_{database}/tml_ocac_usable_database.csv',index=False)

In [51]:
# # Osmotic only
# tml_oc_usable = FilterByPropertyTypes.apply(filtered_dataset, osmotic_schema)
# print(len(tml_oc_usable))

# # save for future use
# tml_oc_path = Path(f'filtered_{database}/tml_filtered_oc_usable.json')
# tml_oc_usable.json(tml_oc_path, format=True)

# df_oc=tml_oc_usable.to_pandas()
# df_oc.to_csv(f'filtered_{database}/tml_oc_usable_database.csv',index=False)

In [52]:
# # Activity only
# tml_ac_usable = FilterByPropertyTypes.apply(filtered_dataset, activity_schema)
# print(len(tml_ac_usable))

# # save for future use
# tml_ac_path = Path(f'filtered_{database}/tml_filtered_ac_usable.json')
# tml_ac_usable.json(tml_ac_path, format=True)

# df_ac=tml_ac_usable.to_pandas()
# df_ac.to_csv(f'filtered_{database}/tml_ac_usable_database.csv',index=False)

### Get data information

In [53]:
# get_indiv_comps(tml_ocac_usable, 'ocac')
# get_uniquemixs_counts(df_ocac, 'ocac')

In [54]:
# get_indiv_comps(tml_oc_usable, 'oc')
# get_uniquemixs_counts(df_oc, 'oc')

In [55]:
# get_indiv_comps(tml_ac_usable, 'ac')
# get_uniquemixs_counts(df_ac, 'ac')