# Core Imports and Setup

In [1]:
import os
import pandas as pd
from pathlib import Path

import warnings
warnings.filterwarnings("ignore")

import logging
# logging.getLogger("openff.toolkit").setLevel(logging.ERROR)
# logging.basicConfig(level=logging.DEBUG)

import json
from openff import toolkit, evaluator

In [2]:
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw, MolFromSmiles
from rdkit.Chem.Draw import MolsToGridImage, rdMolDraw2D, MolsMatrixToGridImage
import pubchempy
from PIL import Image, ImageDraw, ImageFont
IPythonConsole.ipython_useSVG=False  #< set this to False if you want PNGs instead of SVGs

In [3]:
from rdkit.Chem import rdMolDescriptors
import pubchempy as pcp
import math

In [4]:
from openff.evaluator.datasets import PhysicalProperty, PropertyPhase
from openff.evaluator.datasets.thermoml import thermoml_property
from openff.evaluator import properties
from openff.units import unit
from openff.evaluator.datasets.thermoml import ThermoMLDataSet
from openff.evaluator.datasets import PhysicalProperty, PropertyPhase, PhysicalPropertyDataSet

In [5]:
from openff.evaluator.datasets.curation.components.filtering import FilterByPropertyTypes, FilterByPropertyTypesSchema
from openff.evaluator.datasets.curation.components.filtering import FilterByTemperature, FilterByTemperatureSchema
from openff.evaluator.datasets.curation.components.filtering import FilterByPressure, FilterByPressureSchema
from openff.evaluator.datasets.curation.components.filtering import FilterBySmiles, FilterBySmilesSchema
from openff.evaluator.datasets.curation.components.filtering import FilterByNComponents, FilterByNComponentsSchema


In [6]:
from rich.progress import track
from collections import defaultdict
from openff.toolkit.utils.exceptions import UndefinedStereochemistryError

# Registering New Properties

In [7]:
@thermoml_property("Osmotic coefficient", supported_phases=PropertyPhase.Liquid | PropertyPhase.Gas)
class OsmoticCoefficient(PhysicalProperty):
    """A class representation of a osmotic coeff property"""

    @classmethod
    def default_unit(cls):
        return unit.dimensionless
    
setattr(properties, OsmoticCoefficient.__name__, OsmoticCoefficient)

ThermoMLDataSet.registered_properties['Osmotic coefficient'].conversion_function

functools.partial(<function _default_mapping at 0x7f3fa13e2cb0>, <class '__main__.OsmoticCoefficient'>)

In [8]:
@thermoml_property("Activity coefficient", supported_phases=PropertyPhase.Liquid | PropertyPhase.Gas)
class ActivityCoefficient(PhysicalProperty):
    """A class representation of a Activity coeff property"""

    @classmethod
    def default_unit(cls):
        return unit.dimensionless
    
setattr(properties, ActivityCoefficient.__name__, ActivityCoefficient)

ThermoMLDataSet.registered_properties['Activity coefficient'].conversion_function

functools.partial(<function _default_mapping at 0x7f3fa13e2cb0>, <class '__main__.ActivityCoefficient'>)

In [9]:
@thermoml_property("Speed Of Sound (m/s)", supported_phases=PropertyPhase.Liquid | PropertyPhase.Gas)
class SpeedOfSound(PhysicalProperty):
    """A class representation of a Speed Of Sound property"""

    @classmethod
    def default_unit(cls):
        return unit.meter / unit.second
    
setattr(properties, SpeedOfSound.__name__, SpeedOfSound)

ThermoMLDataSet.registered_properties['Speed Of Sound (m/s)'].conversion_function

functools.partial(<function _default_mapping at 0x7f3fa13e2cb0>, <class '__main__.SpeedOfSound'>)

# Functions

In [10]:
def filter_database (unfiltered_directory):
    data_set=[]
    # Loop through all files in the directory
    for filename in os.listdir(unfiltered_directory):
        # Check if the file ends with .json
        if filename.endswith('.xml'):
            # Full path to the file
            file_path = os.path.join(unfiltered_directory, filename)
            data_set.append(file_path)
    
    sorted_dois = defaultdict(list)
    for doi in track(data_set, description='Filtering DOIs...'):
        try:
            dataset = ThermoMLDataSet._from_file(doi)
            sorted_dois['working'].append(doi)
        except UndefinedStereochemistryError:
            sorted_dois['stereo_fail'].append(doi)
        except Exception as other_exc:
            sorted_dois[other_exc.__class__.__name__].append(doi)

    print('Amount of failing files: %i/%i' % (len(sorted_dois['stereo_fail']),len(data_set)))
    print('Amount of working files: %i/%i' % (len(sorted_dois['working']),len(data_set)))

    return sorted_dois['working']

In [11]:
def extract_database (database_directory, csv_cached):
    CACHED_PROP_PATH = Path(csv_cached)

    if CACHED_PROP_PATH.exists():
        prop_df = pd.read_csv(CACHED_PROP_PATH, index_col=0)
        # delete rows with underfined thermo params to avoid pesky indexing errors
        prop_df = prop_df.dropna(subset=['Temperature (K)'])
        prop_df = prop_df.dropna(subset=['Pressure (kPa)'])
        data_set = ThermoMLDataSet.from_pandas(prop_df)
    else:
        sorted_files=filter_database(database_directory)
        data_set=ThermoMLDataSet.from_file(*sorted_files)

        initial_df = data_set.to_pandas()
        with CACHED_PROP_PATH.open('w') as file:
            initial_df.to_csv(CACHED_PROP_PATH)
    return data_set

# Import Directory

In [12]:
# # Ensure database_directory is defined
database_directory = '10.1016'
# database_directory = 'tests'
tml_data_set = extract_database(database_directory, 'tml_database_16.csv')

In [13]:
tml_data_set

<PhysicalPropertyDataSet n_properties=323174 n_substances=74688 n_sources=323174>

In [14]:
tml_df=tml_data_set.to_pandas()

In [15]:
len(tml_df)

323174

# Filtering database dataframe --> Usable

In [16]:
# Define valid and invalid characters
valid_chars = "HCNOSPFBClI"
invalid_chars = r"[.+-]"

# Define the regular expression pattern for valid characters
valid_pattern = f"[{valid_chars}]"

# Apply the filters
filtered_df_final = tml_df[
    (
        (tml_df['Temperature (K)'] >= 273) & (tml_df['Temperature (K)'] <= 373) &
        (tml_df['Pressure (kPa)'] >= 90) & (tml_df['Pressure (kPa)'] <= 110) &
        (tml_df['N Components'] == 2)
    ) &
    tml_df['Component 1'].str.contains(valid_pattern, regex=True, na=False) &
    tml_df['Component 2'].str.contains(valid_pattern, regex=True, na=False) &
    ~tml_df['Component 1'].str.contains(invalid_chars, regex=True, na=False) &
    ~tml_df['Component 2'].str.contains(invalid_chars, regex=True, na=False)
]
print(len(filtered_df_final))

79689


In [17]:
filtered_df_final

Unnamed: 0,Id,Temperature (K),Pressure (kPa),Phase,N Components,Component 1,Role 1,Mole Fraction 1,Exact Amount 1,Component 2,...,ActivityCoefficient Uncertainty (),ExcessMolarVolume Value (cm ** 3 / mol),ExcessMolarVolume Uncertainty (cm ** 3 / mol),DielectricConstant Value (),DielectricConstant Uncertainty (),Density Value (g / ml),Density Uncertainty (g / ml),EnthalpyOfMixing Value (kJ / mol),EnthalpyOfMixing Uncertainty (kJ / mol),Source
2,f0a0939b69b74660916619f191ef7e24,298.15,100.0,Liquid,2,O=C1OCCO1,Solvent,0.022225,,O,...,,,,,,1.02664,0.000600,,,10.1016/j.jct.2016.03.027
3,8652f16c78964edfa313b0bb1a000a7c,298.15,100.0,Liquid,2,O=C1OCCO1,Solvent,0.048655,,O,...,,,,,,1.05671,0.000605,,,10.1016/j.jct.2016.03.027
4,8cddd5632ec547138cf96aa7f4a7a53b,298.15,100.0,Liquid,2,O=C1OCCO1,Solvent,0.080608,,O,...,,,,,,1.08725,0.000610,,,10.1016/j.jct.2016.03.027
5,155d70754afa43889ff6a09724e5b8fc,298.15,100.0,Liquid,2,O=C1OCCO1,Solvent,0.120015,,O,...,,,,,,1.11860,0.000615,,,10.1016/j.jct.2016.03.027
156,3f689bfe845b44c98f4fe77761a4e39b,303.15,101.0,Liquid,2,CCN(CC)CCCN,Solvent,0.050300,,CCCCCCC,...,,,,,,,,0.2700,0.00550,10.1016/j.jct.2008.01.012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
323169,f791928658874ee692d8e3ef826b2ef7,303.15,101.0,Liquid,2,CC(C)OC(C)C,Solvent,0.483000,,Cc1cccc(C)c1,...,,,,,,,,0.0453,0.00505,10.1016/j.fluid.2007.03.017
323170,ae03b90301c14c2c850c74f6b4422924,303.15,101.0,Liquid,2,CC(C)OC(C)C,Solvent,0.622000,,Cc1cccc(C)c1,...,,,,,,,,0.0261,0.00500,10.1016/j.fluid.2007.03.017
323171,7a5aa7ff58a945f28f22b5bb01757d87,303.15,101.0,Liquid,2,CC(C)OC(C)C,Solvent,0.656000,,Cc1cccc(C)c1,...,,,,,,,,0.0226,0.00500,10.1016/j.fluid.2007.03.017
323172,3f2b0fb1b28a47cf9efc73c32e554291,303.15,101.0,Liquid,2,CC(C)OC(C)C,Solvent,0.752000,,Cc1cccc(C)c1,...,,,,,,,,0.0115,0.00500,10.1016/j.fluid.2007.03.017


In [18]:
filtered_df_final.to_csv(f'filtered_{database_directory}.csv')

In [19]:
filtered_dataset=ThermoMLDataSet.from_pandas(filtered_df_final)

# Counting from database dataframe

In [20]:
count_hmix = filtered_df_final.dropna(subset=['EnthalpyOfMixing Value (kJ / mol)']).shape[0]
count_dens = filtered_df_final.dropna(subset=['Density Value (g / ml)']).shape[0]
count_hd = filtered_df_final.dropna(subset=['EnthalpyOfMixing Value (kJ / mol)', 'Density Value (g / ml)']).shape[0]
print(f"Number of data points with Hmix: {count_hmix}")
print(f"Number of data points with Density: {count_dens}")
print(f"Number of data points with Hmix and Density: {count_hd}")

# Assuming filtered_df_final is your DataFrame
# Filter rows where both required columns have values
filtered_hmix_dens = filtered_df_final.dropna(subset=['EnthalpyOfMixing Value (kJ / mol)', 'Density Value (g / ml)'])

# Extract unique strings from the specified columns
unique_strings = pd.unique(filtered_hmix_dens[['Component 1', 'Component 2']].values.ravel())

print("Unique strings found in 'Component 1' and 'Component 2' with values for Hmix and Density:")
print(unique_strings)
# unique_strings.to_csv(f'indiv_comps_hd_{database_directory}.csv')

# Create a DataFrame with unique combinations
unique_combinations = filtered_hmix_dens[['Component 1', 'Component 2']].drop_duplicates()

# Convert each combination into a sorted tuple for uniqueness irrespective of order
unique_combinations_set = unique_combinations.apply(lambda row: tuple(sorted(row)), axis=1).drop_duplicates()

print("Unique combinations of strings from 'Component 1' and 'Component 2' with values for Hmix and Density:")
unique_mixtures=unique_combinations_set.tolist()
print(unique_mixtures)
# unique_mixtures.to_csv(f'indiv_mixs_hd_{database_directory}.csv')

Number of data points with Hmix: 12789
Number of data points with Density: 60529
Number of data points with Hmix and Density: 0
Unique strings found in 'Component 1' and 'Component 2' with values for Hmix and Density:
[]
Unique combinations of strings from 'Component 1' and 'Component 2' with values for Hmix and Density:
[]


In [21]:
count_oc = filtered_df_final.dropna(subset=['OsmoticCoefficient Value ()']).shape[0]
count_ac = filtered_df_final.dropna(subset=['ActivityCoefficient Value ()']).shape[0]
count_oa = filtered_df_final.dropna(subset=['OsmoticCoefficient Value ()', 'ActivityCoefficient Value ()']).shape[0]
print(f"Number of data points with OC: {count_oc}")
print(f"Number of data points with AC: {count_ac}")
print(f"Number of data points with OC and AC: {count_oa}")

# Assuming filtered_df_final is your DataFrame
# Filter rows where both required columns have values
filtered_oc_ac = filtered_df_final.dropna(subset=['OsmoticCoefficient Value ()', 'ActivityCoefficient Value ()'])

# Extract unique strings from the specified columns
unique_strings = pd.unique(filtered_oc_ac[['Component 1', 'Component 2']].values.ravel())

print("Unique strings found in 'Component 1' and 'Component 2' with values for OC and AC:")
print(unique_strings)
# unique_strings.to_csv(f'indiv_comps_oa_{database_directory}.csv')

# Create a DataFrame with unique combinations
unique_combinations = filtered_oc_ac[['Component 1', 'Component 2']].drop_duplicates()

# Convert each combination into a sorted tuple for uniqueness irrespective of order
unique_combinations_set = unique_combinations.apply(lambda row: tuple(sorted(row)), axis=1).drop_duplicates()

print("Unique combinations of strings from 'Component 1' and 'Component 2' with values for OC and AC:")
unique_mixtures=unique_combinations_set.tolist()
print(unique_mixtures)
# unique_mixtures.to_csv(f'indiv_mixs_oa_{database_directory}.csv')

Number of data points with OC: 76
Number of data points with AC: 194
Number of data points with OC and AC: 0
Unique strings found in 'Component 1' and 'Component 2' with values for OC and AC:
[]
Unique combinations of strings from 'Component 1' and 'Component 2' with values for OC and AC:
[]


# Counting with Evaluator

## Filtering properties schemas

In [22]:
ncomp_schema = FilterByNComponentsSchema(n_components=[2])
pressure_schema= FilterByPressureSchema(minimum_pressure = 90.0, maximum_pressure = 110.0)
temp_schema=FilterByTemperatureSchema(minimum_temperature = 273.15, maximum_temperature=373.0)

In [23]:
hmix_schema = FilterByPropertyTypesSchema(property_types=["EnthalpyOfMixing"], strict=True)
dens_schema = FilterByPropertyTypesSchema(property_types=["Density"], strict=True)
hmix_dens_schema = FilterByPropertyTypesSchema(property_types=["EnthalpyOfMixing","Density"])
hmix_dens_strict_schema = FilterByPropertyTypesSchema(property_types=["EnthalpyOfMixing","Density"], strict=True)

In [24]:
osmotic_schema = FilterByPropertyTypesSchema(property_types=["OsmoticCoefficient"], strict=True)
activity_schema = FilterByPropertyTypesSchema(property_types=["ActivityCoefficient"], strict=True)
oc_ac_schema = FilterByPropertyTypesSchema(property_types=["OsmoticCoefficient","ActivityCoefficient"])
oc_ac_strict_schema = FilterByPropertyTypesSchema(property_types=["OsmoticCoefficient","ActivityCoefficient"], strict=True)

## ThermoML Hmix and Density of binary mixtures

### Filtering

In [25]:
# Hmix + density
data_set_hmix_dens= FilterByPropertyTypes.apply(filtered_dataset, hmix_dens_schema)
print(len(data_set_hmix_dens))

data_set_2comps= FilterByNComponents.apply(data_set_hmix_dens, ncomp_schema)
print(len(data_set_2comps))

KeyboardInterrupt: 

In [None]:
data_set_hmix_dens_strict= FilterByPropertyTypes.apply(data_set_2comps, hmix_dens_strict_schema)
print(len(data_set_hmix_dens_strict))

0


In [None]:
data_set_hd_strict_usable= FilterByPressure.apply(data_set_hmix_dens_strict, pressure_schema)
tml_hd_usable= FilterByTemperature.apply(data_set_hd_strict_usable, temp_schema)
print(len(tml_hd_usable))

KeyError: 'Pressure (kPa)'

In [None]:
# save for future use
tml_hmix_dens_path = Path(f'filtered_{database_directory}/tml_filtered_hd_usable.json')
tml_hd_usable.json(tml_hmix_dens_path, format=True)

df_hmix_dens=tml_hd_usable.to_pandas()
df_hmix_dens.to_csv(f'filtered_{database_directory}/tml_hd_usable_database.csv',index=False)

In [26]:
# Hmix only
data_set_hmix = FilterByPropertyTypes.apply(filtered_dataset, hmix_schema)
data_set_hmix_usable= FilterByPressure.apply(data_set_hmix, pressure_schema)
tml_hmix_usable= FilterByTemperature.apply(data_set_hmix_usable, temp_schema)
print(len(tml_hmix_usable))

# save for future use
tml_hmix_path = Path(f'filtered_{database_directory}/tml_filtered_hmix_usable.json')
tml_hmix_usable.json(tml_hmix_path, format=True)

df_hmix=tml_hmix_usable.to_pandas()
df_hmix.to_csv(f'filtered_{database_directory}/tml_hmix_usable_database.csv',index=False)
# df_hmix.head()
# df_hmix.columns.to_list()

12789


In [27]:
# Density only
data_set_dens= FilterByPropertyTypes.apply(filtered_dataset, dens_schema)
data_set_dens_usable= FilterByPressure.apply(data_set_dens, pressure_schema)
tml_dens_usable= FilterByTemperature.apply(data_set_dens_usable, temp_schema)
print(len(tml_dens_usable))

# save for future use
tml_dens_path = Path(f'filtered_{database_directory}/tml_filtered_dens_usable.json')
tml_dens_usable.json(tml_dens_path, format=True)

df_dens=tml_dens_usable.to_pandas()
df_dens.to_csv(f'filtered_{database_directory}/tml_dens_usable_database.csv',index=False)

KeyboardInterrupt: 

In [None]:
print(f'ThermoML - Number of initial data points: {len(filtered_df_final)}')
print(f'ThermoML - Number of hmix+density data points: {len(tml_hd_usable)}')
print(f'ThermoML - Number of hmix data points: {len(tml_hmix_usable)}')
print(f'ThermoML - Number of density data points: {len(tml_dens_usable)}')

ThermoML - Number of initial data points: 3348
ThermoML - Number of hmix+density data points: 14908
ThermoML - Number of hmix data points: 12789
ThermoML - Number of density data points: 3348


### Get data information

#### Functions

In [None]:
def get_unique_temperature_pressure_for_mixtures(data, component_columns=['Component 1', 'Component 2']):
    """
    Loops through a dataframe and returns a dictionary where each unique
    combination of specified component columns is a key. The value for each key is 
    a dictionary containing a unique list of tuples with temperatures and pressures for that mixture 
    and the count of occurrences of each unique mixture.

    Parameters:
    - data (pd.DataFrame): DataFrame containing the data
    - component_columns (list): List of column names representing the mixture components
                                 Defaults to ['Component 1', 'Component 2', 'Component 3'].

    Returns:
    - dict: A dictionary with unique mixtures as keys and a dictionary with temperature-pressure
            information and counts as values.
    """
    mixtures_dict = {}
    unique_mixtures = set()


    for _, row in data.iterrows():
        # Create a tuple of the mixture components
        mixture_key = tuple(row[component] for component in component_columns)

        # Add to unique mixtures set (ignores duplicates)
        unique_mixtures.add(mixture_key)
        
        # Extract temperature and pressure
        temp_pressure = (row['Temperature (K)'], row['Pressure (kPa)'])
        
        # Only add to the dictionary if both temperature and pressure are valid numbers
        if not (math.isnan(temp_pressure[0]) or math.isnan(temp_pressure[1])):
            if mixture_key not in mixtures_dict:
                mixtures_dict[mixture_key] = {"temp_pressures": set(), "count": 0}
            
            # Add the temperature-pressure pair and increment the count
            mixtures_dict[mixture_key]["temp_pressures"].add(temp_pressure)
            mixtures_dict[mixture_key]["count"] += 1

    # Convert the sets back to lists for readability
    mixtures_dict = {
        key: {"temp_pressures": list(value["temp_pressures"]), "count": value["count"]}
        for key, value in mixtures_dict.items()
    }
    
    # Add deduplicated list of unique mixtures
    mixtures_dict['unique_mixtures'] = list(unique_mixtures)
    
    return mixtures_dict


In [None]:
def get_uniquemixs_counts(dataframe, properties):
    # Get the unique mixtures data with counts
    uniquemixs_temps_press = get_unique_temperature_pressure_for_mixtures(dataframe)

    # Prepare data for saving to a CSV file, excluding the 'unique_mixtures' entry itself
    mixtures_list = []

    for mixture, details in uniquemixs_temps_press.items():
        # Skip the 'unique_mixtures' key as it's not part of the temperature/pressure data
        if mixture == 'unique_mixtures':
            continue
        # Extract temperature-pressure pairs and count for each mixture
        count = details["count"]
        for temp, press in details["temp_pressures"]:
            mixtures_list.append({
                "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
                "Temperature": temp,
                "Pressure": press,
                "Count": count
            })

    # Convert to DataFrame
    mixtures_df = pd.DataFrame(mixtures_list)

    # Save to CSV with the additional 'Count' column
    output_path = f'filtered_{database_directory}/tml_uniqmixs_thermo_{properties}.csv'
    mixtures_df.to_csv(output_path, index=False)

    # Prepare a list to store each unique mixture and its count for the CSV
    umixs_list = [
        {
            "Mixture": " | ".join([str(comp) for comp in mixture if pd.notna(comp)]),
            "Count": uniquemixs_temps_press[mixture]["count"]
        }
        for mixture in uniquemixs_temps_press["unique_mixtures"]
    ]

    # Convert to DataFrame
    hmix_umixtures_df = pd.DataFrame(umixs_list)

    # Save to CSV with the additional 'Count' column
    output_path2 = f'filtered_{database_directory}/tml_uniqmixs_{properties}.csv'
    hmix_umixtures_df.to_csv(output_path2, index=False)

    print(f'ThermoML - Number of {properties} mixtures: {len(umixs_list)}')

In [None]:
def get_smiles(data_set_choice):
    print(f'Number of initial data points: {len(data_set_choice)}')
    subs=list(set(data_set_choice.substances))
    print(f'Number of individual mixtures: {len(subs)}')
    smiles_list=[]
    comps=[]

    for i in subs:
        if len(i.components) == 3:
            comps.append(i.components[0].smiles)
            comps.append(i.components[1].smiles)
            comps.append(i.components[2].smiles)
        elif len(i.components) == 2:
            comps.append(i.components[0].smiles)
            comps.append(i.components[1].smiles)
        elif len(i.components) == 1:
            comps.append(i.components[0].smiles)

    # comps.remove('O')

    [smiles_list.append(o) for o in set(comps)]

    print(f'Number of individual components: {len(smiles_list)}')

    return(smiles_list)

In [None]:
def get_indiv_comps(database, properties):
    indiv_compnts=get_smiles(database)
    indiv_compnts_df=pd.DataFrame(indiv_compnts)
    indiv_compnts_df.to_csv(f'filtered_{database_directory}/tml_indivcomps_{properties}.csv')

#### Hmix + dens

In [None]:
get_indiv_comps(tml_hd_usable, 'hd')
get_uniquemixs_counts(df_hmix_dens, 'hd')

Number of initial data points: 14908
Number of individual mixtures: 10003
Number of individual components: 97
ThermoML - Number of hd mixtures: 278


In [None]:
get_indiv_comps(tml_hmix_usable, 'hmix')
get_uniquemixs_counts(df_hmix, 'hmix')

Number of initial data points: 12789
Number of individual mixtures: 11206
Number of individual components: 206
ThermoML - Number of hmix mixtures: 585


In [None]:
get_indiv_comps(tml_dens_usable, 'dens')
get_uniquemixs_counts(df_dens, 'dens')

Number of initial data points: 3348
Number of individual mixtures: 972
Number of individual components: 71
ThermoML - Number of dens mixtures: 83


## ThermoML Osmotic and Activity Coefficients

### FIltering

In [None]:
# Osmotic + Activity
tml_oc_ac= FilterByPropertyTypes.apply(filtered_dataset, oc_ac_schema)
print(len(tml_oc_ac))
tml_2comps= FilterByNComponents.apply(tml_oc_ac, ncomp_schema)
print(len(tml_2comps))

0


KeyError: 'N Components'

In [None]:
tml_ocac_strict= FilterByPropertyTypes.apply(tml_2comps, oc_ac_strict_schema)
print(len(tml_ocac_strict))

0


In [None]:
tml_press_usable= FilterByPressure.apply(tml_ocac_strict, pressure_schema)
tml_ocac_usable= FilterByTemperature.apply(tml_press_usable, temp_schema)
print(len(tml_ocac_usable))

KeyError: 'Pressure (kPa)'

In [None]:
# save for future use
tml_ocac_path = Path(f'filtered_{database_directory}/tml_filtered_ocac_usable.json')
tml_ocac_usable.json(tml_ocac_path, format=True)

df_ocac=tml_ocac_usable.to_pandas()
df_ocac.to_csv(f'filtered_{database_directory}/tml_ocac_usable_database.csv',index=False)

In [None]:
print(len(tml_ocac_usable))

45


In [None]:
# Osmotic only
tml_oc_ds = FilterByPropertyTypes.apply(tml_data_set, osmotic_schema)
tml_oc_press= FilterByPressure.apply(tml_oc_ds, pressure_schema)
tml_oc_usable= FilterByTemperature.apply(tml_oc_press, temp_schema)
print(len(tml_oc_usable))

# save for future use
tml_oc_path = Path(f'filtered_{database_directory}/tml_filtered_oc_usable.json')
tml_oc_usable.json(tml_oc_path, format=True)

df_oc=tml_oc_usable.to_pandas()
df_oc.to_csv(f'filtered_{database_directory}/tml_oc_usable_database.csv',index=False)

KeyError: 'Pressure (kPa)'

In [None]:
# Activity only
tml_ac_ds = FilterByPropertyTypes.apply(tml_data_set, activity_schema)
tml_ac_press= FilterByPressure.apply(tml_ac_ds, pressure_schema)
tml_ac_usable= FilterByTemperature.apply(tml_ac_press, temp_schema)
print(len(tml_ac_usable))

# save for future use
tml_ac_path = Path(f'filtered_{database_directory}/tml_filtered_ac_usable.json')
tml_ac_usable.json(tml_ac_path, format=True)

df_ac=tml_ac_usable.to_pandas()
df_ac.to_csv(f'filtered_{database_directory}/tml_ac_usable_database.csv',index=False)

KeyError: 'Pressure (kPa)'

### Get data information

In [None]:
get_indiv_comps(tml_ocac_usable, 'ocac')
get_uniquemixs_counts(df_ocac, 'ocac')

NameError: name 'tml_ocac_usable' is not defined

In [None]:
get_indiv_comps(tml_oc_usable, 'oc')
get_uniquemixs_counts(df_oc, 'oc')

Number of initial data points: 1822
Number of individual mixtures: 1297
Number of individual components: 53
ThermoML - Number of oc mixtures: 96


In [None]:
get_indiv_comps(tml_ac_usable, 'ac')
get_uniquemixs_counts(df_ac, 'ac')

Number of initial data points: 9108
Number of individual mixtures: 333
Number of individual components: 99
ThermoML - Number of ac mixtures: 99
