In [1]:
from dataclasses import dataclass, field
from pathlib import Path
from typing import Union, Optional

import numpy as np
import pandas as pd

In [2]:
def convert_units(cmc):
    """Convert from units of log(mol per litre) to log(micro mol per litre)."""
    return np.log10(np.power(10, cmc) * 1e+6)

In [3]:
@dataclass
class DataReader:
    """Read a table from a file, do some unit pre-processing and unit conversions, then get metrics."""
    fname: Union[str, Path]
    old_exp_name: str
    old_pred_name: Optional[str] = None
    conv_pred_name: str = "Converted CMC prediction"
    conv_exp_name: str = "Converted CMC expected"
    residual_name: str = "Residual"
    df: pd.DataFrame = field(init=False)

    def __post_init__(self) -> None:
        """Read the file from disk and preprocess."""
        self.df = self.read_df()
        self.df[self.conv_exp_name] = self.df[self.old_exp_name].apply(convert_units)
        if self.old_pred_name is not None:
            self.df[self.conv_pred_name] = self.df[self.old_pred_name].apply(convert_units)
            self.df[self.residual_name] = self.df[self.conv_exp_name] - self.df[self.conv_pred_name]
    
    def read_df(self) -> pd.DataFrame:
        """Read DataFrame from disk."""
        raise NotImplementedError()
    
    @property
    def rmse(self) -> float:
        if self.old_pred_name is None:
            raise ValueError("No prediction column set, cannot calculate RMSE.")
        return np.sqrt(np.mean(np.square(self.df[self.residual_name])))

In [4]:
class COSMOPlexData(DataReader):
    def read_df(self) -> pd.DataFrame:
        return pd.read_html(self.fname, header=0)[0]

cosmoplex = COSMOPlexData("cosmoplex.html", "log10CMC (experimental)", "log10CMC (COSMOplex)")
cosmoplex.df

Unnamed: 0,Surfactant,Abbreviation,log10CMC (COSMOplex),log10CMC (experimental),Converted CMC expected,Converted CMC prediction,Residual
0,Triethylene glycol monohexyl ether,C6E3,-2.0,-1.0,5.0,4.0,1.0
1,Dimethyl nonylamine oxide,C9C2NO,-1.92,-1.27,4.73,4.08,0.65
2,Octyl Î²-d-glucoside,C8BG1,-2.92,-1.6,4.4,3.08,1.32
3,Triethylene glycol monooctyl ether,C8E3,-3.03,-2.12,3.88,2.97,0.91
4,Octyl Î±-glyceryl ether,C8BGLYE,-2.88,-2.24,3.76,3.12,0.64
5,Octyl glycol ether,C8E1,-3.14,-2.31,3.69,2.86,0.83
6,Triethylene glycol monodecyl ether,C10E3,-3.67,-3.22,2.78,2.33,0.45
7,Dodecyl Î²-d-glucoside,C12BG1,-4.21,-3.72,2.28,1.79,0.49
8,Octoxynol-2,C8(C6H4)E2,-4.15,-3.88,2.12,1.85,0.27
9,Hexaethylene glycol monododecyl ether,C12E6,-5.4,-4.06,1.94,0.6,1.34


In [5]:
cosmoplex.rmse

0.8603836353627374

In [9]:
class GCData(DataReader):
    """Group contribution data reader."""
    def read_df(self) -> pd.DataFrame:
        """Read the Marrero-Gani GC data."""
        df = pd.read_csv(self.fname)
        df.rename(columns=lambda x: x.replace("-", ""), inplace=True)
        df.iloc[:, 1:] = -df.iloc[:, 1:]
        return df

gc_data = GCData("marrero-gani-val-cmc.csv", "log(CMC)_exp", "log(CMC)_pred Reduced Dataset")
gc_data.df

Index(['Surfactant Code', 'log(CMC)_exp', 'log(CMC)_pred Total Dataset',
       'log(CMC)_pred Reduced Dataset'],
      dtype='object')


Unnamed: 0,Surfactant Code,log(CMC)_exp,log(CMC)_pred Total Dataset,log(CMC)_pred Reduced Dataset,Converted CMC expected,Converted CMC prediction,Residual
0,C6E4,-1.032,-1.047,-1.053,4.968,4.947,0.021
1,C8E1,-2.31,-2.242,-2.191,3.69,3.809,-0.119
2,C9E8,-2.52,-2.698,-2.649,3.48,3.351,0.129
3,C10E5,-3.1,-3.122,-3.092,2.9,2.908,-0.008
4,C11E8,-3.523,-3.565,-3.544,2.477,2.456,0.021
5,C12E4,-4.194,-3.986,-3.986,1.806,2.014,-0.208
6,C14E9,-5.046,-4.869,-4.889,0.954,1.111,-0.157
7,C15E8,-5.456,-5.23,-5.335,0.544,0.665,-0.121
8,C16E10,-5.699,-5.74,-5.786,0.301,0.214,0.087
9,C8PhE5,-3.824,-3.776,-3.767,2.176,2.233,-0.057


In [10]:
gc_data.rmse

0.13012468379724632