In [None]:
import os

import pandas as pd
from sklearn.model_selection import train_test_split

from ai4chem.data import ChemFluorDataset, Deep4ChemDataset
from ai4chem.utils import canonicalize_smiles, smiles_to_inchi, smiles_to_selfies

In [None]:
def process_loader(loader) -> pd.DataFrame:
    df = loader.raw_data.copy()
    # Rename columns of interes
    df = df.rename({
        loader._chromophore_smiles_column: "CHROMOPHORE_SMILES",
        loader._solvent_smiles_column: "SOLVENT_SMILES",
        loader._emission_max_column: "EMISSION_MAX_NM",
        loader._absorption_max_column: "ABSORPTION_MAX_NM",
    }, axis='columns')
    # Drop all other columns
    df = df[['CHROMOPHORE_SMILES', 'SOLVENT_SMILES', 'EMISSION_MAX_NM', 'ABSORPTION_MAX_NM']]
    # Drop rows with NaN values
    df = df.dropna(axis='index')
    # Canonicalize SMILES
    df['CHROMOPHORE_SMILES'] = df['CHROMOPHORE_SMILES'].apply(canonicalize_smiles)
    df['SOLVENT_SMILES'] = df['SOLVENT_SMILES'].apply(canonicalize_smiles)
    df = df.dropna(axis='index')
    # Add SELFIES columns
    df['CHROMOPHORE_SELFIES'] = df['CHROMOPHORE_SMILES'].apply(smiles_to_selfies)
    df['SOLVENT_SELFIES'] = df['SOLVENT_SMILES'].apply(smiles_to_selfies)
    df = df.dropna(axis='index')
    # Add InChI columns
    df['CHROMOPHORE_INCHI'] = df['CHROMOPHORE_SMILES'].apply(smiles_to_inchi)
    df['SOLVENT_INCHI'] = df['SOLVENT_SMILES'].apply(smiles_to_inchi)
    df = df.dropna(axis='index')
    return df

def get_chemfluor(path: os.PathLike="../data/chemfluor/data.csv") -> pd.DataFrame:
    chemfluor = ChemFluorDataset("../data/chemfluor/data.csv", canonicalize_smiles=False)
    df = process_loader(chemfluor)
    df['SOURCE_DB'] = 'chemfluor'
    return df

def get_deep4chem(path: os.PathLike="../data/deep4chem/data.csv") -> pd.DataFrame:
    deep4chem = Deep4ChemDataset(path, canonicalize_smiles=False)
    df = process_loader(deep4chem)
    df['SOURCE_DB'] = 'deep4chem'
    return df

In [None]:
df = pd.concat([get_chemfluor(), get_deep4chem()], axis=0)
df.head()

In [None]:
train_df, validate_df = train_test_split(df, train_size=5_000, random_state=42)
# Here, train_size is the size of the test set (used at the very end); the rest are used to validate during training
test_df, validate_df = train_test_split(validate_df, train_size=100, random_state=42)

In [None]:
train_df.reset_index(drop=True).to_json('../data/combined/train.json')
validate_df.reset_index(drop=True).to_json('../data/combined/validate.json')
test_df.reset_index(drop=True).to_json('../data/combined/test.json')