In [1]:
import pandas as pd
import numpy as np
import dask
import dask.dataframe as dd
import os
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from scipy.spatial.distance import cdist
import pickle

In [4]:
import pandas as pd
import numpy as np
import dask
import dask.dataframe as dd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.rdmolops import AddHs
from torch_geometric.data import Data
import pickle

def smiles_2_ECFP(df: pd.DataFrame, smiles_col: str = 'smiles', radius: int = 2, nBits: int = 512, 
                 num_processes: int = 4, num_threads: int = 4):
    
    """
    Calculate the ECFP (Extended Connectivity FingerPrints) from a given set of SMILES.

    Parameters
    ---------------

    path          : df
                    pandas DataFrame 
    smiles_col    : str 
                    Name of column with 'SMILES'
    radius        : int
                    Radius for which to calculate the ECFP
    nBits         : int
                    Number of bits to be used in ECFP
    num_processes : int
                    Number of processes to start
    num_threads   : int
                    Number of threads per process
                    
    Returns
    ---------------
    
    df            : pd.DataFrame
                    pd.DataFrame with ECFP stored as np.arrays of position indices
    """
    
    dask.config.set(scheduler='processes', num_workers=num_processes, threads_per_worker=num_threads)

    dask_df = dd.from_pandas(df, chunksize=256)

    delayed_tasks = []

    for smiles in dask_df[smiles_col].compute():

        @dask.delayed
        def convert(smiles, radius=radius, nBits=nBits):
            try:
                mol = Chem.MolFromSmiles(smiles)
                if mol is not None:
                    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits)
                    fp = np.nonzero(fp) 
                    return pd.Series(fp, dtype=object)
                else:
                    return pd.Series(np.nan, dtype=object)
            except Exception as e:
                print(f"Could not calculate ECFP for '{smiles}' due to the following error:\n'{e}'")
                return pd.Series(np.nan, dtype=object)

        delayed_tasks.append(convert(smiles))

    computed_df = dd.from_delayed(delayed_tasks, meta=pd.Series(dtype=object)).compute().reset_index(drop = True)
    dask_df = dask_df.compute()
    dask_df[f'ECFP_{nBits}'] = computed_df
    
    return dask_df

In [6]:
import pandas as pd
import numpy as np
import rdkit
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
import pickle

In [7]:
def similarity_search_1(df: pd.DataFrame, smiles_col: str, radius: int=2, nBits:int=512, n_processes=4, n_threads=4):

    with open(f'./data/train_data/Train_EBV.pkl', 'rb') as file:
        EBVs = pickle.load(file)['EBV']
    
    def search(smiles, radius = radius, nBits = nBits):
        try:
            mol = Chem.MolFromSmiles(smiles)
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits)
            score = np.max([DataStructs.TanimotoSimilarity(fp, train_fp) for train_fp in EBVs])
            return pd.Series(score, name='Tanimoto')
        except Exception as e:
            print(e)
            return pd.Series(np.nan, name='Tanimoto')
    
    df['Tanimoto'] = df[smiles_col].apply(search, radius = radius, nBits = nBits)

    return df            

In [8]:
def similarity_search_2(df: pd.DataFrame, smiles_col: str, radius: int=2, nBits:int=512, n_processes=4, n_threads=4):

    with open(f'./data/train_data/Train_EBV.pkl', 'rb') as file:
        EBVs = pickle.load(file)['EBV']
    
    def search(smiles, radius = radius, nBits = nBits):
        try:
            mol = Chem.MolFromSmiles(smiles)
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits)
            score = np.max(DataStructs.BulkTanimotoSimilarity(fp, EBVs))
            return pd.Series(score, name='Tanimoto')
        except Exception as e:
            print(e)
            return pd.Series(np.nan, name='Tanimoto')
    
    df['Tanimoto'] = df[smiles_col].apply(search, radius = radius, nBits = nBits)

    return df            

In [9]:
df = pd.read_parquet(f'./data/train_data/dataset_prepared.parquet')

In [11]:
%%time
out = similarity_search_1(df.sample(4), smiles_col ='smiles')

CPU times: total: 21.6 s
Wall time: 21.7 s


In [12]:
df.head()

Unnamed: 0,smiles,selfie,tokens,Mol_ID
0,Cc1ccc(C)n1-c1cccc(C(=O)O)c1,[C][C][=C][C][=C][Branch1][C][C][N][Ring1][=Br...,"[[C], [C], [=C], [C], [=C], [Branch1], [C], [C...",0
1,Cc1nc2c(c(C)c1CC(=O)NCc1ccco1)c(=O)[nH]n2C,[C][C][=N][C][=C][Branch2][Ring1][=Branch1][C]...,"[[C], [C], [=N], [C], [=C], [Branch2], [Ring1]...",1
2,O=C(c1csnn1)N1CCCC2(CCN(c3ncccn3)C2)C1,[O][=C][Branch1][Branch2][C][=C][S][N][=N][Rin...,"[[O], [=C], [Branch1], [Branch2], [C], [=C], [...",2
3,CCC(C(=O)NCc1ccco1)n1nc(C)c2c(C)n(-c3ccc(C)cc3...,[C][C][C][Branch1][=C][C][=Branch1][C][=O][N][...,"[[C], [C], [C], [Branch1], [=C], [C], [=Branch...",3
4,O=S(=O)(NCc1ccc2c(c1)OCO2)c1c[nH]cn1,[O][=S][=Branch1][C][=O][Branch2][Ring1][C][N]...,"[[O], [=S], [=Branch1], [C], [=O], [Branch2], ...",4
