In [18]:
import pandas as pd
import numpy as np
import rdkit
import rdkit.Chem as Chem
import rdkit.Chem.MolStandardize.rdMolStandardize as rdMolStandardize
import rdkit.Chem.rdMolDescriptors as rdMolDescriptors
import rdkit.Chem.rdmolops as rdmolops
import multiprocessing as mp
import queue
import time
import selfies as sf
import copy

In [4]:
sf.set_semantic_constraints('hypervalent')
path = f'./C1.parquet'
df = pd.read_parquet(path)
len(df)

5079

In [24]:
def process_smiles(smiles):
    try:
        uc = rdMolStandardize.Uncharger()
        mol = Chem.MolFromSmiles(smiles)
        rdmolops.RemoveStereochemistry(mol)
        mol = uc.uncharge(mol)
        new_smiles = Chem.MolToSmiles(mol)
        return new_smiles
    except Exception as e:
        print(e)
        return None

def smiles2dense(smiles):
    mol = Chem.MolFromSmiles(smiles)
    keys = 'data/KlekFP_keys.txt'
    klek_keys = [line.strip() for line in open(keys)]
    klek_keys_mols = list(map(Chem.MolFromSmarts, klek_keys))
    fp_list = []
    for i, key in enumerate(klek_keys_mols):
        if mol.HasSubstructMatch(key):
            fp_list.append(i)
    return np.array(fp_list)

def smiles2ECFP(smiles):
    mol = Chem.MolFromSmiles(smiles)
    vec = rdkit.Chem.rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    vec = np.nonzero(vec)[0]
    return vec

In [15]:
df['cleaned_smiles'] = df['smiles'].apply(process_smiles)
print(df.isna().sum())

In [12]:
df = df.dropna().reset_index(drop=True)

In [25]:
df['Klek'] = df['cleaned_smiles'].apply(smiles2dense)
df['ECFP'] = df['cleaned_smiles'].apply(smiles2ECFP)

In [27]:
df.to_parquet(f'./C1_cleaned.parquet')