### Author - Ajaya Kumar Sahoo

####  R-group decomposition of chemical strcuture and the computation of their cyclic skeleton

In [1]:
import pandas as pd
import rdkit
from rdkit import Chem
from rdkit.Chem import rdRGroupDecomposition
print(rdkit.__version__)

2023.03.1


In [2]:
inp = 'input.tsv' # input file with first column = chemical id, second column = SMILES, third column = Scaffold
# replace the input file with the input file from your machine

df = pd.read_csv(inp, sep='\t',encoding='UTF-8') 
df.columns= ['iden','SMILES','Scaffold']
s
print(df.shape)
df.head()

In [15]:
def Convert_Scaffold_to_CSKFormat(row):
    '''
    Convert the scaffold to cyclic skeleton (CSK) format - 
    where all atom types are converted to 'C' and all types of bonds are converted to single bond
    
    Originally, this code function was from the MurckoScaffold python file in RDKit and I modified it by adding 'sanitize=False'
    '''
    Scaffold = str(row['Scaffold']).strip()
    res = Chem.MolFromSmiles(Scaffold,sanitize=False)
    for atom in res.GetAtoms():
        if atom.GetAtomicNum() != 1:
            atom.SetAtomicNum(6)
        atom.SetIsAromatic(False)
        atom.SetFormalCharge(0)
        atom.SetChiralTag(Chem.ChiralType.CHI_UNSPECIFIED)
        atom.SetNoImplicit(0)
        atom.SetNumExplicitHs(0)
    for bond in res.GetBonds():
        bond.SetBondType(Chem.BondType.SINGLE)
        bond.SetIsAromatic(False)
    CSK = Chem.MolToSmiles(Chem.RemoveHs(res,sanitize=False),isomericSmiles=False,canonical=True)
    return CSK

df['Cyclic skeleton'] = df.apply(lambda row:Convert_Scaffold_to_CSKFormat(row),axis=1)
print(df.shape)
df.tail()

In [16]:
def Rdecompose(row):
    '''
    This function performs R-group decomposition
    input is the scaffold and the SMILES as strings
    return the list containing the core, and R groups.
    '''
    ms = Chem.MolFromSmiles(row['SMILES'])
    sc = Chem.MolFromSmiles(row['Scaffold'])
    group,_ = rdRGroupDecomposition.RGroupDecompose([sc], [ms],asSmiles=True)
    values = list(group[0].values())
    return (values)

In [18]:
# getting the core and R-groups for all the chemicals

df['Core'] = df.apply(lambda row:Rdecompose(row)[0],axis=1)
df['R-groups'] = df.apply(lambda row:','.join(Rdecompose(row)[1:]),axis=1)

print(df.shape)
df.head()

In [19]:
df.csv(inp.replace('.tsv','_out.tsv'),sep='\t',index=None) # output file