In [1]:
import rdkit.Chem as Chem
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole # Enables RDKit IPython integration]
import pandas as pd


# preparing scaffold data for drug subset

In [7]:
drug_like = pd.read_csv('mcule_drugbank_drugcentarl_chembl.csv')

In [8]:
drug_like.head()

Unnamed: 0,CHEM_COMP_ID
0,SU0
1,TRI
2,X0T
3,CBL
4,MPT


In [9]:
df_scaffold = pd.read_csv('all_scaffolds.csv')

In [10]:
df_scaffold.head()

Unnamed: 0,CHEM_COMP_ID,SMILES
0,244,c1ccc(-c2cc3ccccc3o2)cc1
1,SM1,O=C(CNC(=O)CNS(=O)(=O)Cc1ccccc1)NCc1ccccc1
2,NVV,O=C(c1ccccc1)[C@@H]1CNC[C@@H]1c1ccc2c(=O)[nH]c...
3,9UO,c1ccc(-c2ncncn2)nc1
4,80R,O=c1[nH]c2ccc(Nc3ccccc3)cc2[nH]1


In [13]:
drug_like_scaffold = pd.merge(drug_like, df_scaffold, on=['CHEM_COMP_ID'])

In [19]:
drug_like_scaffold

Unnamed: 0,CHEM_COMP_ID,SMILES
0,SU0,O=C(Cc1cc(=O)oc2ccccc12)Nc1ccccc1
1,SU0,O=C(Cc1cc(=O)oc2ccccc12)Nc1ccccc1
2,TRI,c1nnc[nH]1
3,X0T,O=c1[nH]c(=O)c2[nH]cnc2[nH]1
4,CBL,c1ccccc1
5,8CL,c1ccccc1
6,617,O=C(COc1ccccc1)Nc1ccccc1
7,245,O=C(Nc1ccccc1)Nc1ccncc1
8,ETY,c1ccccc1
9,LPB,C1CSSC1


In [18]:
drug_like_scaffold.to_csv('drug_like_scaffold.csv', index=False)

# preparing fragment data for drug subset

In [23]:
df_fragment = pd.read_csv('ccd_fragment.csv')

In [24]:
df_fragment.head()

Unnamed: 0,CHEM_COMP_ID,SMILES
0,SM1,CC(N)=O
1,SM1,NCC=O
2,244,c1ccoc1
3,244,c1ccccc1
4,SM1,c1ccccc1


In [25]:
drug_like_fragment = pd.merge(drug_like, df_fragment, on=['CHEM_COMP_ID'])

In [28]:
drug_like_fragment.to_csv('drug_like_fragment.csv')

# preparing cofactor data

In [30]:
df_cofactor = pd.read_csv('cofactor.csv')

In [32]:
cofactor_scaffold = pd.merge(df_cofactor, df_scaffold, on = ['CHEM_COMP_ID'])

In [34]:
cofactor_scaffold.head()

Unnamed: 0,CHEM_COMP_ID,SMILES
0,01A,O=C(CCCO[PH](=O)O[PH](=O)OC[C@H]1CC[C@@H](n2cn...
1,01K,c1ncc2ncn([C@@H]3CCCO3)c2n1
2,0AF,c1ccc2[nH]ccc2c1
3,0ET,c1ncc2ncn([C@H]3CCCO3)c2n1
4,0HG,c1ccccc1


In [35]:
cofactor_scaffold.to_csv('cofactor_scaffold.csv', index=False)

#  Preparing CDK2 kinase data

In [36]:
df_cdk2 = pd.read_csv('ligands_cdk2_P24941.csv')

In [37]:
cdk2_scaffold = pd.merge(df_cdk2, df_scaffold, on= ['CHEM_COMP_ID'])

In [38]:
cdk2_scaffold.head()

Unnamed: 0,CHEM_COMP_ID,SMILES
0,LZC,c1cc2nc(NC3CCCCC3)ccn2n1
1,ZIP,c1ncc2nc[nH]c2n1
2,ES4,c1ncc2nc[nH]c2n1
3,HH5,c1ncc2[nH]cnc2n1
4,JYM,c1nc(OCC2CCCCC2)c2nc[nH]c2n1


In [39]:
cdk2_scaffold.to_csv('cdk2_scaffol.csv', index=False)

# preparing biopterin cofactor data

In [41]:
df_biopterin = pd.read_csv('biopterin_hetcodes.csv')

In [42]:
biopterin_scaffold = pd.merge(df_biopterin, df_scaffold, on = ['CHEM_COMP_ID'])

In [44]:
biopterin_scaffold.to_csv('biopterin_scaffold.csv', index=False)

# preparing phys chem property data for drug subset

In [45]:
df_ccd_prop = pd.read_csv('ccd_properties_clean.csv')

In [46]:
ccd_prop_drug = pd.merge(drug_like, df_ccd_prop, on=['CHEM_COMP_ID'])

In [48]:
ccd_prop_drug.head()

Unnamed: 0,CHEM_COMP_ID,Total Molweight,cLogP,H-Acceptors,H-Donors,Druglikeness,Rings
0,SU0,388.399,1.1101,8.0,2.0,-3.3861,3.0
1,SU0,388.399,1.1101,8.0,2.0,-3.3861,3.0
2,TRI,,,,,,
3,TRI,69.0668,-0.5705,3.0,1.0,1.5002,1.0
4,X0T,254.245,-1.4165,8.0,2.0,4.4457,2.0


In [53]:
ccd_prop_drug.isnull().sum()

CHEM_COMP_ID          0
Total Molweight    1296
cLogP              1296
H-Acceptors        1296
H-Donors           1296
Druglikeness       1296
Rings              1296
dtype: int64

In [55]:
ccd_prop_drug = ccd_prop_drug.dropna()

In [58]:
ccd_prop_drug.isnull().sum()

CHEM_COMP_ID       0
Total Molweight    0
cLogP              0
H-Acceptors        0
H-Donors           0
Druglikeness       0
Rings              0
dtype: int64

In [59]:
ccd_prop_drug.to_csv('ccd_drug_prop.csv', index=False)