In [2]:
import os
import sys
import numpy as np
import pandas as pd
import rdkit
import random
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem import AllChem
from rdkit.ML.Cluster import Butina
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator



In [3]:
df=pd.read_csv('feature_set/poses_data_Lin_F9_Score_betaScore_LigCover.csv')
df.head(2)

Unnamed: 0,ligand_id,compound_stem,compound_name,new_smile,o_index,Lin_F9_Score,betaScore,lig_Cover
0,1,A00366875,,Cc1cccc(NC(=O)N2CCC3(CC2)CC(CC(=O)N(C)C)c2cccc...,1,-9.203915,-5.8,0.613
1,1,A00366875,,Cc1cccc(NC(=O)N2CCC3(CC2)CC(CC(=O)N(C)C)c2cccc...,2,-8.8736,-5.9,0.613


In [4]:
len(df)

14310

In [5]:
Lin_F9 = round(df.Lin_F9_Score*(-0.73349),3)
df['Lin_F9']=Lin_F9

In [6]:
LigandDescriptors = ['HeavyAtomMolWt','NumValenceElectrons','FpDensityMorgan1',
                     'FpDensityMorgan2','FpDensityMorgan3','LabuteASA',
                     'TPSA','NHOHCount','NOCount','MolLogP','MolMR']
LigandDescriptors = [x for x in LigandDescriptors if x not in ['NOCount']]
len(LigandDescriptors)

10

In [7]:
DescCalc = MolecularDescriptorCalculator(LigandDescriptors)
len(LigandDescriptors)

10

In [8]:
def GetRDKitDescriptors(smile):
    mol = Chem.MolFromSmiles(smile)
    mol.UpdatePropertyCache(strict = False)
    Chem.GetSymmSSSR(mol)
    return DescCalc.CalcDescriptors(mol)

In [9]:
Features = []

for i in df.new_smile.tolist():
    Features.append(GetRDKitDescriptors(i))
    
ss = pd.DataFrame(Features, columns=LigandDescriptors)


In [10]:
ss

Unnamed: 0,HeavyAtomMolWt,NumValenceElectrons,FpDensityMorgan1,FpDensityMorgan2,FpDensityMorgan3,LabuteASA,TPSA,NHOHCount,MolLogP,MolMR
0,390.293,164,1.129032,1.870968,2.580645,183.727201,61.88,1,4.40612,121.4747
1,390.293,164,1.129032,1.870968,2.580645,183.727201,61.88,1,4.40612,121.4747
2,390.293,164,1.129032,1.870968,2.580645,183.727201,61.88,1,4.40612,121.4747
3,390.293,164,1.129032,1.870968,2.580645,183.727201,61.88,1,4.40612,121.4747
4,390.293,164,1.129032,1.870968,2.580645,183.727201,61.88,1,4.40612,121.4747
...,...,...,...,...,...,...,...,...,...,...
14305,146.081,58,1.454545,2.090909,2.545455,62.886226,83.55,4,0.67260,39.4785
14306,146.081,58,1.454545,2.090909,2.545455,62.886226,83.55,4,0.67260,39.4785
14307,146.081,58,1.454545,2.090909,2.545455,62.886226,83.55,4,0.67260,39.4785
14308,146.081,58,1.454545,2.090909,2.545455,62.886226,83.55,4,0.67260,39.4785


In [11]:
df = pd.concat([df,ss],axis =1, ignore_index = False)

In [12]:
List = ['metal%d'%x for x in range(2,8)] + ['Nbw','Epw','Elw']

In [13]:
def Update(df,List):
    for i in List:
        df[i] = pd.Series([0]*len(df), index=df.index)
    return df

In [14]:
df = Update(df,List)

In [15]:
df.to_csv('feature_set/poses_data_update.csv', index=False)

import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem, rdFMCS, Descriptors
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole

mols=[]
Name=[]
for i,j,m in zip(df['SMILES'],df['ligand_id'],round(df['Lin_F9_Score'],3)):
    mol=Chem.MolFromSmiles(i)
    mols.append(mol)
    Name.append('ligand_id:%d | Lin_F9_Score:%s'%(j,m))

Chem.Draw.MolsToGridImage(mols, molsPerRow=3,subImgSize=(350,350),legends=Name)

In [31]:
ss = pd.read_csv("feature_set/poses_data_feature.csv")
ss=ss.drop_duplicates()
ss

Unnamed: 0,pdb,vina,vina1,vina2,vina3,vina4,vina5,vina6,vina7,vina8,...,vina39,vina40,vina41,vina42,vina43,vina44,vina45,vina46,vina47,vina48
0,1_1,0,0.0,4.6673,48.669,79.268,129.880,211.92,222.87,10.3710,...,29.541,-80.653,-0.01629,-0.00376,3.0,3.0,31.0,16.0,4.0,4.0
1,1_2,0,0.0,3.3743,50.409,87.772,128.440,189.99,216.32,10.4630,...,25.988,-78.127,-0.03233,0.01106,3.0,3.0,31.0,16.0,4.0,4.0
2,1_3,0,0.0,4.6000,42.114,76.679,134.110,222.44,227.13,13.2270,...,28.433,-79.793,0.04649,0.00872,3.0,3.0,31.0,16.0,4.0,4.0
3,1_4,0,0.0,5.2989,50.675,80.235,143.230,222.56,247.98,13.5840,...,29.457,-78.758,-0.14700,-0.03258,3.0,3.0,31.0,16.0,4.0,4.0
4,1_5,0,0.0,4.2632,39.167,73.599,106.110,198.25,238.77,13.2720,...,24.075,-72.715,0.25780,0.04847,3.0,3.0,31.0,16.0,4.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14305,2866_1,0,0.0,2.2165,30.748,46.589,67.456,112.41,127.98,8.2070,...,23.375,-32.885,0.49797,0.12416,2.5,2.5,11.0,4.0,4.0,4.0
14306,2866_2,0,0.0,2.7052,26.752,44.869,73.935,100.77,118.91,6.5946,...,22.419,-33.028,0.28756,0.08883,2.5,2.5,11.0,4.0,4.0,4.0
14307,2866_3,0,0.0,2.9894,27.761,46.308,76.973,106.68,125.78,8.7598,...,23.500,-33.541,0.36371,0.07427,2.5,2.5,11.0,4.0,4.0,4.0
14308,2866_4,0,0.0,3.2418,30.031,49.561,70.436,112.15,133.33,9.1956,...,23.115,-34.432,0.40985,0.10372,2.5,2.5,11.0,4.0,4.0,4.0


In [32]:
ligand_id,o_index=[],[]
for ligand in ss.pdb:
    if '_' in ligand:
        ligand_i=ligand.split('_')[0]
        index=ligand.split('_')[1]
    else:
        ligand_i=ligand
        index=ligand
    ligand_id.append(ligand_i)
    o_index.append(index)
ss['ligand_id']=ligand_id
ss['o_index']=o_index

In [33]:
ss.to_csv('feature_set/poses_data_vina.csv')

In [21]:
sa = pd.read_csv('feature_set/poses_data_sasa.csv')
sa=sa.drop_duplicates()

In [22]:
ligand_id,o_index=[],[]
for ligand in sa.pdb:
    if '_' in ligand:
        ligand_i=ligand.split('_')[0]
        index=ligand.split('_')[1]
    else:
        ligand_i=ligand
        index=ligand
    ligand_id.append(ligand_i)
    o_index.append(index)
sa['ligand_id']=ligand_id
sa['o_index']=o_index

In [23]:
sa

Unnamed: 0,pdb,P2.P,P2.N,P2.DA,P2.D,P2.A,P2.AR,P2.H,P2.PL,P2.HA,...,P2dp.DA,P2dp.D,P2dp.A,P2dp.AR,P2dp.H,P2dp.PL,P2dp.HA,P2dp.SA,ligand_id,o_index
0,1_1,0.0,3.29,0.00,0.0,2.15,59.49,5.77,44.45,0.0,...,0,0,2.15,18.71,0,32.79,0,56.94,1,1
1,1_2,0.0,0.00,0.00,0.0,4.65,0.00,3.63,35.62,0.0,...,0,0,0.00,0.00,0,21.55,0,21.55,1,2
2,1_3,0.0,3.48,0.00,0.0,1.65,59.24,4.62,42.91,0.0,...,0,0,1.65,18.33,0,31.65,0,55.11,1,3
3,1_4,0.0,2.31,0.00,0.0,1.75,18.85,25.45,70.69,0.0,...,0,0,1.75,18.85,0,33.84,0,56.76,1,4
4,1_5,0.0,0.00,0.00,0.0,1.14,0.00,9.68,31.06,0.0,...,0,0,0.00,0.00,0,20.47,0,20.47,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14304,2866_1,0.0,0.00,17.41,0.0,8.23,0.00,0.00,5.88,0.0,...,0,0,8.23,0.00,0,5.88,0,14.11,2866,1
14305,2866_2,0.0,216.70,0.00,0.0,164.78,109.91,0.00,92.55,0.0,...,0,0,164.78,109.91,0,92.55,0,583.94,2866,2
14306,2866_3,0.0,216.70,0.00,0.0,164.78,109.91,0.00,92.55,0.0,...,0,0,164.78,109.91,0,92.55,0,583.94,2866,3
14307,2866_4,0.0,216.70,0.00,0.0,164.78,109.91,0.00,92.55,0.0,...,0,0,164.78,109.91,0,92.55,0,583.94,2866,4


In [24]:
sa.to_csv('feature_set/poses_data_sasa.csv')

In [59]:
import pandas as pd
df=pd.read_csv('feature_set/poses_data_update.csv')
ss = pd.read_csv('feature_set/poses_data_vina.csv')
sa=pd.read_csv('feature_set/poses_data_sasa.csv')

In [60]:
df.shape

(14309, 28)

In [61]:
ss.shape

(14309, 53)

In [62]:
Vina=['vina%d'%i for i in range(1,49)]
f_type = ["P","N","DA","D","A","AR","H","PL","HA","SA"]
SASA = ["P2." + i for i in f_type] + ["P2dl." + i for i in f_type] + ["P2dp." + i for i in f_type]

In [63]:
def Get_df(df, ss, sa):
    for i in Vina:
        df[i] = pd.Series(ss[i].tolist(), index=df.index)
    for j in SASA:
        df[j] = pd.Series(sa[j].tolist(), index=df.index) 
    df['LE'] = df['Lin_F9']/df['vina45']
    return df

In [64]:
df = Get_df(df, ss, sa)

In [65]:
df

Unnamed: 0,ligand_id,compound_stem,compound_name,new_smile,o_index,Lin_F9_Score,betaScore,lig_Cover,Lin_F9,HeavyAtomMolWt,...,P2dp.N,P2dp.DA,P2dp.D,P2dp.A,P2dp.AR,P2dp.H,P2dp.PL,P2dp.HA,P2dp.SA,LE
0,1,A00366875,,Cc1cccc(NC(=O)N2CCC3(CC2)CC(CC(=O)N(C)C)c2cccc...,1,-9.203915,-5.8,0.613,6.751,390.293,...,3.29,0,0,2.15,18.71,0,32.79,0,56.94,0.217774
1,1,A00366875,,Cc1cccc(NC(=O)N2CCC3(CC2)CC(CC(=O)N(C)C)c2cccc...,2,-8.873600,-5.9,0.613,6.509,390.293,...,0.00,0,0,0.00,0.00,0,21.55,0,21.55,0.209968
2,1,A00366875,,Cc1cccc(NC(=O)N2CCC3(CC2)CC(CC(=O)N(C)C)c2cccc...,3,-8.815475,-5.8,0.581,6.466,390.293,...,3.48,0,0,1.65,18.33,0,31.65,0,55.11,0.208581
3,1,A00366875,,Cc1cccc(NC(=O)N2CCC3(CC2)CC(CC(=O)N(C)C)c2cccc...,4,-8.577186,-6.1,0.613,6.291,390.293,...,2.31,0,0,1.75,18.85,0,33.84,0,56.76,0.202935
4,1,A00366875,,Cc1cccc(NC(=O)N2CCC3(CC2)CC(CC(=O)N(C)C)c2cccc...,5,-8.576674,-6.3,0.613,6.291,390.293,...,0.00,0,0,0.00,0.00,0,20.47,0,20.47,0.202935
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14304,2866,M65097687,PAS,Nc1ccc(C(=O)O)c(O)c1,1,-3.959940,-4.8,1.000,2.905,146.081,...,0.00,0,0,8.23,0.00,0,5.88,0,14.11,0.264091
14305,2866,M65097687,PAS,Nc1ccc(C(=O)O)c(O)c1,2,-3.936136,-5.3,0.727,2.887,146.081,...,216.70,0,0,164.78,109.91,0,92.55,0,583.94,0.262455
14306,2866,M65097687,PAS,Nc1ccc(C(=O)O)c(O)c1,3,-3.892714,-4.8,0.909,2.855,146.081,...,216.70,0,0,164.78,109.91,0,92.55,0,583.94,0.259545
14307,2866,M65097687,PAS,Nc1ccc(C(=O)O)c(O)c1,4,-3.872353,-4.3,0.909,2.840,146.081,...,216.70,0,0,164.78,109.91,0,92.55,0,583.94,0.258182


In [66]:
df.to_csv('feature_set/poses_data_all.csv',index=False)