In [1]:
import pandas as pd
import numpy as np

In [47]:
import os
import pandas as pd
import numpy as np
import random
import pickle
import sklearn.ensemble
from sklearn.metrics import roc_auc_score, mean_squared_error, mean_absolute_error

from rdkit import Chem, DataStructs
import rdkit.Chem as rkc
import rdkit.Chem.AllChem as rkac
import rdkit.Chem.Scaffolds.MurckoScaffold as mrks
from rdkit.Chem import AllChem, MACCSkeys
from rdkit.Avalon import pyAvalonTools

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("screening_data/screen_df.csv")

In [4]:
df.columns

Index(['DrugBank ID', 'Name', 'CAS Number', 'Drug Groups', 'InChIKey', 'InChI',
       'SMILES', 'Formula', 'KEGG Compound ID', 'KEGG Drug ID',
       'PubChem Compound ID', 'PubChem Substance ID', 'ChEBI ID', 'ChEMBL ID',
       'HET ID', 'ChemSpider ID', 'BindingDB ID'],
      dtype='object')

In [13]:
selection = ['DrugBank ID', 'SMILES']
df2 = df[selection]
df2

Unnamed: 0,DrugBank ID,SMILES
0,DB00006,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...
1,DB00007,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...
2,DB00014,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...
3,DB00027,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...
4,DB00035,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...
...,...,...
12694,DB18711,OC(=O)CCCCC(O)=O.C[C@@H](N)COC1=CC=C(C=C1)C1=C...
12695,DB18712,
12696,DB18715,NC1=C2N(C(=O)N([C@@H]3CCCN(C3)C(=O)C=C)C2=CC=N...
12697,DB18716,[H][C@@]12CC(=O)N1[C@@H](C([O-])=O)[C@](C)(CN1...


In [14]:
df2.isna().sum()

DrugBank ID      0
SMILES         774
dtype: int64

In [16]:
final_df = df2.dropna(subset=['SMILES'])

In [18]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11925 entries, 0 to 12697
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   DrugBank ID  11925 non-null  object
 1   SMILES       11925 non-null  object
dtypes: object(2)
memory usage: 279.5+ KB


In [19]:
train_df = pd.read_csv("Data/Final_dataframe_encoded.csv")
train_df

Unnamed: 0,molecule_chembl_id,activity_class,preprocessed_smiles
0,CHEMBL412059,0,Cc1ccccc1Cn1cc(NC(=O)c2noc3c2CC(C(C)(C)C)CC3)cn1
1,CHEMBL438202,0,CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccc(C(=O)NCCc...
2,CHEMBL261734,0,CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccc(C(=O)NCc5...
3,CHEMBL410040,0,CC(C)(C)OC(=O)c1ccc(Cn2cc(NC(=O)c3noc4c3CC(C(C...
4,CHEMBL409299,0,CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccc(C(=O)O)cc...
5,CHEMBL261486,0,CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccc(F)cc4F)c3...
6,CHEMBL261487,0,COC(=O)c1ccc(Cn2cc(NC(=O)c3noc4c3CC(C(C)(C)C)C...
7,CHEMBL428787,0,CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccc5ccccc5c4)...
8,CHEMBL260779,0,CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccccc4I)c3)c2C1
9,CHEMBL258966,0,CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccc(Cl)cc4)c3...


In [95]:
# filtering out active molecules from training data 
train_active =  train_df[(train_df['activity_class'] == 0)]

In [96]:
train_active 

Unnamed: 0,molecule_chembl_id,activity_class,preprocessed_smiles
0,CHEMBL412059,0,Cc1ccccc1Cn1cc(NC(=O)c2noc3c2CC(C(C)(C)C)CC3)cn1
1,CHEMBL438202,0,CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccc(C(=O)NCCc...
2,CHEMBL261734,0,CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccc(C(=O)NCc5...
3,CHEMBL410040,0,CC(C)(C)OC(=O)c1ccc(Cn2cc(NC(=O)c3noc4c3CC(C(C...
4,CHEMBL409299,0,CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccc(C(=O)O)cc...
5,CHEMBL261486,0,CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccc(F)cc4F)c3...
6,CHEMBL261487,0,COC(=O)c1ccc(Cn2cc(NC(=O)c3noc4c3CC(C(C)(C)C)C...
7,CHEMBL428787,0,CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccc5ccccc5c4)...
8,CHEMBL260779,0,CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccccc4I)c3)c2C1
9,CHEMBL258966,0,CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccc(Cl)cc4)c3...


In [119]:
training_active_smiles = train_active['preprocessed_smiles'].to_list()

In [120]:
training_active_smiles

['Cc1ccccc1Cn1cc(NC(=O)c2noc3c2CC(C(C)(C)C)CC3)cn1',
 'CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccc(C(=O)NCCc5ccccc5)cc4)c3)c2C1',
 'CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccc(C(=O)NCc5ccccc5)cc4)c3)c2C1',
 'CC(C)(C)OC(=O)c1ccc(Cn2cc(NC(=O)c3noc4c3CC(C(C)(C)C)CC4)cn2)cc1',
 'CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccc(C(=O)O)cc4)c3)c2C1',
 'CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccc(F)cc4F)c3)c2C1',
 'COC(=O)c1ccc(Cn2cc(NC(=O)c3noc4c3CC(C(C)(C)C)CC4)cn2)cc1',
 'CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccc5ccccc5c4)c3)c2C1',
 'CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccccc4I)c3)c2C1',
 'CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccc(Cl)cc4)c3)c2C1',
 'CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccccc4Br)c3)c2C1',
 'CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccccc4)c3)c2C1',
 'CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccc(F)cc4)c3)c2C1',
 'Cc1nc2sc3ccccc3n2c1C(=O)NNC(=O)c1ccc(Oc2ccccc2)cc1',
 'Cc1nc2sccn2c1C(=O)NNC(=O)c1cccc2ccccc12',
 'Cc1nc2sccn2c1C(=O)N/N=C/c1ccc(C(F)(F)F)cc1',
 'Cc1nc2sccn2c1C(=O)N/N=C/c1ccccc1',
 'Cc1nc2sccn2c1C(=O)Nc1ccc(Br)cc1',
 'Cc1nc2sccn2c1C

In [23]:
len(final_df.SMILES.unique())

11919

In [24]:
final_df.drop_duplicates(['SMILES'], inplace = True, ignore_index = True)

In [25]:
final_df

Unnamed: 0,DrugBank ID,SMILES
0,DB00006,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...
1,DB00007,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...
2,DB00014,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...
3,DB00027,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...
4,DB00035,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...
...,...,...
11914,DB18708,CC1=C(COC2=CC=C(N=N2)C(=O)NC2CCOCC2)C(=NO1)C1=...
11915,DB18709,CN1C=C(C=N1)C1=CN2N=CC=C2C(=N1)C1=CN(N=C1)[C@@...
11916,DB18711,OC(=O)CCCCC(O)=O.C[C@@H](N)COC1=CC=C(C=C1)C1=C...
11917,DB18715,NC1=C2N(C(=O)N([C@@H]3CCCN(C3)C(=O)C=C)C2=CC=N...


In [26]:
# Import RDkit packages

from rdkit import Chem
import rdkit.Chem as rkc
import rdkit.Chem.AllChem as rkac
import rdkit.Chem.Scaffolds.MurckoScaffold as mrks

In [30]:
from rdkit import Chem

# Assuming your screening data is stored in a DataFrame called 'screening_data'
# Assuming the SMILES column in the DataFrame is named 'SMILES'

# Define a function to remove stereochemistry from SMILES
def remove_stereochemistry(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    Chem.RemoveStereochemistry(mol)
    return Chem.MolToSmiles(mol)

# Apply the function to the SMILES column
final_df['SMILES'] = final_df['SMILES'].apply(remove_stereochemistry)

# Remove rows with invalid SMILES after removing stereochemistry
final_df = final_df.dropna(subset=['SMILES'])

# Optionally, reset the index of the DataFrame
final_df.reset_index(drop=True, inplace=True)


[00:19:50] Explicit valence for atom # 13 Cl, 5, is greater than permitted
[00:19:50] SMILES Parse Error: syntax error while parsing: OS(O)(O)C1=CC=C(C=C1)C-1=C2\C=CC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC=C(C=C1)S(O)(O)O)C1=CC=C(C=C1)S([O-])([O-])[O-])\C1=CC=C(C=C1)S(O)(O)[O-]
[00:19:50] SMILES Parse Error: Failed parsing SMILES 'OS(O)(O)C1=CC=C(C=C1)C-1=C2\C=CC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC=C(C=C1)S(O)(O)O)C1=CC=C(C=C1)S([O-])([O-])[O-])\C1=CC=C(C=C1)S(O)(O)[O-]' for input: 'OS(O)(O)C1=CC=C(C=C1)C-1=C2\C=CC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC=C(C=C1)S(O)(O)O)C1=CC=C(C=C1)S([O-])([O-])[O-])\C1=CC=C(C=C1)S(O)(O)[O-]'
[00:19:51] Explicit valence for atom # 19 O, 3, is greater than permitted
[00:19:51] Explicit valence for atom # 0 O, 3, is greater than permitted
[00:19:51] Unusual charge on atom 0 number of radical electrons set to zero
[00:19:51] Explicit valence for atom # 4 F, 2, is greater than permitted
[

In [31]:
final_df

Unnamed: 0,DrugBank ID,SMILES
0,DB00006,CCC(C)C(NC(=O)C(CCC(=O)O)NC(=O)C(CCC(=O)O)NC(=...
1,DB00007,CCNC(=O)C1CCCN1C(=O)C(CCCNC(=N)N)NC(=O)C(CC(C)...
2,DB00014,CC(C)CC(NC(=O)C(COC(C)(C)C)NC(=O)C(Cc1ccc(O)cc...
3,DB00027,CC(C)CC(NC(=O)CNC(=O)C(NC=O)C(C)C)C(=O)NC(C)C(...
4,DB00035,N=C(N)NCCCC(NC(=O)C1CCCN1C(=O)C1CSSCCC(=O)NC(C...
...,...,...
11904,DB18708,Cc1ccc(-c2noc(C)c2COc2ccc(C(=O)NC3CCOCC3)nn2)cn1
11905,DB18709,Cn1cc(-c2cn3nccc3c(-c3cnn(C4(CC#N)CC(C#N)C4)c3...
11906,DB18711,CC(N)COc1ccc(-c2cnc3ccc(NC(C)c4cccc(F)c4)nn23)...
11907,DB18715,C=CC(=O)N1CCCC(n2c(=O)n(-c3ccc(Oc4ccccc4)cc3)c...


In [33]:
# for this case we also declare compounds that are too large or too small as invalid

def is_invalid(smi):

    global i
    if smi:
        # setting sanitize to false avoids explicit valence error
        # setting it to true gives the error and can be used to count invalid mols
        mol = rkc.MolFromSmiles(smi, sanitize = True)
        if mol is None:
            return int(1)
        elif (len(smi)  > 120) or (len(smi) < 20):
            return int(1)
        else:
            pass
        return int(0)

In [34]:
# create a series of valid/invalid values. 0 = valid, 1 = invalid
invalid_list = list(map(is_invalid, final_df['SMILES']))

[00:32:06] Unusual charge on atom 42 number of radical electrons set to zero


In [35]:
print('The number of invalid SMILES in the dataframe is: ' + str(sum(invalid_list)))

The number of invalid SMILES in the dataframe is: 1561


In [36]:
invalid_series = pd.Series(invalid_list, name = 'Invalid')

final_df = pd.concat([final_df, invalid_series], axis = 1)

In [37]:
final_df['Invalid'].value_counts()

Invalid
0    10348
1     1561
Name: count, dtype: int64

In [38]:
# create anew df where invalid == 0
clean_valid_df = final_df[final_df['Invalid'] == 0]
clean_valid_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10348 entries, 10 to 11908
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   DrugBank ID  10348 non-null  object
 1   SMILES       10348 non-null  object
 2   Invalid      10348 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 323.4+ KB


In [39]:
# reset index since 'clean_valid_df' is a subset of 'clean_df'
clean_valid_df.reset_index(drop=True, inplace=True)

#drop 'invalid' column
clean_valid_df = clean_valid_df.drop(['Invalid'], axis = 1)

clean_valid_df.tail()

Unnamed: 0,DrugBank ID,SMILES
10343,DB18708,Cc1ccc(-c2noc(C)c2COc2ccc(C(=O)NC3CCOCC3)nn2)cn1
10344,DB18709,Cn1cc(-c2cn3nccc3c(-c3cnn(C4(CC#N)CC(C#N)C4)c3...
10345,DB18711,CC(N)COc1ccc(-c2cnc3ccc(NC(C)c4cccc(F)c4)nn23)...
10346,DB18715,C=CC(=O)N1CCCC(n2c(=O)n(-c3ccc(Oc4ccccc4)cc3)c...
10347,DB18716,C[n+]1ccn(CC2(C)C(C(=O)[O-])N3C(=O)CC3S2(=O)=O)n1


In [41]:
# remove duplicates
clean_valid_df.drop_duplicates(subset = ['SMILES'], inplace = True)

#reset index, check info
clean_valid_df.reset_index(drop=True, inplace=True)
clean_valid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10072 entries, 0 to 10071
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   DrugBank ID  10072 non-null  object
 1   SMILES       10072 non-null  object
dtypes: object(2)
memory usage: 157.5+ KB


In [42]:
screen_clean_df = clean_valid_df

In [43]:
screen_clean_df

Unnamed: 0,DrugBank ID,SMILES
0,DB00114,Cc1ncc(COP(=O)(O)O)c(C=O)c1O
1,DB00116,Nc1nc(=O)c2c([nH]1)NCC(CNc1ccc(C(=O)NC(CCC(=O)...
2,DB00117,NC(Cc1c[nH]cn1)C(=O)O
3,DB00118,C[S+](CCC(N)C(=O)[O-])CC1OC(n2cnc3c(N)ncnc32)C...
4,DB00121,O=C(O)CCCCC1SCC2NC(=O)NC21
...,...,...
10067,DB18708,Cc1ccc(-c2noc(C)c2COc2ccc(C(=O)NC3CCOCC3)nn2)cn1
10068,DB18709,Cn1cc(-c2cn3nccc3c(-c3cnn(C4(CC#N)CC(C#N)C4)c3...
10069,DB18711,CC(N)COc1ccc(-c2cnc3ccc(NC(C)c4cccc(F)c4)nn23)...
10070,DB18715,C=CC(=O)N1CCCC(n2c(=O)n(-c3ccc(Oc4ccccc4)cc3)c...


In [44]:
# Step 1: Extract SMILES sequences from training data
training_smiles = set(train_df['preprocessed_smiles'])

# Step 2: Remove duplicates from the downloaded database
db_df_unique = screen_clean_df[~screen_clean_df['SMILES'].isin(training_smiles)].copy()

# Optionally, you can reset the index of the DataFrame after removing duplicates
db_df_unique.reset_index(drop=True, inplace=True)


In [45]:
db_df_unique

Unnamed: 0,DrugBank ID,SMILES
0,DB00114,Cc1ncc(COP(=O)(O)O)c(C=O)c1O
1,DB00116,Nc1nc(=O)c2c([nH]1)NCC(CNc1ccc(C(=O)NC(CCC(=O)...
2,DB00117,NC(Cc1c[nH]cn1)C(=O)O
3,DB00118,C[S+](CCC(N)C(=O)[O-])CC1OC(n2cnc3c(N)ncnc32)C...
4,DB00121,O=C(O)CCCCC1SCC2NC(=O)NC21
...,...,...
10067,DB18708,Cc1ccc(-c2noc(C)c2COc2ccc(C(=O)NC3CCOCC3)nn2)cn1
10068,DB18709,Cn1cc(-c2cn3nccc3c(-c3cnn(C4(CC#N)CC(C#N)C4)c3...
10069,DB18711,CC(N)COc1ccc(-c2cnc3ccc(NC(C)c4cccc(F)c4)nn23)...
10070,DB18715,C=CC(=O)N1CCCC(n2c(=O)n(-c3ccc(Oc4ccccc4)cc3)c...


In [46]:
screen_db = db_df_unique

In [49]:
screen_db

Unnamed: 0,DrugBank ID,SMILES
0,DB00114,Cc1ncc(COP(=O)(O)O)c(C=O)c1O
1,DB00116,Nc1nc(=O)c2c([nH]1)NCC(CNc1ccc(C(=O)NC(CCC(=O)...
2,DB00117,NC(Cc1c[nH]cn1)C(=O)O
3,DB00118,C[S+](CCC(N)C(=O)[O-])CC1OC(n2cnc3c(N)ncnc32)C...
4,DB00121,O=C(O)CCCCC1SCC2NC(=O)NC21
...,...,...
10067,DB18708,Cc1ccc(-c2noc(C)c2COc2ccc(C(=O)NC3CCOCC3)nn2)cn1
10068,DB18709,Cn1cc(-c2cn3nccc3c(-c3cnn(C4(CC#N)CC(C#N)C4)c3...
10069,DB18711,CC(N)COc1ccc(-c2cnc3ccc(NC(C)c4cccc(F)c4)nn23)...
10070,DB18715,C=CC(=O)N1CCCC(n2c(=O)n(-c3ccc(Oc4ccccc4)cc3)c...


In [50]:
X_train = screen_db['SMILES']

In [51]:
X_train

0                             Cc1ncc(COP(=O)(O)O)c(C=O)c1O
1        Nc1nc(=O)c2c([nH]1)NCC(CNc1ccc(C(=O)NC(CCC(=O)...
2                                    NC(Cc1c[nH]cn1)C(=O)O
3        C[S+](CCC(N)C(=O)[O-])CC1OC(n2cnc3c(N)ncnc32)C...
4                               O=C(O)CCCCC1SCC2NC(=O)NC21
                               ...                        
10067     Cc1ccc(-c2noc(C)c2COc2ccc(C(=O)NC3CCOCC3)nn2)cn1
10068    Cn1cc(-c2cn3nccc3c(-c3cnn(C4(CC#N)CC(C#N)C4)c3...
10069    CC(N)COc1ccc(-c2cnc3ccc(NC(C)c4cccc(F)c4)nn23)...
10070    C=CC(=O)N1CCCC(n2c(=O)n(-c3ccc(Oc4ccccc4)cc3)c...
10071    C[n+]1ccn(CC2(C)C(C(=O)[O-])N3C(=O)CC3S2(=O)=O)n1
Name: SMILES, Length: 10072, dtype: object

In [52]:
# convers a list of SMILEs to a list of mols
def smi_to_mols(smi_list):
    mols_list = [Chem.MolFromSmiles(smile) for smile in smi_list]
    return mols_list


# returns morgan fingerprints as a 2D array for a list of SMILE strings
def get_morgan_fingerprints(smi_list, size, radius, useFeatures, useCounts=True):
        mols = smi_to_mols(smi_list)
        fps = [AllChem.GetMorganFingerprint(mol, radius, useCounts=useCounts, useFeatures=useFeatures) for mol in mols]
        fps_arr = np.zeros((len(fps), size), np.int32)
        for i, fp in enumerate(fps):
            for index, value in fp.GetNonzeroElements().items():
                n_index = index % size
                fps_arr[i, n_index] += int(value)
        return fps_arr

In [53]:
# conver series to a list
X_train_list = X_train.tolist()

In [54]:
# get fingerprints for training and test data
X_train_arr = get_morgan_fingerprints(X_train_list, size=2048, radius=2, useFeatures=True, useCounts=True)

[00:51:54] Unusual charge on atom 42 number of radical electrons set to zero


In [55]:
from sklearn.decomposition import PCA
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler

In [56]:
import pickle

# Load the pickled StandardScaler object from file
with open('scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)


In [57]:
X_screen = scaler.transform(X_train_arr)

In [58]:
X_screen

array([[-1.05718968,  0.69687315, -0.16568939, ...,  0.        ,
         0.        ,  0.        ],
       [-0.05676186,  1.95660538,  0.26694402, ...,  0.        ,
         0.        ,  0.        ],
       [-1.72414156, -0.56285908, -1.03095621, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.61019002,  0.69687315, -0.16568939, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.61019002, -1.82259131, -0.16568939, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.94366596, -1.82259131,  0.69957743, ...,  0.        ,
         0.        ,  0.        ]])

In [59]:
import pickle

# Load the pickled PCA object from file
with open('pca.pkl', 'rb') as f:
    pca_model = pickle.load(f)


In [60]:
screen_cmpts = pca_model.transform(X_screen)

In [62]:
screen_cmpts.shape

(10072, 25)

# **PREDICTION**

In [65]:
import joblib

In [66]:
import pandas as pd
# from sklearn.externals import joblib

def load_and_predict_models(screen_cmpts):
    # Load KNN model
    knn_model = joblib.load('Models/knn_model_25.pkl')

    # Load SVM model
    svm_model = joblib.load('Models/svm_model_25.pkl')

    # Load XGBoost model
    xgb_model = joblib.load('Models/xgb_model_25.pkl')

    # Load Random Forest model
    rf_model = joblib.load('Models/rf_model_25.pkl')

    # Make predictions using each model
    knn_preds = knn_model.predict(screen_cmpts)
    svm_preds = svm_model.predict(screen_cmpts)
    xgb_preds = xgb_model.predict(screen_cmpts)
    rf_preds = rf_model.predict(screen_cmpts)

    # Create DataFrames for predictions
    knn_df = pd.DataFrame({'Prediction': knn_preds})
    svm_df = pd.DataFrame({'Prediction': svm_preds})
    xgb_df = pd.DataFrame({'Prediction': xgb_preds})
    rf_df = pd.DataFrame({'Prediction': rf_preds})

    # Save predictions to CSV files
    knn_df.to_csv('knn_predictions.csv', index=False)
    svm_df.to_csv('svm_predictions.csv', index=False)
    xgb_df.to_csv('xgb_predictions.csv', index=False)
    rf_df.to_csv('rf_predictions.csv', index=False)


In [67]:
load_and_predict_models(screen_cmpts)


In [74]:
import pandas as pd

# Load each prediction CSV file
knn_preds = pd.read_csv('knn_predictions.csv')
svm_preds = pd.read_csv('svm_predictions.csv')
xgb_preds = pd.read_csv('xgb_predictions.csv')
rf_preds = pd.read_csv('rf_predictions.csv')

# Combine predictions into a single DataFrame with each model's predictions in separate columns
combined_preds = pd.concat([knn_preds, svm_preds, xgb_preds, rf_preds], axis=1)

# Rename columns with model names
combined_preds.columns = ['KNN', 'SVM', 'XGBoost', 'RandomForest']

# Save the combined predictions to a CSV file
combined_preds.to_csv('combined_predictions.csv', index=False)


In [75]:
combined_preds

Unnamed: 0,KNN,SVM,XGBoost,RandomForest
0,1,0,1,1
1,0,1,1,1
2,1,1,1,1
3,1,1,1,1
4,1,1,1,1
...,...,...,...,...
10067,0,1,1,1
10068,1,0,1,1
10069,0,1,1,1
10070,0,1,1,1


In [82]:
pred_df = pd.concat([combined_preds, screen_db], axis = 1)

In [83]:
pred_df

Unnamed: 0,KNN,SVM,XGBoost,RandomForest,DrugBank ID,SMILES
0,1,0,1,1,DB00114,Cc1ncc(COP(=O)(O)O)c(C=O)c1O
1,0,1,1,1,DB00116,Nc1nc(=O)c2c([nH]1)NCC(CNc1ccc(C(=O)NC(CCC(=O)...
2,1,1,1,1,DB00117,NC(Cc1c[nH]cn1)C(=O)O
3,1,1,1,1,DB00118,C[S+](CCC(N)C(=O)[O-])CC1OC(n2cnc3c(N)ncnc32)C...
4,1,1,1,1,DB00121,O=C(O)CCCCC1SCC2NC(=O)NC21
...,...,...,...,...,...,...
10067,0,1,1,1,DB18708,Cc1ccc(-c2noc(C)c2COc2ccc(C(=O)NC3CCOCC3)nn2)cn1
10068,1,0,1,1,DB18709,Cn1cc(-c2cn3nccc3c(-c3cnn(C4(CC#N)CC(C#N)C4)c3...
10069,0,1,1,1,DB18711,CC(N)COc1ccc(-c2cnc3ccc(NC(C)c4cccc(F)c4)nn23)...
10070,0,1,1,1,DB18715,C=CC(=O)N1CCCC(n2c(=O)n(-c3ccc(Oc4ccccc4)cc3)c...


In [78]:
combined_preds.shape

(10072, 4)

In [79]:
screen_db.shape

(10072, 2)

In [84]:
zero_predictions = pred_df[(pred_df['KNN'] == 0) & (pred_df['SVM'] == 0) & (pred_df['XGBoost'] == 0) & (pred_df['RandomForest'] == 0)]

# Print or use the filtered DataFrame
zero_predictions

Unnamed: 0,KNN,SVM,XGBoost,RandomForest,DrugBank ID,SMILES
53,0,0,0,0,DB00201,Cn1c(=O)c2c(ncn2C)n(C)c1=O
126,0,0,0,0,DB00279,NC(Cc1cc(I)c(Oc2ccc(O)c(I)c2)c(I)c1)C(=O)O
228,0,0,0,0,DB00398,CNC(=O)c1cc(Oc2ccc(NC(=O)Nc3ccc(Cl)c(C(F)(F)F)...
277,0,0,0,0,DB00451,NC(Cc1cc(I)c(Oc2cc(I)c(O)c(I)c2)c(I)c1)C(=O)O
513,0,0,0,0,DB00716,CCCc1c2oc(C(=O)O)cc(=O)c2cc2c(=O)cc(C(=O)O)n(C...
...,...,...,...,...,...,...
9561,0,0,0,0,DB16846,Cc1cnc(CNC(=O)c2c(=O)c3ccc(N4CCCN(C)CC4)nc3n3c...
9651,0,0,0,0,DB17029,Cn1c2ccccc2c2c3c(c4c5ccccc5n(CCC#N)c4c21)CNC3=O
9862,0,0,0,0,DB17930,COc1nc2c(c(=O)n1C)n(C)c(=O)n2C
9933,0,0,0,0,DB18080,Cc1nc2ccc(F)cc2n1-c1nc(N)c(F)c(Nc2ccc(C(F)(F)F...


In [86]:
zero_predictions.to_csv("active_mols/active_molecules.csv", index = False)

In [4]:
from rdkit import Chem
from rdkit.Chem import Descriptors

def calculate_lipinski_properties(smiles):
    molecule = Chem.MolFromSmiles(smiles)
    if molecule is None:
        return None
    
    # Calculate descriptors
    mw = Descriptors.MolWt(molecule)
    logp = Descriptors.MolLogP(molecule)
    hbd = Descriptors.NumHDonors(molecule)
    hba = Descriptors.NumHAcceptors(molecule)
    
    # Check Lipinski's Rule of Five
    lipinski_rule = {
        "Molecular Weight": mw,
        "LogP": logp,
        "Hydrogen Bond Donors": hbd,
        "Hydrogen Bond Acceptors": hba
    }
    
    # Check if molecule passes Lipinski's Rule of Five
    passes_rule = all([
        mw <= 500,
        logp <= 5,
        hbd <= 5,
        hba <= 10
    ])
    
    return lipinski_rule, passes_rule

In [5]:
# Apply Lipinski's Rule of Five calculation to each SMILES string
zero_predictions["Lipinski_Properties"] = zero_predictions["SMILES"].apply(lambda x: calculate_lipinski_properties(x))

# Extract results into separate columns
zero_predictions["Molecular_Weight"] = zero_predictions["Lipinski_Properties"].apply(lambda x: x[0]["Molecular Weight"] if x else None)
zero_predictions["LogP"] = zero_predictions["Lipinski_Properties"].apply(lambda x: x[0]["LogP"] if x else None)
zero_predictions["Hydrogen_Bond_Donors"] = zero_predictions["Lipinski_Properties"].apply(lambda x: x[0]["Hydrogen Bond Donors"] if x else None)
zero_predictions["Hydrogen_Bond_Acceptors"] = zero_predictions["Lipinski_Properties"].apply(lambda x: x[0]["Hydrogen Bond Acceptors"] if x else None)
zero_predictions["Passes_Lipinski_Rule"] = zero_predictions["Lipinski_Properties"].apply(lambda x: x[1] if x else None)

# Drop the temporary "Lipinski_Properties" column
zero_predictions.drop(columns=["Lipinski_Properties"], inplace=True)


In [7]:
zero_predictions

Unnamed: 0,KNN,SVM,XGBoost,RandomForest,DrugBank ID,SMILES,Molecular_Weight,LogP,Hydrogen_Bond_Donors,Hydrogen_Bond_Acceptors,Passes_Lipinski_Rule
0,0,0,0,0,DB00201,Cn1c(=O)c2c(ncn2C)n(C)c1=O,194.194,-1.02930,0,6,True
1,0,0,0,0,DB00279,NC(Cc1cc(I)c(Oc2ccc(O)c(I)c2)c(I)c1)C(=O)O,650.976,3.95270,3,4,False
2,0,0,0,0,DB00398,CNC(=O)c1cc(Oc2ccc(NC(=O)Nc3ccc(Cl)c(C(F)(F)F)...,464.831,5.54970,3,4,False
3,0,0,0,0,DB00451,NC(Cc1cc(I)c(Oc2cc(I)c(O)c(I)c2)c(I)c1)C(=O)O,776.872,4.55730,3,4,False
4,0,0,0,0,DB00716,CCCc1c2oc(C(=O)O)cc(=O)c2cc2c(=O)cc(C(=O)O)n(C...,371.345,2.47670,2,6,True
...,...,...,...,...,...,...,...,...,...,...,...
103,0,0,0,0,DB16846,Cc1cnc(CNC(=O)c2c(=O)c3ccc(N4CCCN(C)CC4)nc3n3c...,513.627,3.23272,1,9,False
104,0,0,0,0,DB17029,Cn1c2ccccc2c2c3c(c4c5ccccc5n(CCC#N)c4c21)CNC3=O,378.435,4.59648,1,4,True
105,0,0,0,0,DB17930,COc1nc2c(c(=O)n1C)n(C)c(=O)n2C,224.220,-1.02070,0,7,True
106,0,0,0,0,DB18080,Cc1nc2ccc(F)cc2n1-c1nc(N)c(F)c(Nc2ccc(C(F)(F)F...,420.345,4.74672,2,6,True


In [8]:
active_mols = zero_predictions[(zero_predictions['Passes_Lipinski_Rule'] == True)]

In [9]:
active_mols.reset_index(inplace=True)

In [10]:
active_mols

Unnamed: 0,index,KNN,SVM,XGBoost,RandomForest,DrugBank ID,SMILES,Molecular_Weight,LogP,Hydrogen_Bond_Donors,Hydrogen_Bond_Acceptors,Passes_Lipinski_Rule
0,0,0,0,0,0,DB00201,Cn1c(=O)c2c(ncn2C)n(C)c1=O,194.194,-1.02930,0,6,True
1,4,0,0,0,0,DB00716,CCCc1c2oc(C(=O)O)cc(=O)c2cc2c(=O)cc(C(=O)O)n(C...,371.345,2.47670,2,6,True
2,5,0,0,0,0,DB00876,CCCCc1ncc(C=C(Cc2cccs2)C(=O)O)n1Cc1ccc(C(=O)O)cc1,424.522,4.74440,2,5,True
3,6,0,0,0,0,DB01097,Cc1oncc1C(=O)Nc1ccc(C(F)(F)F)cc1,270.210,3.25412,1,3,True
4,7,0,0,0,0,DB01393,CC(C)(Oc1ccc(CCNC(=O)c2ccc(Cl)cc2)cc1)C(=O)O,361.825,3.55450,2,3,True
...,...,...,...,...,...,...,...,...,...,...,...,...
58,102,0,0,0,0,DB16734,O=C(O)CCn1ccc(-c2cc(C(F)(F)F)cc(C(F)(F)F)c2)n1,352.234,4.06240,1,3,True
59,104,0,0,0,0,DB17029,Cn1c2ccccc2c2c3c(c4c5ccccc5n(CCC#N)c4c21)CNC3=O,378.435,4.59648,1,4,True
60,105,0,0,0,0,DB17930,COc1nc2c(c(=O)n1C)n(C)c(=O)n2C,224.220,-1.02070,0,7,True
61,106,0,0,0,0,DB18080,Cc1nc2ccc(F)cc2n1-c1nc(N)c(F)c(Nc2ccc(C(F)(F)F...,420.345,4.74672,2,6,True


In [12]:
active_mols.drop(columns=['index'], inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  active_mols.drop(columns=['index'], inplace = True)


In [13]:
active_mols

Unnamed: 0,KNN,SVM,XGBoost,RandomForest,DrugBank ID,SMILES,Molecular_Weight,LogP,Hydrogen_Bond_Donors,Hydrogen_Bond_Acceptors,Passes_Lipinski_Rule
0,0,0,0,0,DB00201,Cn1c(=O)c2c(ncn2C)n(C)c1=O,194.194,-1.02930,0,6,True
1,0,0,0,0,DB00716,CCCc1c2oc(C(=O)O)cc(=O)c2cc2c(=O)cc(C(=O)O)n(C...,371.345,2.47670,2,6,True
2,0,0,0,0,DB00876,CCCCc1ncc(C=C(Cc2cccs2)C(=O)O)n1Cc1ccc(C(=O)O)cc1,424.522,4.74440,2,5,True
3,0,0,0,0,DB01097,Cc1oncc1C(=O)Nc1ccc(C(F)(F)F)cc1,270.210,3.25412,1,3,True
4,0,0,0,0,DB01393,CC(C)(Oc1ccc(CCNC(=O)c2ccc(Cl)cc2)cc1)C(=O)O,361.825,3.55450,2,3,True
...,...,...,...,...,...,...,...,...,...,...,...
58,0,0,0,0,DB16734,O=C(O)CCn1ccc(-c2cc(C(F)(F)F)cc(C(F)(F)F)c2)n1,352.234,4.06240,1,3,True
59,0,0,0,0,DB17029,Cn1c2ccccc2c2c3c(c4c5ccccc5n(CCC#N)c4c21)CNC3=O,378.435,4.59648,1,4,True
60,0,0,0,0,DB17930,COc1nc2c(c(=O)n1C)n(C)c(=O)n2C,224.220,-1.02070,0,7,True
61,0,0,0,0,DB18080,Cc1nc2ccc(F)cc2n1-c1nc(N)c(F)c(Nc2ccc(C(F)(F)F...,420.345,4.74672,2,6,True


In [14]:
active_mols.to_csv("active_mols/lipinski_true_active.csv", index = False)

## **The above 63 molecules will be used for virtual screening**

In [121]:
training_active_smiles

['Cc1ccccc1Cn1cc(NC(=O)c2noc3c2CC(C(C)(C)C)CC3)cn1',
 'CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccc(C(=O)NCCc5ccccc5)cc4)c3)c2C1',
 'CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccc(C(=O)NCc5ccccc5)cc4)c3)c2C1',
 'CC(C)(C)OC(=O)c1ccc(Cn2cc(NC(=O)c3noc4c3CC(C(C)(C)C)CC4)cn2)cc1',
 'CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccc(C(=O)O)cc4)c3)c2C1',
 'CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccc(F)cc4F)c3)c2C1',
 'COC(=O)c1ccc(Cn2cc(NC(=O)c3noc4c3CC(C(C)(C)C)CC4)cn2)cc1',
 'CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccc5ccccc5c4)c3)c2C1',
 'CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccccc4I)c3)c2C1',
 'CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccc(Cl)cc4)c3)c2C1',
 'CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccccc4Br)c3)c2C1',
 'CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccccc4)c3)c2C1',
 'CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccc(F)cc4)c3)c2C1',
 'Cc1nc2sc3ccccc3n2c1C(=O)NNC(=O)c1ccc(Oc2ccccc2)cc1',
 'Cc1nc2sccn2c1C(=O)NNC(=O)c1cccc2ccccc12',
 'Cc1nc2sccn2c1C(=O)N/N=C/c1ccc(C(F)(F)F)cc1',
 'Cc1nc2sccn2c1C(=O)N/N=C/c1ccccc1',
 'Cc1nc2sccn2c1C(=O)Nc1ccc(Br)cc1',
 'Cc1nc2sccn2c1C

In [114]:
predicted_active_smiles = active_mols["SMILES"].to_list()

In [115]:
predicted_active_smiles

['Cn1c(=O)c2c(ncn2C)n(C)c1=O',
 'CCCc1c2oc(C(=O)O)cc(=O)c2cc2c(=O)cc(C(=O)O)n(CC)c12',
 'CCCCc1ncc(C=C(Cc2cccs2)C(=O)O)n1Cc1ccc(C(=O)O)cc1',
 'Cc1oncc1C(=O)Nc1ccc(C(F)(F)F)cc1',
 'CC(C)(Oc1ccc(CCNC(=O)c2ccc(Cl)cc2)cc1)C(=O)O',
 'O=C(O)c1cccnc1C(=O)O',
 'CNC1CC2OC(C)(C1OC)n1c3ccccc3c3c4c(c5c6ccccc6n2c5c31)C(=O)NC4',
 'O=C(O)Cn1c(=O)n(Cc2ccc(Br)cc2F)c(=O)c2ccc(Cl)cc21',
 'COC(=O)C1(O)CC2OC1(C)n1c3ccccc3c3c4c(c5c6ccccc6n2c5c31)C(=O)NC4',
 'Cc1nc2ccccc2n1Cc1ccc2c(c1)nc(CNc1ccc(C(=N)N)cc1)n2C',
 'O=C(O)COc1cc(F)ccc1C(=O)NCc1nc2c(F)c(F)cc(F)c2s1',
 'Nc1nc(O)c2nc(CNc3ccc(C(=O)O)cc3)cnc2[nH+]1',
 'O=C(O)c1cccnc1Nc1cccc(C(F)(F)F)c1',
 'CC(C)C1(C)N=C(c2nc3ccccc3cc2C(=O)O)NC1=O',
 'C=C(CC(NC(=O)c1ccc(CCc2ccc3nc(N)nc(N)c3c2)cc1)C(=O)O)C(=O)O',
 'Cc1nc2c(N)nc3cccnc3c2n1CC(C)C',
 'Cc1cc2nc3c4cc(Cl)ccc4n(CC(=O)NCCN(C)C)c3nc2cc1C',
 'COc1cccc2c1c(O)c(C(=O)N(C)c1ccc(C(F)(F)F)cc1)c(=O)n2C',
 'CCNC(=O)c1cn2ncnc(Nc3cc(C(=O)NOC)ccc3C)c2c1C',
 'O=C(NN=Cc1cc(Br)c(O)c(Br)c1O)c1cccc(Br)c1',
 'O=C(O)Cn1cc(Cc2nc

In [133]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.DataStructs import TanimotoSimilarity


# Convert training active molecules to Morgan fingerprints
training_active_fps = [AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smiles), 2, nBits=1024) for smiles in training_active_smiles]

# Example DataFrame containing predicted active molecules

# Convert predicted active molecules to Morgan fingerprints
predicted_active_fps = [AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smiles), 2, nBits=1024) for smiles in active_mols["SMILES"]]

# Calculate Tanimoto similarity
similar_molecules = []
for pred_fp, pred_smiles in zip(predicted_active_fps, active_mols["SMILES"]):
    for train_fp, train_smiles in zip(training_active_fps, training_active_smiles):
        similarity = TanimotoSimilarity(pred_fp, train_fp)
        if similarity >= 0.3:
            similar_molecules.append((pred_smiles, train_smiles, similarity))

# Convert results to DataFrame
similar_molecules_df = pd.DataFrame(similar_molecules, columns=["Predicted_SMILES", "Training_SMILES", "Tanimoto_Similarity"])

# Print similar molecules
print("Molecules similar to training active molecules:")
similar_molecules_df


Molecules similar to training active molecules:


Unnamed: 0,Predicted_SMILES,Training_SMILES,Tanimoto_Similarity
0,Cc1oncc1C(=O)Nc1ccc(C(F)(F)F)cc1,Cc1nc2sccn2c1C(=O)N/N=C/c1ccc(C(F)(F)F)cc1,0.3125
1,CC(C)(Oc1ccc(CCNC(=O)c2ccc(Cl)cc2)cc1)C(=O)O,CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccc(C(=O)NCCc...,0.3625
2,CCC(Cc1ccc(OC)c(CNC(=O)c2ccc(C(F)(F)F)cc2F)c1)...,COc1ccc2c(c1)cc(C(=O)NS(=O)(=O)Cc1ccc(C(F)(F)F...,0.301205


In [126]:
similar_molecules_df

Unnamed: 0,Predicted_SMILES,Training_SMILES,Tanimoto_Similarity
0,Cc1oncc1C(=O)Nc1ccc(C(F)(F)F)cc1,Cc1nc2sccn2c1C(=O)N/N=C/c1ccc(C(F)(F)F)cc1,0.3125
1,CC(C)(Oc1ccc(CCNC(=O)c2ccc(Cl)cc2)cc1)C(=O)O,CC(C)(C)C1CCc2onc(C(=O)Nc3cnn(Cc4ccc(C(=O)NCCc...,0.3625
2,CCC(Cc1ccc(OC)c(CNC(=O)c2ccc(C(F)(F)F)cc2F)c1)...,COc1ccc2c(c1)cc(C(=O)NS(=O)(=O)Cc1ccc(C(F)(F)F...,0.301205


In [127]:
similar_molecules_df.to_csv("tani_sim_mol.csv", index = False)

In [130]:
active_mols

Unnamed: 0,index,KNN,SVM,XGBoost,RandomForest,DrugBank ID,SMILES,Molecular_Weight,LogP,Hydrogen_Bond_Donors,Hydrogen_Bond_Acceptors,Passes_Lipinski_Rule
0,53,0,0,0,0,DB00201,Cn1c(=O)c2c(ncn2C)n(C)c1=O,194.194,-1.02930,0,6,True
1,513,0,0,0,0,DB00716,CCCc1c2oc(C(=O)O)cc(=O)c2cc2c(=O)cc(C(=O)O)n(C...,371.345,2.47670,2,6,True
2,649,0,0,0,0,DB00876,CCCCc1ncc(C=C(Cc2cccs2)C(=O)O)n1Cc1ccc(C(=O)O)cc1,424.522,4.74440,2,5,True
3,843,0,0,0,0,DB01097,Cc1oncc1C(=O)Nc1ccc(C(F)(F)F)cc1,270.210,3.25412,1,3,True
4,1045,0,0,0,0,DB01393,CC(C)(Oc1ccc(CCNC(=O)c2ccc(Cl)cc2)cc1)C(=O)O,361.825,3.55450,2,3,True
...,...,...,...,...,...,...,...,...,...,...,...,...
58,9510,0,0,0,0,DB16734,O=C(O)CCn1ccc(-c2cc(C(F)(F)F)cc(C(F)(F)F)c2)n1,352.234,4.06240,1,3,True
59,9651,0,0,0,0,DB17029,Cn1c2ccccc2c2c3c(c4c5ccccc5n(CCC#N)c4c21)CNC3=O,378.435,4.59648,1,4,True
60,9862,0,0,0,0,DB17930,COc1nc2c(c(=O)n1C)n(C)c(=O)n2C,224.220,-1.02070,0,7,True
61,9933,0,0,0,0,DB18080,Cc1nc2ccc(F)cc2n1-c1nc(N)c(F)c(Nc2ccc(C(F)(F)F...,420.345,4.74672,2,6,True


In [138]:
a = active_mols[active_mols["SMILES"] == "Cc1oncc1C(=O)Nc1ccc(C(F)(F)F)cc1"]

In [139]:
b = active_mols[active_mols["SMILES"] == "CC(C)(Oc1ccc(CCNC(=O)c2ccc(Cl)cc2)cc1)C(=O)O"]

In [140]:
c = active_mols[active_mols["SMILES"] == "CCC(Cc1ccc(OC)c(CNC(=O)c2ccc(C(F)(F)F)cc2F)c1)C(=O)O"]

In [143]:
abc = pd.concat([a,b,c], axis = 0)

In [146]:
abc

Unnamed: 0,KNN,SVM,XGBoost,RandomForest,DrugBank ID,SMILES,Molecular_Weight,LogP,Hydrogen_Bond_Donors,Hydrogen_Bond_Acceptors,Passes_Lipinski_Rule
3,0,0,0,0,DB01097,Cc1oncc1C(=O)Nc1ccc(C(F)(F)F)cc1,270.21,3.25412,1,3,True
4,0,0,0,0,DB01393,CC(C)(Oc1ccc(CCNC(=O)c2ccc(Cl)cc2)cc1)C(=O)O,361.825,3.5545,2,3,True
21,0,0,0,0,DB07070,CCC(Cc1ccc(OC)c(CNC(=O)c2ccc(C(F)(F)F)cc2F)c1)...,427.394,4.4364,2,3,True


In [145]:
abc.drop(columns=['index'], inplace=True)

In [147]:
abc.to_csv("mols_2_doc.csv", index = False)