In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from rdkit.Chem import PandasTools
from rdkit import rdBase
from rdkit import Chem
from rdkit.Chem.rdmolfiles import SmilesWriter
from pathlib import Path
from rdkit.Chem import Descriptors, Draw, PandasTools
from warnings import filterwarnings
from rdkit import DataStructs
from rdkit.Chem import MACCSkeys
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
from FP import *

In [2]:
# Set path to this notebook
HERE = Path(_dh[-1])
HDAC6 = Path(HERE).resolve().parents[0]/ "BioactivityModels/HDAC6/input"
HDAC1 = Path(HERE).resolve().parents[0] / "BioactivityModels/HDAC1/input"
HDAC6vs1 = Path(HERE).resolve().parents[0] / "SelectivityModels/input"

In [3]:
def calculate_ro5_properties(smiles):
    """
    Test if input molecule (SMILES) fulfills Lipinski's rule of five.

    Parameters
    ----------
    smiles : str
        SMILES for a molecule.

    Returns
    -------
    pandas.Series
        Molecular weight, number of hydrogen bond acceptors/donor and logP value
        and Lipinski's rule of five compliance for input molecule.
    """
    # RDKit molecule from SMILES
    molecule = Chem.MolFromSmiles(smiles)
    # Calculate Ro5-relevant chemical properties
    molecular_weight = Descriptors.ExactMolWt(molecule)
    n_rot = Descriptors.NumRotatableBonds(molecule)
    n_heavy = Descriptors.HeavyAtomCount(molecule)
    n_hba = Descriptors.NumHAcceptors(molecule)
    n_hbd = Descriptors.NumHDonors(molecule)
    logp = Descriptors.MolLogP(molecule)
    num_ar = Descriptors.NumAromaticRings(molecule)
    num_sa = Descriptors.NumSaturatedRings(molecule)
    num_alip = Descriptors.NumAliphaticRings(molecule)
    
    # Check if Ro5 conditions fulfilled
    
    return pd.Series(
        [molecular_weight, n_rot,n_heavy, n_hba, n_hbd, logp, num_ar, num_sa,num_alip],
        index=["molecular_weight","n_rot","n_heavy", "n_hba", "n_hbd", "logp", "num_ar", "num_sa","num_alip" ],
    )

#### Generate the fingerprints for the training/test set as well as external test set.
Use different types of FPs and develop models with each one of them to test 

#### for HDAC6 dataset

In [10]:
df6All = pd.read_csv(HDAC6/'HDAC6_dataset.csv', )
#1024 bits fingerprints and MACCS for all compounds
compound_df6All = df6All.copy()
compound_df6All["fp_MACCS"] = compound_df6All['smiles'].apply(smiles_to_maccs)
compound_df6All["fp_Morgan3"] = compound_df6All['smiles'].apply(smiles_to_morgan3)
compound_df6All["fp_MorganF"] = compound_df6All['smiles'].apply(smiles_to_morganF)
compound_df6All["fp_MAP4"] = compound_df6All['smiles'].apply(smiles_to_MAP4)


#combine with 1d properties
oneD6All = df6All["smiles"].apply(calculate_ro5_properties)
oneD6All.head(5)
compound_df6AllComb = pd.concat([compound_df6All, oneD6All], axis=1)
compound_df6AllComb.head(5)
column_to_move = compound_df6AllComb.pop("pChEMBL_HDAC6")
compound_df6AllComb.insert(14, "pChEMBL_HDAC6", column_to_move)
compound_df6AllComb.to_pickle(HDAC6/"HDAC6_1024B.csv")
compound_df6AllComb.shape

(2971, 19)

In [13]:
#2048 bits fingerprints for all compounds
compound_df6All2 = df6All.copy()
compound_df6All2["fp_Morgan3"] = compound_df6All2['smiles'].apply(smiles_to_morgan3_2048)
compound_df6All2["fp_MorganF"] = compound_df6All2['smiles'].apply(smiles_to_morganF_2048)
compound_df6All2["fp_MAP4"] = compound_df6All2['smiles'].apply(smiles_to_MAP4_2048)
compound_df6All2 = compound_df6All2[['molecule_chembl_id', 'fp_Morgan3', 'fp_MorganF', 'fp_MAP4','pChEMBL_HDAC6' ]]
compound_df6All2.rename({'pchembl_ave_fromStandard':'pchembl'}, axis=1, inplace=True)
#combine with 1d properties
compound_df6All2Comb = pd.concat([compound_df6All2, oneD6All], axis=1)
compound_df6All2Comb.head(5)
column_to_move = compound_df6All2Comb.pop("pChEMBL_HDAC6")
compound_df6All2Comb.insert(13, "pChEMBL_HDAC6", column_to_move)
compound_df6All2Comb.to_pickle(HDAC6/"HDAC6_2048B.csv")
compound_df6All2Comb.shape

(2971, 14)

#### for HDAC1 dataset

In [14]:
df1All = pd.read_csv(HDAC1/'HDAC1_dataset.csv', )
#1024 bits fingerprints and MACCS
compound_df1All = df1All.copy()
compound_df1All["fp_MACCS"] = compound_df1All['smiles'].apply(smiles_to_maccs)
compound_df1All["fp_Morgan3"] = compound_df1All['smiles'].apply(smiles_to_morgan3)
compound_df1All["fp_MorganF"] = compound_df1All['smiles'].apply(smiles_to_morganF)
compound_df1All["fp_MAP4"] = compound_df1All['smiles'].apply(smiles_to_MAP4)
compound_df1All = compound_df1All[['molecule_chembl_id','fp_MACCS', 'fp_Morgan3', 'fp_MorganF', 'fp_MAP4','pChEMBL_HDAC1' ]]
compound_df1All.rename({'pchembl_ave_fromStandard':'pchembl'}, axis=1, inplace=True)
#combine with 1d properties
oneD1All = df1All["smiles"].apply(calculate_ro5_properties)
oneD1All.head(5)
compound_df1AllComb = pd.concat([compound_df1All, oneD1All], axis=1)
compound_df1AllComb.head(5)
column_to_move = compound_df1AllComb.pop("pChEMBL_HDAC1")
compound_df1AllComb.insert(14, "pChEMBL_HDAC1", column_to_move)
compound_df1AllComb.to_pickle(HDAC1/"HDAC1_1024B.csv")
compound_df1AllComb.shape

(4492, 15)

In [16]:
#2048 bits fingerprints for all dataset
compound_df1All2 = df1All.copy()
compound_df1All2["fp_Morgan3"] = compound_df1All2['smiles'].apply(smiles_to_morgan3_2048)
compound_df1All2["fp_MorganF"] = compound_df1All2['smiles'].apply(smiles_to_morganF_2048)
compound_df1All2["fp_MAP4"] = compound_df1All2['smiles'].apply(smiles_to_MAP4_2048)
compound_df1All2 = compound_df1All2[['molecule_chembl_id', 'fp_Morgan3', 'fp_MorganF', 'fp_MAP4','pChEMBL_HDAC1' ]]

#combine with 1d properties
compound_df1All2Comb = pd.concat([compound_df1All2, oneD1All], axis=1)
compound_df1All2Comb.head(5)
column_to_move = compound_df1All2Comb.pop("pChEMBL_HDAC1")
compound_df1All2Comb.insert(13, "pChEMBL_HDAC1", column_to_move)
compound_df1All2Comb.to_pickle(HDAC1/"HDAC1_2048B.csv")
compound_df1All2Comb.head(5)

Unnamed: 0,molecule_chembl_id,fp_Morgan3,fp_MorganF,fp_MAP4,molecular_weight,n_rot,n_heavy,n_hba,n_hbd,logp,num_ar,num_sa,num_alip,pChEMBL_HDAC1
0,CHEMBL327146,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, ...","[5547132, 3346815, 526260, 7125875, 7332227, 1...",485.206304,10.0,36.0,6.0,4.0,4.4323,4.0,0.0,0.0,9.0
1,CHEMBL116620,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[2593493, 2349897, 526260, 13440577, 19258323,...",415.153206,7.0,31.0,4.0,4.0,3.5662,3.0,0.0,0.0,9.0
2,CHEMBL2093007,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[5090223, 4497230, 396665, 4659087, 418066, 30...",569.259328,8.0,38.0,8.0,3.0,2.8512,0.0,1.0,1.0,5.2
3,CHEMBL316457,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[9076848, 10234695, 1615018, 4730798, 2140073,...",444.208279,8.0,31.0,4.0,3.0,5.3762,2.0,0.0,0.0,6.22
4,CHEMBL269692,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[5976924, 29775619, 8033062, 11956864, 7698299...",301.106256,5.0,22.0,5.0,3.0,1.627,2.0,0.0,0.0,5.52


#### for HDAC1/6 dataset

In [17]:
df5 = pd.read_csv(HDAC6vs1/'HDAC1and6_SemiSel_dataset.csv', )
#1024 bits fingerprints and MACCS
compound_df9 = df5.copy()
compound_df9["fp_MACCS"] = compound_df9['smiles'].apply(smiles_to_maccs)
compound_df9["fp_Morgan3"] = compound_df9['smiles'].apply(smiles_to_morgan3)
compound_df9["fp_MorganF"] = compound_df9['smiles'].apply(smiles_to_morganF)
compound_df9["fp_MAP4"] = compound_df9['smiles'].apply(smiles_to_MAP4)
compound_df9 = compound_df9[['molecule_chembl_id','fp_MACCS', 'fp_Morgan3', 'fp_MorganF', 'fp_MAP4','SelectivityWindow']]
#combine with 1d properties
oneD3 = df5["smiles"].apply(calculate_ro5_properties)
compound_df9Comb = pd.concat([compound_df9, oneD3], axis=1)
compound_df9Comb.head(5)
column_to_move = compound_df9Comb.pop("SelectivityWindow")
compound_df9Comb.insert(14, "SelectivityWindow", column_to_move)
compound_df9Comb.to_pickle(HDAC6vs1/"HDAC1and6_SemiSel_1024B.csv")
compound_df9Comb.head(5)

Unnamed: 0,molecule_chembl_id,fp_MACCS,fp_Morgan3,fp_MorganF,fp_MAP4,molecular_weight,n_rot,n_heavy,n_hba,n_hbd,logp,num_ar,num_sa,num_alip,SelectivityWindow
0,CHEMBL4098975,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, ...","[2770663, 83582, 137380, 5467685, 28861, 20399...",504.259737,14.0,37.0,8.0,5.0,4.1081,4.0,0.0,0.0,2.89
1,CHEMBL3912061,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[5976924, 3913535, 4142580, 576166, 5215404, 6...",387.125277,7.0,27.0,5.0,3.0,1.9003,3.0,0.0,0.0,3.56
2,CHEMBL4243347,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, ...","[6411541, 3821889, 137380, 4236434, 7332227, 2...",462.212787,11.0,34.0,7.0,6.0,3.8761,4.0,0.0,0.0,1.0
3,CHEMBL4247128,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, ...","[3808044, 3821889, 137380, 2385390, 6940687, 2...",516.259737,14.0,38.0,8.0,5.0,4.9256,4.0,0.0,0.0,2.52
4,CHEMBL4126811,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, ...","[4928228, 22758228, 8033062, 10725227, 1087409...",452.169584,6.0,33.0,7.0,4.0,3.7376,3.0,0.0,0.0,3.12


In [18]:
#2048 bits fingerprints
compound_df10 = df5.copy()
compound_df10["fp_Morgan3"] = compound_df10['smiles'].apply(smiles_to_morgan3_2048)
compound_df10["fp_MorganF"] = compound_df10['smiles'].apply(smiles_to_morganF_2048)
compound_df10["fp_MAP4"] = compound_df10['smiles'].apply(smiles_to_MAP4_2048)
compound_df10 = compound_df10[['molecule_chembl_id', 'fp_Morgan3', 'fp_MorganF', 'fp_MAP4','SelectivityWindow' ]]
compound_df10.rename({'sel6':'selectivityIndex'}, axis=1, inplace=True)
#combine with 1d properties
compound_df10Comb = pd.concat([compound_df10, oneD3], axis=1)
column_to_move = compound_df10Comb.pop("SelectivityWindow")
compound_df10Comb.insert(13, "SelectivityWindow", column_to_move)
compound_df10Comb.to_pickle((HDAC6vs1/"HDAC1and6_SemiSel_2048B.csv"))
compound_df10Comb.head(5)

Unnamed: 0,molecule_chembl_id,fp_Morgan3,fp_MorganF,fp_MAP4,molecular_weight,n_rot,n_heavy,n_hba,n_hbd,logp,num_ar,num_sa,num_alip,SelectivityWindow
0,CHEMBL4098975,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, ...","[2770663, 83582, 137380, 5467685, 28861, 20399...",504.259737,14.0,37.0,8.0,5.0,4.1081,4.0,0.0,0.0,2.89
1,CHEMBL3912061,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[5976924, 3913535, 4142580, 576166, 5215404, 6...",387.125277,7.0,27.0,5.0,3.0,1.9003,3.0,0.0,0.0,3.56
2,CHEMBL4243347,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, ...","[6411541, 3821889, 137380, 4236434, 7332227, 2...",462.212787,11.0,34.0,7.0,6.0,3.8761,4.0,0.0,0.0,1.0
3,CHEMBL4247128,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, ...","[3808044, 3821889, 137380, 2385390, 6940687, 2...",516.259737,14.0,38.0,8.0,5.0,4.9256,4.0,0.0,0.0,2.52
4,CHEMBL4126811,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[4928228, 22758228, 8033062, 10725227, 1087409...",452.169584,6.0,33.0,7.0,4.0,3.7376,3.0,0.0,0.0,3.12


#### HDAC1/6 set without Semi-selective compounds

In [19]:
df7 = pd.read_csv(HDAC6vs1/'HDAC1and6_dataset.csv' )
#1024 bits fingerprints and MACCS
compound_df12 = df7.copy()

compound_df12["fp_MACCS"] = compound_df12['smiles'].apply(smiles_to_maccs)
compound_df12["fp_Morgan3"] = compound_df12['smiles'].apply(smiles_to_morgan3)
compound_df12["fp_MorganF"] = compound_df12['smiles'].apply(smiles_to_morganF)
compound_df12["fp_MAP4"] = compound_df12['smiles'].apply(smiles_to_MAP4)
compound_df12 = compound_df12[['molecule_chembl_id','fp_MACCS', 'fp_Morgan3', 'fp_MorganF', 'fp_MAP4','SelectivityWindow' ]]
#combine with 1d properties
oneD5 = df7["smiles"].apply(calculate_ro5_properties)
compound_df12Comb = pd.concat([compound_df12, oneD5], axis=1)
compound_df12Comb.head(5)
column_to_move = compound_df12Comb.pop("SelectivityWindow")
compound_df12Comb.insert(14, "SelectivityWindow", column_to_move)
compound_df12Comb.to_pickle(HDAC6vs1/"HDAC1and6_1024B.csv")
compound_df12Comb.head(5)

Unnamed: 0,molecule_chembl_id,fp_MACCS,fp_Morgan3,fp_MorganF,fp_MAP4,molecular_weight,n_rot,n_heavy,n_hba,n_hbd,logp,num_ar,num_sa,num_alip,SelectivityWindow
0,CHEMBL4098975,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, ...","[2770663, 83582, 137380, 5467685, 28861, 20399...",504.259737,14.0,37.0,8.0,5.0,4.1081,4.0,0.0,0.0,2.89
1,CHEMBL3912061,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[5976924, 3913535, 4142580, 576166, 5215404, 6...",387.125277,7.0,27.0,5.0,3.0,1.9003,3.0,0.0,0.0,3.56
2,CHEMBL4243347,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, ...","[6411541, 3821889, 137380, 4236434, 7332227, 2...",462.212787,11.0,34.0,7.0,6.0,3.8761,4.0,0.0,0.0,1.0
3,CHEMBL4247128,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, ...","[3808044, 3821889, 137380, 2385390, 6940687, 2...",516.259737,14.0,38.0,8.0,5.0,4.9256,4.0,0.0,0.0,2.52
4,CHEMBL4126811,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, ...","[4928228, 22758228, 8033062, 10725227, 1087409...",452.169584,6.0,33.0,7.0,4.0,3.7376,3.0,0.0,0.0,3.12


In [20]:
#2048 bits fingerprints
compound_df13 = df7.copy()
compound_df13["fp_Morgan3"] = compound_df13['smiles'].apply(smiles_to_morgan3_2048)
compound_df13["fp_MorganF"] = compound_df13['smiles'].apply(smiles_to_morganF_2048)
compound_df13["fp_MAP4"] = compound_df13['smiles'].apply(smiles_to_MAP4_2048)
compound_df13 = compound_df13[['molecule_chembl_id', 'fp_Morgan3', 'fp_MorganF', 'fp_MAP4','SelectivityWindow' ]]
#combine with 1d properties
compound_df13Comb = pd.concat([compound_df13, oneD5], axis=1)
column_to_move = compound_df13Comb.pop("SelectivityWindow")
compound_df13Comb.insert(13, "SelectivityWindow", column_to_move)
compound_df13Comb.to_pickle((HDAC6vs1/"HDAC1and6_2048B.csv"))
compound_df13Comb.head(5)

Unnamed: 0,molecule_chembl_id,fp_Morgan3,fp_MorganF,fp_MAP4,molecular_weight,n_rot,n_heavy,n_hba,n_hbd,logp,num_ar,num_sa,num_alip,SelectivityWindow
0,CHEMBL4098975,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, ...","[2770663, 83582, 137380, 5467685, 28861, 20399...",504.259737,14.0,37.0,8.0,5.0,4.1081,4.0,0.0,0.0,2.89
1,CHEMBL3912061,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[5976924, 3913535, 4142580, 576166, 5215404, 6...",387.125277,7.0,27.0,5.0,3.0,1.9003,3.0,0.0,0.0,3.56
2,CHEMBL4243347,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, ...","[6411541, 3821889, 137380, 4236434, 7332227, 2...",462.212787,11.0,34.0,7.0,6.0,3.8761,4.0,0.0,0.0,1.0
3,CHEMBL4247128,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, ...","[3808044, 3821889, 137380, 2385390, 6940687, 2...",516.259737,14.0,38.0,8.0,5.0,4.9256,4.0,0.0,0.0,2.52
4,CHEMBL4126811,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[4928228, 22758228, 8033062, 10725227, 1087409...",452.169584,6.0,33.0,7.0,4.0,3.7376,3.0,0.0,0.0,3.12


#### External set molecules

In [21]:
dfext = pd.read_csv(HDAC6vs1/'HDAC1and6_external_420Mols.csv',)
#1024 bits fingerprints and MACCS for all compounds
exterLabel = dfext.copy()

exterLabel

Unnamed: 0,molecule_chembl_id,smiles,type_HDAC1,pChEMBL_value_HDAC1,standard_value_HDAC1,type_HDAC6,pChEMBL_value_HDAC6,standard_value_HDAC6,document_chembl_id_hdac1,SelectivityWindow,label
0,CHEMBL4442817,CCCCCCCOC(=O)Cc1ccc(NC(=O)CCCCCCC(=O)NO)cc1,IC50,6.508638,310.0,IC50,7.283997,52.0,CHEMBL4304814,0.7754,Semi-selective
1,CHEMBL4455447,O=C(CCCCCCC(=O)Nc1ccc(CC(=O)OCCCC(F)CF)cc1)NO,IC50,7.136677,73.0,IC50,7.431798,37.0,CHEMBL4304814,0.2951,Dual-binder
2,CHEMBL4463358,O=C(CCCCCCC(=O)Nc1ccc(CC(=O)OCCCCCCCC(F)CF)cc1)NO,IC50,6.716699,192.0,IC50,7.455932,35.0,CHEMBL4304814,0.7392,Dual-binder
3,CHEMBL4463948,CCCCCCOC(=O)Cc1ccc(NC(=O)CCCCCCC(=O)NO)cc1,IC50,6.638272,230.0,IC50,7.408935,39.0,CHEMBL4304814,0.7707,Dual-binder
4,CHEMBL4465542,CCCCCCCCCOC(=O)Cc1ccc(NC(=O)CCCCCCC(=O)NO)cc1,IC50,6.047208,897.0,IC50,6.785156,164.0,CHEMBL4304814,0.7379,Semi-selective
...,...,...,...,...,...,...,...,...,...,...,...
415,CHEMBL5220434,Cn1c([C@@H]2C[C@@H](OCc3ccc(C(=O)NO)s3)CN2c2nc...,IC50,7.060000,88.0,IC50,7.890000,13.0,CHEMBL5214954,0.8300,Dual-binder
416,CHEMBL5220776,CC[C@H](Nc1nc(N)nc(N)c1C#N)c1nc2cccc(Cl)c2c(=O...,IC50,6.700000,198.0,IC50,7.350000,45.0,CHEMBL5214954,0.6500,Dual-binder
417,CHEMBL5220934,O=C(NO)c1ccc(Cn2c([C@@H]3CC4(CC4)CN3c3ncnc4[nH...,IC50,6.440000,360.0,IC50,7.550000,28.0,CHEMBL5214954,1.1100,Semi-selective
418,CHEMBL5221043,CC[C@H](Nc1ncnc2[nH]cnc12)c1nc2cccc(F)c2c(=O)n...,IC50,7.070000,85.0,IC50,6.920000,121.0,CHEMBL5214954,-0.1500,Dual-binder


In [22]:
#1024 bits fingerprints and MACCS for all compounds
exterLabel = dfext.copy()
exterLabel.rename({'sel6':'SelectivityWindow'}, axis=1, inplace=True)
exterLabel["fp_MACCS"] = exterLabel['smiles'].apply(smiles_to_maccs)
exterLabel["fp_Morgan3"] = exterLabel['smiles'].apply(smiles_to_morgan3)
exterLabel["fp_MorganF"] = exterLabel['smiles'].apply(smiles_to_morganF)
exterLabel["fp_MAP4"] = exterLabel['smiles'].apply(smiles_to_MAP4)
exterLabel = exterLabel[['molecule_chembl_id','fp_MACCS', 'fp_Morgan3', 'fp_MorganF', 'fp_MAP4','SelectivityWindow' ]]
#exterLabel.rename({'ChEMBL ID':'molecule_chembl_id'}, axis=1, inplace=True)
#combine with 1d properties
#oneD6All = df6All["smiles"].apply(calculate_ro5_properties)
#oneD6All.head(5)
#exterLabelComb = pd.concat([exterLabel, oneD6All], axis=1)
exterLabel.head(5)
#column_to_move = exterLabelComb.pop("pchembl")
#exterLabelComb.insert(14, "pchembl", column_to_move)
exterLabel.to_pickle(HDAC6vs1/"HDAC1and6_external_420Mols_1024B.csv")
exterLabel.shape

(420, 6)