In [1]:
import numpy as np
import pandas as pd

from rdkit import Chem
from rdkit.Chem import AllChem


### Open the molecule file created in part 1

In [2]:
df = pd.read_csv('molecule_pIC50.csv', index_col = [0])
df.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity_class
8,CHEMBL217524,O=C(Nc1ccc(F)c(Cl)c1)C1CC(=O)N(C2CCCCC2)C1,4.828859,0
9,CHEMBL384149,COc1ccc(Cl)cc1NC(=O)C1CC(=O)N(C2CCCCC2)C1,5.79588,0
10,CHEMBL216339,O=C(Nc1cc(C(F)(F)F)cc(C(F)(F)F)c1)C1CC(=O)N(C2...,5.435334,0
13,CHEMBL216807,O=C(Nc1cc(Cl)cc(Cl)c1)C1CC(=O)N(C2CCCCC2)C1,6.408935,1
15,CHEMBL217617,Cc1cc([N+](=O)[O-])ccc1NC(=O)C1CC(=O)N(C2CCCCC...,4.503485,0


### The morgan fingerprints will be calculated for each canonical smiles and then put together in a dataframe

In [3]:
def _compute_single_fp_descriptor(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
    except Exception as E:
        return None

    if mol:
        fp = Chem.AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
        return np.array(fp)
    
    return None
    
#Computes ecfp descriptors  
def compute_fp_descriptors(smiles_list):
    
    idx_to_keep = list()
    descriptors = list()
    for i, smiles in enumerate(smiles_list):
        fp = _compute_single_fp_descriptor(smiles)
        if fp is not None:
            idx_to_keep.append(i)
            descriptors.append(fp)

    return np.vstack(descriptors), idx_to_keep

In [4]:
smiles_list = np.array(df.canonical_smiles)

In [5]:
descriptors, idx = compute_fp_descriptors(smiles_list)

In [6]:
df_descriptors = pd.DataFrame(descriptors)

In [7]:
df_descriptors.shape

(282, 2048)

In [8]:
df_descriptors.to_csv('descriptors_molecule.csv')

In [9]:
df_descriptors

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
278,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
279,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
280,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
