#### This notebook converts the smiles of (not blinded) compounds in smiles.pdf file to sdf file using rdkit.
#### The created sdf file is saved in sdf/train_and_test_molecules.sdf. 
#### After creating sdf files the file is passed to Padel (separate software) to generate topological molecular descriptors for training and test compounds.
#### The resulting molecular descriptors file is available in molecular_descriptors_csv/molecular_descriptors_with_pIC50_values.csv

In [1]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
import tabula
import random
import itertools as it
import sys
import numpy as np
from rdkit.Chem import PandasTools

In [2]:
pdf_path = "smiles.pdf"
data_df = tabula.read_pdf(pdf_path,pages="all")
# Pick compounds other than blinded
blinded_data = data_df[data_df['pIC50 (IC50 in microM)'] != 'BLINDED']

In [3]:
# Smiles to sdf
mols = []
def create_sdf(sdf_file_path):
    w = Chem.SDWriter(sdf_file_path)
    for index, row in blinded_data.iterrows():
        m = Chem.MolFromSmiles(row['SMILES'])
        m.SetProp("_Name",row['SMILES'])
        mols.append(m)
    for m in mols: w.write(m)
    return
# Save sdf file in sdf/train_and_test_molecules.sdf
sdf_file_path = 'sdf/train_and_test_molecules.sdf'
create_sdf(sdf_file_path)