In [1]:
# necessary imports
import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem import Crippen
import glob
import pandas as pd
import os
import re
import json
from bs4 import BeautifulSoup

In [2]:
def get_src_from_image_tag(html):
    soup = BeautifulSoup(html, "html.parser")
    return soup.img['src']

In [3]:
# This works for folders of Shamim Afrose and Rohon

def get_dict_1(path):
    smiles_dict_1 = dict()
    i = 0
    for file in glob.iglob('{}/**//*.sdf'.format(path), recursive=True): # data is saved in subdirectories of current directory
        sdf = Chem.SDMolSupplier(file) # read sdf
        for mol in sdf:
            smiles = Chem.MolToSmiles(mol) # get smiles
            if smiles_dict_1.get(smiles, -1) == -1:
                i += 1
                smiles_dict_1[smiles] = {
                    'molecular_formula': Chem.rdMolDescriptors.CalcMolFormula(mol), # formula
                    'molecular_weight': Chem.rdMolDescriptors.CalcExactMolWt(mol), # weight
                    'hba': Chem.rdMolDescriptors.CalcNumHBA(mol), # h bond acceptor
                    'hbd': Chem.rdMolDescriptors.CalcNumHBD(mol), # h bond donor
                    'molar_refractivity': Chem.Crippen.MolMR(mol), # molar refractivity
                    'tpsa': Chem.rdMolDescriptors.CalcTPSA(mol), # tpsa
                    'logp': Chem.Crippen.MolLogP(mol), # logP
                    'romol': get_src_from_image_tag(str(mol)) # image of the molecule
                }
                parent = os.path.split(os.path.dirname(file))[-1]
                smiles_dict_1[smiles]['plant'] = [re.search('(\w+ \w+)', parent).group(0)]
            else:
                parent = os.path.split(os.path.dirname(file))[-1]
                plant = re.search('(\w+ \w+)', parent).group(0)
                if plant not in smiles_dict_1[smiles]['plant']:
                    smiles_dict_1[smiles]['plant'].append(plant)
            if i % 100 == 0:
                print('{} plants processed'.format(i))
    return smiles_dict_1

In [4]:
# This works for folders of Suvro and Mita

def get_dict_2(path):
    smiles_dict_2 = dict()
    i = 0
    for file in glob.iglob('{}/**//*.sdf'.format(path), recursive=True): # data is saved in subdirectories of current directory
        sdf = Chem.SDMolSupplier(file) # read sdf
        for mol in sdf:
            smiles = Chem.MolToSmiles(mol) # get smiles
            if smiles_dict_2.get(smiles, -1) == -1:
                i += 1
                smiles_dict_2[smiles] = {
                    'molecular_formula': Chem.rdMolDescriptors.CalcMolFormula(mol), # formula
                    'molecular_weight': Chem.rdMolDescriptors.CalcExactMolWt(mol), # weight
                    'hba': Chem.rdMolDescriptors.CalcNumHBA(mol), # h bond acceptor
                    'hbd': Chem.rdMolDescriptors.CalcNumHBD(mol), # h bond donor
                    'molar_refractivity': Chem.Crippen.MolMR(mol), # molar refractivity
                    'tpsa': Chem.rdMolDescriptors.CalcTPSA(mol), # tpsa
                    'logp': Chem.Crippen.MolLogP(mol), # logP
                    'romol': get_src_from_image_tag(str(mol)) # image of the molecule
                }
                parent = os.path.dirname(file).split(os.sep)[-2]
                smiles_dict_2[smiles]['plant'] = [re.search('(\w+ \w+)', parent).group(0)]
            else:
                parent = os.path.dirname(file).split(os.sep)[-2]
                plant = re.search('(\w+ \w+)', parent).group(0)
                if plant not in smiles_dict_2[smiles]['plant']:
                    smiles_dict_2[smiles]['plant'].append(plant)
            if i % 100 == 0:
                print('{} plants processed'.format(i))
    return smiles_dict_2

In [5]:
with open('data/data1.json', 'w') as outfile:
    smiles_dict_1 = get_dict_1('data/Data1')
    json.dump(smiles_dict_1, outfile, indent=4)

100 plants processed
200 plants processed
300 plants processed
400 plants processed
500 plants processed
600 plants processed
700 plants processed
800 plants processed
800 plants processed
800 plants processed
800 plants processed
800 plants processed
800 plants processed
800 plants processed
800 plants processed
900 plants processed
900 plants processed
1000 plants processed
1100 plants processed
1100 plants processed
1100 plants processed
1100 plants processed
1200 plants processed
1200 plants processed
1200 plants processed
1200 plants processed
1200 plants processed
1300 plants processed
1400 plants processed
1500 plants processed
1600 plants processed
1700 plants processed
1800 plants processed
1900 plants processed
1900 plants processed
2000 plants processed
2000 plants processed
2000 plants processed
2000 plants processed
2000 plants processed
2100 plants processed
2200 plants processed
2200 plants processed


In [6]:
with open('data/data2.json', 'w') as outfile:
    smiles_dict_2 = get_dict_2('data/Data2')
    json.dump(smiles_dict_2, outfile, indent=4)

100 plants processed
200 plants processed
300 plants processed
400 plants processed
400 plants processed
500 plants processed
600 plants processed
700 plants processed
800 plants processed
900 plants processed
1000 plants processed
1100 plants processed
1200 plants processed
1300 plants processed
1400 plants processed
1400 plants processed
1400 plants processed
1400 plants processed
1400 plants processed
1500 plants processed
1600 plants processed
1700 plants processed
1800 plants processed
1900 plants processed
2000 plants processed
2100 plants processed
2200 plants processed
2200 plants processed
2200 plants processed
2200 plants processed
2200 plants processed
2300 plants processed
2400 plants processed
2500 plants processed
2500 plants processed
2600 plants processed
2700 plants processed
2700 plants processed
2800 plants processed
2800 plants processed
2900 plants processed
3000 plants processed




3100 plants processed
3100 plants processed
3100 plants processed
3100 plants processed
3100 plants processed
3100 plants processed
3100 plants processed
3100 plants processed
3100 plants processed
3200 plants processed
