In [1]:
import pandas as pd
from __future__ import print_function
import importlib.util
import math
import os
import sys
import numpy as np

from collections import defaultdict
from rdkit import Chem

# If you don't want to draw your molecule these aren't necessary
from rdkit.Chem import Draw
from rdkit.Chem.Draw import DrawingOptions
from rdkit.Chem.Draw import IPythonConsole

from rdkit.Chem import AllChem
from rdkit.Chem import RDConfig
sys.path.append(os.path.join(RDConfig.RDContribDir,'ChiralPairs'))
from ChiralDescriptors import determineAtomSubstituents

In [2]:
from rdkit import Chem
suppl = Chem.SDMolSupplier("data\\qm9\\raw_orig\\gdb9.sdf", removeHs=False,
                                   sanitize=False)

In [3]:
len(suppl)

133885

In [6]:
mdic={"name":[],"weight":[]}

In [7]:
def getweight(mol):
    atoms=mol.GetAtoms()
    w=0
    for atom in atoms:
        w+=atom.GetAtomicNum()
    return w

In [8]:
for mol in suppl:
    n=mol.GetProp('_Name')
    w=getweight(mol)
    mdic["name"].append(n)
    mdic["weight"].append(w)


In [9]:
df=pd.DataFrame.from_dict(mdic)

In [18]:
df.groupby("weight")["name"].count().to_frame("count").reset_index().to_csv("wc.csv")

In [4]:
moldic={}
for mol in suppl:
    moldic[mol.GetProp('_Name')]=mol

In [5]:
def findmol(name,suppl):
    for mol in suppl:
        molname=mol.GetProp('_Name')
        if name==molname:
            return mol
    return None

In [12]:
df1=pd.read_csv("pvd-lumo.csv")
#df2=pd.read_csv("lumo.csv")

In [13]:
bonds_keys=["CH","CC","CN","CO","CF","NH","NN","NO","NF","OH","OO","OF","FH","FF"]

In [14]:
def GetBonds(mol):
    bonds=mol.GetBonds()
    bond_dic={k:0 for k in bonds_keys}
    for bond in bonds:
        i=bond.GetBeginAtom().GetSymbol()
        j=bond.GetEndAtom().GetSymbol()
        if i+j in bond_dic:
            bond_dic[i+j]+=1
        else:
            bond_dic[j+i]+=1
    return bond_dic

In [15]:
df1[bonds_keys]=0

In [16]:
names=df1["name"]
for i,name in enumerate(names):
    bond_dic={ i:0 for i in bonds_keys }
    mol=moldic[name]
    molbonds=GetBonds(mol)
    for key in molbonds.keys():
        bond_dic[key]=molbonds[key]
    df1.loc[i,bond_dic.keys()]=bond_dic.values()


In [17]:
df1=df1.drop("Unnamed: 0",axis=1)

In [18]:
df1.to_csv("pvd-lumo-bonds.csv")

In [12]:
complexities=[i for i in range(len(names))]

In [14]:
df1["compl"]=complexities
df2["compl"]=complexities

In [None]:
df1.to_csv("homo.csv")
df2.to_csv("lumo.csv")