In [1]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [2]:
# import sys
# !{sys.executable} -m pip install csd-python-api

In [1]:
from rdkit.Chem import Draw

In [2]:
import pandas as pd
import numpy as np
import os

from sklearn import preprocessing
from rdkit.Chem import ChemicalFeatures
from rdkit import RDConfig

pd.set_option('display.max_columns', 500)

In [3]:
file_folder = '../../data/input'
os.listdir(file_folder)

['sample_submission.csv',
 'magnetic_shielding_tensors.csv',
 'potential_energy.csv',
 'scalar_coupling_contributions.csv',
 'dipole_moments.csv',
 'mulliken_charges.csv',
 'train.csv',
 'test.csv',
 'structures.csv',
 'structures']

In [4]:
train = pd.read_csv(f'{file_folder}/train.csv')
test = pd.read_csv(f'{file_folder}/test.csv')
magnetic_shielding_tensors = pd.read_csv(f'{file_folder}/magnetic_shielding_tensors.csv')
dipole_moments = pd.read_csv(f'{file_folder}/dipole_moments.csv')
mulliken_charges = pd.read_csv(f'{file_folder}/mulliken_charges.csv')
potential_energy = pd.read_csv(f'{file_folder}/potential_energy.csv')
scalar_coupling_contributions = pd.read_csv(f'{file_folder}/scalar_coupling_contributions.csv')
structures = pd.read_csv(f'{file_folder}/structures.csv')

In [5]:
class Struct(object):
    def __init__(self, is_copy=False, **kwargs):
        self.add(is_copy, **kwargs)

    def add(self, is_copy=False, **kwargs):
        #self.__dict__.update(kwargs)

        if is_copy == False:
            for key, value in kwargs.items():
                setattr(self, key, value)
        else:
            for key, value in kwargs.items():
                try:
                    setattr(self, key, copy.deepcopy(value))
                    #setattr(self, key, value.copy())
                except Exception:
                    setattr(self, key, value)

    def __str__(self):
        return str(self.__dict__.keys())

In [6]:
def load_csv(DATA_DIR):

#     DATA_DIR = '/root/share/project/kaggle/2019/champs_scalar/data'

    #structure
    df_structure = pd.read_csv(DATA_DIR + '/structures.csv')

    #coupling
    df_train = pd.read_csv(DATA_DIR + '/train.csv')
    df_test  = pd.read_csv(DATA_DIR + '/test.csv')
    df_test['scalar_coupling_constant']=0
    df_scalar_coupling = pd.concat([df_train,df_test])
    df_scalar_coupling_contribution = pd.read_csv(DATA_DIR + '/scalar_coupling_contributions.csv')
    df_scalar_coupling = pd.merge(df_scalar_coupling, df_scalar_coupling_contribution, how='left', on=['molecule_name','atom_index_0','atom_index_1','atom_index_0','type'])
    gb_scalar_coupling = df_scalar_coupling.groupby('molecule_name')
    gb_structure       = df_structure.groupby('molecule_name')

    return gb_structure, gb_scalar_coupling

In [7]:
##
# Written by Jan H. Jensen based on this paper Yeonjoon Kim and Woo Youn Kim 
# "Universal Structure Conversion Method for Organic Molecules: From Atomic Connectivity
# to Three-Dimensional Geometry" Bull. Korean Chem. Soc. 2015, Vol. 36, 1769-1777 DOI: 10.1002/bkcs.10334
#
from rdkit import Chem
from rdkit.Chem import AllChem
import itertools
from rdkit.Chem import rdmolops
from collections import defaultdict
import copy
import networkx as nx #uncomment if you don't want to use "quick"/install networkx


global __ATOM_LIST__
__ATOM_LIST__ = [ x.strip() for x in ['h ','he', \
      'li','be','b ','c ','n ','o ','f ','ne', \
      'na','mg','al','si','p ','s ','cl','ar', \
      'k ','ca','sc','ti','v ','cr','mn','fe','co','ni','cu', \
      'zn','ga','ge','as','se','br','kr', \
      'rb','sr','y ','zr','nb','mo','tc','ru','rh','pd','ag', \
      'cd','in','sn','sb','te','i ','xe', \
      'cs','ba','la','ce','pr','nd','pm','sm','eu','gd','tb','dy', \
      'ho','er','tm','yb','lu','hf','ta','w ','re','os','ir','pt', \
      'au','hg','tl','pb','bi','po','at','rn', \
      'fr','ra','ac','th','pa','u ','np','pu'] ]


def get_atom(atom):
    global __ATOM_LIST__
    atom = atom.lower()
    return __ATOM_LIST__.index(atom) + 1


def getUA(maxValence_list, valence_list):
    UA = []
    DU = []
    for i, (maxValence,valence) in enumerate(zip(maxValence_list, valence_list)):
        if maxValence - valence > 0:
            UA.append(i)
            DU.append(maxValence - valence)
    return UA,DU


def get_BO(AC,UA,DU,valences,UA_pairs,quick):
    BO = AC.copy()
    DU_save = []

    while DU_save != DU:
        for i,j in UA_pairs:
            BO[i,j] += 1
            BO[j,i] += 1 
        
        BO_valence = list(BO.sum(axis=1))
        DU_save = copy.copy(DU)
        UA, DU = getUA(valences, BO_valence)
        UA_pairs = get_UA_pairs(UA,AC,quick)[0]

    return BO


def valences_not_too_large(BO,valences):
    number_of_bonds_list = BO.sum(axis=1)
    for valence, number_of_bonds in zip(valences,number_of_bonds_list):
        if number_of_bonds > valence:
            return False

    return True


def BO_is_OK(BO,AC,charge,DU,atomic_valence_electrons,atomicNumList,charged_fragments):
    Q = 0 # total charge
    q_list = []
    if charged_fragments:
        BO_valences = list(BO.sum(axis=1))
        for i,atom in enumerate(atomicNumList):
            q = get_atomic_charge(atom,atomic_valence_electrons[atom],BO_valences[i])
            Q += q
            if atom == 6:
                number_of_single_bonds_to_C = list(BO[i,:]).count(1)
                if number_of_single_bonds_to_C == 2 and BO_valences[i] == 2:
                    Q += 1
                    q = 2
                if number_of_single_bonds_to_C == 3 and Q + 1 < charge:
                    Q += 2
                    q = 1
            
            if q != 0:
                q_list.append(q)

    if (BO-AC).sum() == sum(DU) and charge == Q and len(q_list) <= abs(charge):
        return True
    else:
        return False


def get_atomic_charge(atom,atomic_valence_electrons,BO_valence):
    if atom == 1:
        charge = 1 - BO_valence
    elif atom == 5:
        charge = 3 - BO_valence
    elif atom == 15 and BO_valence == 5:
        charge = 0
    elif atom == 16 and BO_valence == 6:
        charge = 0
    else:
        charge = atomic_valence_electrons - 8 + BO_valence

    return charge

def clean_charges(mol):
# this hack should not be needed any more but is kept just in case
#

    rxn_smarts = ['[N+:1]=[*:2]-[C-:3]>>[N+0:1]-[*:2]=[C-0:3]',
                  '[N+:1]=[*:2]-[O-:3]>>[N+0:1]-[*:2]=[O-0:3]',
                  '[N+:1]=[*:2]-[*:3]=[*:4]-[O-:5]>>[N+0:1]-[*:2]=[*:3]-[*:4]=[O-0:5]',
                  '[#8:1]=[#6:2]([!-:6])[*:3]=[*:4][#6-:5]>>[*-:1][*:2]([*:6])=[*:3][*:4]=[*+0:5]',
                  '[O:1]=[c:2][c-:3]>>[*-:1][*:2][*+0:3]',
                  '[O:1]=[C:2][C-:3]>>[*-:1][*:2]=[*+0:3]']

    fragments = Chem.GetMolFrags(mol,asMols=True,sanitizeFrags=False)

    for i,fragment in enumerate(fragments):
        for smarts in rxn_smarts:
            patt = Chem.MolFromSmarts(smarts.split(">>")[0])
            while fragment.HasSubstructMatch(patt):
                rxn = AllChem.ReactionFromSmarts(smarts)
                ps = rxn.RunReactants((fragment,))
                fragment = ps[0][0]
        if i == 0:
            mol = fragment
        else:
            mol = Chem.CombineMols(mol,fragment)

    return mol


def BO2mol(mol,BO_matrix, atomicNumList,atomic_valence_electrons,mol_charge,charged_fragments):
# based on code written by Paolo Toscani

    l = len(BO_matrix)
    l2 = len(atomicNumList)
    BO_valences = list(BO_matrix.sum(axis=1))

    if (l != l2):
        raise RuntimeError('sizes of adjMat ({0:d}) and atomicNumList '
            '{1:d} differ'.format(l, l2))

    rwMol = Chem.RWMol(mol)

    bondTypeDict = {
        1: Chem.BondType.SINGLE,
        2: Chem.BondType.DOUBLE,
        3: Chem.BondType.TRIPLE
    }

    for i in range(l):
        for j in range(i + 1, l):
            bo = int(round(BO_matrix[i, j]))
            if (bo == 0):
                continue
            bt = bondTypeDict.get(bo, Chem.BondType.SINGLE)
            rwMol.AddBond(i, j, bt)
    mol = rwMol.GetMol()

    if charged_fragments:
        mol = set_atomic_charges(mol,atomicNumList,atomic_valence_electrons,BO_valences,BO_matrix,mol_charge)
    else:
        mol = set_atomic_radicals(mol,atomicNumList,atomic_valence_electrons,BO_valences)

    return mol

def set_atomic_charges(mol,atomicNumList,atomic_valence_electrons,BO_valences,BO_matrix,mol_charge):
    q = 0
    for i,atom in enumerate(atomicNumList):
        a = mol.GetAtomWithIdx(i)
        charge = get_atomic_charge(atom,atomic_valence_electrons[atom],BO_valences[i])
        q += charge
        if atom == 6:
            number_of_single_bonds_to_C = list(BO_matrix[i,:]).count(1)
            if number_of_single_bonds_to_C == 2 and BO_valences[i] == 2:
                    q += 1
                    charge = 0
            if number_of_single_bonds_to_C == 3 and q + 1 < mol_charge:
                    q += 2
                    charge = 1

        if (abs(charge) > 0):
            a.SetFormalCharge(int(charge))

    # shouldn't be needed anymore bit is kept just in case
    #mol = clean_charges(mol)

    return mol


def set_atomic_radicals(mol,atomicNumList,atomic_valence_electrons,BO_valences):
# The number of radical electrons = absolute atomic charge
    for i,atom in enumerate(atomicNumList):
        a = mol.GetAtomWithIdx(i)
        charge = get_atomic_charge(atom,atomic_valence_electrons[atom],BO_valences[i])

        if (abs(charge) > 0):
            a.SetNumRadicalElectrons(abs(int(charge)))

    return mol

def get_bonds(UA,AC):
    bonds = []

    for k,i in enumerate(UA):
        for j in UA[k+1:]:
            if AC[i,j] == 1:
                bonds.append(tuple(sorted([i,j])))

    return bonds

def get_UA_pairs(UA,AC,quick):
    bonds = get_bonds(UA,AC)
    if len(bonds) == 0:
        return [()]

    if quick:
        G=nx.Graph()
        G.add_edges_from(bonds)
        UA_pairs = [list(nx.max_weight_matching(G))]
        return UA_pairs

    max_atoms_in_combo = 0
    UA_pairs = [()]
    for combo in list(itertools.combinations(bonds, int(len(UA)/2))):
        flat_list = [item for sublist in combo for item in sublist]
        atoms_in_combo = len(set(flat_list))
        if atoms_in_combo > max_atoms_in_combo:
            max_atoms_in_combo = atoms_in_combo
            UA_pairs = [combo]
 #           if quick and max_atoms_in_combo == 2*int(len(UA)/2):
 #               return UA_pairs
        elif atoms_in_combo == max_atoms_in_combo:
            UA_pairs.append(combo)

    return UA_pairs

def AC2BO(AC,atomicNumList,charge,charged_fragments,quick):
    # TODO
    atomic_valence = defaultdict(list)
    atomic_valence[1] = [1]
    atomic_valence[6] = [4]
    atomic_valence[7] = [4,3]
    atomic_valence[8] = [2,1]
    atomic_valence[9] = [1]
    atomic_valence[14] = [4]
    atomic_valence[15] = [5,4,3]
    atomic_valence[16] = [6,4,2]
    atomic_valence[17] = [1]
    atomic_valence[32] = [4]
    atomic_valence[35] = [1]
    atomic_valence[53] = [1]


    atomic_valence_electrons = {}
    atomic_valence_electrons[1] = 1
    atomic_valence_electrons[6] = 4
    atomic_valence_electrons[7] = 5
    atomic_valence_electrons[8] = 6
    atomic_valence_electrons[9] = 7
    atomic_valence_electrons[14] = 4
    atomic_valence_electrons[15] = 5
    atomic_valence_electrons[16] = 6
    atomic_valence_electrons[17] = 7
    atomic_valence_electrons[32] = 4
    atomic_valence_electrons[35] = 7
    atomic_valence_electrons[53] = 7

# make a list of valences, e.g. for CO: [[4],[2,1]]
    valences_list_of_lists = []
    for atomicNum in atomicNumList:
        valences_list_of_lists.append(atomic_valence[atomicNum])

# convert [[4],[2,1]] to [[4,2],[4,1]]
    valences_list = itertools.product(*valences_list_of_lists)

    best_BO = AC.copy()

# implemenation of algorithm shown in Figure 2
# UA: unsaturated atoms
# DU: degree of unsaturation (u matrix in Figure)
# best_BO: Bcurr in Figure 
#

    for valences in valences_list:
        AC_valence = list(AC.sum(axis=1))
        UA,DU_from_AC = getUA(valences, AC_valence)

        if len(UA) == 0 and BO_is_OK(AC,AC,charge,DU_from_AC,atomic_valence_electrons,atomicNumList,charged_fragments):
            return AC,atomic_valence_electrons
        
        UA_pairs_list = get_UA_pairs(UA,AC,quick) 
        for UA_pairs in UA_pairs_list:
            BO = get_BO(AC,UA,DU_from_AC,valences,UA_pairs,quick)
            if BO_is_OK(BO,AC,charge,DU_from_AC,atomic_valence_electrons,atomicNumList,charged_fragments):
                return BO,atomic_valence_electrons

            elif BO.sum() >= best_BO.sum() and valences_not_too_large(BO,valences):
                best_BO = BO.copy()

    return best_BO,atomic_valence_electrons


def AC2mol(mol,AC,atomicNumList,charge,charged_fragments,quick):
# convert AC matrix to bond order (BO) matrix
    BO,atomic_valence_electrons = AC2BO(AC,atomicNumList,charge,charged_fragments,quick)

# add BO connectivity and charge info to mol object
    mol = BO2mol(mol,BO, atomicNumList,atomic_valence_electrons,charge,charged_fragments)

    return mol


def get_proto_mol(atomicNumList):
    mol = Chem.MolFromSmarts("[#"+str(atomicNumList[0])+"]")
    rwMol = Chem.RWMol(mol)
    for i in range(1,len(atomicNumList)):
        a = Chem.Atom(atomicNumList[i])
        rwMol.AddAtom(a)
    
    mol = rwMol.GetMol()

    return mol


def get_atomicNumList(atomic_symbols):
    atomicNumList = []
    for symbol in atomic_symbols:
        atomicNumList.append(get_atom(symbol))
    return atomicNumList


def read_xyz_file(filename):

    atomic_symbols = []
    xyz_coordinates = []

    with open(filename, "r") as file:
        for line_number,line in enumerate(file):
            if line_number == 0:
                num_atoms = int(line)
            elif line_number == 1:
                if "charge=" in line:
                    charge = int(line.split("=")[1])
                else:
                    charge = 0
            else:
                atomic_symbol, x, y, z = line.split()
                atomic_symbols.append(atomic_symbol)
                xyz_coordinates.append([float(x),float(y),float(z)])

    atomicNumList = get_atomicNumList(atomic_symbols)
    
    return atomicNumList,charge,xyz_coordinates

def xyz2AC(atomicNumList,xyz):
    import numpy as np
    mol = get_proto_mol(atomicNumList)
    conf = Chem.Conformer(mol.GetNumAtoms())
#     print(mol.GetNumAtoms())
    for i in range(mol.GetNumAtoms()):
        conf.SetAtomPosition(i,(xyz[i][0],xyz[i][1],xyz[i][2]))
    mol.AddConformer(conf)

    dMat = Chem.Get3DDistanceMatrix(mol)
    pt = Chem.GetPeriodicTable()

    num_atoms = len(atomicNumList)
    AC = np.zeros((num_atoms,num_atoms)).astype(int)

    for i in range(num_atoms):
        a_i = mol.GetAtomWithIdx(i)
        Rcov_i = pt.GetRcovalent(a_i.GetAtomicNum())*1.30
        for j in range(i+1,num_atoms):
            a_j = mol.GetAtomWithIdx(j)
            Rcov_j = pt.GetRcovalent(a_j.GetAtomicNum())*1.30
            if dMat[i,j] <= Rcov_i + Rcov_j:
                AC[i,j] = 1
                AC[j,i] = 1

    return AC,mol

def chiral_stereo_check(mol):
    Chem.SanitizeMol(mol)
    Chem.DetectBondStereochemistry(mol,-1)
    Chem.AssignStereochemistry(mol, flagPossibleStereoCenters=True, force=True)
    Chem.AssignAtomChiralTagsFromStructure(mol,-1)

    return mol

def xyz2mol(atomicNumList,charge,xyz_coordinates,charged_fragments,quick):

# Get atom connectivity (AC) matrix, list of atomic numbers, molecular charge, 
# and mol object with no connectivity information
    AC,mol = xyz2AC(atomicNumList,xyz_coordinates)

# Convert AC to bond order matrix and add connectivity and charge info to mol object
    new_mol = AC2mol(mol,AC,atomicNumList,charge,charged_fragments,quick)

# Check for stereocenters and chiral centers
    new_mol = chiral_stereo_check(new_mol)

    return new_mol

# if __name__ == "__main__":

#     import argparse

#     parser = argparse.ArgumentParser(usage='%(prog)s [options] molecule.xyz')
#     parser.add_argument('structure', metavar='structure', type=str)
#     parser.add_argument('-s', '--sdf', action="store_true", help="Dump sdf file")
#     args = parser.parse_args()

    
#     filename = args.structure
#     charged_fragments = True # alternatively radicals are made

#     # quick is faster for large systems but requires networkx
#     # if you don't want to install networkx set quick=False and 
#     # uncomment 'import networkx as nx' at the top of the file 
#     quick = True 

#     atomicNumList, charge, xyz_coordinates = read_xyz_file(filename)

#     mol = xyz2mol(atomicNumList, charge, xyz_coordinates, charged_fragments, quick)

#     if args.sdf:
#         filename = filename.replace(".xyz", "")
#         filename += ".sdf"
#         writer = Chem.SDWriter(filename)
#         writer.write(mol)

#     # Canonical hack
#     smiles = Chem.MolToSmiles(mol, isomericSmiles=True)
#     m = Chem.MolFromSmiles(smiles)
#     smiles = Chem.MolToSmiles(m, isomericSmiles=True)

#     print(smiles)

In [8]:
# gb_structure, gb_scalar_coupling = load_csv(file_folder)

In [9]:
# molecue_names = train.molecule_name.unique()

In [10]:
# m = 'dsgdb9nsd_000002'

In [11]:
# for c, g in gb_structure:
#     if c == m:
#         break

In [12]:
# for c2, g2 in gb_scalar_coupling:
#     if c2==m:
#         break

In [26]:
from multiprocessing import Pool
import multiprocessing as mp
import pickle

def read_pickle_from_file(pickle_file):
    with open(pickle_file,'rb') as f:
        x = pickle.load(f)
    return x

def write_pickle_to_file(pickle_file, x):
    with open(pickle_file, 'wb') as f:
        pickle.dump(x, f, pickle.HIGHEST_PROTOCOL)
        
def do_one(p):
    i, molecule_name, gb_structure, gb_scalar_coupling, graph_file = p
    try:
        g = make_graph(molecule_name, gb_structure, gb_scalar_coupling, )
#         print(i, g.molecule_name, g.smiles)
        write_pickle_to_file(graph_file,g)
    except Exception as e:
        raise Exception(molecule_name, e.__str__())
    
def run_convert_to_graph(graph_dir, csv_dir):
#     graph_dir = '/root/share/project/kaggle/2019/champs_scalar/data/structure/graph1'
    os.makedirs(graph_dir, exist_ok=True)

    gb_structure, gb_scalar_coupling = load_csv(csv_dir)
    molecule_names = list(gb_scalar_coupling.groups.keys())
    molecule_names = np.sort(molecule_names)
    param=[]
    
    N_ = len(molecule_names)
    segment = N_//1000
    for m_sidx in range(0, N_, segment):
        m_eidx = m_sidx+segment if m_sidx+segment < N_ else N_
        for i, molecule_name in enumerate(molecule_names[m_sidx:m_eidx]):

            graph_file = graph_dir + '/%s.pickle'%molecule_name
            p = (i, molecule_name, gb_structure, gb_scalar_coupling, graph_file)
    #         if i<2000:
    #             do_one(p)
    #         else:
    #             param.append(p)
            param.append(p)

        if 1:
            pool = mp.Pool(processes=16)
            pool.map(do_one, param)
        print(f'{m_eidx}/{N_} finished')

In [14]:
# atomic_symbols   = g.atom.values.tolist()
# atomicNumList = get_atomicNumList(atomic_symbols)
# xyz = g[['x','y','z']].values
# charge  = 0
# charged_fragments = True
# quick   =  True
# mol = xyz2mol(atomicNumList, charge, xyz, charged_fragments, quick)

In [15]:
# Draw.MolToImage(mol)

In [16]:
SYMBOL=['H', 'C', 'N', 'O', 'F']
SYMBOL_num = {'H':0, 'C':1, 'N':2, 'O':3, 'F':4}

HYBRIDIZATION=[
    #Chem.rdchem.HybridizationType.S,
    Chem.rdchem.HybridizationType.SP,
    Chem.rdchem.HybridizationType.SP2,
    Chem.rdchem.HybridizationType.SP3,
    #Chem.rdchem.HybridizationType.SP3D,
    #Chem.rdchem.HybridizationType.SP3D2,
]

BOND_TYPE = [
    Chem.rdchem.BondType.SINGLE,
    Chem.rdchem.BondType.DOUBLE,
    Chem.rdchem.BondType.TRIPLE,
    Chem.rdchem.BondType.AROMATIC,
]

# def one_hot_encoding(x, set):
#     one_hot = [int(x == s) for s in set]
#     if 0:
#         if sum(one_hot)==0: print('one_hot_encoding() return NULL!', x, set)
#     return one_hot

# factory = ChemicalFeatures.BuildFeatureFactory(os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef'))
# feature = factory.GetFeaturesForMol(mol)

# num_atom = mol.GetNumAtoms()
# symbol   = np.zeros((num_atom,1),np.uint8) #category
# acceptor = np.zeros((num_atom,1),np.uint8)
# donor    = np.zeros((num_atom,1),np.uint8)
# aromatic = np.zeros((num_atom,1),np.uint8)
# hybridization = np.zeros((num_atom,1),np.uint8)
# num_h  = np.zeros((num_atom,1),np.float32)#real
# atomic = np.zeros((num_atom,1),np.float32)

# atom_map_num = np.zeros((num_atom,1),np.uint8)
# atomic_num = np.zeros((num_atom,1),np.uint8)
# chiral_tag = np.zeros((num_atom,1),np.uint8)
# degree = np.zeros((num_atom,1),np.uint8)
# explicit_valence = np.zeros((num_atom,1),np.uint8)
# formal_charge = np.zeros((num_atom,1),np.uint8)
# implicit_valence = np.zeros((num_atom,1),np.uint8)
# isotope = np.zeros((num_atom,1),np.uint8)
# mass = np.zeros((num_atom,1),np.float32)
# degree = np.zeros((num_atom,1),np.uint8)
# no_implicit = np.zeros((num_atom,1),np.uint8)
# num_explicit_hs = np.zeros((num_atom,1),np.uint8)
# num_implicit_hs = np.zeros((num_atom,1),np.uint8)
# num_radical_electrons = np.zeros((num_atom,1),np.uint8)
# total_degree = np.zeros((num_atom,1),np.uint8)
# total_num_hs = np.zeros((num_atom,1),np.uint8)
# total_valence = np.zeros((num_atom,1),np.uint8)
# has_query = np.zeros((num_atom,1),np.uint8)
# neighbor_h_count = np.zeros((num_atom,1),np.uint8)
# neighbor_c_count = np.zeros((num_atom,1),np.uint8)
# neighbor_n_count = np.zeros((num_atom,1),np.uint8)
# neighbor_o_count = np.zeros((num_atom,1),np.uint8)
# neighbor_f_count = np.zeros((num_atom,1),np.uint8)

# for i in range(num_atom):
#     atom = mol.GetAtomWithIdx(i)
#     atom_map_num[i] = atom.GetAtomMapNum()
#     atomic_num[i] = atom.GetAtomicNum()
#     chiral_tag[i] = atom.GetChiralTag()
#     degree[i] = atom.GetDegree()
#     explicit_valence[i] = atom.GetExplicitValence()
#     formal_charge[i] = atom.GetFormalCharge()
# #     degree[i] = atom.GetHybridization()
#     implicit_valence[i] = atom.GetImplicitValence()
# #     degree[i] = atom.GetIsAromatic()
#     isotope[i] = atom.GetIsotope()
#     mass[i] = atom.GetMass()
#     atom_neighbors = atom.GetNeighbors()
#     neighbor_symbol_list = [n_i.GetSymbol() for n_i in list(atom_neighbors)]
#     neighbor_h_count[i] = neighbor_symbol_list.count('H')
#     neighbor_c_count[i] = neighbor_symbol_list.count('H')
#     neighbor_n_count[i] = neighbor_symbol_list.count('H')
#     neighbor_o_count[i] = neighbor_symbol_list.count('H')
#     neighbor_f_count[i] = neighbor_symbol_list.count('H')
#     degree[i] = atom.GetDegree()
#     no_implicit[i] = int(atom.GetNoImplicit())
#     num_explicit_hs[i] = atom.GetNumExplicitHs()
#     num_implicit_hs[i] = atom.GetNumImplicitHs()
#     num_radical_electrons[i] = atom.GetNumRadicalElectrons()
# #     degree[i] = atom.GetSymbol()
#     total_degree[i] = atom.GetTotalDegree()
#     total_num_hs[i] = atom.GetTotalNumHs()
#     total_valence[i] = atom.GetTotalValence()
#     has_query[i] = int(atom.HasQuery())
#     is_in_ring[i] = int(atom.IsInRing())
#     symbol[i]        = SYMBOL_num[atom.GetSymbol()]#one_hot_encoding(atom.GetSymbol(),SYMBOL)
#     aromatic[i]      = atom.GetIsAromatic()
#     hybridization[i] = atom.GetHybridization()#one_hot_encoding(atom.GetHybridization(),HYBRIDIZATION)
#     num_h[i]  = atom.GetTotalNumHs(includeNeighbors=True)
#     atomic[i] = atom.GetAtomicNum()

# #[f.GetFamily() for f in feature]
# for t in range(0, len(feature)):
#     if feature[t].GetFamily() == 'Donor':
#         for i in feature[t].GetAtomIds():
#             donor[i] = 1
#     elif feature[t].GetFamily() == 'Acceptor':
#         for i in feature[t].GetAtomIds():
#             acceptor[i] = 1

# ## ** edge **
# num_edge = num_atom*num_atom - num_atom
# edge_index = np.zeros((num_edge,2), np.uint8)
# bond_type  = np.zeros((num_edge,1), np.uint8)#category
# distance   = np.zeros((num_edge,1),np.float32) #real
# cos_angle      = np.zeros((num_edge,1),np.float32) #real
# angle      = np.zeros((num_edge,1),np.float32) #real
# angle2center = np.zeros((num_edge,1),np.float32)

# bond_dir = np.zeros((num_edge,1),np.float32)
# is_aromatic = np.zeros((num_edge,1),np.float32)
# is_conjugated = np.zeros((num_edge,1),np.float32)
# stereo = np.zeros((num_edge,1),np.float32)
# has_query = np.zeros((num_edge,1),np.float32)
# is_in_ring = np.zeros((num_edge,1),np.float32)

# norm_xyz = preprocessing.normalize(xyz, norm='l2')
# norm_xyz_center = np.mean(norm_xyz, axis=0)

# ij=0
# for i in range(num_atom):
#     for j in range(num_atom):
#         if i==j: continue
#         edge_index[ij] = [i,j]

#         bond = mol.GetBondBetweenAtoms(i, j)
#         if bond is not None:
#             bond_type[ij] = bond.GetBondType()#one_hot_encoding(bond.GetBondType(),BOND_TYPE)
#             bond_dir[ij] = bond.GetBondDir()
#             is_aromatic[ij] = int(bond.GetIsAromatic())
#             is_conjugated[ij] = int(bond.GetIsConjugated())
#             stereo[ij] = bond.GetStereo()
#             has_query[ij] = int(bond.HasQuery())
#             is_in_ring[ij] = int(bond.IsInRing())

#         distance[ij] = ((xyz[i] - xyz[j])**2).sum()**0.5
#         cos_angle[ij] = (norm_xyz[i]*norm_xyz[j]).sum()
#         angle[ij] = np.arccos(cos_angle[ij])
#         cos_angle2center = ((norm_xyz[i]-norm_xyz_center)*(norm_xyz[j]-norm_xyz_center)).sum()
#         angle2center[ij] = np.arccos(cos_angle2center - int(cos_angle2center))
        
#         ij+=1
##-------------------

# node_list = [symbol, acceptor, donor, aromatic, hybridization, num_h, atomic, atom_map_num,atomic_num,chiral_tag,degree,explicit_valence,formal_charge,implicit_valence,isotope,mass,degree,no_implicit,num_explicit_hs,num_implicit_hs,num_radical_electrons,total_degree,total_num_hs,total_valence,has_query,neighbor_h_count,neighbor_c_count,neighbor_n_count,neighbor_o_count,neighbor_f_count]
# edge_list = [bond_type, distance, angle, angle2center, bond_dir, is_aromatic, is_conjugated, stereo, has_query, is_in_ring]
# graph = Struct(
#     molecule_name = molecule_name,
#     smiles = Chem.MolToSmiles(mol),
#     axyz = [a,xyz],

#     node = node_list,
#     edge = edge_list,
#     edge_index = edge_index,

#     coupling = coupling,
# )



In [17]:
atom_func_name = [
#     'DescribeQuery',
 'GetAtomMapNum',
 'GetAtomicNum',
 'GetBonds',
#  'GetBoolProp',
 'GetChiralTag',
 'GetDegree',
#  'GetDoubleProp',
 'GetExplicitValence',
 'GetFormalCharge',
 'GetHybridization',
#  'GetIdx',
 'GetImplicitValence',
#  'GetIntProp',
 'GetIsAromatic',
 'GetIsotope',
 'GetMass',
#  'GetMonomerInfo',
 'GetNeighbors',
 'GetNoImplicit',
 'GetNumExplicitHs',
 'GetNumImplicitHs',
 'GetNumRadicalElectrons',
#  'GetOwningMol',
#  'GetPDBResidueInfo',
#  'GetProp',
#  'GetPropNames',
#  'GetPropsAsDict',
#  'GetSmarts',
 'GetSymbol',
 'GetTotalDegree',
 'GetTotalNumHs',
 'GetTotalValence',
#  'GetUnsignedProp',
#  'HasProp',
 'HasQuery',
#  'InvertChirality',
 'IsInRing',
#  'IsInRingSize',
#  'Match',
            ]

In [18]:
bond_func_name = [
# 'ClearProp',
#  'DescribeQuery',
#  'GetBeginAtom',
 'GetBeginAtomIdx',
 'GetBondDir',
 'GetBondType',
 'GetBondTypeAsDouble',
#  'GetBoolProp',
#  'GetDoubleProp',
#  'GetEndAtom',
 'GetEndAtomIdx',
#  'GetIdx',
#  'GetIntProp',
 'GetIsAromatic',
 'GetIsConjugated',
#  'GetOtherAtom',
#  'GetOtherAtomIdx',
#  'GetOwningMol',
#  'GetProp',
#  'GetPropNames',
#  'GetPropsAsDict',
#  'GetSmarts',
 'GetStereo',
#  'GetStereoAtoms',
#  'GetUnsignedProp',
#  'GetValenceContrib',
#  'HasProp',
 'HasQuery',
 'IsInRing',
#  'IsInRingSize',
#  'Match',
            ]

In [19]:
from rdkit import Chem

In [20]:
import pybel

In [21]:
COUPLING_TYPE_STATS=[
    #type   #mean, std, min, max
    '1JHC',  94.9761528641869,   18.27722399839607,   66.6008,   204.8800,
    '2JHC',  -0.2706244378832,    4.52360876732858,  -36.2186,    42.8192,
    '3JHC',   3.6884695895355,    3.07090647005439,  -18.5821,    76.0437,
    '1JHN',  47.4798844844683,   10.92204561670947,   24.3222,    80.4187,
    '2JHN',   3.1247536134185,    3.67345877025737,   -2.6209,    17.7436,
    '3JHN',   0.9907298624944,    1.31538940138001,   -3.1724,    10.9712,
    '2JHH', -10.2866051639817,    3.97960190019757,  -35.1761,    11.8542,
    '3JHH',   4.7710233597359,    3.70498129755812,   -3.0205,    17.4841,
]
NUM_COUPLING_TYPE = len(COUPLING_TYPE_STATS)//5

COUPLING_TYPE_MEAN = [ COUPLING_TYPE_STATS[i*5+1] for i in range(NUM_COUPLING_TYPE)]
COUPLING_TYPE_STD  = [ COUPLING_TYPE_STATS[i*5+2] for i in range(NUM_COUPLING_TYPE)]
COUPLING_TYPE      = [ COUPLING_TYPE_STATS[i*5  ] for i in range(NUM_COUPLING_TYPE)]


def one_hot_encoding(x, set):
    one_hot = [int(x == s) for s in set]
    if 0:
        if sum(one_hot)==0: print('one_hot_encoding() return NULL!', x, set)
    return one_hot

def mol_from_axyz(symbol, xyz):
    charged_fragments = True
    quick   =  True
    charge  = 0
    atomicNumList = get_atomicNumList(symbol)
    mol = xyz2mol(atomicNumList, charge, xyz, charged_fragments, quick)
    return mol

def make_graph2(molecule_name, gb_structure, gb_scalar_coupling, ):
    #https://stackoverflow.com/questions/14734533/how-to-access-pandas-groupby-dataframe-by-key

    #----
    df = gb_scalar_coupling.get_group(molecule_name)
    # ['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type',
    #        'scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso'],

    coupling = Struct(
        id = df.id.values,
        contribution = df[['fc', 'sd', 'pso', 'dso']].values,
        index = df[['atom_index_0', 'atom_index_1']].values,
        #type = np.array([ one_hot_encoding(t,COUPLING_TYPE) for t in df.type.values ], np.uint8)
        type = np.array([ COUPLING_TYPE.index(t) for t in df.type.values ], np.int32),
        value = df.scalar_coupling_constant.values,
    )

    #----
    df = gb_structure.get_group(molecule_name)
    df = df.sort_values(['atom_index'], ascending=True)
    # ['molecule_name', 'atom_index', 'atom', 'x', 'y', 'z']
    a   = df.atom.values.tolist()
    xyz = df[['x','y','z']].values
    mol = mol_from_axyz(a, xyz)

    #---
    assert( #check
       a == [ mol.GetAtomWithIdx(i).GetSymbol() for i in range(mol.GetNumAtoms())]
    )

    #---
    factory = ChemicalFeatures.BuildFeatureFactory(os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef'))
    feature = factory.GetFeaturesForMol(mol)

    ## ** node **
    #[ a.GetSymbol() for a in mol.GetAtoms() ]

    num_atom = mol.GetNumAtoms()
    symbol   = np.zeros((num_atom,len(SYMBOL)),np.uint8) #category
    acceptor = np.zeros((num_atom,1),np.uint8)
    donor    = np.zeros((num_atom,1),np.uint8)
    aromatic = np.zeros((num_atom,1),np.uint8)
    hybridization = np.zeros((num_atom,len(HYBRIDIZATION)),np.uint8)
    num_h  = np.zeros((num_atom,1),np.float32)#real
    atomic = np.zeros((num_atom,1),np.float32)

    for i in range(num_atom):
        atom = mol.GetAtomWithIdx(i)
        symbol[i]        = one_hot_encoding(atom.GetSymbol(),SYMBOL)
        aromatic[i]      = atom.GetIsAromatic()
        hybridization[i] = one_hot_encoding(atom.GetHybridization(),HYBRIDIZATION)

        num_h[i]  = atom.GetTotalNumHs(includeNeighbors=True)
        atomic[i] = atom.GetAtomicNum()

    #[f.GetFamily() for f in feature]
    for t in range(0, len(feature)):
        if feature[t].GetFamily() == 'Donor':
            for i in feature[t].GetAtomIds():
                donor[i] = 1
        elif feature[t].GetFamily() == 'Acceptor':
            for i in feature[t].GetAtomIds():
                acceptor[i] = 1

    ## ** edge **
    num_edge = num_atom*num_atom - num_atom
    edge_index = np.zeros((num_edge,2), np.uint8)
    bond_type  = np.zeros((num_edge,len(BOND_TYPE)), np.uint8)#category
    distance   = np.zeros((num_edge,1),np.float32) #real
    angle      = np.zeros((num_edge,1),np.float32) #real

    norm_xyz = preprocessing.normalize(xyz, norm='l2')

    ij=0
    for i in range(num_atom):
        for j in range(num_atom):
            if i==j: continue
            edge_index[ij] = [i,j]

            bond = mol.GetBondBetweenAtoms(i, j)
            if bond is not None:
                bond_type[ij] = one_hot_encoding(bond.GetBondType(),BOND_TYPE)

            distance[ij] = ((xyz[i] - xyz[j])**2).sum()**0.5
            angle[ij] = (norm_xyz[i]*norm_xyz[j]).sum()

            ij+=1
    ##-------------------

    graph = Struct(
        molecule_name = molecule_name,
        smiles = Chem.MolToSmiles(mol),
        axyz = [a,xyz],

        node = [symbol, acceptor, donor, aromatic, hybridization, num_h, atomic,],
        edge = [bond_type, distance, angle],
        edge_index = edge_index,

        coupling = coupling,
    )
    return graph

In [22]:
def make_graph(molecule_name, gb_structure, gb_scalar_coupling, ):
    #https://stackoverflow.com/questions/14734533/how-to-access-pandas-groupby-dataframe-by-key

    #----
    df = gb_scalar_coupling.get_group(molecule_name)
    # ['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type',
    #        'scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso'],

    coupling = Struct(
        id = df.id.values,
        contribution = df[['fc', 'sd', 'pso', 'dso']].values,
        index = df[['atom_index_0', 'atom_index_1']].values,
        #type = np.array([ one_hot_encoding(t,COUPLING_TYPE) for t in df.type.values ], np.uint8)
        type = np.array([ COUPLING_TYPE.index(t) for t in df.type.values ], np.int32),
        value = df.scalar_coupling_constant.values,
    )

    #----
    df = gb_structure.get_group(molecule_name)
    df = df.sort_values(['atom_index'], ascending=True)
    # ['molecule_name', 'atom_index', 'atom', 'x', 'y', 'z']
    a   = df.atom.values.tolist()
    xyz = df[['x','y','z']].values
    mol = mol_from_axyz(a, xyz)

    #---
    assert( #check
       a == [ mol.GetAtomWithIdx(i).GetSymbol() for i in range(mol.GetNumAtoms())]
    )

    #---
    factory = ChemicalFeatures.BuildFeatureFactory(os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef'))
    feature = factory.GetFeaturesForMol(mol)

    num_atom = mol.GetNumAtoms()
    symbol   = np.zeros((num_atom,1),np.uint8) #category
    acceptor = np.zeros((num_atom,1),np.uint8)
    donor    = np.zeros((num_atom,1),np.uint8)
    aromatic = np.zeros((num_atom,1),np.uint8)
    hybridization = np.zeros((num_atom,1),np.uint8)
    num_h  = np.zeros((num_atom,1),np.float32)#real
    atomic = np.zeros((num_atom,1),np.float32)

    atom_map_num = np.zeros((num_atom,1),np.uint8)
    atomic_num = np.zeros((num_atom,1),np.uint8)
    chiral_tag = np.zeros((num_atom,1),np.uint8)
    degree = np.zeros((num_atom,1),np.uint8)
    explicit_valence = np.zeros((num_atom,1),np.uint8)
    formal_charge = np.zeros((num_atom,1),np.uint8)
    implicit_valence = np.zeros((num_atom,1),np.uint8)
    isotope = np.zeros((num_atom,1),np.uint8)
    mass = np.zeros((num_atom,1),np.float32)
    degree = np.zeros((num_atom,1),np.uint8)
    no_implicit = np.zeros((num_atom,1),np.uint8)
    num_explicit_hs = np.zeros((num_atom,1),np.uint8)
    num_implicit_hs = np.zeros((num_atom,1),np.uint8)
    num_radical_electrons = np.zeros((num_atom,1),np.uint8)
    total_degree = np.zeros((num_atom,1),np.uint8)
    total_num_hs = np.zeros((num_atom,1),np.uint8)
    total_valence = np.zeros((num_atom,1),np.uint8)
    has_query = np.zeros((num_atom,1),np.uint8)
    is_in_ring = np.zeros((num_atom, 1),np.uint8)
    neighbor_h_count = np.zeros((num_atom,1),np.uint8)
    neighbor_c_count = np.zeros((num_atom,1),np.uint8)
    neighbor_n_count = np.zeros((num_atom,1),np.uint8)
    neighbor_o_count = np.zeros((num_atom,1),np.uint8)
    neighbor_f_count = np.zeros((num_atom,1),np.uint8)

    for i in range(num_atom):
        atom = mol.GetAtomWithIdx(i)
        atom_map_num[i] = atom.GetAtomMapNum()
        atomic_num[i] = atom.GetAtomicNum()
        chiral_tag[i] = atom.GetChiralTag()
        degree[i] = atom.GetDegree()
        explicit_valence[i] = atom.GetExplicitValence()
        formal_charge[i] = atom.GetFormalCharge()
    #     degree[i] = atom.GetHybridization()
        implicit_valence[i] = atom.GetImplicitValence()
    #     degree[i] = atom.GetIsAromatic()
        isotope[i] = atom.GetIsotope()
        mass[i] = atom.GetMass()
        atom_neighbors = atom.GetNeighbors()
        neighbor_symbol_list = [n_i.GetSymbol() for n_i in list(atom_neighbors)]
        neighbor_h_count[i] = neighbor_symbol_list.count('H')
        neighbor_c_count[i] = neighbor_symbol_list.count('H')
        neighbor_n_count[i] = neighbor_symbol_list.count('H')
        neighbor_o_count[i] = neighbor_symbol_list.count('H')
        neighbor_f_count[i] = neighbor_symbol_list.count('H')
        degree[i] = atom.GetDegree()
        no_implicit[i] = int(atom.GetNoImplicit())
        num_explicit_hs[i] = atom.GetNumExplicitHs()
        num_implicit_hs[i] = atom.GetNumImplicitHs()
        num_radical_electrons[i] = atom.GetNumRadicalElectrons()
    #     degree[i] = atom.GetSymbol()
        total_degree[i] = atom.GetTotalDegree()
        total_num_hs[i] = atom.GetTotalNumHs()
        total_valence[i] = atom.GetTotalValence()
        has_query[i] = int(atom.HasQuery())
        is_in_ring[i] = int(atom.IsInRing())
        symbol[i]        = SYMBOL_num[atom.GetSymbol()]#one_hot_encoding(atom.GetSymbol(),SYMBOL)
        aromatic[i]      = atom.GetIsAromatic()
        hybridization[i] = atom.GetHybridization()#one_hot_encoding(atom.GetHybridization(),HYBRIDIZATION)
        num_h[i]  = atom.GetTotalNumHs(includeNeighbors=True)
        atomic[i] = atom.GetAtomicNum()

    #[f.GetFamily() for f in feature]
    for t in range(0, len(feature)):
        if feature[t].GetFamily() == 'Donor':
            for i in feature[t].GetAtomIds():
                donor[i] = 1
        elif feature[t].GetFamily() == 'Acceptor':
            for i in feature[t].GetAtomIds():
                acceptor[i] = 1

    ## ** edge **
    num_edge = num_atom*num_atom - num_atom
    edge_index = np.zeros((num_edge,2), np.uint8)
    bond_type  = np.zeros((num_edge,1), np.uint8)#category
    distance   = np.zeros((num_edge,1),np.float32) #real
    cos_angle      = np.zeros((num_edge,1),np.float32) #real
    angle      = np.zeros((num_edge,1),np.float32) #real
    angle2center = np.zeros((num_edge,1),np.float32)

    bond_dir = np.zeros((num_edge,1),np.float32)
    is_aromatic = np.zeros((num_edge,1),np.float32)
    is_conjugated = np.zeros((num_edge,1),np.float32)
    stereo = np.zeros((num_edge,1),np.float32)
    has_query = np.zeros((num_edge,1),np.float32)
    is_in_ring = np.zeros((num_edge,1),np.float32)

    norm_xyz = preprocessing.normalize(xyz, norm='l2')
    norm_xyz_center = np.mean(norm_xyz, axis=0)

    ij=0
    for i in range(num_atom):
        for j in range(num_atom):
            if i==j: continue
            edge_index[ij] = [i,j]

            bond = mol.GetBondBetweenAtoms(i, j)
            if bond is not None:
                bond_type[ij] = bond.GetBondType()#one_hot_encoding(bond.GetBondType(),BOND_TYPE)
                bond_dir[ij] = bond.GetBondDir()
                is_aromatic[ij] = int(bond.GetIsAromatic())
                is_conjugated[ij] = int(bond.GetIsConjugated())
                stereo[ij] = bond.GetStereo()
                has_query[ij] = int(bond.HasQuery())
                is_in_ring[ij] = int(bond.IsInRing())

            distance[ij] = ((xyz[i] - xyz[j])**2).sum()**0.5
            cos_angle[ij] = (norm_xyz[i]*norm_xyz[j]).sum()
            angle[ij] = np.arccos(cos_angle[ij])
            cos_angle2center = ((norm_xyz[i]-norm_xyz_center)*(norm_xyz[j]-norm_xyz_center)).sum()
            angle2center[ij] = np.arccos(cos_angle2center - int(cos_angle2center))

            ij+=1
    ##-------------------

    node_list = [symbol, acceptor, donor, aromatic, hybridization, num_h, atomic, atom_map_num,atomic_num,chiral_tag,degree,explicit_valence,formal_charge,implicit_valence,isotope,mass,degree,no_implicit,num_explicit_hs,num_implicit_hs,num_radical_electrons,total_degree,total_num_hs,total_valence,has_query,neighbor_h_count,neighbor_c_count,neighbor_n_count,neighbor_o_count,neighbor_f_count]
    edge_list = [bond_type, distance, angle, angle2center, bond_dir, is_aromatic, is_conjugated, stereo, has_query, is_in_ring]
    graph = Struct(
        molecule_name = molecule_name,
        smiles = Chem.MolToSmiles(mol),
        axyz = [a,xyz],

        node = node_list,
        edge = edge_list,
        edge_index = edge_index,

        coupling = coupling,
    )
    return graph

In [None]:
run_convert_to_graph('../../data/temp/pytorch_geometric3', file_folder)

130/130775 finished
260/130775 finished
390/130775 finished
520/130775 finished
650/130775 finished
780/130775 finished
910/130775 finished
1040/130775 finished
1170/130775 finished
1300/130775 finished
1430/130775 finished
1560/130775 finished
1690/130775 finished
1820/130775 finished
1950/130775 finished
2080/130775 finished
2210/130775 finished
2340/130775 finished
2470/130775 finished
2600/130775 finished
2730/130775 finished
2860/130775 finished
2990/130775 finished
3120/130775 finished
3250/130775 finished
3380/130775 finished
3510/130775 finished
3640/130775 finished
3770/130775 finished
3900/130775 finished
4030/130775 finished
4160/130775 finished
4290/130775 finished
4420/130775 finished
4550/130775 finished
4680/130775 finished
4810/130775 finished
4940/130775 finished
5070/130775 finished
5200/130775 finished
5330/130775 finished
5460/130775 finished
5590/130775 finished
5720/130775 finished
5850/130775 finished
5980/130775 finished
6110/130775 finished
6240/130775 finished