In [2]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [21]:
import copy
from tqdm import tqdm_notebook

import sys
sys.path.append("../..") # Adds higher directory to python modules path.
from utilities import aggregate_feature_calculators
from utilities import aggregate_feature_calculators_setting as aggcal
from utilities.parallel import Parallel

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from ase import Atoms  # 3D mouse clickable funky visualization
import ase.visualize
import networkx as nx # for cycle detection

from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import os
print("Modules loading Done!")

Modules loading Done!


In [3]:
csv_files = [ "train.csv", "test.csv", "structures.csv" ]
csv_vars  = [ filename[:-4] for filename in csv_files ]
for filename, var in zip( csv_files, csv_vars ):
    eval_str = f"{var:32s} = pd.read_csv('../../data/input/{filename}')"
    print(eval_str)
    exec(eval_str)
    print(f"{'nb of cols ':32s}= "+str(eval(f"len({var})")))
    exec(f"display({var}.head())")

train                            = pd.read_csv('../../data/input/train.csv')
nb of cols                      = 4658147


Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074


test                             = pd.read_csv('../../data/input/test.csv')
nb of cols                      = 2505542


Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type
0,4658147,dsgdb9nsd_000004,2,0,2JHC
1,4658148,dsgdb9nsd_000004,2,1,1JHC
2,4658149,dsgdb9nsd_000004,2,3,3JHH
3,4658150,dsgdb9nsd_000004,3,0,1JHC
4,4658151,dsgdb9nsd_000004,3,1,2JHC


structures                       = pd.read_csv('../../data/input/structures.csv')
nb of cols                      = 2358657


Unnamed: 0,molecule_name,atom_index,atom,x,y,z
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397


In [4]:
from scipy.spatial import cKDTree as KDTree
from tqdm import tqdm_notebook as tqdm


# "usual" valence of atoms
VALENCE_MAX = { 'C': 4, 'H' : 1, 'N' : 4, 'O' : 2, 'F': 1 }
VALENCE_STD = { 'C': 4, 'H' : 1, 'N' : 3, 'O' : 2, 'F': 1 }

# expected distances in [A] for covalence 1 bond
BOND_DIST_C1 = { 'C' : 0.77, 'H' : 0.38, 'N' : 0.75, 'O' : 0.73, 'F' : 0.71 }

# order used for finding bonds by atom type
BOND_ORDER = { 'H' : 0, 'F' : 0, 'O' : 1, 'N' : 2, 'C' : 3}

def add_bond(n_avail, nbond, a0, a1, d1=None):
    key = tuple(sorted((a0, a1)))
    if key in nbond:
        nbond[key][0] += 1.0
    elif d1 is not None:
        nbond[key] = [1.0, d1]
    else:
        raise Exception(f"{a0},{a1} added after phase 1")
    n_avail[a0] -= 1
    n_avail[a1] -= 1
    #display(f"{a0},{a1} => {n_avail}, {nbond}")

def get_bonded_atoms(atoms, nbond, i):
    """returns: [sorted atoms list], [sorted atom index] )"""
    bonded = []
    for (a0, a1), (n, _) in nbond.items():
        if a0 == i:
            bonded.append((a1, atoms[a1]))
        elif a1 == i:
            bonded.append((a0, atoms[a0]))
    bonded = sorted(bonded, key=lambda b: b[1])
    return "".join([b[1] for b in bonded]), [b[0] for b in bonded]


def search_bonds(kdt, n_avail, nbond, connected, isleaf, coords, atoms, atoms_idx, a0, 
                 connect_once=True, VALENCE=VALENCE_STD):
    atom0 = atoms[a0]
    if n_avail[a0] == 0:
        return

    # select closest atoms ORDERED BY DISTANCE: closest first
    # note: the first answer is the atom itself and must be removed
    next_dist, next_i = kdt.query(coords[a0], min(1+VALENCE[atom0], len(atoms)))
    next_dist = next_dist[1:]; # remove a0 from list
    next_i = next_i[1:]

    # for each #VALENCE closest atoms
    found = False
    for d1,a1 in zip(next_dist, next_i):
        if connect_once and (a1 in connected[a0]):
            continue  # enforce 1-bond only in STEP 1
        atom1 = atoms[a1]
        predicted_bond = BOND_DIST_C1[atom0] + BOND_DIST_C1[atom1]
        if abs(d1 / predicted_bond) < 1.2: # keep only atoms in the 20% expected distance or closer
            if n_avail[a1] > 0:
                add_bond(n_avail, nbond, a0, a1, d1)
                connected[a0][a1] = 1
                connected[a1][a0] = 1
                if (n_avail[a0] == 0) or (n_avail[a1] == 0):
                    isleaf[a0] = 1
                    isleaf[a1] = 1
                found = True
                #print("leaF/Trunk & avail: "+ ", ".join([f"{i}:{atoms[i]}={leaflabel[isleaf[i]]}{n_avail[i]}"
                #                 for i in ordered_atoms_index]))

        else:
            #print(f"-- match failure in molecule_name={name} {a0}:{atom0}-{a1}:{atoms[a1]}={d1} predicted={predicted_bond}")
            pass
    return found
    
def compute_bonds(structures, molecules):
    out_name = []; out_a0 = []; out_a1 = []; out_n = []; out_dist = []; out_error = []; out_type = []
    cycle_name = []; cycle_index = []; cycle_seq = [];  cycle_atom_index = []
    charge_name = []; charge_atom_index = []; charge_value = [];
    for imol,name in tqdm(list(enumerate(molecules))):
        molecule = structures.loc[name ]
        error    = 0
        atoms    = molecule.atom.values
        atoms_idx= molecule.atom_index.values
        
        n_avail  = np.asarray([VALENCE_STD[a] for a in atoms])
        n_charge = np.zeros(len(atoms), dtype=np.float16)
        isleaf   = np.zeros(len(atoms), dtype=np.bool) # is the atom in the leafs of connection tree?
        coords   = molecule[['x', 'y', 'z']].values
        kdt = KDTree(coords) # use an optimized structure for closest match query
        nbond = {}
        connected = { i:{} for i in atoms_idx}
        
        # select Hydrogen first to avoid butadyne-like ordering failures (molecule_name=dsgdb9nsd_000023)
        ordered_atoms_index = list(atoms_idx)
        ordered_atoms_index.sort(key = lambda i: BOND_ORDER[atoms[i]])
        ordered_atoms_index = np.asarray(ordered_atoms_index)
        
        # STEP 1: 1-bond connect each atom with closest match
        #         only one bond for each atom pair is done in step 1
        for a0 in ordered_atoms_index:
            search_bonds(kdt, n_avail, nbond, connected, isleaf, coords, atoms, atoms_idx, a0,
                         connect_once=True, VALENCE=VALENCE_STD)
     
        # STEP 2: greedy connect n-bonds, progressing from leafs of connection tree
        while (((n_avail > 0).sum() > 0) and isleaf).sum() > 0:
            progress = False
            for a0 in ordered_atoms_index:
                #print("leaF/Trunk & avail: " + ", ".join([f"{i}:{atoms[i]}={leaflabel[leaf[i]]}{n_avail[i]}"
                #                                          for i in ordered_atoms_index]))
                if (n_avail[a0] > 0) and isleaf[a0]:
                    for a1 in connected[a0]:
                        if (n_avail[a0] > 0) and (n_avail[a1] > 0):
                            add_bond(n_avail, nbond, a0, a1) 
                            progress = True
                            if (n_avail[a0] == 0) or (n_avail[a1] == 0):
                                isleaf[a0] = 1
                                isleaf[a1] = 1
            if not progress:
                break
        
        # gather remaining multiple bonds
        if n_avail.sum() > 0:
            for key in nbond.keys():
                a0, a1 = key
                while (n_avail[a0] > 0) and (n_avail[a1] > 0):
                    add_bond(n_avail, nbond, a0, a1)
    
        # STEP 3: search for known ionized radicals
        if n_avail.sum() > 0:
            for (i,a) in zip(atoms_idx, atoms):
                if a == 'N':
                    # NH3+
                    bonded_str, bonded_idx = get_bonded_atoms(atoms, nbond, i)
                    if (bonded_str == "HHH") and (n_avail[i] == 0):
                        # add a valence unit and search a dangling bond nearby
                        n_avail[i] += 1; n_charge[i] += 1
                        if search_bonds(kdt, n_avail, nbond, connected, isleaf, coords, atoms, atoms_idx,
                                        i, connect_once=False, VALENCE=VALENCE_MAX):
                            print(f"++ NH3+ found for {name} atom_index={i}")
                        else:
                            print(f"** NH3+ bonding failure for {name} atom_index={i}")
                            
                elif (a == 'O') and (n_avail[i] == 1):
                    # COO-
                    bonded_str, bonded_idx = get_bonded_atoms(atoms, nbond, i)
                    if bonded_str == "C":
                        C_i = bonded_idx[0]
                        C_bonded_str, C_bonded_idx = get_bonded_atoms(atoms, nbond, C_i)
                        if ("OO" in C_bonded_str):
                            has_2CO = False 
                            #print (C_bonded_str, C_bonded_idx, nbond, name)
                            for a1, i1 in zip(C_bonded_str, C_bonded_idx):
                                key = tuple(sorted((C_i, i1)))
                                if (a1 == 'O') and  (nbond[key][0] == 2):
                                    has_2CO = True
                            if (len(C_bonded_idx) == 3) and has_2CO:
                                # found carboxyle!
                                n_avail[i] -= 1
                                print(f"**  COO- found for {name} C_atom_index={C_i}")
                                for a1, i1 in zip(C_bonded_str, C_bonded_idx):
                                    if a1 == 'O':
                                        n_charge[i1] = -0.5
                                        key = tuple(sorted((C_i, i1)))
                                        nbond[key][0] = 1.5                    
                                    
        # detect cycles : algo complexity in O(m^2 * n)
        #    paper : https://link.springer.com/article/10.1007/s00453-007-9064-z
        #    nx doc: https://networkx.github.io/documentation/latest/reference/algorithms/generated/networkx.algorithms.cycles.minimum_cycle_basis.html
        graph = nx.Graph([bond for bond in nbond.keys()])
        unordered_cycles = nx.minimum_cycle_basis(graph)
        
        # index atoms by their sequential order in the cycle: i.e follow bonds
        # Note: this code can be written in a much cleaner way!
        if len(unordered_cycles) > 0:
            for icycle, c in enumerate(unordered_cycles):
                available = {i:1 for i in c}
                a0 = c[0]
                cycle = [ a0 ]
                del(available[a0])
                for index in range(1, len(c)):
                    # get atoms bonded to a0
                    bonded = [ b for b in nbond.keys() if a0 in b ]
                    bonded = list(map(lambda b: b[0] if b[1] == a0 else b[1], bonded))
                    
                    # get next atom and remove it from cycle
                    assert(len(bonded) > 0)
                    found = False
                    for a1 in bonded:
                        if (a1 in bonded) and (a1 in available):
                            cycle.append(a1)
                            del(available[a1])
                            a0 = a1
                            found = True
                            break
                    assert(found)
                    
                # and add cycles found to the cycle dataframe lists
                cycle_name.extend([name] * len(cycle))
                cycle_index.extend([icycle] * len(cycle))
                cycle_seq.extend(np.arange(len(cycle)))
                cycle_atom_index.extend(cycle)
        
        # display info on failed molecules
        if n_avail.sum() > 0:
            error = 1
            print(f"   Remaining bondings={n_avail.sum()} for molecule_name={name}, atoms: " +
                  ", ".join([f"{i}:{atoms[i]}" for i in atoms_idx if n_avail[i] > 0]))

        # inputs for DataFrame bonds
        for (a0, a1), (n, dist) in nbond.items():
            # append to python lists which is 7x faster than toa pd.DataFrame
            out_name.append(name); out_a0.append(a0); out_a1.append(a1); out_n.append(n)
            out_dist.append(dist); out_error.append(error)
            out_type.append(f"{n:0.1f}" + "".join(sorted(f"{atoms[a0]}{atoms[a1]}")) )
    
        # inputs for DataFrame charges
        charge_name.extend(        [name] * len(atoms) )
        charge_atom_index.extend(  molecule.atom_index.values )
        charge_value.extend(       n_charge )
        
    bonds = pd.DataFrame({'molecule_name':out_name, 'atom_index_0':out_a0,'atom_index_1':out_a1, 'nbond':out_n, 
                          'L2dist':out_dist, 'error':out_error, 'bond_type':out_type})
    charges = pd.DataFrame({'molecule_name':charge_name, 'atom_index':charge_atom_index, 
                            'charge': charge_value})
    cycles = pd.DataFrame({'molecule_name' : cycle_name, 'cycle_index' : cycle_index, 
                           'cycle_seq' : cycle_seq, 'atom_index' : cycle_atom_index})
    return bonds, charges, cycles

In [5]:
train_bonds, train_charges , train_cycles =  compute_bonds(structures.set_index('molecule_name'),  train.molecule_name.unique())

HBox(children=(IntProgress(value=0, max=85003), HTML(value='')))

   Remaining bondings=2 for molecule_name=dsgdb9nsd_000202, atoms: 2:C, 4:C
++ NH3+ found for dsgdb9nsd_000271 atom_index=0
**  COO- found for dsgdb9nsd_000271 C_atom_index=6
   Remaining bondings=2 for molecule_name=dsgdb9nsd_000731, atoms: 2:C, 4:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_000761, atoms: 2:C, 4:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_000840, atoms: 2:C, 4:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_000842, atoms: 2:C, 4:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_000854, atoms: 2:C, 4:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_000855, atoms: 2:C, 4:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_000910, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_000912, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_000982, atoms: 2:C, 5:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_001006, atoms: 0:C, 5:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_001007, atoms

   Remaining bondings=2 for molecule_name=dsgdb9nsd_005208, atoms: 1:C, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_005217, atoms: 3:C, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_005229, atoms: 3:C, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_005234, atoms: 3:C, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_005250, atoms: 4:C, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_005252, atoms: 4:C, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_005262, atoms: 4:C, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_005264, atoms: 4:C, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_005266, atoms: 4:C, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_005268, atoms: 4:C, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_005286, atoms: 3:C, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_005305, atoms: 3:C, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_005306, atoms: 3:C, 6:C
   Remaining

   Remaining bondings=2 for molecule_name=dsgdb9nsd_020316, atoms: 4:C, 6:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_020344, atoms: 3:C, 6:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_020347, atoms: 3:C, 6:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_020352, atoms: 4:N, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_020439, atoms: 3:N, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_020441, atoms: 3:N, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_020447, atoms: 4:C, 6:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_020470, atoms: 4:C, 6:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_020495, atoms: 1:C, 6:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_020497, atoms: 1:C, 6:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_020596, atoms: 2:C, 6:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_020614, atoms: 4:N, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_020616, atoms: 4:N, 7:O
   Remaining

   Remaining bondings=2 for molecule_name=dsgdb9nsd_024115, atoms: 4:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_024116, atoms: 4:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_024120, atoms: 4:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_024121, atoms: 4:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_024127, atoms: 4:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_024136, atoms: 4:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_024140, atoms: 4:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_024142, atoms: 4:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_024156, atoms: 5:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_024158, atoms: 5:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_024173, atoms: 1:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_024175, atoms: 1:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_024177, atoms: 1:C, 7:C
   Remaining

   Remaining bondings=2 for molecule_name=dsgdb9nsd_024959, atoms: 3:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_024960, atoms: 3:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_024964, atoms: 4:C, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_024968, atoms: 3:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_025107, atoms: 6:O, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_025113, atoms: 6:O, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_025116, atoms: 6:O, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_025119, atoms: 6:O, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_025125, atoms: 6:O, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_025232, atoms: 4:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_025233, atoms: 4:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_025244, atoms: 5:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_025250, atoms: 0:O, 7:C
   Remaining

   Remaining bondings=2 for molecule_name=dsgdb9nsd_026096, atoms: 4:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_026158, atoms: 4:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_026160, atoms: 4:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_026181, atoms: 3:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_026182, atoms: 3:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_026183, atoms: 3:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_026185, atoms: 3:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_026186, atoms: 3:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_026206, atoms: 3:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_026208, atoms: 3:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_026209, atoms: 3:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_026210, atoms: 3:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_026226, atoms: 3:C, 6:C
   Remaining

   Remaining bondings=2 for molecule_name=dsgdb9nsd_027178, atoms: 2:C, 4:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_027180, atoms: 2:C, 4:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_027181, atoms: 2:C, 4:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_027184, atoms: 2:C, 4:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_027185, atoms: 2:C, 4:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_027186, atoms: 2:C, 4:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_027189, atoms: 2:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_027193, atoms: 2:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_027208, atoms: 4:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_027210, atoms: 4:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_027232, atoms: 2:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_027282, atoms: 2:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_027284, atoms: 2:C, 7:C
   Remaining

   Remaining bondings=2 for molecule_name=dsgdb9nsd_028041, atoms: 2:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_028045, atoms: 2:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_028095, atoms: 2:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_028097, atoms: 2:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_028100, atoms: 3:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_028101, atoms: 3:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_028103, atoms: 3:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_028109, atoms: 3:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_028113, atoms: 3:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_028114, atoms: 3:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_028119, atoms: 3:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_028122, atoms: 3:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_028124, atoms: 3:C, 7:C
   Remaining

   Remaining bondings=2 for molecule_name=dsgdb9nsd_029458, atoms: 4:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_029459, atoms: 4:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_029460, atoms: 4:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_029463, atoms: 4:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_029471, atoms: 4:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_029472, atoms: 4:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_029473, atoms: 4:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_029501, atoms: 5:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_029540, atoms: 0:O, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_029544, atoms: 4:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_029546, atoms: 4:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_029628, atoms: 3:O, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_029630, atoms: 3:O, 7:C
   Remaining

   Remaining bondings=2 for molecule_name=dsgdb9nsd_031232, atoms: 3:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_031234, atoms: 3:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_031241, atoms: 3:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_031249, atoms: 3:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_031253, atoms: 2:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_031264, atoms: 2:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_031315, atoms: 5:C, 8:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_031321, atoms: 5:C, 8:O
   Remaining bondings=2 for molecule_name=dsgdb9nsd_031323, atoms: 5:C, 8:O
   Remaining bondings=2 for molecule_name=dsgdb9nsd_031333, atoms: 5:C, 8:O
   Remaining bondings=2 for molecule_name=dsgdb9nsd_031335, atoms: 5:C, 8:O
   Remaining bondings=2 for molecule_name=dsgdb9nsd_031337, atoms: 3:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_031343, atoms: 3:C, 7:C
   Remaining

   Remaining bondings=2 for molecule_name=dsgdb9nsd_032186, atoms: 2:C, 5:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_032188, atoms: 2:C, 5:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_032189, atoms: 2:C, 5:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_032296, atoms: 2:C, 5:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_032298, atoms: 2:C, 5:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_032299, atoms: 2:C, 5:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_032300, atoms: 2:C, 5:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_032306, atoms: 2:C, 5:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_032308, atoms: 2:C, 5:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_032310, atoms: 2:C, 5:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_032311, atoms: 2:C, 5:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_032335, atoms: 3:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_032338, atoms: 3:C, 7:C
   Remaining

   Remaining bondings=2 for molecule_name=dsgdb9nsd_043232, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_043233, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_043234, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_043236, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_043237, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_043238, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_043239, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_043240, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_043242, atoms: 0:O, 5:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_043244, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_043245, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_043247, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_043249, atoms: 0:O, 3:C
   Remaining

   Remaining bondings=2 for molecule_name=dsgdb9nsd_043409, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_043411, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_043412, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_043413, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_043418, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_043419, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_043423, atoms: 6:O, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_043425, atoms: 6:O, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_043437, atoms: 0:O, 5:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_044525, atoms: 0:O, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_045297, atoms: 0:O, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_045298, atoms: 0:O, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_045538, atoms: 0:O, 7:C
   Remaining

   Remaining bondings=2 for molecule_name=dsgdb9nsd_064832, atoms: 4:O, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_064836, atoms: 4:O, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_066499, atoms: 5:N, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_066506, atoms: 5:N, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_068157, atoms: 6:O, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_068159, atoms: 6:O, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_069204, atoms: 5:O, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_070175, atoms: 6:O, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_070182, atoms: 6:O, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_072803, atoms: 6:O, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_074122, atoms: 4:O, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_074123, atoms: 4:O, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_074124, atoms: 4:O, 6:C
   Remaining

++ NH3+ found for dsgdb9nsd_093941 atom_index=0
**  COO- found for dsgdb9nsd_093941 C_atom_index=9
   Remaining bondings=2 for molecule_name=dsgdb9nsd_093986, atoms: 2:C, 8:O
   Remaining bondings=2 for molecule_name=dsgdb9nsd_094182, atoms: 8:O, 9:O
++ NH3+ found for dsgdb9nsd_094604 atom_index=0
**  COO- found for dsgdb9nsd_094604 C_atom_index=9
**  COO- found for dsgdb9nsd_095438 C_atom_index=8
   Remaining bondings=1 for molecule_name=dsgdb9nsd_095438, atoms: 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_095662, atoms: 6:O, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_095671, atoms: 6:O, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_095673, atoms: 6:O, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_095675, atoms: 6:O, 8:C
**  COO- found for dsgdb9nsd_096612 C_atom_index=7
   Remaining bondings=1 for molecule_name=dsgdb9nsd_096612, atoms: 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_096613, atoms: 8:O, 9:O
**  COO- found for dsgdb9n

   Remaining bondings=2 for molecule_name=dsgdb9nsd_123607, atoms: 4:N, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_123608, atoms: 4:N, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_123661, atoms: 3:C, 6:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_123664, atoms: 3:C, 6:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_123670, atoms: 5:C, 7:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_123708, atoms: 3:C, 6:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_123714, atoms: 5:C, 7:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_123716, atoms: 5:C, 7:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_123760, atoms: 5:C, 7:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_123776, atoms: 5:N, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_123834, atoms: 5:N, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_123840, atoms: 5:N, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_123856, atoms: 5:N, 8:C
   Remaining

   Remaining bondings=2 for molecule_name=dsgdb9nsd_126515, atoms: 4:C, 7:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_126583, atoms: 5:C, 7:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_126631, atoms: 4:N, 8:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_126632, atoms: 4:N, 8:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_126636, atoms: 4:N, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_126834, atoms: 3:N, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_126838, atoms: 3:N, 8:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_126839, atoms: 3:N, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_126840, atoms: 3:N, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_126847, atoms: 3:N, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_126848, atoms: 3:N, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_126849, atoms: 3:N, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_126851, atoms: 3:N, 6:C
   Remaining

   Remaining bondings=2 for molecule_name=dsgdb9nsd_129017, atoms: 3:C, 8:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_129020, atoms: 1:N, 4:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_129021, atoms: 1:N, 4:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_129022, atoms: 5:C, 8:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_129119, atoms: 4:N, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_129186, atoms: 3:N, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_129187, atoms: 3:N, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_129202, atoms: 6:O, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_129231, atoms: 3:N, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_129232, atoms: 3:N, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_129233, atoms: 3:N, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_129236, atoms: 3:N, 6:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_129258, atoms: 3:N, 6:C
   Remaining

   Remaining bondings=2 for molecule_name=dsgdb9nsd_131203, atoms: 2:C, 7:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_131207, atoms: 2:C, 7:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_131235, atoms: 2:C, 7:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_131241, atoms: 2:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_131244, atoms: 2:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_131245, atoms: 2:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_131374, atoms: 5:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_131437, atoms: 4:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_131438, atoms: 4:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_131448, atoms: 4:C, 7:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_131527, atoms: 5:N, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_131537, atoms: 4:N, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_131538, atoms: 4:N, 7:C
   Remaining

   Remaining bondings=2 for molecule_name=dsgdb9nsd_132171, atoms: 4:O, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_132182, atoms: 4:O, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_132183, atoms: 4:O, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_132201, atoms: 4:O, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_132202, atoms: 4:O, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_132209, atoms: 4:O, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_132223, atoms: 4:O, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_132248, atoms: 3:C, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_132249, atoms: 3:C, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_132250, atoms: 3:C, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_132259, atoms: 3:C, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_132260, atoms: 3:C, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_132261, atoms: 3:C, 6:C
   Remaining

In [6]:
test_bonds, test_charges, test_cycles = compute_bonds(structures.set_index('molecule_name'),  test.molecule_name.unique())

HBox(children=(IntProgress(value=0, max=45772), HTML(value='')))

   Remaining bondings=2 for molecule_name=dsgdb9nsd_000204, atoms: 2:C, 4:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_000206, atoms: 2:C, 4:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_000208, atoms: 2:C, 4:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_000210, atoms: 2:C, 4:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_000211, atoms: 2:C, 4:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_000282, atoms: 6:O, 7:O
   Remaining bondings=2 for molecule_name=dsgdb9nsd_000724, atoms: 2:N, 5:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_000782, atoms: 2:C, 4:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_000843, atoms: 2:C, 4:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_000909, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_000911, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_000968, atoms: 2:C, 5:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_000974, atoms: 2:C, 5:C
   Remaining

**  COO- found for dsgdb9nsd_008398 C_atom_index=1
   Remaining bondings=1 for molecule_name=dsgdb9nsd_008398, atoms: 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_010016, atoms: 1:C, 8:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_011382, atoms: 5:O, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_011384, atoms: 5:O, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_012586, atoms: 3:O, 7:O
   Remaining bondings=2 for molecule_name=dsgdb9nsd_012588, atoms: 3:O, 7:O
   Remaining bondings=2 for molecule_name=dsgdb9nsd_013357, atoms: 1:C, 4:C
**  COO- found for dsgdb9nsd_014121 C_atom_index=1
   Remaining bondings=1 for molecule_name=dsgdb9nsd_014121, atoms: 9:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_015512, atoms: 6:C, 9:O
   Remaining bondings=2 for molecule_name=dsgdb9nsd_016190, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_016192, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_016193, atoms: 0:O, 

   Remaining bondings=2 for molecule_name=dsgdb9nsd_024438, atoms: 3:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_024466, atoms: 3:C, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_024471, atoms: 3:C, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_024472, atoms: 3:C, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_024476, atoms: 3:C, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_024478, atoms: 3:C, 6:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_024481, atoms: 3:C, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_024483, atoms: 3:C, 6:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_024543, atoms: 3:N, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_024585, atoms: 4:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_024596, atoms: 4:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_024608, atoms: 5:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_024616, atoms: 5:C, 7:C
   Remaining

   Remaining bondings=2 for molecule_name=dsgdb9nsd_026476, atoms: 4:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_026481, atoms: 4:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_026494, atoms: 4:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_026504, atoms: 5:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_026524, atoms: 5:O, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_026542, atoms: 5:O, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_026553, atoms: 5:O, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_026561, atoms: 5:O, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_026572, atoms: 5:O, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_026575, atoms: 2:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_026585, atoms: 5:O, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_026610, atoms: 2:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_026612, atoms: 2:C, 7:C
   Remaining

   Remaining bondings=2 for molecule_name=dsgdb9nsd_028271, atoms: 3:C, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_028287, atoms: 2:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_028294, atoms: 2:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_028296, atoms: 2:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_028298, atoms: 2:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_028302, atoms: 2:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_028304, atoms: 2:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_028329, atoms: 2:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_028773, atoms: 3:N, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_028805, atoms: 6:O, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_028806, atoms: 6:O, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_028811, atoms: 4:N, 6:O
   Remaining bondings=2 for molecule_name=dsgdb9nsd_028815, atoms: 6:O, 8:C
   Remaining

   Remaining bondings=2 for molecule_name=dsgdb9nsd_031114, atoms: 3:C, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_031177, atoms: 2:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_031182, atoms: 2:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_031199, atoms: 2:C, 5:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_031200, atoms: 2:C, 5:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_031243, atoms: 3:C, 5:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_031252, atoms: 2:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_031289, atoms: 5:C, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_031290, atoms: 5:C, 8:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_031293, atoms: 5:C, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_031294, atoms: 5:C, 8:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_031311, atoms: 5:C, 8:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_031314, atoms: 5:C, 8:C
   Remaining

   Remaining bondings=2 for molecule_name=dsgdb9nsd_043299, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_043300, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_043303, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_043304, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_043305, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_043306, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_043308, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_043309, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_043310, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_043314, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_043318, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_043319, atoms: 0:O, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_043325, atoms: 0:O, 3:C
   Remaining

   Remaining bondings=2 for molecule_name=dsgdb9nsd_064835, atoms: 4:O, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_064840, atoms: 4:O, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_064842, atoms: 4:O, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_070173, atoms: 6:O, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_070180, atoms: 6:O, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_074089, atoms: 5:O, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_074126, atoms: 4:O, 6:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_074140, atoms: 5:O, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_074242, atoms: 1:C, 6:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_074313, atoms: 1:C, 6:O
   Remaining bondings=2 for molecule_name=dsgdb9nsd_075095, atoms: 5:O, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_075419, atoms: 6:O, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_075736, atoms: 3:O, 8:O
**  COO- fou

++ NH3+ found for dsgdb9nsd_123129 atom_index=0
**  COO- found for dsgdb9nsd_123129 C_atom_index=9
   Remaining bondings=2 for molecule_name=dsgdb9nsd_123274, atoms: 2:C, 7:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_123294, atoms: 0:C, 3:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_123303, atoms: 0:C, 5:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_123330, atoms: 5:N, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_123341, atoms: 5:C, 7:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_123495, atoms: 5:C, 7:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_123497, atoms: 5:C, 7:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_123498, atoms: 5:N, 7:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_123502, atoms: 5:N, 7:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_123508, atoms: 1:C, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_123511, atoms: 3:N, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_123514, atoms

   Remaining bondings=2 for molecule_name=dsgdb9nsd_127793, atoms: 2:N, 5:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_127794, atoms: 2:N, 5:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_127795, atoms: 2:N, 5:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_127798, atoms: 2:N, 5:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_127804, atoms: 2:N, 5:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_127952, atoms: 4:N, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_127954, atoms: 4:N, 8:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_127957, atoms: 4:N, 7:C
   Remaining bondings=2 for molecule_name=dsgdb9nsd_128008, atoms: 4:N, 8:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_128009, atoms: 4:N, 8:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_128162, atoms: 2:C, 6:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_128269, atoms: 2:C, 6:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_128278, atoms: 2:C, 5:N
   Remaining

   Remaining bondings=2 for molecule_name=dsgdb9nsd_131645, atoms: 3:C, 7:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_131661, atoms: 5:O, 7:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_131674, atoms: 3:C, 7:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_131804, atoms: 5:C, 7:N
   Remaining bondings=2 for molecule_name=dsgdb9nsd_131839, atoms: 4:N, 7:C
   Remaining bondings=4 for molecule_name=dsgdb9nsd_131883, atoms: 2:C, 4:C, 7:O, 8:O
   Remaining bondings=4 for molecule_name=dsgdb9nsd_131885, atoms: 2:C, 4:C, 7:O, 8:O
   Remaining bondings=2 for molecule_name=dsgdb9nsd_131886, atoms: 7:O, 8:O
   Remaining bondings=4 for molecule_name=dsgdb9nsd_131887, atoms: 2:C, 4:C, 7:O, 8:O
   Remaining bondings=2 for molecule_name=dsgdb9nsd_131888, atoms: 7:O, 8:O
   Remaining bondings=2 for molecule_name=dsgdb9nsd_131890, atoms: 7:O, 8:O
   Remaining bondings=2 for molecule_name=dsgdb9nsd_131894, atoms: 7:O, 8:O
   Remaining bondings=2 for molecule_name=dsgdb9nsd_131897

In [22]:
def _worker(item, edges, bonds):
    
    molecule_name = item
    list_ = []
    for edges_idx, edges_item in  edges[edges['molecule_name']==molecule_name].iterrows():
        try:
            is_found_bond = False
            for bonds_idx, bonds_item in  bonds[bonds['molecule_name']==molecule_name].iterrows():
                if (bonds_item['atom_index_0']==edges_item['atom_index_0']) & (bonds_item['atom_index_1']==edges_item['atom_index_1']):
                    is_found_bond = True
                    break
                elif (bonds_item['atom_index_0']==edges_item['atom_index_1']) & (bonds_item['atom_index_1']==edges_item['atom_index_0']):
                    is_found_bond = True
                    break
            if is_found_bond:
                dict_ = {'id':edges_item['id'], 'molecule_name':molecule_name, 'atom_index_0':edges_item['atom_index_0'], 'atom_index_1':edges_item['atom_index_1'], 
                         'is_found_bond':1, 'nbond':bonds_item['nbond'], 'L2dist': bonds_item['L2dist'], 'error':bonds_item['error'], 'bond_type':bonds_item['bond_type']}
            else:
                dict_ = {'id':edges_item['id'], 'molecule_name':molecule_name, 'atom_index_0':edges_item['atom_index_0'], 'atom_index_1':edges_item['atom_index_1'], 
                         'is_found_bond':0, 'nbond':0, 'L2dist': 0, 'error':0, 'bond_type':'none'}
            list_.append(dict_)
        except :
            raise Exception(edges_idx, bonds_idx, bonds_item)
    return list_

def map_bond(edges, bonds, molecule_names=None):
    
    if type(molecule_names)==type(None):
        molecule_names = list(edges['molecule_name'].unique())
    N_ = len(molecule_names)
    segment = N_//100
    df_ = pd.DataFrame()
    for start in tqdm_notebook(range(0, N_, segment)):
        end = start+segment if start+segment < N_ else N_
        res = Parallel(_worker,{'edges':edges, 'bonds':bonds}).run(molecule_names[start:end], n_jobs=16)
        df_ = pd.concat([df_, pd.DataFrame(res)], axis=0)
    df_ = df_.sort_values(by=['index'])
    df_ = df_.reset_index(drop=True)
    return df_

In [32]:
def _worker(item, cycles):
    
    index = item[0]
    row = item[1]
    molecule_name_ = row['molecule_name']
    atom_index_ = row['atom_index']
    cycle_index_list_ = cycles[(cycles['molecule_name']==molecule_name_)&(cycles['atom_index']==atom_index_)].cycle_index.unique().tolist()
    n_cycle = len(cycle_index_list_)
    cycle_size_mean = np.mean([cycles[(cycles['molecule_name']==molecule_name_) & (cycles['cycle_index']==cycle_index)].shape[0] for cycle_index in cycle_index_list_])
    dict_ = {'index':index, 'molecule_name':molecule_name_, 'atom_index':atom_index_, 'n_cycle':n_cycle, 'cycle_size_mean':cycle_size_mean}
    return dict_


def arrange_cycles(cycles):
    N_ = len(cycles)
    segment = N_//100
    df_ = pd.DataFrame()
    for start in tqdm_notebook(range(0, N_, segment)):
        end = start+segment if start+segment < N_ else N_
        res = Parallel(_worker,{'cycles':cycles}).run(cycles.loc[start:end-1].iterrows(), n_jobs=12)
        df_ = pd.concat([df_, pd.DataFrame(res)], axis=0)
    
    df_ = df_.sort_values(by=['index'])
    df_ = df_.reset_index(drop=True)
    df_ = df_[['molecule_name', 'atom_index', 'n_cycle', 'cycle_size_mean']].drop_duplicates()
    return df_

In [None]:
def _map_atom_charges(df, charges, atom_idx):
    df = pd.merge(df, charges, how = 'left', left_on  = ['molecule_name', f'atom_index_{atom_idx}'], right_on = ['molecule_name',  'atom_index'])
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'charge': f'atom_index_{atom_idx}_charge'})
    return df

In [24]:
df_train = map_bond(train, train_bonds)
df_test = map_bond(test, test_bonds)

In [None]:
df_train = _map_atom_charges(df_train, train_charges,0)
df_train = _map_atom_charges(df_train, train_charges,1)

df_test = _map_atom_charges(df_test, test_charges,0)
df_test = _map_atom_charges(df_test, test_charges,1)

In [None]:
test_cycles_sample = arrange_cycles(test_cycles)

HBox(children=(IntProgress(value=0, max=101), HTML(value='')))

In [13]:
print(df_train.shape)
df_train.head()

(4658147, 10)


Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,nbond,L2dist,error,bond_type
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,,,,
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,,,,
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,,,,
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,,,,
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,,,,


In [16]:
df_train[~df_train['L2dist'].isna()]

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,nbond,L2dist,error,bond_type
1600734,1600734,dsgdb9nsd_051932,0,1,1JHN,33.7798,1.0,1.017917,0.0,1.0HN


In [7]:
def _map_atom_charges(df, charges, atom_idx):
    df = pd.merge(df, charges, how = 'left', left_on  = ['molecule_name', f'atom_index_{atom_idx}'], right_on = ['molecule_name',  'atom_index'])
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'charge': f'atom_index_{atom_idx}_charge'})
    return df

In [8]:
train_bonds = _map_atom_charges(train_bonds, train_charges, 0)
train_bonds = _map_atom_charges(train_bonds, train_charges, 1)

In [9]:
test_bonds = _map_atom_charges(test_bonds, test_charges, 0)
test_bonds = _map_atom_charges(test_bonds, test_charges, 1)

In [10]:
lambda_count_cycle = lambda x : x.unique().shape[0]
train_cycles['in_n_cycle'] = train_cycles.groupby(['molecule_name', 'atom_index'])["cycle_index"].transform(lambda_count_cycle)
test_cycles['in_n_cycle'] = test_cycles.groupby(['molecule_name', 'atom_index'])["cycle_index"].transform(lambda_count_cycle)

In [11]:
train_cycles = train_cycles[['molecule_name','atom_index','in_n_cycle']].drop_duplicates()
test_cycles = test_cycles[['molecule_name','atom_index','in_n_cycle']].drop_duplicates()

In [12]:
def _map_atom_cycles(df, cycles, atom_idx):
    df = pd.merge(df, cycles, how = 'left', left_on  = ['molecule_name', f'atom_index_{atom_idx}'], right_on = ['molecule_name',  'atom_index'])
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'cycles': f'atom_index_{atom_idx}_cycles'})
    return df

In [13]:
train_bonds = _map_atom_charges(train_bonds, train_cycles, 0)
train_bonds = _map_atom_charges(train_bonds, train_cycles, 1)

In [14]:
test_bonds = _map_atom_charges(test_bonds, train_cycles, 0)
test_bonds = _map_atom_charges(test_bonds, train_cycles, 1)

In [15]:
train_bonds.shape, test_bonds.shape

((1586228, 11), (853433, 11))

In [16]:
train_bonds.head()

Unnamed: 0,molecule_name,atom_index_0,atom_index_1,nbond,L2dist,error,bond_type,atom_index_0_charge,atom_index_1_charge,in_n_cycle_x,in_n_cycle_y
0,dsgdb9nsd_000001,0,1,1.0,1.091953,0,1.0CH,0.0,0.0,,
1,dsgdb9nsd_000001,0,2,1.0,1.091952,0,1.0CH,0.0,0.0,,
2,dsgdb9nsd_000001,0,3,1.0,1.091946,0,1.0CH,0.0,0.0,,
3,dsgdb9nsd_000001,0,4,1.0,1.091948,0,1.0CH,0.0,0.0,,
4,dsgdb9nsd_000002,0,1,1.0,1.01719,0,1.0HN,0.0,0.0,,


In [18]:
train_bonds = train_bonds.fillna(0)
test_bonds = test_bonds.fillna(0)

In [17]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [19]:
train_bonds = reduce_mem_usage(train_bonds)
test_bonds = reduce_mem_usage(test_bonds)

Mem. usage decreased to 59.00 Mb (59.4% reduction)
Mem. usage decreased to 31.74 Mb (59.4% reduction)


In [22]:
train_bonds = pd.merge(train_bonds, train[['id', 'molecule_name', 'atom_index_0', 'atom_index_1']], how = 'left', left_on  = ['molecule_name', 'atom_index_0', 'atom_index_1'], right_on = ['molecule_name', 'atom_index_0', 'atom_index_1'])

In [23]:
train_bonds.head()

Unnamed: 0,molecule_name,atom_index_0,atom_index_1,nbond,L2dist,error,bond_type,atom_index_0_charge,atom_index_1_charge,in_n_cycle_x,in_n_cycle_y,id
0,dsgdb9nsd_000001,0,1,1.0,1.091797,0,1.0CH,0.0,0.0,0.0,0.0,
1,dsgdb9nsd_000001,0,2,1.0,1.091797,0,1.0CH,0.0,0.0,0.0,0.0,
2,dsgdb9nsd_000001,0,3,1.0,1.091797,0,1.0CH,0.0,0.0,0.0,0.0,
3,dsgdb9nsd_000001,0,4,1.0,1.091797,0,1.0CH,0.0,0.0,0.0,0.0,
4,dsgdb9nsd_000002,0,1,1.0,1.017578,0,1.0HN,0.0,0.0,0.0,0.0,


In [24]:
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074
