In [1]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [1]:
import sys
sys.path.append("../..") # Adds higher directory to python modules path.
from utilities import aggregate_feature_calculators
from utilities import aggregate_feature_calculators_setting as aggcal
from utilities.parallel import Parallel

In [2]:
import os
import pandas as pd
import numpy as np
import math

from tqdm import tqdm_notebook, tqdm

import openbabel

In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
file_folder =  '../../data/input'
os.listdir(file_folder)

['structures',
 'magnetic_shielding_parameters.csv',
 'train.csv',
 'dipole_moments.csv',
 'magnetic_shielding_tensors.csv',
 'mulliken_charges.csv',
 'potential_energy.csv',
 'sample_submission.csv',
 'scalar_coupling_contributions.csv',
 'structures.csv',
 'test.csv']

In [5]:
train = pd.read_csv(f'{file_folder}/train.csv')
test = pd.read_csv(f'{file_folder}/test.csv')
magnetic_shielding_tensors = pd.read_csv(f'{file_folder}/magnetic_shielding_tensors.csv')
dipole_moments = pd.read_csv(f'{file_folder}/dipole_moments.csv')
mulliken_charges = pd.read_csv(f'{file_folder}/mulliken_charges.csv')
potential_energy = pd.read_csv(f'{file_folder}/potential_energy.csv')
scalar_coupling_contributions = pd.read_csv(f'{file_folder}/scalar_coupling_contributions.csv')
structures = pd.read_csv(f'{file_folder}/structures.csv')

In [6]:
atomic_radius = {'H':0.38, 'C':0.77, 'N':0.75, 'O':0.73, 'F':0.71} # Without fudge factor

fudge_factor = 0.05
atomic_radius = {k:v + fudge_factor for k,v in atomic_radius.items()}
print(atomic_radius)

electronegativity = {'H':2.2, 'C':2.55, 'N':3.04, 'O':3.44, 'F':3.98}

atoms = structures['atom'].values
atoms_en = [electronegativity[x] for x in tqdm(atoms)]
atoms_rad = [atomic_radius[x] for x in tqdm(atoms)]

structures['EN'] = atoms_en
structures['rad'] = atoms_rad

display(structures.head())

 18%|█▊        | 413660/2358657 [00:00<00:00, 2068060.37it/s]

{'H': 0.43, 'C': 0.8200000000000001, 'N': 0.8, 'O': 0.78, 'F': 0.76}


100%|██████████| 2358657/2358657 [00:00<00:00, 2596070.70it/s]
100%|██████████| 2358657/2358657 [00:00<00:00, 2782265.72it/s]


Unnamed: 0,molecule_name,atom_index,atom,x,y,z,EN,rad
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001,2.55,0.82
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976,2.2,0.43
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277,2.2,0.43
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644,2.2,0.43
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397,2.2,0.43


In [7]:
i_atom = structures['atom_index'].values
p = structures[['x', 'y', 'z']].values
p_compare = p
m = structures['molecule_name'].values
m_compare = m
r = structures['rad'].values
r_compare = r

source_row = np.arange(len(structures))
max_atoms = 28

bonds = np.zeros((len(structures)+1, max_atoms+1), dtype=np.int8)
bond_dists = np.zeros((len(structures)+1, max_atoms+1), dtype=np.float32)

print('Calculating bonds')

for i in tqdm(range(max_atoms-1)):
    p_compare = np.roll(p_compare, -1, axis=0)
    m_compare = np.roll(m_compare, -1, axis=0)
    r_compare = np.roll(r_compare, -1, axis=0)
    
    mask = np.where(m == m_compare, 1, 0) #Are we still comparing atoms in the same molecule?
    dists = np.linalg.norm(p - p_compare, axis=1) * mask
    r_bond = r + r_compare
    
    bond = np.where(np.logical_and(dists > 0.0001, dists < r_bond), 1, 0)
    
    source_row = source_row
    target_row = source_row + i + 1 #Note: Will be out of bounds of bonds array for some values of i
    target_row = np.where(np.logical_or(target_row > len(structures), mask==0), len(structures), target_row) #If invalid target, write to dummy row
    
    source_atom = i_atom
    target_atom = i_atom + i + 1 #Note: Will be out of bounds of bonds array for some values of i
    target_atom = np.where(np.logical_or(target_atom > max_atoms, mask==0), max_atoms, target_atom) #If invalid target, write to dummy col
    
    bonds[(source_row, target_atom)] = bond
    bonds[(target_row, source_atom)] = bond
    bond_dists[(source_row, target_atom)] = dists
    bond_dists[(target_row, source_atom)] = dists

bonds = np.delete(bonds, axis=0, obj=-1) #Delete dummy row
bonds = np.delete(bonds, axis=1, obj=-1) #Delete dummy col
bond_dists = np.delete(bond_dists, axis=0, obj=-1) #Delete dummy row
bond_dists = np.delete(bond_dists, axis=1, obj=-1) #Delete dummy col

print('Counting and condensing bonds')

bonds_numeric = [[i for i,x in enumerate(row) if x] for row in tqdm(bonds)]
bond_lengths = [[dist for i,dist in enumerate(row) if i in bonds_numeric[j]] for j,row in enumerate(tqdm(bond_dists))]
n_bonds = [len(x) for x in bonds_numeric]

#bond_data = {'bond_' + str(i):col for i, col in enumerate(np.transpose(bonds))}
#bond_data.update({'bonds_numeric':bonds_numeric, 'n_bonds':n_bonds})

bond_data = {'bonds':bonds_numeric, 'n_bonds':n_bonds, 'bond_lengths':bond_lengths}
bond_df = pd.DataFrame(bond_data)
structures = structures.join(bond_df)
display(structures.head(20))

  0%|          | 0/27 [00:00<?, ?it/s]

Calculating bonds


100%|██████████| 27/27 [00:07<00:00,  3.63it/s]
  1%|          | 16979/2358657 [00:00<00:13, 169749.69it/s]

Counting and condensing bonds


100%|██████████| 2358657/2358657 [00:14<00:00, 159598.47it/s]
100%|██████████| 2358657/2358657 [00:19<00:00, 123033.13it/s]


Unnamed: 0,molecule_name,atom_index,atom,x,y,z,EN,rad,bonds,n_bonds,bond_lengths
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001,2.55,0.82,"[1, 2, 3, 4]",4,"[1.091953, 1.0919516, 1.0919464, 1.0919476]"
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976,2.2,0.43,[0],1,[1.091953]
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277,2.2,0.43,[0],1,[1.0919516]
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644,2.2,0.43,[0],1,[1.0919464]
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397,2.2,0.43,[0],1,[1.0919476]
5,dsgdb9nsd_000002,0,N,-0.040426,1.024108,0.062564,3.04,0.8,"[1, 2, 3]",3,"[1.01719, 1.0171872, 1.0172079]"
6,dsgdb9nsd_000002,1,H,0.017257,0.012545,-0.027377,2.2,0.43,[0],1,[1.01719]
7,dsgdb9nsd_000002,2,H,0.915789,1.358745,-0.028758,2.2,0.43,[0],1,[1.0171872]
8,dsgdb9nsd_000002,3,H,-0.520278,1.343532,-0.775543,2.2,0.43,[0],1,[1.0172079]
9,dsgdb9nsd_000003,0,O,-0.03436,0.97754,0.007602,3.44,0.78,"[1, 2]",2,"[0.9621068, 0.9621068]"


In [8]:
bond_df.shape,structures.shape

((2358657, 3), (2358657, 11))

In [9]:
structures['bond_lengths_mean'] = structures['bond_lengths'].apply(lambda x : np.mean(x))
structures['bond_lengths_min'] = structures['bond_lengths'].apply(lambda x : np.min(x))
structures['bond_lengths_max'] = structures['bond_lengths'].apply(lambda x : np.max(x))
structures['bond_lengths_std'] = structures['bond_lengths'].apply(lambda x : np.std(x))

In [10]:
# atomic properties
# https://www.lenntech.com/periodic-chart-elements/
atomic_radius = {'H': 0.38, 'C': 0.77, 'N': 0.75, 'O': 0.73, 'F': 0.71}
atomic_number = {'H': 1, 'C': 6, 'N': 7, 'O': 8, 'F': 9}
atomic_mass = {'H': 1.0079, 'C': 12.0107, 'N': 14.0067, 'O': 15.9994, 'F': 18.9984}
vanderwaalsradius = {'H': 120, 'C': 185, 'N': 154, 'O': 140, 'F': 135}
covalenzradius = {'H': 30, 'C': 77, 'N': 70, 'O': 66, 'F': 58}
electronegativity = {'H': 2.2, 'C': 2.55, 'N': 3.04, 'O': 3.44, 'F': 3.98}
ionization_energy = {'H': 13.5984, 'C': 11.2603, 'N': 14.5341, 'O': 13.6181, 'F': 17.4228}

In [11]:
structures['atomic_radius'] = structures['atom'].apply(lambda x : atomic_radius[x])
structures['atomic_number'] = structures['atom'].apply(lambda x : atomic_number[x])
structures['atomic_mass'] = structures['atom'].apply(lambda x : atomic_mass[x])
structures['vanderwaalsradius'] = structures['atom'].apply(lambda x : vanderwaalsradius[x])
structures['covalenzradius'] = structures['atom'].apply(lambda x : covalenzradius[x])
structures['electronegativity'] = structures['atom'].apply(lambda x : electronegativity[x])
structures['ionization_energy'] = structures['atom'].apply(lambda x : ionization_energy[x])

In [12]:
# https://stackoverflow.com/questions/20305272/dihedral-torsion-angle-from-four-points-in-cartesian-coordinates-in-python
def dihedral_angle(data): 
        
    vals = np.array(data[:, 3:6], dtype=np.float64)
    mol_names = np.array(data[:, 0], dtype=np.str)
 
    result = np.zeros((data.shape[0], 2), dtype=object)
    # use every 4 rows to compute the dihedral angle
    for idx in range(0, vals.shape[0] - 4, 4):

        a0 = vals[idx]
        a1 = vals[idx + 1]
        a2 = vals[idx + 2]
        a3 = vals[idx + 3]
        
        b0 = a0 - a1
        b1 = a2 - a1
        b2 = a3 - a2
        
        # normalize b1 so that it does not influence magnitude of vector
        # rejections that come next
        b1 /= np.linalg.norm(b1)
    
        # vector rejections
        # v = projection of b0 onto plane perpendicular to b1
        #   = b0 minus component that aligns with b1
        # w = projection of b2 onto plane perpendicular to b1
        #   = b2 minus component that aligns with b1

        v = b0 - np.dot(b0, b1) * b1
        w = b2 - np.dot(b2, b1) * b1

        # angle between v and w in a plane is the torsion angle
        # v and w may not be normalized but that's fine since tan is y/x
        x = np.dot(v, w)
        y = np.dot(np.cross(b1, v), w)
       
        # We want all 4 first rows for every molecule to have the same value
        # (in order to have the same length as the dataframe)
        result[idx:idx + 4] = [mol_names[idx], np.degrees(np.arctan2(y, x))]
        
    return result

In [13]:
dihedral = dihedral_angle(structures[structures.groupby('molecule_name')['atom_index'].transform('count').ge(4)].groupby('molecule_name').head(4).values)
themap = {k:v for k, v in dihedral if k}
structures['dihedral'] = structures['molecule_name'].map(themap)

In [14]:
def map_atom_info(df, structures, atom_idx):
    df = pd.merge(df, structures, how = 'left', left_on  = ['molecule_name', f'atom_index_{atom_idx}'], right_on = ['molecule_name',  'atom_index'])
    df = df.drop('atom_index', axis=1)
    rename_columns = {}
    for col in structures.columns.drop(['molecule_name', 'atom_index']):
        rename_columns[col] = f'atom_{atom_idx}_{col}'
    df = df.rename(columns=rename_columns)
    return df

In [15]:
structures.columns.tolist()

['molecule_name',
 'atom_index',
 'atom',
 'x',
 'y',
 'z',
 'EN',
 'rad',
 'bonds',
 'n_bonds',
 'bond_lengths',
 'bond_lengths_mean',
 'bond_lengths_min',
 'bond_lengths_max',
 'bond_lengths_std',
 'atomic_radius',
 'atomic_number',
 'atomic_mass',
 'vanderwaalsradius',
 'covalenzradius',
 'electronegativity',
 'ionization_energy',
 'dihedral']

In [16]:
columns_ = ['molecule_name',
 'atom_index',
 'EN',
 'rad',
 'n_bonds',
 'bond_lengths_mean',
 'bond_lengths_min',
 'bond_lengths_max',
 'bond_lengths_std',
 'atomic_radius',
 'atomic_number',
 'atomic_mass',
 'vanderwaalsradius',
 'covalenzradius',
 'electronegativity',
 'ionization_energy',
 'dihedral']

In [20]:
train = map_atom_info(train, structures[columns_], 0)
train = map_atom_info(train, structures[columns_], 1)

In [21]:
test = map_atom_info(test, structures[columns_], 0)
test = map_atom_info(test, structures[columns_], 1)

In [22]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Mem. usage decreased to 377.60 Mb (71.3% reduction)
Mem. usage decreased to 198.33 Mb (71.2% reduction)


In [23]:
train.columns.tolist()

['id',
 'molecule_name',
 'atom_index_0',
 'atom_index_1',
 'type',
 'scalar_coupling_constant',
 'atom_0_EN',
 'atom_0_rad',
 'atom_0_n_bonds',
 'atom_0_bond_lengths_mean',
 'atom_0_bond_lengths_min',
 'atom_0_bond_lengths_max',
 'atom_0_bond_lengths_std',
 'atom_0_atomic_radius',
 'atom_0_atomic_number',
 'atom_0_atomic_mass',
 'atom_0_vanderwaalsradius',
 'atom_0_covalenzradius',
 'atom_0_electronegativity',
 'atom_0_ionization_energy',
 'atom_0_dihedral',
 'atom_1_EN',
 'atom_1_rad',
 'atom_1_n_bonds',
 'atom_1_bond_lengths_mean',
 'atom_1_bond_lengths_min',
 'atom_1_bond_lengths_max',
 'atom_1_bond_lengths_std',
 'atom_1_atomic_radius',
 'atom_1_atomic_number',
 'atom_1_atomic_mass',
 'atom_1_vanderwaalsradius',
 'atom_1_covalenzradius',
 'atom_1_electronegativity',
 'atom_1_ionization_energy',
 'atom_1_dihedral']

In [24]:
train.drop(columns=['molecule_name','atom_index_0','atom_index_1','type','scalar_coupling_constant']).to_pickle('../../data/feature/bonds-from-structure-data_train.pkl')

In [25]:
test.drop(columns=['molecule_name','atom_index_0','atom_index_1','type']).to_pickle('../../data/feature/bonds-from-structure-data_test.pkl')