In [1]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [1]:
import copy
from tqdm import tqdm_notebook

import sys
sys.path.append("../..") # Adds higher directory to python modules path.
from utilities import aggregate_feature_calculators
from utilities import aggregate_feature_calculators_setting as aggcal
from utilities.parallel import Parallel

In [2]:
import os
import numpy as np
import pandas as pd
import networkx as nx
import tensorly as tl
from sklearn.preprocessing import LabelEncoder
from tensorly.decomposition import tucker, parafac, non_negative_tucker

Using numpy backend.


In [3]:
# import sys
# !{sys.executable} -m pip install tensorly

In [4]:
file_folder =  '../../data/input'
os.listdir(file_folder)

['sample_submission.csv',
 'magnetic_shielding_tensors.csv',
 'potential_energy.csv',
 'scalar_coupling_contributions.csv',
 'dipole_moments.csv',
 'mulliken_charges.csv',
 'train.csv',
 'test.csv',
 'structures.csv',
 'structures']

In [5]:
train = pd.read_csv(f'{file_folder}/train.csv')
test = pd.read_csv(f'{file_folder}/test.csv')
magnetic_shielding_tensors = pd.read_csv(f'{file_folder}/magnetic_shielding_tensors.csv')
dipole_moments = pd.read_csv(f'{file_folder}/dipole_moments.csv')
mulliken_charges = pd.read_csv(f'{file_folder}/mulliken_charges.csv')
potential_energy = pd.read_csv(f'{file_folder}/potential_energy.csv')
scalar_coupling_contributions = pd.read_csv(f'{file_folder}/scalar_coupling_contributions.csv')
structures = pd.read_csv(f'{file_folder}/structures.csv')

In [6]:
atomic_radius = {'H': 0.38, 'C': 0.77, 'N': 0.75, 'O': 0.73, 'F': 0.71}
atomic_number = {'H': 1, 'C': 6, 'N': 7, 'O': 8, 'F': 9}
atomic_mass = {'H': 1.0079, 'C': 12.0107, 'N': 14.0067, 'O': 15.9994, 'F': 18.9984}
vanderwaalsradius = {'H': 120, 'C': 185, 'N': 154, 'O': 140, 'F': 135}
covalenzradius = {'H': 30, 'C': 77, 'N': 70, 'O': 66, 'F': 58}
electronegativity = {'H': 2.2, 'C': 2.55, 'N': 3.04, 'O': 3.44, 'F': 3.98}
ionization_energy = {'H': 13.5984, 'C': 11.2603, 'N': 14.5341, 'O': 13.6181, 'F': 17.4228}

In [7]:
train = pd.read_csv(f'{file_folder}/train.csv')
test = pd.read_csv(f'{file_folder}/test.csv')
sub = pd.read_csv(f'{file_folder}/sample_submission.csv')
train['type0'] = train['type'].apply(lambda x: int(x[0]))
train['type1'] = train['type'].apply(lambda x: x[1:])

le = LabelEncoder()
le.fit(train.type1.tolist())
int_bond_type = le.transform(train.type1.tolist()) 
train['int_type1']= int_bond_type

test['type0'] = test['type'].apply(lambda x: int(x[0]))
test['type1'] = test['type'].apply(lambda x: x[1:])
int_bond_type = le.transform(test.type1.tolist()) 
test['int_type1']= int_bond_type

In [9]:
le.classes_

array(['JHC', 'JHH', 'JHN'], dtype='<U3')

In [10]:
atomic_radius = {'H': 0.38, 'C': 0.77, 'N': 0.75, 'O': 0.73, 'F': 0.71}
atomic_number = {'H': 1, 'C': 6, 'N': 7, 'O': 8, 'F': 9}
atomic_mass = {'H': 1.0079, 'C': 12.0107, 'N': 14.0067, 'O': 15.9994, 'F': 18.9984}
vanderwaalsradius = {'H': 120, 'C': 185, 'N': 154, 'O': 140, 'F': 135}
covalenzradius = {'H': 30, 'C': 77, 'N': 70, 'O': 66, 'F': 58}
electronegativity = {'H': 2.2, 'C': 2.55, 'N': 3.04, 'O': 3.44, 'F': 3.98}
ionization_energy = {'H': 13.5984, 'C': 11.2603, 'N': 14.5341, 'O': 13.6181, 'F': 17.4228}

structures['atomic_radius'] = structures['atom'].apply(lambda x: atomic_radius[x])
structures['atomic_number'] = structures['atom'].apply(lambda x: atomic_number[x])
structures['atomic_mass'] = structures['atom'].apply(lambda x: atomic_mass[x])
structures['vanderwaalsradius'] = structures['atom'].apply(lambda x: vanderwaalsradius[x])
structures['covalenzradius'] = structures['atom'].apply(lambda x: covalenzradius[x])
structures['electronegativity'] = structures['atom'].apply(lambda x: electronegativity[x])
structures['ionization_energy'] = structures['atom'].apply(lambda x: ionization_energy[x])


In [11]:
le = LabelEncoder()
le.fit(structures.atom.tolist())
int_atom = le.transform(structures.atom.tolist()) 
structures['int_atom'] = int_atom

In [12]:
le.classes_

array(['C', 'F', 'H', 'N', 'O'], dtype='<U1')

In [112]:
def molecule_graph(molecule_name, structures, bonds):
    assert structures[structures['molecule_name']==molecule_name].shape[0]>0, 'no info in structures'
    assert bonds[bonds['molecule_name']==molecule_name].shape[0] > 0, 'no info in bonds'
    g = nx.Graph()
    for idx, row in structures[structures['molecule_name']==molecule_name].iterrows():
        g.add_node(row['atom_index'], pos=(row['x'], row['y'], row['z']), int_atom=row['int_atom'], atomic_radius=row['atomic_radius'], atomic_number=row['atomic_number'], atomic_mass=row['atomic_mass'], vanderwaalsradius=row['vanderwaalsradius'], covalenzradius=row['covalenzradius'], electronegativity=row['electronegativity'], ionization_energy=row['ionization_energy'])
    for idx, row in bonds[bonds['molecule_name']==molecule_name].iterrows():
        g.add_edge(row['atom_index_0'], row['atom_index_1'], weight=row['type0'], bond_type=row['int_type1'])
    return g

In [114]:
xyz = structures[['x','y','z']].values
structures_idx = structures.set_index('molecule_name')
ss = structures.groupby('molecule_name').size()
ss = ss.cumsum()
ssx = np.zeros(len(ss) + 1, 'int')
ssx[1:] = ss

In [186]:
from numba import jit
from math import sqrt

@jit
def numba_dist_matrix(xyz, ssx, molecule_id):
    start_molecule, end_molecule = ssx[molecule_id], ssx[molecule_id+1]
    locs = xyz[start_molecule:end_molecule]     
   # return locs
    num_atoms = end_molecule - start_molecule
    dmat = np.zeros((num_atoms, num_atoms))
    for i in range(num_atoms):
        for j in range(i+1, num_atoms):
            d = (locs[i,0] - locs[j,0])**2 + (locs[i,1] - locs[j,1])**2 + (locs[i,2] - locs[j,2])**2
            dmat[i,j] = d
            dmat[j,i] = d
    return dmat

# def numba_dist_matrices(xyz, ssx):
#     for molecule_id in range(structures.molecule_name.nunique()):
#         numba_dist_matrix(xyz, ssx, molecule_id)
        
        
def get_dist_matrix(df_structures_idx, molecule):
    df_temp = df_structures_idx.loc[molecule]
    locs = df_temp[['x','y','z']].values
    num_atoms = len(locs)
    loc_tile = np.tile(locs.T, (num_atoms,1,1))
    dist_mat = ((loc_tile - loc_tile.T)**2).sum(axis=1)
    return dist_mat

In [200]:
molecule_id=465
start_molecule, end_molecule = ssx[molecule_id], ssx[molecule_id+1]

In [201]:
end_molecule - start_molecule

14

In [195]:
%%time
dist_m = numba_dist_matrix(xyz, ssx, 465)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 26 µs


In [194]:
%%time
get_dist_matrix(structures_idx, 'dsgdb9nsd_000055').shape

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 2.51 ms


(15, 15)

In [202]:
structures_idx.shape

(2358657, 13)

In [203]:
def _worker(item, xyz, ssx, structures, structures_idx, bonds, verbose=0):
    
    molecule_name = item[0]
    molecule_id = item[1]
    list_ = []
    
    try:
#         dist_m = get_dist_matrix(xyz, ssx, molecule_id)
        dist_m = get_dist_matrix(structures_idx, molecule_name)
        
        g = molecule_graph(molecule_name, structures, bonds)
        laplacian_m = nx.laplacian_matrix(g).toarray().astype(float)

        t_dist_core, t_dist_factors = tucker(dist_m, rank=1)
        p_dist_core, p_dist_factors = parafac(dist_m, rank=1)
        t_lap_core, t_lap_factors = tucker(laplacian_m, rank=1)
        p_lap_core, p_lap_factors = parafac(laplacian_m, rank=1)
        
        if verbose > 0:
            print(t_dist_core.shape, t_dist_factors[0].shape, t_dist_factors[1].shape)
            print(p_dist_core.shape, p_dist_factors.shape)

        for idx, row in structures[structures['molecule_name']==molecule_name].iterrows():
            atom_index = row['atom_index']
            d_ = {'molecule_name':molecule, 'atom_index':atom_index}
            d_ = {'t_dist_core':t_dist_core[0][0], 't_dist_factors_0':t_dist_factors[0][atom_index][0], 't_dist_factors_1':t_dist_factors[1][atom_index][0], 'p_dist_core':p_dist_core[atom_index][0], 'p_dist_factors':p_dist_factors[atom_index][0], **d_}
            d_ = {'t_lap_core':t_lap_core[0][0], 't_lap_factors_0':t_lap_factors[0][atom_index][0], 't_lap_factors_1':t_lap_factors[1][atom_index][0], 'p_lap_core':p_lap_core[atom_index][0], 'p_lap_factors':p_lap_factors[atom_index][0], **d_}
            list_.append(d_)
    except Exception as e:
        raise Exception(item, e.__str__())
    return list_


def generate_datalist(ss, xyz, ssx, structures, structures_idx, bonds):
    data_list = []
    for item in tqdm_notebook(zip(ss.index, ss), total=len(ss)):
        list_ = _worker(item, **{'xyz':xyz, 'ssx':ssx, 'structures':structures, 'structures_idx':structures_idx, 'bonds':bonds})
        data_list = data_list + list_
    df_ = pd.DataFrame(data_list)
    return df_

In [204]:
ss_train = ss[[i for i in train.molecule_name.unique().tolist() if i in ss.index]]

In [None]:
df_train = generate_datalist(ss_train, xyz, ssx, structures, structures_idx, train)

HBox(children=(IntProgress(value=0, max=85003), HTML(value='')))

