In [1]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [1]:
import copy
from tqdm import tqdm_notebook

import sys
sys.path.append("../..") # Adds higher directory to python modules path.
from utilities import aggregate_feature_calculators
from utilities import aggregate_feature_calculators_setting as aggcal
from utilities.parallel import Parallel

In [2]:
import os
import numpy as np
import pandas as pd
import warnings
import networkx as nx
import tensorly as tl
from sklearn.preprocessing import LabelEncoder
from tensorly.decomposition import tucker, parafac, non_negative_tucker

Using numpy backend.


In [6]:
# import sys
# !{sys.executable} -m pip install tensorly

In [3]:
file_folder =  '../../data/input'
os.listdir(file_folder)

['test.csv',
 'structures',
 'sample_submission.csv',
 'magnetic_shielding_tensors.csv',
 'mulliken_charges.csv',
 'potential_energy.csv',
 'scalar_coupling_contributions.csv',
 'dipole_moments.csv',
 'structures.csv',
 'train.csv']

In [4]:
train = pd.read_csv(f'{file_folder}/train.csv')
test = pd.read_csv(f'{file_folder}/test.csv')
magnetic_shielding_tensors = pd.read_csv(f'{file_folder}/magnetic_shielding_tensors.csv')
dipole_moments = pd.read_csv(f'{file_folder}/dipole_moments.csv')
mulliken_charges = pd.read_csv(f'{file_folder}/mulliken_charges.csv')
potential_energy = pd.read_csv(f'{file_folder}/potential_energy.csv')
scalar_coupling_contributions = pd.read_csv(f'{file_folder}/scalar_coupling_contributions.csv')
structures = pd.read_csv(f'{file_folder}/structures.csv')

In [5]:
atomic_radius = {'H': 0.38, 'C': 0.77, 'N': 0.75, 'O': 0.73, 'F': 0.71}
atomic_number = {'H': 1, 'C': 6, 'N': 7, 'O': 8, 'F': 9}
atomic_mass = {'H': 1.0079, 'C': 12.0107, 'N': 14.0067, 'O': 15.9994, 'F': 18.9984}
vanderwaalsradius = {'H': 120, 'C': 185, 'N': 154, 'O': 140, 'F': 135}
covalenzradius = {'H': 30, 'C': 77, 'N': 70, 'O': 66, 'F': 58}
electronegativity = {'H': 2.2, 'C': 2.55, 'N': 3.04, 'O': 3.44, 'F': 3.98}
ionization_energy = {'H': 13.5984, 'C': 11.2603, 'N': 14.5341, 'O': 13.6181, 'F': 17.4228}

In [6]:
train = pd.read_csv(f'{file_folder}/train.csv')
test = pd.read_csv(f'{file_folder}/test.csv')
sub = pd.read_csv(f'{file_folder}/sample_submission.csv')
train['type0'] = train['type'].apply(lambda x: int(x[0]))
train['type1'] = train['type'].apply(lambda x: x[1:])

le = LabelEncoder()
le.fit(train.type1.tolist())
int_bond_type = le.transform(train.type1.tolist()) 
train['int_type1']= int_bond_type

test['type0'] = test['type'].apply(lambda x: int(x[0]))
test['type1'] = test['type'].apply(lambda x: x[1:])
int_bond_type = le.transform(test.type1.tolist()) 
test['int_type1']= int_bond_type

In [7]:
le.classes_

array(['JHC', 'JHH', 'JHN'], dtype='<U3')

In [8]:
atomic_radius = {'H': 0.38, 'C': 0.77, 'N': 0.75, 'O': 0.73, 'F': 0.71}
atomic_number = {'H': 1, 'C': 6, 'N': 7, 'O': 8, 'F': 9}
atomic_mass = {'H': 1.0079, 'C': 12.0107, 'N': 14.0067, 'O': 15.9994, 'F': 18.9984}
vanderwaalsradius = {'H': 120, 'C': 185, 'N': 154, 'O': 140, 'F': 135}
covalenzradius = {'H': 30, 'C': 77, 'N': 70, 'O': 66, 'F': 58}
electronegativity = {'H': 2.2, 'C': 2.55, 'N': 3.04, 'O': 3.44, 'F': 3.98}
ionization_energy = {'H': 13.5984, 'C': 11.2603, 'N': 14.5341, 'O': 13.6181, 'F': 17.4228}

structures['atomic_radius'] = structures['atom'].apply(lambda x: atomic_radius[x])
structures['atomic_number'] = structures['atom'].apply(lambda x: atomic_number[x])
structures['atomic_mass'] = structures['atom'].apply(lambda x: atomic_mass[x])
structures['vanderwaalsradius'] = structures['atom'].apply(lambda x: vanderwaalsradius[x])
structures['covalenzradius'] = structures['atom'].apply(lambda x: covalenzradius[x])
structures['electronegativity'] = structures['atom'].apply(lambda x: electronegativity[x])
structures['ionization_energy'] = structures['atom'].apply(lambda x: ionization_energy[x])


In [9]:
le = LabelEncoder()
le.fit(structures.atom.tolist())
int_atom = le.transform(structures.atom.tolist()) 
structures['int_atom'] = int_atom

In [10]:
le.classes_

array(['C', 'F', 'H', 'N', 'O'], dtype='<U1')

In [11]:
def molecule_graph(molecule_name, structures, bonds):
    assert structures[structures['molecule_name']==molecule_name].shape[0]>0, 'no info in structures'
    assert bonds[bonds['molecule_name']==molecule_name].shape[0] > 0, 'no info in bonds'
    g = nx.Graph()
    for idx, row in structures[structures['molecule_name']==molecule_name].iterrows():
        g.add_node(row['atom_index'], pos=(row['x'], row['y'], row['z']), int_atom=row['int_atom'], atomic_radius=row['atomic_radius'], atomic_number=row['atomic_number'], atomic_mass=row['atomic_mass'], vanderwaalsradius=row['vanderwaalsradius'], covalenzradius=row['covalenzradius'], electronegativity=row['electronegativity'], ionization_energy=row['ionization_energy'])
    for idx, row in bonds[bonds['molecule_name']==molecule_name].iterrows():
        g.add_edge(row['atom_index_0'], row['atom_index_1'], weight=row['type0'], bond_type=row['int_type1'])
    return g

In [12]:
xyz = structures[['x','y','z']].values
structures_idx = structures.set_index('molecule_name')
ss = structures.groupby('molecule_name').size()
ss = ss.cumsum()
ssx = np.zeros(len(ss) + 1, 'int')
ssx[1:] = ss

In [13]:
# from numba import jit
# from math import sqrt

# @jit
# def numba_dist_matrix(xyz, ssx, molecule_id):
#     start_molecule, end_molecule = ssx[molecule_id], ssx[molecule_id+1]
#     locs = xyz[start_molecule:end_molecule]     
#    # return locs
#     num_atoms = end_molecule - start_molecule
#     dmat = np.zeros((num_atoms, num_atoms))
#     for i in range(num_atoms):
#         for j in range(i+1, num_atoms):
#             d = (locs[i,0] - locs[j,0])**2 + (locs[i,1] - locs[j,1])**2 + (locs[i,2] - locs[j,2])**2
#             dmat[i,j] = d
#             dmat[j,i] = d
#     return dmat

# def numba_dist_matrices(xyz, ssx):
#     for molecule_id in range(structures.molecule_name.nunique()):
#         numba_dist_matrix(xyz, ssx, molecule_id)
        
        
def get_dist_matrix(df_structures_idx, molecule):
    df_temp = df_structures_idx.loc[molecule]
    locs = df_temp[['x','y','z']].values
    num_atoms = len(locs)
    loc_tile = np.tile(locs.T, (num_atoms,1,1))
    dist_mat = ((loc_tile - loc_tile.T)**2).sum(axis=1)
    return dist_mat

In [14]:
def _worker(item, xyz, ssx, structures, structures_idx, bonds, verbose=0):
    
    molecule_name = item[0]
    molecule_id = item[1]
    list_ = []
    
    try:
#         dist_m = get_dist_matrix(xyz, ssx, molecule_id)
        dist_m = get_dist_matrix(structures_idx, molecule_name)
        
        g = molecule_graph(molecule_name, structures, bonds)
        laplacian_m = nx.laplacian_matrix(g).toarray().astype(float)

        t_dist_core, t_dist_factors = tucker(dist_m, rank=1)
        p_dist_core, p_dist_factors = parafac(dist_m, rank=1)
        t_lap_core, t_lap_factors = tucker(laplacian_m, rank=1)
        p_lap_core, p_lap_factors = parafac(laplacian_m, rank=1)
        
        if verbose > 0:
            print(t_dist_core.shape, t_dist_factors[0].shape, t_dist_factors[1].shape)
            print(p_dist_core.shape, p_dist_factors.shape)

        for idx, row in structures[structures['molecule_name']==molecule_name].iterrows():
            atom_index = row['atom_index']
            d_ = {'molecule_name':molecule_name, 'atom_index':atom_index}
            d_ = {'t_dist_core':t_dist_core[0][0], 't_dist_factors_0':t_dist_factors[0][atom_index][0], 't_dist_factors_1':t_dist_factors[1][atom_index][0], 'p_dist_core':p_dist_core[atom_index][0], 'p_dist_factors':p_dist_factors[atom_index][0], **d_}
            d_ = {'t_lap_core':t_lap_core[0][0], 't_lap_factors_0':t_lap_factors[0][atom_index][0], 't_lap_factors_1':t_lap_factors[1][atom_index][0], 'p_lap_core':p_lap_core[atom_index][0], 'p_lap_factors':p_lap_factors[atom_index][0], **d_}
            list_.append(d_)
    except Exception as e:
        raise Exception(item, e.__str__())
    return list_


def generate_datalist(ss, xyz, ssx, structures, structures_idx, bonds):
    data_list = []
    for item in tqdm_notebook(zip(ss.index, ss), total=len(ss)):
        list_ = _worker(item, **{'xyz':xyz, 'ssx':ssx, 'structures':structures, 'structures_idx':structures_idx, 'bonds':bonds})
        data_list = data_list + list_
    df_ = pd.DataFrame(data_list)
    return df_

In [15]:
ss_train = ss[[i for i in train.molecule_name.unique().tolist() if i in ss.index]]

In [16]:
warnings.filterwarnings('ignore')
df_train = generate_datalist(ss_train, xyz, ssx, structures, structures_idx, train)
warnings.filterwarnings('default')

HBox(children=(IntProgress(value=0, max=85003), HTML(value='')))




In [17]:
df_train.head()

Unnamed: 0,atom_index,molecule_name,p_dist_core,p_dist_factors,p_lap_core,p_lap_factors,t_dist_core,t_dist_factors_0,t_dist_factors_1,t_lap_core,t_lap_factors_0,t_lap_factors_1
0,0,dsgdb9nsd_000001,-2.320915,-0.229753,1.795439e-16,7.709882e-18,10.101786,-0.229753,-0.229753,9.0,0.0,0.0
1,1,dsgdb9nsd_000001,-4.915758,-0.486623,-6.806524,-0.7562804,10.101786,-0.486623,-0.486623,9.0,-0.822925,-0.822925
2,2,dsgdb9nsd_000001,-4.91576,-0.486623,5.787065,0.6430072,10.101786,-0.486623,-0.486623,9.0,0.020415,0.020415
3,3,dsgdb9nsd_000001,-4.915794,-0.486626,1.084822,0.1205358,10.101786,-0.486626,-0.486626,9.0,0.414768,0.414768
4,4,dsgdb9nsd_000001,-4.915794,-0.486626,-0.06536377,-0.007262641,10.101786,-0.486626,-0.486626,9.0,0.387743,0.387743


In [18]:
ss_test = ss[[i for i in test.molecule_name.unique().tolist() if i in ss.index]]

In [19]:
warnings.filterwarnings('ignore')
df_test = generate_datalist(ss_test, xyz, ssx, structures, structures_idx, test)
warnings.filterwarnings('default')

HBox(children=(IntProgress(value=0, max=45772), HTML(value='')))




In [20]:
df_train.shape, df_test.shape

((1533537, 12), (825120, 12))

In [21]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [22]:
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,type0,type1,int_type1
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,1,JHC,0
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,2,JHH,1
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,2,JHH,1
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,2,JHH,1
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,1,JHC,0


In [23]:
def map_atom_info(df, structures, atom_idx):
    df = pd.merge(df, structures, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    #df = df.drop('atom_index', axis=1)
    columns_ = {}
    for col in structures.columns.drop(['molecule_name','atom_index']):
        columns_[col] = f'atom_index_{atom_idx}_{col}'
    df = df.rename(columns=columns_)
    return df



In [24]:
train = map_atom_info(train, df_train, 0)
train = map_atom_info(train, df_train, 1)


In [27]:
df_train.columns.tolist()

['atom_index',
 'molecule_name',
 'p_dist_core',
 'p_dist_factors',
 'p_lap_core',
 'p_lap_factors',
 't_dist_core',
 't_dist_factors_0',
 't_dist_factors_1',
 't_lap_core',
 't_lap_factors_0',
 't_lap_factors_1']

In [25]:
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,type0,type1,int_type1,atom_index_x,...,atom_index_1_p_dist_core,atom_index_1_p_dist_factors,atom_index_1_p_lap_core,atom_index_1_p_lap_factors,atom_index_1_t_dist_core,atom_index_1_t_dist_factors_0,atom_index_1_t_dist_factors_1,atom_index_1_t_lap_core,atom_index_1_t_lap_factors_0,atom_index_1_t_lap_factors_1
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,1,JHC,0,1,...,-2.320915,-0.229753,1.795439e-16,7.709882e-18,10.101786,-0.229753,-0.229753,9.0,0.0,0.0
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,2,JHH,1,1,...,-4.91576,-0.486623,5.787065,0.6430072,10.101786,-0.486623,-0.486623,9.0,0.020415,0.020415
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,2,JHH,1,1,...,-4.915794,-0.486626,1.084822,0.1205358,10.101786,-0.486626,-0.486626,9.0,0.414768,0.414768
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,2,JHH,1,1,...,-4.915794,-0.486626,-0.06536377,-0.007262641,10.101786,-0.486626,-0.486626,9.0,0.387743,0.387743
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,1,JHC,0,2,...,-2.320915,-0.229753,1.795439e-16,7.709882e-18,10.101786,-0.229753,-0.229753,9.0,0.0,0.0


In [29]:
train = train[['id',
 'atom_index_0_p_dist_core',
 'atom_index_0_p_dist_factors',
 'atom_index_0_p_lap_core',
 'atom_index_0_p_lap_factors',
 'atom_index_0_t_dist_core',
 'atom_index_0_t_dist_factors_0',
 'atom_index_0_t_dist_factors_1',
 'atom_index_0_t_lap_core',
 'atom_index_0_t_lap_factors_0',
 'atom_index_0_t_lap_factors_1',
 'atom_index_1_p_dist_core',
 'atom_index_1_p_dist_factors',
 'atom_index_1_p_lap_core',
 'atom_index_1_p_lap_factors',
 'atom_index_1_t_dist_core',
 'atom_index_1_t_dist_factors_0',
 'atom_index_1_t_dist_factors_1',
 'atom_index_1_t_lap_core',
 'atom_index_1_t_lap_factors_0',
 'atom_index_1_t_lap_factors_1']]

In [31]:
train = reduce_mem_usage(train)

Mem. usage decreased to 231.00 Mb (70.5% reduction)


In [32]:
test = map_atom_info(test, df_test, 0)
test = map_atom_info(test, df_test, 1)

In [33]:
test = test[['id',
 'atom_index_0_p_dist_core',
 'atom_index_0_p_dist_factors',
 'atom_index_0_p_lap_core',
 'atom_index_0_p_lap_factors',
 'atom_index_0_t_dist_core',
 'atom_index_0_t_dist_factors_0',
 'atom_index_0_t_dist_factors_1',
 'atom_index_0_t_lap_core',
 'atom_index_0_t_lap_factors_0',
 'atom_index_0_t_lap_factors_1',
 'atom_index_1_p_dist_core',
 'atom_index_1_p_dist_factors',
 'atom_index_1_p_lap_core',
 'atom_index_1_p_lap_factors',
 'atom_index_1_t_dist_core',
 'atom_index_1_t_dist_factors_0',
 'atom_index_1_t_dist_factors_1',
 'atom_index_1_t_lap_core',
 'atom_index_1_t_lap_factors_0',
 'atom_index_1_t_lap_factors_1']]

In [34]:
test = reduce_mem_usage(test)

Mem. usage decreased to 124.25 Mb (70.5% reduction)


In [23]:
df_train.drop(columns=['molecule_name','atom_index_0','atom_index_1','type','scalar_coupling_constant']).to_pickle('../../data/feature/angles-and-distances_train.pkl')

In [24]:
df_test.drop(columns=['molecule_name','atom_index_0','atom_index_1','type']).to_pickle('../../data/feature/angles-and-distances_test.pkl')