In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy.spatial import distance_matrix

from sklearn import preprocessing
import os


In [2]:
datadir = "../../data/input"
# print(os.listdir(datadir))
featdir = '../../data/feature'
tempmpnn = '../../data/temp/mpnn_keras'

file_folder = featdir


In [3]:
train = pd.read_csv(f'{datadir}/train.csv')
test = pd.read_csv(f'{datadir}/test.csv')
structures = pd.read_csv(f'{datadir}/structures.csv')

In [4]:
train_bonds = pd.read_csv(f'{tempmpnn}/train_bonds.csv')
test_bonds = pd.read_csv(f'{tempmpnn}/test_bonds.csv')

In [5]:
angs = pd.read_csv(f'{tempmpnn}/angles.csv')

In [6]:
train_bonds.columns.tolist()

['molecule_name',
 'atom_index_0',
 'atom_index_1',
 'nbond',
 'L2dist',
 'error',
 'bond_type']

In [7]:
angs.columns.tolist()

['molecule_name',
 'atom_index_0',
 'atom_index_1',
 'shortest_path_atoms',
 'shortest_path_n_bonds',
 'cosinus',
 'dihedral']

In [8]:

type_columns = [['tertiary_angle_0', 'inv_dist1R', 'd_4_3', 'yukawa_H.y', 'mulliken_atom_0', 'dist_to_type_mean', 'dist_O_0_x', 'atom_1_n_bonds', 'dist_to_type_1_mean', 'atom_1_bond_lengths_mean', 'dist_xyz', 'dist_C_0_y', 'd_3_2', 'atom_index_1_ hybridization', 'atom_index_1_cycle_size_mean', 'dist_O_0_y', 'eem_1', 'inv_distPE', 'd_4_2', 'inv_distPR', 'dist_no_bond_min_y', 'dist_H_2_x', 'dist_H_1_x', 'tertiary_distance_2', 'dist_C_3_x', 'dist_O_1_x', 'atom_1_bond_lengths_std', 'dist_C_2_y', 'dist_C_2_x', 'mulliken_atom_1', 'cos_center1', 'dist_O_1_y', 'tertiary_angle_3', 'dist_H_2_y', 'dist_N_0_y', 'dist_C_1_y', 'inv_dist1E', 'distance_y', 'tertiary_angle_2', 'dist_N_0_x', 'd_2_1', 'molecule_atom_index_0_dist_max_div', 'adC1', 'adN1', 'd_4_0', 'dist_C_3_y', 'atom_3', 'distC0', 'tertiary_distance_4', 'tertiary_angle_5', 'd_5_1', 'molecule_atom_index_1_dist_min_diff', 'dist_C_4_y', 'dist_H_0_y', 'cos_f0', 'd_5_0', 'tertiary_distance_3', 'd_5_2', 'tertiary_atom_1', 'dist_C_4_x', 'cos_c0_f0', 'atom_index_0_sv_3', 'rc_C', 'cos_f1', 'tertiary_angle_8', 'dist_O_2_y', 'max_molecule_atom_1_dist_xyz', 'dist_F_0_y', 'atom_index_1_ aromatic', 'tertiary_angle_26', 'type_0'],
['dist_H_0_y', 'd_3_2', 'dist_C_0_y', 'atom_index_1_ aromatic', 'atom_1_bond_lengths_mean', 'bond_atom', 'inv_dist1R', 'd_3_1', 'mulliken_atom_0', 'dist_H_0_x', 'dist_O_0_y', 'dist_C_1_x', 'tertiary_angle_0', 'dist_C_1_y', 'vander_C.y', 'dist_H_1_y', 'mulliken_atom_1', 'inv_dist0R', 'd_1_0', 'tertiary_distance_0', 'tertiary_angle_2', 'atom_index_1_explicit_valence', 'dist_N_0_y', 'inv_distPR', 'dist_C_2_x', 'vander_H.x', 'd_4_2', 'atom_index_0_eigv_max', 'tertiary_distance_2', 'dist_H_1_x', 'dist_N_1_x', 'dist_C_3_x', 'cos_f0', 'atom_index_1_sv_2', 'max_molecule_atom_0_dist_xyz', 'd_2_1'],
['tertiary_atom_0', 'inv_dist0', 'dist_no_bond_min_x', 'atom_index_1_ hybridization', 'tertiary_angle_0', 'tertiary_angle_1', 'dist_O_0_x', 'cos_c0', 'd_5_2', 'tertiary_atom_1', 'cos_f0', 'dist_H_0_x', 'd_3_1', 'atom_index_1_degree', 'dist_C_0_y', 'adC2', 'dist_C_3_x', 'vander_O.y', 'mulliken_atom_1', 'atom_7', 'tertiary_angle_2', 'd_2_1', 'atom_3', 'd_5_1', 'd_6_2', 'd_4_1', 'tertiary_atom_2', 'molecule_atom_index_1_dist_min_diff', 'd_4_2', 'dist_C_2_x', 'cos_c0_f0', 'd_6_0', 'dist_O_0_y', 'd_4_3', 'd_3_0', 'd_7_0', 'd_3_2', 'inv_dist0R', 'atom_8', 'dist_C_1_x', 'd_6_1', 'd_2_0', 'd_8_1', 'mulliken_atom_0', 'dist_N_0_x', 'atom_4', 'tertiary_distance_2', 'd_7_2', 'dist_C_0_x', 'atom_1_bond_lengths_mean', 'dist_C_1_y', 'bond_atom', 'd_7_1', 'd_4_0', 'distC0', 'atom_index_1_cycle_size_mean', 'cos_c0_c1', 'tertiary_angle_3', 'dist_O_1_x', 'atom_index_1_n_cycle', 'max_molecule_atom_0_dist_xyz', 'molecule_atom_index_0_dist_max_div', 'atom_5', 'gap', 'cos_c1', 'dist_N_0_y', 'd_6_3', 'dist_C_3_y', 'inv_distP', 'dist_C_4_y'],
['cos_c0', 'd_4_3', 'cos_c0_c1', 'molecule_atom_index_0_dist_min_diff', 'tertiary_atom_1', 'd_3_2', 'd_1_0', 'dist_H_0_y', 'mulliken_atom_0', 'mulliken_atom_1', 'dist_N_0_x', 'link0', 'tertiary_atom_2', 'dist_C_1_y', 'dist_C_1_x', 'cos_f0', 'dist_C_0_y', 'cos_f1', 'd_3_1', 'tertiary_distance_1', 'dist_O_0_y', 'cos_f0_f1', 'adC1', 'd_5_3', 'inv_distP', 'edge_4', 'd_6_2', 'dist_N_0_y', 'tertiary_distance_2', 'dist_O_0_x', 'cos_c1_f1', 'd_3_0', 'd_5_2', 'dist_C_0_x', 'adN1', 'cos_c0_f0', 'd_4_1', 'max_distance_y', 'dist_C_2_y', 'atom_5', 'adC3', 'dist_to_type_1_mean', 'vander_H.x', 'dist_C_3_y', 'dist_H_3_x', 'molecule_atom_index_0_dist_max_div', 'atom_7', 'dist_C_3_x', 'd_5_1', 'dist_H_3_y', 'atom_index_0_eigv_max', 'atom_6', 'dist_H_2_x', 'atom_index_1_sv_0', 'molecule_atom_index_1_dist_std_div', 'link1'],
['d_3_1', 'dist_H_1_x', 'd_5_0', 'd_4_0', 'yukawa_H.x', 'inv_dist0', 'd_6_0', 'd_4_1', 'cos_c0', 'atom_3', 'dist_C_0_y', 'molecule_atom_index_0_dist_std_div', 'cos_c0_c1', 'd_4_2', 'min_molecule_atom_0_dist_xyz', 'sd_molecule_atom_0_dist_xyz', 'd_2_1', 'adC2', 'd_3_0', 'dist_C_1_y', 'd_4_3', 'dist_H_0_x', 'vander_C.x', 'd_5_3', 'dist_H_1_y', 'tertiary_distance_3', 'd_2_0', 'dist_O_0_x', 'd_5_1', 'dist_O_0_y', 'adC3', 'inv_dist0R', 'dist_C_3_y', 'atom_index_1_ hybridization', 'cos_f0', 'dist_C_2_x', 'd_5_2', 'd_6_1', 'dist_C_0_x', 'atom_1_bond_lengths_min', 'mulliken_atom_1', 'distance_farthest_0', 'tertiary_distance_1', 'min_molecule_atom_1_dist_xyz', 'yukawa_O.y', 'atom_0_bond_lengths_max'],
['tertiary_angle_0', 'd_2_1', 'cos_c0', 'atom_1_bond_lengths_mean', 'd_3_1', 'd_2_0', 'tertiary_distance_1', 'd_3_2', 'tertiary_angle_1', 'cos_f0', 'tertiary_distance_2', 'dist_C_0_x', 'dist_H_0_x', 'dist_C_2_x', 'dist_O_0_y', 'd_4_1', 'd_4_3', 'atom_index_1_cycle_size_mean', 'molecule_atom_index_0_dist_min_diff', 'tertiary_atom_2', 'atom_4', 'cos_c0_f0', 'tertiary_distance_3', 'd_3_0', 'dist_median_bond_y', 'd_5_2', 'adC3', 'atom_5', 'dist_H_1_x', 'molecule_atom_index_0_dist_min_div', 'gap', 'molecule_atom_index_1_dist_min_div', 'dist_O_0_x', 'cos_c1', 'dist_C_0_y', 'd_5_1', 'dist_N_0_y', 'dist_C_3_y', 'dist_no_bond_min_y', 'd_4_0', 'dist_N_0_x', 'd_4_2', 'max_molecule_atom_0_dist_xyz', 'cos_c0_c1', 'adC2', 'atom_index_1_n_cycle', 'd_5_0', 'd_6_1', 'dist_C_4_y', 'dist_O_1_y', 'd_7_2', 'tertiary_angle_2', 'd_6_2', 'mulliken_atom_1', 'atom_6', 'd_7_3', 'dist_O_1_x'],
['cos_c0_c1', 'atom_4', 'atom_5', 'molecule_atom_index_0_dist_min_diff', 'cos_c1', 'max_molecule_atom_1_dist_xyz', 'dist_to_type_std', 'd_3_2', 'cos_c0', 'dist_O_0_x', 'd_4_3', 'atom_6', 'dist_O_0_y', 'tertiary_atom_1', 'dist_C_2_y', 'd_4_2', 'dist_C_1_y', 'atom_7', 'tertiary_angle_1', 'dist_H_0_y', 'dist_no_bond_min_y', 'distance_c1', 'dist_C_2_x', 'linkM0', 'd_6_2', 'dist_C_0_y', 'd_5_2', 'd_7_2', 'dist_C_3_y', 'd_6_0', 'dihedral', 'max_molecule_atom_0_dist_xyz', 'd_7_3', 'd_6_1', 'dist_H_1_y', 'tertiary_atom_2', 'd_4_0', 'tertiary_atom_0', 'tertiary_angle_3', 'dist_C_0_x', 'dist_to_type_0_mean', 'dist_N_0_y', 'd_4_1', 'cos_c1_f1', 'cos_f0', 'dist_xyz', 'adC2', 'd_5_3', 'cos_f0_f1', 'gap', 'd_7_0', 'cos_f1', 'tertiary_distance_1', 'molecule_atom_index_0_dist_max_diff', 'd_2_1'],
['cos_c0', 'tertiary_distance_1', 'cos_c1', 'd_3_2', 'tertiary_angle_1', 'tertiary_angle_0', 'atom_1_n_bonds', 'tertiary_distance_2', 'd_2_1', 'tertiary_angle_2', 'd_4_0', 'molecule_atom_index_0_dist_min_div', 'd_2_0', 'dist_H_0_x', 'd_3_1', 'cos_c0_c1', 'mulliken_atom_1', 'd_8_3', 'd_4_1', 'dist_C_0_y', 'd_3_0', 'atom_index_1_cycle_size_mean', 'dist_C_1_x', 'dist_C_2_x', 'adC2', 'adC1', 'atom_1_bond_lengths_std', 'atom_index_1_n_cycle', 'd_4_2', 'cos_f0', 'd_5_2', 'dist_to_type_0_mean', 'dist_O_0_x', 'molecule_atom_index_0_dist_std_diff', 'd_5_1', 'tertiary_angle_3', 'd_6_2', 'd_7_3']]

In [10]:
unique_cols = []
for cols in type_columns:
    unique_cols += cols
unique_cols = list(set(unique_cols))
plus_cols = pd.read_pickle(f'{featdir}/df_train2_plus.gzde', compression='gzip').columns.tolist()
unique_cols_plus = [col for col in unique_cols if col in plus_cols]
unique_cols_data2 = list(set(unique_cols) - set(unique_cols_plus))
len(unique_cols_data2), len(unique_cols_plus)

(105, 48)

In [11]:
df_train2 = pd.read_pickle(f'{file_folder}/df_train2.gzde', compression='gzip')[['index', 'type', 'group', 'scalar_coupling_constant', 'fc', 'sd','pso','dso']+unique_cols_data2]
df_train2_plus = pd.read_pickle(f'{file_folder}/df_train2_plus.gzde', compression='gzip')[['id']+unique_cols_plus]
df_train2_plus = df_train2_plus.rename(columns={'id':'index'})
df_train2 = pd.merge(df_train2, df_train2_plus, how='left', on='index')
df_train2 = df_train2.fillna(0)

df_test2 = pd.read_pickle(f'{file_folder}/df_test2.gzde', compression='gzip')[['index', 'type', 'group']+unique_cols_data2]
df_test2_plus = pd.read_pickle(f'{file_folder}/df_test2_plus.gzde', compression='gzip')[['id']+unique_cols_plus]
df_test2_plus = df_test2_plus.rename(columns={'id':'index'})
df_test2 = pd.merge(df_test2, df_test2_plus, how='left', on='index')
df_test2 = df_test2.fillna(0)

In [12]:
# scale_min  = train['scalar_coupling_constant'].min()
# scale_max  = train['scalar_coupling_constant'].max()
# scale_mid = (scale_max + scale_min)/2
# scale_norm = scale_max - scale_mid

# train['scalar_coupling_constant'] = (train['scalar_coupling_constant'] - scale_mid)/scale_norm

# One hot encoding gets  too big for Kaggle, let's try label
# use npz now, back to OH
train[['1JHC', '1JHN', '2JHC', '2JHH', '2JHN', '3JHC', '3JHH', '3JHN']] =  pd.get_dummies(train['type'])
test[['1JHC', '1JHN', '2JHC', '2JHH', '2JHN', '3JHC', '3JHH', '3JHN']]  =  pd.get_dummies(test['type'])

In [16]:
structures[['C', 'F' ,'H', 'N', 'O']] = pd.get_dummies(structures['atom'])
# structures[['x', 'y', 'z']] = structures[['x', 'y', 'z']]/10.

In [13]:
test_bonds[['nbond_1', 'nbond_1.5', 'nbond_2', 'nbond_3']] = pd.get_dummies(test_bonds['nbond'])#test_bonds['nbond']/3
train_bonds[['nbond_1', 'nbond_1.5', 'nbond_2', 'nbond_3']] = pd.get_dummies(train_bonds['nbond'])#train_bonds['nbond']/3

In [14]:
# angs['dihedral'] = angs['dihedral']/np.pi
# Should I rather one-hot this?
# angs['shortest_path_n_bonds'] = angs['shortest_path_n_bonds']/6.0
angs = angs.fillna(0)

In [15]:
train_mol_names = train['molecule_name'].unique()
test_mol_names  = test['molecule_name'].unique()

train_structures = structures.loc[structures['molecule_name'].isin(train_mol_names)]
test_structures = structures.loc[structures['molecule_name'].isin(test_mol_names)]

train_struct_group = train_structures.groupby('molecule_name')
test_struct_group  = test_structures.groupby('molecule_name')

train_group = train.groupby('molecule_name')
test_group  = test.groupby('molecule_name')

train_bond_group = train_bonds.groupby('molecule_name')
test_bond_group  = test_bonds.groupby('molecule_name')

train_angs = angs.loc[angs['molecule_name'].isin(train_mol_names)]
test_angs = angs.loc[angs['molecule_name'].isin(test_mol_names)]

train_angs_group = train_angs.groupby('molecule_name')
test_angs_group  = test_angs.groupby('molecule_name')

# Find max nodes in graph:
max_size = train_struct_group.size().max()

In [17]:
df_train2 = df_train2.rename(columns={'index':'id'})
df_test2 = df_test2.rename(columns={'index':'id'})
df_train2 = pd.merge(df_train2, train[['id', 'molecule_name']], how='left', on='id')
df_test2 = pd.merge(df_test2, test[['id', 'molecule_name']], how='left', on='id')

In [18]:
train_bonds2_group = df_train2.groupby('molecule_name')
test_bonds2_group = df_test2.groupby('molecule_name')

In [None]:
# Values our nodes will have
node_vals = ['C', 'F' ,'H', 'N', 'O']#, 'x', 'y', 'z']
#Values our edges will have (minus distance, for now)
bond_vals = ['nbond_1', 'nbond_1.5', 'nbond_2', 'nbond_3']#['nbond']
bond2_vals = unique_cols
j_coup_vals = ['1JHC', '1JHN', '2JHC', '2JHH', '2JHN', '3JHC', '3JHH', '3JHN']#'l_type']
ang_vals = ['shortest_path_n_bonds','cosinus','dihedral']
edge_vals = j_coup_vals + bond_vals + bond2_vals+ ang_vals

# Find amount of training molecules
n_train_mols = len(train_mol_names)
n_test_mols = len(test_mol_names)

# Find dim of edges and nodes
bond_dim  = len(bond_vals)
bond2_dim = len(bond2_vals)
j_coup_dim= len(j_coup_vals)
ang_dim   = len(ang_vals)
node_dim  = len(node_vals)
edge_dim  = len(edge_vals) 

# Additional edge dims for distances 
add_edge_dim = 1

In [None]:
train_nodes_array     = np.zeros((n_train_mols, max_size, node_dim), dtype=np.float32) 
train_in_edges_array  = np.zeros((n_train_mols, max_size, max_size, edge_dim + add_edge_dim),dtype=np.float32) 
train_out_edges_array = np.zeros((n_train_mols, max_size, max_size, 1),dtype=np.float32) 

test_nodes_array     = np.zeros((n_test_mols, max_size, node_dim), dtype=np.float32) 
test_in_edges_array  = np.zeros((n_test_mols, max_size, max_size, edge_dim + add_edge_dim),dtype=np.float32) 

In [None]:
def make_arrs(val_group, struct_group, bond_group, bond2_group, ang_group, test):
    i = 0
    for values, structs, bonds, bonds2, angles in zip(val_group, struct_group, bond_group, bond2_group, ang_group):
        if (not i%1000):
            print(i)

        # Calculate distances
        distances = np.zeros((max_size, max_size, 1))
        coords = structs[1][['x','y','z']].values
        dists  = distance_matrix(coords, coords)
        distances[:dists.shape[0],:dists.shape[1], 0] = dists 
        
        # Create nodes
        mol_info = structs[1][node_vals].values
        nodes = np.zeros((max_size, node_dim))
        nodes[:mol_info.shape[0], :mol_info.shape[1]] = mol_info

        # Create edges
        in_feats = np.zeros((max_size, max_size, j_coup_dim))
        ind = values[1][['atom_index_0', 'atom_index_1' ]].values
        in_feats[ind[:,0], ind[:,1], 0:j_coup_dim] = values[1][j_coup_vals].values
        in_feats[ind[:,1], ind[:,0], 0:j_coup_dim] = in_feats[ind[:,0], ind[:,1], 0:j_coup_dim]

        # Create bonds
        in_bonds = np.zeros((max_size, max_size, bond_dim))
        ind_bonds = bonds[1][['atom_index_0', 'atom_index_1' ]].values
        in_bonds[ind_bonds[:,0], ind_bonds[:,1]] = bonds[1][bond_vals].values
        in_bonds[ind_bonds[:,1], ind_bonds[:,0]] = in_bonds[ind_bonds[:,0], ind_bonds[:,1]]
        
        # Create bonds2
        in_bonds2 = np.zeros((max_size, max_size, bond2_dim))
        ind_bonds2 = bonds2[1][['atom_index_0', 'atom_index_1' ]].values
        in_bonds2[ind_bonds2[:,0], ind_bonds2[:,1]] = bonds2[1][bond2_vals].values
        in_bonds2[ind_bonds2[:,1], ind_bonds2[:,0]] = in_bonds2[ind_bonds2[:,0], ind_bonds2[:,1]]
                
        # Create angles
        ind_angs = angles[1][['atom_index_0', 'atom_index_1' ]].values
        ang_mat  = np.zeros((max_size, max_size, ang_dim))
        ang_mat[ind_angs[:,0], ind_angs[:,1]]  = angles[1][ang_vals]
        ang_mat[ind_angs[:,1], ind_angs[:,0]]  = ang_mat[ind_angs[:,0], ind_angs[:,1]]
        
        # concat all edge values 
        in_edges = np.concatenate((in_feats, in_bonds, in_bonds2, ang_mat, distances),axis=2)

        if not test:           
            out_edges = np.zeros((max_size, max_size, 1))
            out_edges[ind[:,0], ind[:,1], 0] = values[1]['scalar_coupling_constant' ].values
            out_edges[ind[:,1], ind[:,0], 0] = out_edges[ind[:,0], ind[:,1], 0]
        

            train_nodes_array[i]      = nodes
            train_in_edges_array[i]   = in_edges
            train_out_edges_array[i]  = out_edges
        else:
            test_nodes_array[i]      = nodes
            test_in_edges_array[i]   = in_edges
        i = i + 1

In [None]:
make_arrs(train_group, train_struct_group, train_bond_group, train_angs_group, test = False)

In [None]:
make_arrs(test_group, test_struct_group, test_bond_group, test_angs_group, test = True)

In [None]:
np.savez_compressed("nodes_train.npz" , train_nodes_array)
np.savez_compressed("in_edges_train.npz" , train_in_edges_array)
np.savez_compressed("out_edges_train.npz" , train_out_edges_array)

np.savez_compressed("nodes_test.npz" , test_nodes_array)
np.savez_compressed("in_edges_test.npz" , test_in_edges_array)