In [1]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy.spatial import distance_matrix

from sklearn import preprocessing
import os




In [2]:
datadir = "../../data/input"
print(os.listdir(datadir))

['test.csv', 'structures', 'magnetic_shielding_tensors.csv', 'mulliken_charges.csv', 'potential_energy.csv', 'scalar_coupling_contributions.csv', 'dipole_moments.csv', 'structures.csv', 'train.csv']


In [3]:
tmpdir = "../../data/temp/mpnn_keras"
print(os.listdir(tmpdir))

['nodes_test.npz', 'test_bonds.csv', 'in_edges_train.npz', 'nodes_train.npz', 'out_edges_train.npz', 'angles.csv', 'train_bonds.csv', 'in_edges_test.npz']


In [4]:
train = pd.read_csv(datadir + '/train.csv')
test = pd.read_csv(datadir + '/test.csv')
structures = pd.read_csv(datadir + '/structures.csv')

In [5]:
train.columns

Index(['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type',
       'scalar_coupling_constant'],
      dtype='object')

In [6]:
train_bonds = pd.read_csv(tmpdir + '/train_bonds.csv')
test_bonds = pd.read_csv(tmpdir + '/test_bonds.csv')

In [7]:
angs = pd.read_csv(tmpdir + '/angles.csv')

In [8]:
scale_min  = train['scalar_coupling_constant'].min()
scale_max  = train['scalar_coupling_constant'].max()
scale_mid = (scale_max + scale_min)/2
scale_norm = scale_max - scale_mid

train['scalar_coupling_constant'] = (train['scalar_coupling_constant'] - scale_mid)/scale_norm

# One hot encoding gets  too big for Kaggle, let's try label
# use npz now, back to OH
train[['1JHC', '1JHN', '2JHC', '2JHH', '2JHN', '3JHC', '3JHH', '3JHN']] =  pd.get_dummies(train['type'])
test[['1JHC', '1JHN', '2JHC', '2JHH', '2JHN', '3JHC', '3JHH', '3JHN']]  =  pd.get_dummies(test['type'])


In [9]:
structures[['C', 'F' ,'H', 'N', 'O']] = pd.get_dummies(structures['atom'])
structures[['x', 'y', 'z']] = structures[['x', 'y', 'z']]/10.

In [10]:
test_bonds[['nbond_1', 'nbond_1.5', 'nbond_2', 'nbond_3']] = pd.get_dummies(test_bonds['nbond'])#test_bonds['nbond']/3
train_bonds[['nbond_1', 'nbond_1.5', 'nbond_2', 'nbond_3']] = pd.get_dummies(train_bonds['nbond'])#train_bonds['nbond']/3

In [11]:
angs['dihedral'] = angs['dihedral']/np.pi
# Should I rather one-hot this?
angs['shortest_path_n_bonds'] = angs['shortest_path_n_bonds']/6.0
angs = angs.fillna(0)

In [12]:
train_mol_names = train['molecule_name'].unique()
test_mol_names  = test['molecule_name'].unique()

train_structures = structures.loc[structures['molecule_name'].isin(train_mol_names)]
test_structures = structures.loc[structures['molecule_name'].isin(test_mol_names)]

train_struct_group = train_structures.groupby('molecule_name')
test_struct_group  = test_structures.groupby('molecule_name')

train_group = train.groupby('molecule_name')
test_group  = test.groupby('molecule_name')

train_bond_group = train_bonds.groupby('molecule_name')
test_bond_group  = test_bonds.groupby('molecule_name')

train_angs = angs.loc[angs['molecule_name'].isin(train_mol_names)]
test_angs = angs.loc[angs['molecule_name'].isin(test_mol_names)]

train_angs_group = train_angs.groupby('molecule_name')
test_angs_group  = test_angs.groupby('molecule_name')

# Find max nodes in graph:
max_size = train_struct_group.size().max()

In [13]:
# Values our nodes will have
node_vals = ['C', 'F' ,'H', 'N', 'O']#, 'x', 'y', 'z']
#Values our edges will have (minus distance, for now)
bond_vals = ['nbond_1', 'nbond_1.5', 'nbond_2', 'nbond_3']#['nbond']
j_coup_vals = ['1JHC', '1JHN', '2JHC', '2JHH', '2JHN', '3JHC', '3JHH', '3JHN']#'l_type']
ang_vals = ['shortest_path_n_bonds','cosinus','dihedral']
edge_vals = j_coup_vals + bond_vals + ang_vals

# Find amount of training molecules
n_train_mols = len(train_mol_names)
n_test_mols = len(test_mol_names)

# Find dim of edges and nodes
bond_dim  = len(bond_vals)
j_coup_dim= len(j_coup_vals)
ang_dim   = len(ang_vals)
node_dim  = len(node_vals)
edge_dim  = len(edge_vals) 

# Additional edge dims for distances 
add_edge_dim = 1

In [14]:
file_folder =  '../../data/feature'
df_data = pd.read_pickle(f'{file_folder}/df_train.gzde', compression='gzip')
df_data = df_data.rename(columns={'index':'id'})
const_col = ['id','group', 'type', 'scalar_coupling_constant', 'fc', 'sd','pso','dso']
atom_feats = []
bond_feats = []
for col in df_data.drop(columns=const_col).columns.tolist():
    if '0' in col:
        if col.replace('0','1') in df_data.columns.tolist():
            atom_feats.append(col)
        else:
            bond_feats.append(col)
    elif '1' in col:
        if col.replace('1','0') in df_data.columns.tolist():
            atom_feats.append(col)
        else:
            bond_feats.append(col)
    else:
        bond_feats.append(col)
        
bond2_dim = len(bond_feats)
df_data = pd.merge(df_data, train[['id', 'molecule_name']], on='id')
train_bond_group2 = df_data.groupby('molecule_name')

In [15]:
# Additional edge dims for bonds2 
add_edge_dim2 = bond2_dim

In [16]:
train_nodes_array     = np.zeros((n_train_mols, max_size, node_dim), dtype=np.float32) 
train_in_edges_array  = np.zeros((n_train_mols, max_size, max_size, edge_dim + add_edge_dim + add_edge_dim2),dtype=np.float32) 
train_out_edges_array = np.zeros((n_train_mols, max_size, max_size, 1),dtype=np.float32) 

test_nodes_array     = np.zeros((n_test_mols, max_size, node_dim), dtype=np.float32) 
test_in_edges_array  = np.zeros((n_test_mols, max_size, max_size, edge_dim + add_edge_dim),dtype=np.float32) 

MemoryError: 

In [29]:
def make_arrs(val_group, struct_group, bond_group, ang_group, train_bond_group2, test):
    i = 0
    for values, structs, bonds, angles, bonds2 in zip(val_group, struct_group, bond_group, ang_group, train_bond_group2):
        if (not i%1000):
            print(i)

        # Calculate distances
        distances = np.zeros((max_size, max_size, 1))
        coords = structs[1][['x','y','z']].values
        dists  = distance_matrix(coords, coords)
        distances[:dists.shape[0],:dists.shape[1], 0] = dists 
        print(distances.shape, dists.shape)
        
        # Create nodes
        mol_info = structs[1][node_vals].values
        nodes = np.zeros((max_size, node_dim))
        nodes[:mol_info.shape[0], :mol_info.shape[1]] = mol_info
        

        # Create edges
        in_feats = np.zeros((max_size, max_size, j_coup_dim))
        ind = values[1][['atom_index_0', 'atom_index_1' ]].values
        in_feats[ind[:,0], ind[:,1], 0:j_coup_dim] = values[1][j_coup_vals].values
        in_feats[ind[:,1], ind[:,0], 0:j_coup_dim] = in_feats[ind[:,0], ind[:,1], 0:j_coup_dim]
        print(values[1].columns)
        
        
        # Create bonds
        in_bonds = np.zeros((max_size, max_size, bond_dim))
        ind_bonds = bonds[1][['atom_index_0', 'atom_index_1' ]].values
        in_bonds[ind_bonds[:,0], ind_bonds[:,1]] = bonds[1][bond_vals].values
        in_bonds[ind_bonds[:,1], ind_bonds[:,0]] = in_bonds[ind_bonds[:,0], ind_bonds[:,1]]
        
        # Create bonds2
        in_bonds2 = np.zeros((max_size, max_size, bond2_dim))
        ind_bonds = bonds2[1][['atom_index_0', 'atom_index_1' ]].values
        in_bonds2[ind_bonds[:,0], ind_bonds[:,1]] = bonds2[1][bond_feats].values
        in_bonds2[ind_bonds[:,1], ind_bonds[:,0]] = in_bonds2[ind_bonds[:,0], ind_bonds[:,1]]
        
        # Create angles
        ind_angs = angles[1][['atom_index_0', 'atom_index_1' ]].values
        ang_mat  = np.zeros((max_size, max_size, ang_dim))
        ang_mat[ind_angs[:,0], ind_angs[:,1]]  = angles[1][ang_vals]
        ang_mat[ind_angs[:,1], ind_angs[:,0]]  = ang_mat[ind_angs[:,0], ind_angs[:,1]]
        print(angles[1][ang_vals].columns, angles[1][ang_vals].shape)
        
        
        # concat all edge values 
        in_edges = np.concatenate((in_feats, in_bonds, ang_mat, distances, in_bonds2),axis=2)
        print('in_edges shape:', in_edges.shape)



        
        if not test:           
            out_edges = np.zeros((max_size, max_size, 1))
            out_edges[ind[:,0], ind[:,1], 0] = values[1]['scalar_coupling_constant' ].values
            out_edges[ind[:,1], ind[:,0], 0] = out_edges[ind[:,0], ind[:,1], 0]
        

            train_nodes_array[i]      = nodes
            train_in_edges_array[i]   = in_edges
            train_out_edges_array[i]  = out_edges
        else:
            test_nodes_array[i]      = nodes
            test_in_edges_array[i]   = in_edges
        i = i + 1
        break

In [30]:
make_arrs(train_group, train_struct_group, train_bond_group, train_angs_group, train_bond_group2, test = False)

0
(29, 29, 1) (5, 5)
Index(['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type',
       'scalar_coupling_constant', '1JHC', '1JHN', '2JHC', '2JHH', '2JHN',
       '3JHC', '3JHH', '3JHN'],
      dtype='object')
Index(['shortest_path_n_bonds', 'cosinus', 'dihedral'], dtype='object') (10, 3)
in_edges shape: (29, 29, 244)


ValueError: could not broadcast input array from shape (29,29,244) into shape (29,29,16)

In [None]:
make_arrs(test_group, test_struct_group, test_bond_group, test_angs_group, test = True)

In [None]:
np.savez_compressed("nodes_train.npz" , train_nodes_array)
np.savez_compressed("in_edges_train.npz" , train_in_edges_array)
np.savez_compressed("out_edges_train.npz" , train_out_edges_array)

np.savez_compressed("nodes_test.npz" , test_nodes_array)
np.savez_compressed("in_edges_test.npz" , test_in_edges_array)