In [1]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [4]:
# !{sys.executable} -m pip install torch_sparse

In [2]:
import sys
sys.path.append("../..") # Adds higher directory to python modules path.
from utilities import aggregate_feature_calculators
from utilities import aggregate_feature_calculators_setting as aggcal
from utilities.parallel import Parallel

In [3]:
import pandas as pd
import numpy as np
import os
import os.path as osp
import datetime

import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold, TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

from torch_geometric.data import Data, DataLoader
import torch
import torch.nn.functional as F
from torch_geometric.datasets import Planetoid
from torch_geometric.datasets import QM9
from torch_geometric.datasets import TUDataset
import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv, ChebConv  # noqa
from torch.nn import Sequential, Linear, ReLU, GRU
import torch_geometric.transforms as T
from torch_geometric.datasets import QM9
from torch_geometric.nn import NNConv, Set2Set
from torch_geometric.data import DataLoader
from torch_geometric.utils import remove_self_loops
from torch_geometric.data import Data

In [4]:
file_folder = '../../data/input'
os.listdir(file_folder)

['sample_submission.csv',
 'magnetic_shielding_tensors.csv',
 'potential_energy.csv',
 'scalar_coupling_contributions.csv',
 'dipole_moments.csv',
 'mulliken_charges.csv',
 'train.csv',
 'test.csv',
 'structures.csv',
 'structures']

In [9]:
atomic_radius = {'H': 0.38, 'C': 0.77, 'N': 0.75, 'O': 0.73, 'F': 0.71}
atomic_number = {'H': 1, 'C': 6, 'N': 7, 'O': 8, 'F': 9}
atomic_mass = {'H': 1.0079, 'C': 12.0107, 'N': 14.0067, 'O': 15.9994, 'F': 18.9984}
vanderwaalsradius = {'H': 120, 'C': 185, 'N': 154, 'O': 140, 'F': 135}
covalenzradius = {'H': 30, 'C': 77, 'N': 70, 'O': 66, 'F': 58}
electronegativity = {'H': 2.2, 'C': 2.55, 'N': 3.04, 'O': 3.44, 'F': 3.98}
ionization_energy = {'H': 13.5984, 'C': 11.2603, 'N': 14.5341, 'O': 13.6181, 'F': 17.4228}

In [10]:
train = pd.read_csv(f'{file_folder}/train.csv')
test = pd.read_csv(f'{file_folder}/test.csv')
sub = pd.read_csv(f'{file_folder}/sample_submission.csv')
train['type0'] = train['type'].apply(lambda x: int(x[0]))
train['type1'] = train['type'].apply(lambda x: x[1:])

le = LabelEncoder()
le.fit(train.type1.tolist())
int_bond_type = le.transform(train.type1.tolist()) 
train['int_type1']= int_bond_type

test['type0'] = test['type'].apply(lambda x: int(x[0]))
test['type1'] = test['type'].apply(lambda x: x[1:])
int_bond_type = le.transform(test.type1.tolist()) 
test['int_type1']= int_bond_type

In [11]:
le.classes_

array(['JHC', 'JHH', 'JHN'], dtype='<U3')

In [12]:
magnetic_shielding_tensors = pd.read_csv(f'{file_folder}/magnetic_shielding_tensors.csv')
dipole_moments = pd.read_csv(f'{file_folder}/dipole_moments.csv')
mulliken_charges = pd.read_csv(f'{file_folder}/mulliken_charges.csv')
potential_energy = pd.read_csv(f'{file_folder}/potential_energy.csv')
scalar_coupling_contributions = pd.read_csv(f'{file_folder}/scalar_coupling_contributions.csv')
structures = pd.read_csv(f'{file_folder}/structures.csv')

In [13]:
x = magnetic_shielding_tensors.columns.values[2:]
x = magnetic_shielding_tensors[x].values
x = x.reshape(-1,3,3)
x = x + np.transpose(x,(0,2,1))
x = 0.5 * x
w, v = np.linalg.eigh(x)

sigma_iso = np.sum(w, axis=1)/3 
omega = w[:,2] - w[:,0]
kappa = 3 * (sigma_iso - w[:,1])/omega

magnetic_shielding_parameters = magnetic_shielding_tensors[magnetic_shielding_tensors.columns.values[:2]]
magnetic_shielding_parameters = pd.DataFrame(magnetic_shielding_parameters)
magnetic_shielding_parameters["sigma_iso"] = sigma_iso
magnetic_shielding_parameters["omega"] = omega
magnetic_shielding_parameters["kappa"] = kappa

In [14]:
atomic_radius = {'H': 0.38, 'C': 0.77, 'N': 0.75, 'O': 0.73, 'F': 0.71}
atomic_number = {'H': 1, 'C': 6, 'N': 7, 'O': 8, 'F': 9}
atomic_mass = {'H': 1.0079, 'C': 12.0107, 'N': 14.0067, 'O': 15.9994, 'F': 18.9984}
vanderwaalsradius = {'H': 120, 'C': 185, 'N': 154, 'O': 140, 'F': 135}
covalenzradius = {'H': 30, 'C': 77, 'N': 70, 'O': 66, 'F': 58}
electronegativity = {'H': 2.2, 'C': 2.55, 'N': 3.04, 'O': 3.44, 'F': 3.98}
ionization_energy = {'H': 13.5984, 'C': 11.2603, 'N': 14.5341, 'O': 13.6181, 'F': 17.4228}

structures['atomic_radius'] = structures['atom'].apply(lambda x: atomic_radius[x])
structures['atomic_number'] = structures['atom'].apply(lambda x: atomic_number[x])
structures['atomic_mass'] = structures['atom'].apply(lambda x: atomic_mass[x])
structures['vanderwaalsradius'] = structures['atom'].apply(lambda x: vanderwaalsradius[x])
structures['covalenzradius'] = structures['atom'].apply(lambda x: covalenzradius[x])
structures['electronegativity'] = structures['atom'].apply(lambda x: electronegativity[x])
structures['ionization_energy'] = structures['atom'].apply(lambda x: ionization_energy[x])


In [15]:
structures.head()

Unnamed: 0,molecule_name,atom_index,atom,x,y,z,atomic_radius,atomic_number,atomic_mass,vanderwaalsradius,covalenzradius,electronegativity,ionization_energy
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001,0.77,6,12.0107,185,77,2.55,11.2603
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976,0.38,1,1.0079,120,30,2.2,13.5984
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277,0.38,1,1.0079,120,30,2.2,13.5984
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644,0.38,1,1.0079,120,30,2.2,13.5984
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397,0.38,1,1.0079,120,30,2.2,13.5984


In [16]:
le = LabelEncoder()
le.fit(structures.atom.tolist())
int_atom = le.transform(structures.atom.tolist()) 
structures['int_atom'] = int_atom

In [17]:
le.classes_

array(['C', 'F', 'H', 'N', 'O'], dtype='<U1')

In [15]:
magnetic_shielding_parameters.head()

Unnamed: 0,molecule_name,atom_index,sigma_iso,omega,kappa
0,dsgdb9nsd_000001,0,195.316333,0.002708,0.032697
1,dsgdb9nsd_000001,1,31.460567,9.082838,0.999975
2,dsgdb9nsd_000001,2,31.460567,9.082793,0.999984
3,dsgdb9nsd_000001,3,31.460633,9.082804,0.999977
4,dsgdb9nsd_000001,4,31.4606,9.082853,0.999971


In [16]:
mulliken_charges.head()

Unnamed: 0,molecule_name,atom_index,mulliken_charge
0,dsgdb9nsd_000001,0,-0.535689
1,dsgdb9nsd_000001,1,0.133921
2,dsgdb9nsd_000001,2,0.133922
3,dsgdb9nsd_000001,3,0.133923
4,dsgdb9nsd_000001,4,0.133923


In [25]:
dipole_moments['rms'] = np.sqrt(dipole_moments['X']**2 + dipole_moments['Y']**2 + dipole_moments['Z']**2 )

In [18]:
dipole_moments.head()#MAE 3.x without scaled y, 2.x with scaled y

Unnamed: 0,molecule_name,X,Y,Z,rms
0,dsgdb9nsd_000001,0.0,0.0,0.0,0.0
1,dsgdb9nsd_000002,-0.0002,0.0,1.6256,1.6256
2,dsgdb9nsd_000003,0.0,0.0,-1.8511,1.8511
3,dsgdb9nsd_000005,0.0,0.0,-2.8937,2.8937
4,dsgdb9nsd_000007,0.0,0.0,0.0,0.0


In [19]:
potential_energy.head()#MAE 30.x with scaled y

Unnamed: 0,molecule_name,potential_energy
0,dsgdb9nsd_000001,-40.52368
1,dsgdb9nsd_000002,-56.56025
2,dsgdb9nsd_000003,-76.42608
3,dsgdb9nsd_000005,-93.42849
4,dsgdb9nsd_000007,-79.83869


In [26]:
molecule_info = pd.merge(dipole_moments, potential_energy, how = 'left', left_on  = ['molecule_name'], right_on = ['molecule_name'])

In [27]:
molecule_info.head()

Unnamed: 0,molecule_name,X,Y,Z,rms,potential_energy
0,dsgdb9nsd_000001,0.0,0.0,0.0,0.0,-40.52368
1,dsgdb9nsd_000002,-0.0002,0.0,1.6256,1.6256,-56.56025
2,dsgdb9nsd_000003,0.0,0.0,-1.8511,1.8511,-76.42608
3,dsgdb9nsd_000005,0.0,0.0,-2.8937,2.8937,-93.42849
4,dsgdb9nsd_000007,0.0,0.0,0.0,0.0,-79.83869


In [20]:
for col in molecule_info.columns.tolist():
    print(np.where(molecule_info[col].isna())[0].shape)

(0,)
(0,)
(0,)
(0,)
(0,)


In [21]:
atom_info = pd.merge(magnetic_shielding_parameters, mulliken_charges, how = 'left', left_on  = ['molecule_name','atom_index'], right_on = ['molecule_name','atom_index'])

In [22]:
atom_info.head()

Unnamed: 0,molecule_name,atom_index,sigma_iso,omega,kappa,mulliken_charge
0,dsgdb9nsd_000001,0,195.316333,0.002708,0.032697,-0.535689
1,dsgdb9nsd_000001,1,31.460567,9.082838,0.999975,0.133921
2,dsgdb9nsd_000001,2,31.460567,9.082793,0.999984,0.133922
3,dsgdb9nsd_000001,3,31.460633,9.082804,0.999977,0.133923
4,dsgdb9nsd_000001,4,31.4606,9.082853,0.999971,0.133923


In [23]:
for col in atom_info.columns.tolist():
    print(np.where(atom_info[col].isna())[0].shape)

(0,)
(0,)
(0,)
(0,)
(0,)
(0,)


In [28]:
y_std = np.std(molecule_info[['potential_energy','rms']].values, axis=0)
y_mean = np.mean(molecule_info[['potential_energy','rms']].values, axis=0)
y_std, y_mean

(array([39.82863665,  1.49279267]), array([-410.87405548,    2.67236092]))

In [29]:
def _worker(item, df_bonds, df_structures, molecule_info, has_y):
    
    idx = item[0]
    molecule_name = item[1]
    
    # point attribute
    x = torch.tensor(df_structures[df_structures['molecule_name']==molecule_name].sort_values(by=['atom_index'])[['int_atom','atomic_radius', 'atomic_number', 'atomic_mass', 'vanderwaalsradius', 'covalenzradius', 'electronegativity', 'ionization_energy']].values, dtype=torch.float)
    # position
    pos = torch.tensor(df_structures[df_structures['molecule_name']==molecule_name].sort_values(by=['atom_index'])[['x','y','z']].values, dtype=torch.float)

    edge_index = []
    edge_attr = []

    df_molecule_i = df_bonds[df_bonds['molecule_name']==molecule_name]
    molecule_info_i = molecule_info[molecule_info['molecule_name']==molecule_name]

    if has_y:
        y = (molecule_info_i[['potential_energy','rms']].values-y_mean)/y_std
        y = torch.tensor(y, dtype=torch.float)

    for idx, row in df_molecule_i.iterrows():
        edge_index.append([row['atom_index_0'], row['atom_index_1']])
        edge_index.append([row['atom_index_1'], row['atom_index_0']])
        edge_attr.append([row['type0'], row['int_type1']])
        edge_attr.append([row['type0'], row['int_type1']])

    edge_index = torch.tensor(edge_index, dtype=torch.long)
    edge_attr = torch.tensor(edge_attr, dtype=torch.float)
    if has_y:
        data = Data(x=x, edge_index=edge_index.t().contiguous(), pos=pos, edge_attr=edge_attr, y=y)
    else:
        data = Data(x=x, edge_index=edge_index.t().contiguous(), pos=pos, edge_attr=edge_attr)
    d_ =  {'idx':idx, 'molecule_name':molecule_name, 'data':data}
#     print(d_)
    return d_


def generate_datalist(df_bonds, df_structures, molecule_info, molecule_names, has_y = True):
    N_ = len(molecule_names)
    segments = N_//100
    data_list = []
    
#     data_list = Parallel(_worker,{'df_bonds':df_bonds, 'df_structures':df_structures, 'molecule_info':molecule_info, 'has_y':has_y}).run(zip(np.arange(N_), molecule_names), n_jobs=4)
    
#     for start in tqdm_notebook(range(0, N_, segments)):
#         end = start + segments if start + segments < N_ else N_
#         res = Parallel(_worker,{'df_bonds':df_bonds, 'df_structures':df_structures, 'molecule_info':molecule_info, 'has_y':has_y}).run(zip(np.arange(start,end,1), molecule_names[start:end]), n_jobs=4)
    for item in tqdm_notebook(enumerate(molecule_names), total=N_):
        d_ = _worker(item, **{'df_bonds':df_bonds, 'df_structures':df_structures, 'molecule_info':molecule_info, 'has_y':has_y})
        data_list.append(d_)
    df_ = pd.DataFrame(data_list)
    df_ = df_.sort_values(by=['idx']).reset_index(drop=True)
    return df_
    

In [None]:
molecule_names = train.molecule_name.unique().tolist()

In [36]:
df_train = generate_datalist(train, structures, molecule_info, molecule_names=molecule_names)

HBox(children=(IntProgress(value=0, max=85003), HTML(value='')))




In [31]:
molecule_names = test.molecule_name.unique().tolist()

In [None]:
test = generate_datalist(test, structures, molecule_info, molecule_names=molecule_names)

HBox(children=(IntProgress(value=0, max=45772), HTML(value='')))

In [37]:
# df_train.to_pickle('../../data/feature/graph_train.gzip', compression='gzip')

In [5]:
df_train = pd.read_pickle('../../data/feature/graph_train.gzip', compression='gzip')

In [6]:
df_train.head()

Unnamed: 0,data,idx,molecule_name
0,"[(edge_attr, [tensor([1., 0.]), tensor([1., 0....",9,dsgdb9nsd_000001
1,"[(edge_attr, [tensor([1., 2.]), tensor([1., 2....",15,dsgdb9nsd_000002
2,"[(edge_attr, [tensor([2., 1.]), tensor([2., 1....",16,dsgdb9nsd_000003
3,"[(edge_attr, [tensor([1., 0.]), tensor([1., 0....",18,dsgdb9nsd_000005
4,"[(edge_attr, [tensor([1., 0.]), tensor([1., 0....",45,dsgdb9nsd_000007


In [7]:
df_train.data.values[0]

Data(edge_attr=[20, 2], edge_index=[2, 20], pos=[5, 3], x=[5, 8], y=[1, 2])

In [8]:
def generate_dataLoader(df, molecule_names, shuffle=True, batch_size=32, **kwargs):
    data_list = df[df['molecule_name'].isin(molecule_names)]['data'].tolist()
    return DataLoader(data_list, batch_size=batch_size, shuffle=shuffle, **kwargs)

In [40]:
for i in train_molecule_names
train_molecule_names, valid_molecule_names =  train_test_split(molecule_names, train_size=.8, shuffle=True)



In [42]:
# test_loader = generate_dataLoader(df_data_train, test_molecule_names)
val_loader = generate_dataLoader(df_train, valid_molecule_names)
train_loader = generate_dataLoader(df_train, train_molecule_names)

In [43]:
gnn_param = {
    'node_expand_dim':64,
    'link_expand_dim':128,
    'node_dim':8,
    'link_dim':2,
    'conv_layers':3
}
# node_expand_dim = 64
# link_expand_dim = 128
# node_dim = 8
# link_dim = 2
# conv_layers = 3

class Net(torch.nn.Module):
    
    def __init__(self, node_expand_dim, link_expand_dim, node_dim, link_dim, conv_layers):
        super(Net, self).__init__()
        
        self.node_expand_dim = node_expand_dim
        self.link_expand_dim = link_expand_dim
        self.node_dim = node_dim
        self.link_dim = link_dim
        self.conv_layers = conv_layers
        
        self.lin0 = torch.nn.Linear(node_dim, node_expand_dim)
        nn = Sequential(Linear(link_dim, link_expand_dim), ReLU(), Linear(link_expand_dim, node_expand_dim * node_expand_dim))
        self.conv = NNConv(node_expand_dim, node_expand_dim, nn, aggr='mean', root_weight=False)
        self.gru = GRU(node_expand_dim, node_expand_dim)
        self.set2set = Set2Set(node_expand_dim, processing_steps=3)
        self.lin3 = torch.nn.Linear(node_expand_dim, 2 * node_expand_dim)
        self.lin1 = torch.nn.Linear(2 * node_expand_dim, node_expand_dim)
        self.lin2 = torch.nn.Linear(node_expand_dim, 1)

    def forward(self, data):
        out = F.relu(self.lin0(data.x))
        h = out.unsqueeze(0)
        for i in range(self.conv_layers):
            m = F.relu(self.conv(out, data.edge_index, data.edge_attr))
            out, h = self.gru(m.unsqueeze(0), h)
            out = out.squeeze(0)
            
        out = self.set2set(out, data.batch)
#         out = F.relu(self.lin3(out))
        out = F.relu(self.lin1(out))
        out = self.lin2(out)
#         print(out.shape)
#         out = out.view(-1)
        return out



class GNN(object):
    
    def __init__(self, node_expand_dim, link_expand_dim, node_dim, link_dim, conv_layers):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = Net(node_expand_dim, link_expand_dim, node_dim, link_dim, conv_layers).to(self.device)
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.001)
        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, mode='min', factor=0.7, patience=5, min_lr=0.00001)
        return

    def fit(self, train_loader, val_loader, epochs, y_std, y_mean, y_index=0, verbose=0):

        his = []
        for epoch in range(1, epochs+1):
            lr = self.scheduler.optimizer.param_groups[0]['lr']
            loss = self._train(epoch, train_loader, y_index, self.model, self.device, self.optimizer)
            val_error = self._test(val_loader, y_index, self.model, self.device, self.optimizer, y_std, y_mean)
            self.scheduler.step(val_error)
            his.append({'epoch':epoch, 'lr':lr, 'loss':loss, 'val_error':val_error})
            if verbose > 0:
#                 if epoch % (epochs//10) == 0:
                print('Epoch: {:03d}, LR: {:7f}, Loss: {:.7f}, Validation MAE: {:.7f}'.format(epoch, lr, loss, val_error))
        return his
    
    def predict(self, loader, y_std, y_mean, y_index=0):
        model.eval()
        y_pred_list  = []
        for data in loader:
            data = data.to(device)
            std = torch.tensor(y_std[y_index:y_index+1].reshape(1,1), dtype=torch.float)
            std = std.to(device)
            y_pred = model(data)
            y_pred_list.append(y_pred.item())
        y_pred = np.array(y_pred_list)
        y_pred = y_pred*y_std + y_mean
        return y_pred
    
    def _train(self, epoch, loader, y_index, model, device, optimizer):
        model.train()
        loss_all = 0
        for data in loader:
            data = data.to(device)
            optimizer.zero_grad()
            y_pred = model(data)
            loss = F.mse_loss(y_pred, data.y[:,y_index:y_index+1])
            loss.backward()
            loss_all += loss.item() * data.num_graphs
            optimizer.step()
        return loss_all / len(loader.dataset)


    def _test(self, loader, y_index, model, device, optimizer, y_std, y_mean):
        model.eval()
        error = 0
        for data in loader:
            data = data.to(device)
            std = torch.tensor(y_std[y_index:y_index+1].reshape(1,1), dtype=torch.float)
            std = std.to(device)
            y_pred = model(data)
#             print(y_pred.shape)            
            error += (y_pred.mm(std) - data.y[:,y_index:y_index+1].mm(std)).abs().sum().item()  # MAE
        return error / len(loader.dataset)

In [44]:
gnn = GNN(**gnn_param)

In [None]:
his  = gnn.fit(train_loader, val_loader, 100, y_std, y_mean, y_index=1, verbose=1)

Epoch: 001, LR: 0.001000, Loss: 0.8995454, Validation MAE: 1.0764222
Epoch: 002, LR: 0.001000, Loss: 0.8312789, Validation MAE: 0.9917343
Epoch: 003, LR: 0.001000, Loss: 0.8373433, Validation MAE: 1.0545565
Epoch: 004, LR: 0.001000, Loss: 0.8354087, Validation MAE: 1.0621529
Epoch: 005, LR: 0.001000, Loss: 0.8302558, Validation MAE: 0.9906469
Epoch: 006, LR: 0.001000, Loss: 0.7646985, Validation MAE: 0.9733653
Epoch: 007, LR: 0.001000, Loss: 0.7212076, Validation MAE: 0.9276138
Epoch: 008, LR: 0.001000, Loss: 0.7138798, Validation MAE: 0.9443923
Epoch: 009, LR: 0.001000, Loss: 0.7097099, Validation MAE: 0.9370616
Epoch: 010, LR: 0.001000, Loss: 0.7077996, Validation MAE: 0.9285092
Epoch: 011, LR: 0.001000, Loss: 0.7070415, Validation MAE: 0.9407990
Epoch: 012, LR: 0.001000, Loss: 0.7046720, Validation MAE: 0.9240429
Epoch: 013, LR: 0.001000, Loss: 0.7047568, Validation MAE: 0.9255332
Epoch: 014, LR: 0.001000, Loss: 0.7055413, Validation MAE: 0.9225434
Epoch: 015, LR: 0.001000, Loss: 0.

In [175]:
y_pred  = gnn.predict(val_loader, y_std, y_mean, y_index=1)

TypeError: predict() missing 2 required positional arguments: 'y_std' and 'y_mean'