In [448]:
# import necessary packages
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.utils import shuffle 
from sklearn.metrics import classification_report
import re

np.random.seed(1)


In [449]:
#pd.set_option('display.max_rows',None)
#pd.set_option('display.max_columns',None)

In [450]:
# import dataset
dataset = pd.read_csv('cages.csv')
dataset.head()

Unnamed: 0,name,reaction,topology,fingerprint,collapsed,cavity_size,max_diameter,windows,window_diff,window_std
0,0,thiol2thiol3,FourPlusSix,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.0,0.0,29.078668,,,
1,1,thiol2thiol3,FourPlusSix,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.0,0.0,32.572897,,,
2,2,thiol2thiol3,FourPlusSix,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.0,0.0,35.421492,,,
3,3,thiol2thiol3,FourPlusSix,"[3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.0,1.049308,25.011154,"[2.168335046395502, 1.97799126890146]",,
4,4,thiol2thiol3,FourPlusSix,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",1.0,1.124412,28.322324,[2.9717479377190696],,


In [451]:
# take out the reaction type 'amine2aldehyde3' and topology type 'FourPlusSix'
subdataset = dataset[dataset['reaction'] == 'amine2aldehyde3']
subdataset1 = subdataset[subdataset['topology'] == 'FourPlusSix']

# eliminate the nan data
subdataset2 = subdataset1[subdataset1['collapsed'].notnull()]
subdataset3 = subdataset2['fingerprint']


# build the overall training set
fingerprints =[]
single_fingerprint = []

# split the string by commas
# transfer string to float32
for i in range(0,len(subdataset3)):
    single_fingerprint = subdataset3.iloc[i,].split(',')
    for i in range(0, len(single_fingerprint)):
        single_fingerprint[i] = re.findall("\d+", single_fingerprint[i])
        single_fingerprint[i] = [float(x) for x in single_fingerprint[i]]
    fingerprints.append(single_fingerprint)

fingerprints = np.squeeze(np.array(fingerprints)) # to array and reduce the dimension
#print(fingerprints)

# build the labels
labels_all = subdataset2['collapsed'].values
#print(labels)


In [452]:
num_dataset = len(fingerprints)
#print(num_dataset)
dataset_index = np.arange(num_dataset) 

# divide the dataset by labels
labels_zero_index = list(np.where(labels_all == 0))
labels_one_index = list(np.where(labels_all == 1))
#print(np.array(labels_zero_index).size)
#print(np.array(labels_one_index).size)

fingerprints_zero = fingerprints[labels_zero_index]
fingerprints_one = fingerprints[labels_one_index]
labels_zero = labels_all[labels_zero_index]
labels_one = labels_all[labels_one_index]

  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':
  del sys.path[0]
  


In [453]:
# shuffle the datasets
num_dataset_zero = len(fingerprints_zero)
num_dataset_one = len(fingerprints_one)
fingerprints_zero_index = np.arange(num_dataset_zero)
fingerprints_one_index = np.arange(num_dataset_one)

shuffle_fingerprints_zero_index = shuffle(fingerprints_zero_index)
shuffle_fingerprints_one_index = shuffle(fingerprints_one_index)

In [454]:
proportion_test = 0.2 # 80% train_dev set, 20% test set
num_fingerprints_zero_test = int(proportion_test * num_dataset_zero)
num_fingerprints_one_test = int(proportion_test * num_dataset_one)

In [455]:
# build the test set and test label
index_fingerprints_zero_test = shuffle_fingerprints_zero_index[:num_fingerprints_zero_test]
index_fingerprints_one_test = shuffle_fingerprints_one_index[: num_fingerprints_one_test]

fingerprints_zero_test = fingerprints_zero[index_fingerprints_zero_test]
fingerprints_one_test = fingerprints_one[index_fingerprints_one_test]
label_zero_test = labels_zero[index_fingerprints_zero_test]
label_one_test = labels_one[index_fingerprints_one_test]

test_set = np.concatenate((fingerprints_zero_test, fingerprints_one_test), axis = 0)
test_label = np.concatenate((label_zero_test, label_one_test), axis = 0)
#print(test_set.shape)
#print(test_label.shape)

In [456]:
# build the train_dev set and train_dev label
index_train_dev_zero = shuffle_fingerprints_zero_index[num_fingerprints_zero_test:]
index_train_dev_one = shuffle_fingerprints_one_index[num_fingerprints_one_test:]
fingerprints_zero_train_dev = fingerprints_zero[index_train_dev_zero]
fingerprints_one_train_dev = fingerprints_one[index_train_dev_one]
label_zero_train_dev = labels_zero[index_train_dev_zero]
label_one_train_dev = labels_one[index_train_dev_one]

train_dev_set = np.concatenate((fingerprints_zero_train_dev, fingerprints_one_train_dev), axis = 0)
train_dev_label = np.concatenate((label_zero_train_dev, label_one_train_dev), axis = 0)
#print(train_dev_set.shape)
#print(train_dev_label.shape)


In [457]:
# evenly distribute data in test and train_dev set
num_test = len(test_set)
num_train_dev = len(train_dev_set)
test_index = np.arange(num_test)
train_dev_index = np.arange(num_train_dev)

shuffle_num_test = shuffle(test_index)
shuffle_num_train_dev = shuffle(train_dev_index)

In [458]:
# final version of test set and train_dev set
test_set = test_set[shuffle_num_test]
test_label = test_label[shuffle_num_test][:, np.newaxis]    # shuffle and add one dimension

train_dev_set = train_dev_set[shuffle_num_train_dev]
train_dev_label = train_dev_label[shuffle_num_train_dev][:, np.newaxis]   # shuffle and add one dimension


In [459]:
# transfer test set and train_dev set from array to tensor
test_set = torch.from_numpy(test_set).float()
test_label = torch.from_numpy(test_label).float()

train_dev_set = torch.from_numpy(train_dev_set).float() 
train_dev_label = torch.from_numpy(train_dev_label).float()

print(test_set.shape)
print(test_label.shape)
print(train_dev_set.shape)
print(train_dev_label.shape)

torch.Size([915, 1024])
torch.Size([915, 1])
torch.Size([3668, 1024])
torch.Size([3668, 1])


In [460]:
# construct the neural network
class collapse_model(nn.Module):
    def __init__(self, x_size, hidden1_size, hidden2_size, hidden3_size, y_size):
        super(collapse_model, self).__init__()
        self.hidden1 = nn.Linear(x_size, hidden1_size)
        #self.batch1 = nn.BatchNorm1d(hidden1_size)   # add batch norm 1
        self.hidden2 = nn.Linear(hidden1_size, hidden2_size)
        #self.batch2 = nn.BatchNorm1d(hidden2_size)  # add batch norm 2
        self.hidden3 = nn.Linear(hidden2_size, hidden3_size)
        self.predict = nn.Linear(hidden3_size, y_size)
    def forward(self, input):
        result = self.hidden1(input)
        #result = self.batch1(result)   # add batch norm 1
        result = F.leaky_relu(result)
        result = self.hidden2(result)
        #result = self.batch2(result)   # add batch norm 2
        result = F.leaky_relu(result)
        result = self.hidden3(result)
        result = torch.sigmoid(result)
        result = self.predict(result)
        
        return result

net = collapse_model(1024, 512, 256, 64, 1)
print(net)

collapse_model(
  (hidden1): Linear(in_features=1024, out_features=512, bias=True)
  (hidden2): Linear(in_features=512, out_features=256, bias=True)
  (hidden3): Linear(in_features=256, out_features=64, bias=True)
  (predict): Linear(in_features=64, out_features=1, bias=True)
)


In [461]:
# set optimizer and loss function
#optimizer = optim.Adam(net.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0) # 87.10%
#optimizer = optim.Adam(net.parameters(), lr=0.001, eps=1e-08, weight_decay=0) # 87.32
#optimizer = optim.Adam(net.parameters(), lr=0.002, eps=1e-08, weight_decay=0) # 87.21
optimizer = optim.Adam(net.parameters(), lr=0.00001, eps=1e-08, weight_decay=0)
loss_func = nn.BCEWithLogitsLoss() 

In [462]:
# use GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
train_dev_set = train_dev_set.to(device)
train_dev_label = train_dev_label.to(device)
test_set = test_set.to(device)
test_label = test_label.to(device)

net = net.to(device)
loss_func = loss_func.to(device)

In [463]:
# training and evaluation
def evaluate_accuracy(y, pred):
    num_set = len(y)

    predicted = pred.ge(0.5).view(-1)  
    performance = (y == predicted).sum().float() / num_set

    
    return performance

In [464]:
def train(net, train_x, train_y, test_x, test_y, loss, num_epochs, optimizer):

    for epoch in range(num_epochs):

        pred = net(train_x)
        pred = torch.squeeze(pred)
        l = loss_func(pred, train_y)
        #optimizer.zero_grad()
        #l.backward()
        #optimizer.step()
        train_loss = l.item()


        if epoch == 0 or (epoch + 1) % 100 == 0:
            train_acc = evaluate_accuracy(train_y, pred)

            test_pred = net(test_x)
            test_pred = torch.squeeze(test_pred)
            test_loss = loss_func(test_pred, test_y)
            test_acc = evaluate_accuracy(test_y, test_pred)
            test_loss += test_loss.item()

            print('epoch %d ,train loss %.4f'%(epoch + 1,train_loss)+', train accuracy {:.2f}%'.format(train_acc*100))
            print('test loss %.4f'% (test_loss)+' test accuracy {:.2f}%'.format(test_acc*100))
        
        optimizer.zero_grad()
        l.backward()
        optimizer.step()
        

In [465]:
num_epochs = 3000
train(net, train_dev_set, torch.squeeze(train_dev_label), test_set, torch.squeeze(test_label), loss_func, num_epochs, optimizer)

epoch 1 ,train loss 0.6979, train accuracy 50.49%
test loss 1.3955 test accuracy 50.49%
epoch 100 ,train loss 0.6782, train accuracy 50.49%
test loss 1.3605 test accuracy 50.49%
epoch 200 ,train loss 0.6302, train accuracy 51.36%
test loss 1.2739 test accuracy 51.26%
epoch 300 ,train loss 0.5417, train accuracy 71.76%
test loss 1.1153 test accuracy 72.13%
epoch 400 ,train loss 0.4610, train accuracy 80.18%
test loss 0.9806 test accuracy 78.47%
epoch 500 ,train loss 0.4071, train accuracy 83.29%
test loss 0.8931 test accuracy 80.33%
epoch 600 ,train loss 0.3737, train accuracy 85.69%
test loss 0.8397 test accuracy 81.75%
epoch 700 ,train loss 0.3516, train accuracy 86.83%
test loss 0.8072 test accuracy 83.72%
epoch 800 ,train loss 0.3354, train accuracy 87.43%
test loss 0.7865 test accuracy 84.70%
epoch 900 ,train loss 0.3220, train accuracy 87.92%
test loss 0.7716 test accuracy 84.81%
epoch 1000 ,train loss 0.3102, train accuracy 88.50%
test loss 0.7601 test accuracy 84.81%
epoch 1100 

In [466]:
#evaluation
classes = ['collapsed: 0','collapsed: 1']
y_pred = net(test_set)
y_pred = y_pred.ge(.5).view(-1).cpu()
y_test = torch.squeeze(test_label).cpu()
print(classification_report(y_test, y_pred, target_names=classes))

              precision    recall  f1-score   support

collapsed: 0       0.87      0.87      0.87       462
collapsed: 1       0.87      0.87      0.87       453

    accuracy                           0.87       915
   macro avg       0.87      0.87      0.87       915
weighted avg       0.87      0.87      0.87       915



In [467]:
#from google.colab import drive
#drive.mount('/drive')

In [468]:
#%cd ..
#%cd drive/MyDrive/Colab_Notebooks/kimjelfs