In [1]:
import torch
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import math
from sklearn.model_selection import train_test_split
from torch.nn import Module, Conv2d, MaxPool2d, Linear
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

In [2]:
# define parameters for training and device
starting_features = 5
batchsize = 16
num_epochs = 10
# If a GPU is available, then send it to that GPU rather than train on CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
device

device(type='cuda', index=0)

In [4]:
# Load the dataset. 
# molecule_features: First col is a string containing name of molecule. Rest are floats containing its features
# eutectic_compilation: First 2 cols are strings containing molecule names, third col is eutectic proportion, fourth is eutectic temperature

molecule_features = pd.read_csv("D:\\Research\\UConn_ML\\Code\\data\\eutectic_mixtures-main\\single_components.csv").drop(["xlogp"], axis=1)
eutectic_compilation = pd.read_csv("D:\\Research\\UConn_ML\\Code\\data\\eutectic_mixtures-main\\eutectic_compilation.csv")

molecule_features[molecule_features.columns[1:]] = molecule_features[molecule_features.columns[1:]].astype(float)
eutectic_compilation[eutectic_compilation.columns[3]] = eutectic_compilation[eutectic_compilation.columns[3]].astype(float)

# Some indices in eutectic_compilation do not exist in single_components. They will need to be removed.
# Some of the eutectic proportions are inconvertible to floats from strings. They will also need to be removed.
drops = np.array([]).astype(int)
for i in range(len(eutectic_compilation)):
    ec = eutectic_compilation.iloc[i]
    m1 = ec[0]
    m2 = ec[1]
    xe = ec[2]
    
    m1f = molecule_features.loc[molecule_features.mol == ec[0]]
    m2f = molecule_features.loc[molecule_features.mol == ec[1]]
    
    try:
        xe = float(xe)
    except ValueError:
        drops = np.append(drops, i)
    
    if(len(m1f) == 0 or len(m2f) == 0):
        drops = np.append(drops, i)

eutectic_compilation = eutectic_compilation.drop(eutectic_compilation.index[drops])

In [5]:
molecule_features

Unnamed: 0,mol,molecular_weight,complexity,rotatable_bond_count,heavy_atom_count,topological_area
0,BaO,153.3300,2.0,0.0,2.0,17.1
1,CrCl3,158.3500,0.0,0.0,4.0,0.0
2,PBr5,430.4900,37.1,0.0,6.0,0.0
3,WCl5,361.1000,37.1,0.0,6.0,0.0
4,HgI2,454.4000,2.8,0.0,3.0,0.0
...,...,...,...,...,...,...
672,PBr3,270.6900,8.0,0.0,4.0,0.0
673,PbSiO3,283.0000,18.8,0.0,5.0,63.2
674,LiI,133.9000,2.0,0.0,2.0,0.0
675,EuF3,208.9590,8.0,0.0,4.0,0.0


In [6]:
eutectic_compilation

Unnamed: 0,molA,molB,xe,Te
0,BF3,N2O,76.6,-138.0
1,BF3,SO2,95.2,-128.6
2,BCl3,GeCl4,76,-116.0
3,BCl3,PCl3,94,-110.0
4,BCl3,PCl3,20,-99.0
...,...,...,...,...
4463,Sc2O3,ZrO2,52.2,2450.0
4464,UO2,ZrO2,47.5,2550.0
4465,UO2,ZrO2,50,2550.0
4466,UN,W,100,2700.0


In [7]:
print(molecule_features.columns)
print(eutectic_compilation.columns)

Index(['mol', 'molecular_weight', 'complexity', 'rotatable_bond_count',
       'heavy_atom_count', 'topological_area'],
      dtype='object')
Index(['molA', 'molB', 'xe', 'Te'], dtype='object')


In [8]:
print(molecule_features.dtypes)
print(eutectic_compilation.dtypes)

mol                      object
molecular_weight        float64
complexity              float64
rotatable_bond_count    float64
heavy_atom_count        float64
topological_area        float64
dtype: object
molA     object
molB     object
xe       object
Te      float64
dtype: object


In [9]:
molecule_features.iloc[0]

mol                        BaO
molecular_weight        153.33
complexity                 2.0
rotatable_bond_count       0.0
heavy_atom_count           2.0
topological_area          17.1
Name: 0, dtype: object

In [10]:
len(eutectic_compilation)

3980

In [11]:
# This is how the __getitem__() will be made when making a DataLoader from the dataset
# Take a eutectic compilation and print it
ec = eutectic_compilation.iloc[0]
print(ec)
print("\n")

# Obtain the 2 molecules involved
print("Molecule A: ", ec[0])
print("Molecule B: ", ec[1])
print("\n")

# Find their molecular features from the molecule_features dataframe, then convert the features into a NumPy array
m1f = torch.tensor(np.array(molecule_features.loc[molecule_features.mol == ec[0]])[0][1:starting_features+1].astype(float))
m2f = torch.tensor(np.array(molecule_features.loc[molecule_features.mol == ec[1]])[0][1:starting_features+1].astype(float))

print("Molecule A Features: ", m1f)
print("Molecule B Features: ", m2f)

molA      BF3
molB      N2O
xe       76.6
Te     -138.0
Name: 0, dtype: object


Molecule A:  BF3
Molecule B:  N2O


Molecule A Features:  tensor([67.8100,  8.0000,  0.0000,  4.0000,  0.0000], dtype=torch.float64)
Molecule B Features:  tensor([44.0130, 25.8000,  0.0000,  3.0000, 19.1000], dtype=torch.float64)


In [12]:
# SNN class with model
class SiameseNeuralNetwork(Module):
    def __init__(self, start_features):
        super(SiameseNeuralNetwork, self).__init__()
        
        self.fc1 = Linear(in_features=start_features, out_features=16)
        self.fc2 = Linear(in_features=16, out_features=32)
        self.fc3 = Linear(in_features=32, out_features=64)
        
        # create the feature vector for the input data
        self.fc4 = Linear(in_features=64, out_features=256)
        
        # final stage where prediction is made
        self.fc5 = Linear(in_features=256, out_features=1)

    def forward_on_input(self, x):
        return torch.tanh(self.fc4(F.relu(self.fc3(F.relu(self.fc2(F.relu(self.fc1(x))))))))

    def forward(self, x1, x2):
        y1 = self.forward_on_input(x1)
        y2 = self.forward_on_input(x2)
        d = y1 - y2                      # difference for xe
        # d = torch.abs(y1 - y2)           # difference for Te
        p = torch.sigmoid(self.fc5(d))
        return p

In [13]:
# To be fixed so all code below this has not been run yet. Will throw error!!!
class MakeDataset(Dataset):
    def __init__(self, ec_df, mf_df, num_features):
        self.ecdf = ec_df
        self.mfdf = mf_df
        self.nf = num_features

    def __len__(self):
        return len(self.ecdf)
  
    def __getitem__(self, idx):
        ec = self.ecdf.iloc[idx]
        
        m1f = torch.tensor(np.array(self.mfdf.loc[self.mfdf.mol == ec[0]])[0][1:self.nf+1].astype(float))
        m2f = torch.tensor(np.array(self.mfdf.loc[self.mfdf.mol == ec[1]])[0][1:self.nf+1].astype(float))
        xe = torch.tensor(float(ec[2]))
        Te = torch.tensor(ec[3])
        
        return m1f, m2f, xe, Te

In [14]:
ec_dataset = MakeDataset(eutectic_compilation, molecule_features, starting_features)
ec_dl = DataLoader(ec_dataset, shuffle=True, batch_size=batchsize)

# set the model
model = SiameseNeuralNetwork(starting_features).to(device)

# Set optimizer
opt = torch.optim.Adam(model.parameters(), lr=0.005)

In [15]:
# Compute the accuracy of the model at each epoch
def accuracy(output, target, batch_size):
    corrects = 0
    for x in range(len(output)):
        errorband = 0.25 * target[x]
        if output[x] > target[x] - errorband and output[x] < target[x] + errorband:
            corrects = corrects + 1
    accuracy = 100.0 * (corrects / batch_size)
    return accuracy

In [16]:
# training
model = model.train()
for epoch in range(num_epochs):
    train_running_loss = 0.0
    train_accuracy = 0.0
    model = model.train()

    # training step: iterate through the batch and get the images and labels at each x
    for x, (m1, m2, xe, Te) in enumerate(ec_dl):   
        # sending images and labels to device (GPU or CPU)
        m1 = m1.to(device)
        m2 = m2.to(device)
        xe = (xe/100.0).to(device)
        # Te = Te.to(device)

        # pass 2 sets of inputs into the snn and gives p, the output
        output = model(m1.float(), m2.float())
        loss = torch.abs(output - xe).sum() / xe.shape[0]
        # loss = Te*math.log(output) + (1-Te)*math.log(1-output)

        opt.zero_grad()
        loss.backward()
        opt.step()

        train_running_loss += loss.detach().item()
        train_accuracy += accuracy(output, xe, batchsize)
        # train_accuracy += accuracy(output, Te, batchsize)

    model.eval()
    print('Epoch %d | Loss: %.4f | Train Accuracy: %.2f'%(epoch+1, train_running_loss / x, train_accuracy / x))

Epoch 1 | Loss: 3.5235 | Train Accuracy: 33.95
Epoch 2 | Loss: 3.5186 | Train Accuracy: 33.57
Epoch 3 | Loss: 3.5167 | Train Accuracy: 33.52
Epoch 4 | Loss: 3.5171 | Train Accuracy: 33.67
Epoch 5 | Loss: 3.5167 | Train Accuracy: 33.92
Epoch 6 | Loss: 3.5173 | Train Accuracy: 34.00
Epoch 7 | Loss: 3.5166 | Train Accuracy: 33.54
Epoch 8 | Loss: 3.5171 | Train Accuracy: 33.95
Epoch 9 | Loss: 3.5172 | Train Accuracy: 33.77
Epoch 10 | Loss: 3.5158 | Train Accuracy: 34.00


In [17]:
test_accuracy = 0.0
for y, (m1, m2, xe, Te) in enumerate(ec_dl):
    m1 = m1.to(device)
    m2 = m2.to(device)
    xe = (xe/100.0).to(device)
    # Te = Te.to(device)

    outputs = model(m1.float(), m2.float())
    test_accuracy += accuracy(output, xe, batchsize)
    # test_accuracy += accuracy(output, Te, batchsize)
print('Test Accuracy: %.2f'%(test_accuracy / y))

Test Accuracy: 26.21
