In [1]:
import torch
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from torch.nn import Module, Conv2d, MaxPool2d, Linear, Dropout, BatchNorm1d
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

In [2]:
# define parameters for training and device
starting_features = 5
batchsize = 20
num_epochs = 10
# If a GPU is available, then send it to that GPU rather than train on CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
device

device(type='cuda', index=0)

In [4]:
# Load the dataset. 
# molecule_features: First col is a string containing name of molecule. Rest are floats containing its features
# eutectic_compilation: First 2 cols are strings containing molecule names, third col is eutectic proportion, fourth is eutectic temperature

molecule_features = pd.read_csv("D:\\Research\\UConn_ML\\Code\\data\\eutectic_mixtures-main\\single_components.csv").drop(["xlogp"], axis=1)
eutectic_compilation = pd.read_csv("D:\\Research\\UConn_ML\\Code\\data\\eutectic_mixtures-main\\eutectic_compilation.csv")

molecule_features[molecule_features.columns[1:]] = molecule_features[molecule_features.columns[1:]].astype(float)
eutectic_compilation[eutectic_compilation.columns[3]] = eutectic_compilation[eutectic_compilation.columns[3]].astype(float)

In [5]:
# Some indices in eutectic_compilation do not exist in single_components. They will need to be removed.
# Some of the eutectic proportions are inconvertible to floats from strings. They will also need to be removed.
# missing_molecules will store all molecules that are missing features from eutectic_compilation
drops = np.array([]).astype(int)
missing_molecules = np.array([]).astype(str)
for i in range(len(eutectic_compilation)):
    ec = eutectic_compilation.iloc[i]
    m1 = ec[0]
    m2 = ec[1]
    xe = ec[2]
    
    m1f = molecule_features.loc[molecule_features.mol == ec[0]]
    m2f = molecule_features.loc[molecule_features.mol == ec[1]]
    
    try:
        xe = float(xe)
    except ValueError:
        drops = np.append(drops, i)
    
    if(len(m1f) == 0 or len(m2f) == 0):
        drops = np.append(drops, i)
        
        if(len(m1f) == 0):
            missing_molecules = np.append(missing_molecules, m1)
        elif(len(m2f) == 0):
            missing_molecules = np.append(missing_molecules, m2)

eutectic_compilation = eutectic_compilation.drop(eutectic_compilation.index[drops])
# np.savetxt("missing_molecules.csv", missing_molecules, delimiter=",", fmt="%s")

In [6]:
# Split the eutectic compilation dataframe by a ratio into training and testing sets
split = 0.85
train_ec = eutectic_compilation.sample(frac=split)
test_ec = eutectic_compilation.drop(train_ec.index)

In [7]:
molecule_features

Unnamed: 0,mol,molecular_weight,complexity,rotatable_bond_count,heavy_atom_count,topological_area
0,BaO,153.3300,2.0,0.0,2.0,17.1
1,CrCl3,158.3500,0.0,0.0,4.0,0.0
2,PBr5,430.4900,37.1,0.0,6.0,0.0
3,WCl5,361.1000,37.1,0.0,6.0,0.0
4,HgI2,454.4000,2.8,0.0,3.0,0.0
...,...,...,...,...,...,...
672,PBr3,270.6900,8.0,0.0,4.0,0.0
673,PbSiO3,283.0000,18.8,0.0,5.0,63.2
674,LiI,133.9000,2.0,0.0,2.0,0.0
675,EuF3,208.9590,8.0,0.0,4.0,0.0


In [8]:
train_ec

Unnamed: 0,molA,molB,xe,Te
3926,P2O5,SrO,48,970.0
2000,KI,NaCl,57,510.0
2023,CrCl2,CsCl,19.1,515.0
1269,KCl,KClO3,13.1,345.0
1923,CaCl,MnCl2,79.5,499.0
...,...,...,...,...
1556,CaCl2,TlCl,7.5,420.0
1992,BaCl2,LiCl,33,510.0
305,AlOCl,NbCl5,42.9,135.0
4115,BaO,TiO2,32,1315.0


In [9]:
test_ec

Unnamed: 0,molA,molB,xe,Te
8,TiCl4,VOCl3,18,-88.0
21,SiCl4,TeCl4,99,-68.0
23,AsBr3,S2Br2,13.5,-56.0
28,GeCl4,SnCl4,91,-51.3
33,SbCl5,TiCl4,35,-47.5
...,...,...,...,...
4450,Sc2O3,UO2,82,2280.0
4454,SrO,ZrO2,25,2300.0
4458,CaO,MgO,59.3,2370.0
4459,CeO2,ZrO2,53,2390.0


In [10]:
print(molecule_features.columns)
print(eutectic_compilation.columns)

Index(['mol', 'molecular_weight', 'complexity', 'rotatable_bond_count',
       'heavy_atom_count', 'topological_area'],
      dtype='object')
Index(['molA', 'molB', 'xe', 'Te'], dtype='object')


In [11]:
print(molecule_features.dtypes)
print(eutectic_compilation.dtypes)

mol                      object
molecular_weight        float64
complexity              float64
rotatable_bond_count    float64
heavy_atom_count        float64
topological_area        float64
dtype: object
molA     object
molB     object
xe       object
Te      float64
dtype: object


In [12]:
molecule_features.iloc[0]

mol                        BaO
molecular_weight        153.33
complexity                 2.0
rotatable_bond_count       0.0
heavy_atom_count           2.0
topological_area          17.1
Name: 0, dtype: object

In [13]:
print("Train Size: ", len(train_ec))
print("Test Size: ", len(test_ec))

Train Size:  3383
Test Size:  597


In [14]:
# This is how the __getitem__() will be made when making a DataLoader from the dataset
# Take a eutectic compilation and print it
ec = eutectic_compilation.iloc[0]
print(ec)
print("\n")

# Obtain the 2 molecules involved
print("Molecule A: ", ec[0])
print("Molecule B: ", ec[1])
print("\n")

# Find their molecular features from the molecule_features dataframe, then convert the features into a NumPy array
m1f = torch.tensor(np.array(molecule_features.loc[molecule_features.mol == ec[0]])[0][1:starting_features+1].astype(float))
m2f = torch.tensor(np.array(molecule_features.loc[molecule_features.mol == ec[1]])[0][1:starting_features+1].astype(float))

print("Molecule A Features: ", m1f)
print("Molecule B Features: ", m2f)

molA      BF3
molB      N2O
xe       76.6
Te     -138.0
Name: 0, dtype: object


Molecule A:  BF3
Molecule B:  N2O


Molecule A Features:  tensor([67.8100,  8.0000,  0.0000,  4.0000,  0.0000], dtype=torch.float64)
Molecule B Features:  tensor([44.0130, 25.8000,  0.0000,  3.0000, 19.1000], dtype=torch.float64)


In [15]:
# SNN class with model
class SiameseNeuralNetwork(Module):
    def __init__(self, start_features):
        super(SiameseNeuralNetwork, self).__init__()
        
        self.fc1 = Linear(in_features=start_features, out_features=15)
        self.fc2 = Linear(in_features=15, out_features=10) # create the feature vector for the input data
        self.fc3 = Linear(in_features=10, out_features=1) # final stage where prediction is made

    def forward_on_input(self, x):
        return self.fc2(F.relu(self.fc1(x)))

    def forward(self, x1, x2):
        y1 = self.forward_on_input(x1)
        y2 = self.forward_on_input(x2)
        d = y1 - y2                      # difference for xe
        # d = torch.abs(y1 - y2)           # difference for Te
        p = torch.sigmoid(self.fc3(d))
        return p

In [16]:
# To be fixed so all code below this has not been run yet. Will throw error!!!
class MakeDataset(Dataset):
    def __init__(self, ec_df, mf_df, num_features):
        self.ecdf = ec_df
        self.mfdf = mf_df
        self.nf = num_features

    def __len__(self):
        return len(self.ecdf)
  
    def __getitem__(self, idx):
        ec = self.ecdf.iloc[idx]
        
        m1f = torch.tensor(np.array(self.mfdf.loc[self.mfdf.mol == ec[0]])[0][1:self.nf+1].astype(float))
        m2f = torch.tensor(np.array(self.mfdf.loc[self.mfdf.mol == ec[1]])[0][1:self.nf+1].astype(float))
        xe = torch.tensor(float(ec[2]))
        Te = torch.tensor(ec[3])
        
        return m1f, m2f, xe, Te

In [17]:
train_dset = MakeDataset(train_ec, molecule_features, starting_features)
test_dset = MakeDataset(test_ec, molecule_features, starting_features)

train_ec_dl = DataLoader(train_dset, shuffle=True, batch_size=batchsize)
test_ec_dl = DataLoader(test_dset, shuffle=True, batch_size=batchsize)

# set the model
model = SiameseNeuralNetwork(starting_features).to(device)

# Set optimizer and loss function. Using MAE since we are comparing a float to another float. 
opt = torch.optim.Adam(model.parameters(), lr=0.005)
criterion = torch.nn.MSELoss()
base_xe = torch.mean(torch.tensor(np.array(eutectic_compilation["xe"]).astype(float)) / 100.0)

In [18]:
# training
for epoch in range(num_epochs):
    train_running_loss = 0.0
    train_base_loss = 0.0
    train_accuracy = 0.0
    model = model.train()

    # training step: iterate through the batch and obtain the 4 data
    for x, (m1, m2, xe, Te) in enumerate(train_ec_dl):   
        # sending to device (GPU or CPU)
        m1 = m1.to(device)
        m2 = m2.to(device)
        xe = (xe/100.0).to(device)
        # Te = Te.to(device)

        # pass 2 sets of inputs into the snn and gives p, the output
        output = model(m1.float(), m2.float())
        loss = criterion(output[:, 0], xe)
        
        base = torch.full((len(xe),), base_xe, device=device)
        base_loss = criterion(base, xe)

        opt.zero_grad()
        loss.backward()
        opt.step()

        train_running_loss += loss.item()
        train_base_loss += base_loss.item()

    model.eval()
    print('Epoch {} | Train Loss: {} | Baseline: {}'.format(epoch+1, train_running_loss, train_base_loss))

Epoch 1 | Train Loss: 14.691761128604412 | Baseline: 11.271969364956021
Epoch 2 | Train Loss: 11.932573046302423 | Baseline: 11.230554484762251
Epoch 3 | Train Loss: 11.746398750692606 | Baseline: 11.336872508749366
Epoch 4 | Train Loss: 11.504698945209384 | Baseline: 11.250636644661427
Epoch 5 | Train Loss: 11.288904912769794 | Baseline: 11.281386563554406
Epoch 6 | Train Loss: 11.240158069878817 | Baseline: 11.277103014290333
Epoch 7 | Train Loss: 11.096206383779645 | Baseline: 11.288867516443133
Epoch 8 | Train Loss: 11.145650863647461 | Baseline: 11.323230799287558
Epoch 9 | Train Loss: 11.093204252421856 | Baseline: 11.276492374017835
Epoch 10 | Train Loss: 10.979264341294765 | Baseline: 11.253032088279724


In [19]:
test_loss = 0.0
test_baseline = 0.0
for (m1, m2, xe, Te) in test_ec_dl:
    m1 = m1.to(device)
    m2 = m2.to(device)
    xe = (xe/100.0).to(device)
    # Te = Te.to(device)

    outputs = model(m1.float(), m2.float())
    test_loss += criterion(outputs[:, 0], xe).item()
    
    base = torch.full((len(xe),), base_xe, device=device)
    test_baseline = criterion(base, xe).item()
    
print('Test Loss: {} | Baseline: {}\n'.format(test_loss, test_baseline))
print("Original Data: {}\n".format(xe))
print("Predicted: {}\n".format(outputs))

Test Loss: 2.0663128420710564 | Baseline: 0.060593221336603165

Original Data: tensor([0.0750, 0.1500, 0.5500, 0.2600, 0.1400, 0.2100, 0.5300, 0.8300, 0.1100,
        0.2300, 0.5770, 0.4370, 0.4900, 0.4500, 0.3940, 0.5000, 0.7800],
       device='cuda:0')

Predicted: tensor([[0.5223],
        [0.5771],
        [0.3669],
        [0.4521],
        [0.4866],
        [0.5151],
        [0.5118],
        [0.6282],
        [0.4527],
        [0.5359],
        [0.5243],
        [0.5372],
        [0.5589],
        [0.5007],
        [0.5256],
        [0.5067],
        [0.4672]], device='cuda:0', grad_fn=<SigmoidBackward0>)

