In [2]:
import pandas as pd
import numpy as np
import pickle
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
import time
import argparse
import os
import matplotlib.pyplot as plt


In [3]:
df = pd.read_csv('../baselines/RNN/datasets/Dataset 1.csv') #to be comparable with the size of other datasets, select ~5000 data points from the reference
df['TRIMER_mol'] = df['TRIMER'].apply(Chem.MolFromSmiles)
df = df.dropna()
#df = df.head(1000)

In [4]:
nbits = 1024
fp = df['TRIMER_mol'].apply(lambda m: AllChem.GetMorganFingerprintAsBitVect(m, radius=3, nBits=nbits))
df['fps'] = fp

In [5]:
df.columns

Index(['TRIMER', 'Excitation Energy (eV)', 'IP (eV)', 'EA (eV)',
       'Calib. IP (eV)', 'Calib. EA (eV)', 'Calib. Excitation Energy (eV)',
       'TRIMER_mol', 'fps'],
      dtype='object')

In [6]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

#reset index
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

In [7]:
from torch.utils.data import Dataset, DataLoader

class TrimerDataset(Dataset):
    def __init__(self, data, target_column):
        self.data = data
        self.target_column = target_column

    def __getitem__(self, index):
        tokens = self.data.loc[index, "fps"]
        target = self.data.loc[index, self.target_column]

        tokens_tensor = torch.tensor(tokens, dtype=torch.long)
        target_tensor = torch.tensor(target, dtype=torch.float32)

        return tokens_tensor, target_tensor

    def __len__(self):
        return len(self.data)


In [8]:
target_column = "EA (eV)"
train_dataset = TrimerDataset(train_data, target_column)
test_dataset = TrimerDataset(test_data, target_column)


In [9]:
batch_size = 128
shuffle = True
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=shuffle)


In [10]:
import torch
from torch import nn
from torchvision.transforms import ToTensor

device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


In [11]:
for i in range(5):
    tokens, target = train_dataset[i]
    print(f"Sample {i + 1}:")
    print(f"Tokens: {tokens}")
    print(f"Target: {target}\n")

Sample 1:
Tokens: tensor([1, 0, 0,  ..., 0, 0, 0])
Target: 2.458224058151245

Sample 2:
Tokens: tensor([0, 0, 0,  ..., 0, 0, 0])
Target: 1.7806440591812134

Sample 3:
Tokens: tensor([0, 0, 0,  ..., 0, 0, 0])
Target: 1.791316032409668

Sample 4:
Tokens: tensor([0, 1, 0,  ..., 0, 1, 0])
Target: 1.6426440477371216

Sample 5:
Tokens: tensor([0, 0, 0,  ..., 0, 0, 0])
Target: 2.719320058822632



In [12]:
# without mlp.
class RNN(nn.Module):
    def __init__(self, seq_len, emb_dim, lstm_dim, linear_dim, out_dim, num_tokens=None):
        super(RNN, self).__init__()
        self.seq_len = seq_len
        self.emb = nn.Embedding(num_tokens, emb_dim) if num_tokens is not None else None
        self.lstm1 = nn.LSTM(emb_dim, lstm_dim, bidirectional=True, batch_first=True)
        self.lstm2 = nn.LSTM(lstm_dim*2, lstm_dim, bidirectional=True, batch_first=True)
        self.lstm_dim = lstm_dim

        self.mlp = nn.Sequential(
            nn.Linear(seq_len * lstm_dim * 2, linear_dim),
            nn.ReLU(),
            nn.Linear(linear_dim, out_dim),
        )
        
        self.last = nn.Linear(lstm_dim * seq_len *2, out_dim)

    def forward(self, data):  # 2D
        x = self.emb(data) if self.emb else data
        x, _ = self.lstm1(x)
        x, _ = self.lstm2(x)
        # x = x.reshape(x.shape[0], -1)  # Flatten time and batch dims
        # x = self.mlp(x) removed mlp
        x = x.reshape(x.shape[0], -1)  # Flatten time dim into last one (instead of before into sample dim)
        #x = self.mlp(x)
        x = self.last(x)
        return x

In [13]:
# from Veronika
class RNN(nn.Module):
    def __init__(self, seq_len, emb_dim, lstm_dim, linear_dim, out_dim, num_tokens=None):
        super(RNN, self).__init__()
        self.seq_len = seq_len
        self.emb = nn.Embedding(num_tokens, emb_dim) if num_tokens is not None else None
        self.lstm1 = nn.LSTM(emb_dim, lstm_dim, bidirectional=True, batch_first=True)
        self.lstm2 = nn.LSTM(lstm_dim*2, lstm_dim, bidirectional=True, batch_first=True)
        self.lstm_dim = lstm_dim

        self.mlp = nn.Sequential(
            nn.Linear(lstm_dim * 2, int(lstm_dim/2)),
            nn.ReLU(),
        )
        
        self.last = nn.Linear(int(lstm_dim/2)*seq_len, out_dim)

    def forward(self, data):  # 2D
        x = self.emb(data) if self.emb else data
        x, _ = self.lstm1(x)
        x, _ = self.lstm2(x)
        # x = x.reshape(x.shape[0], -1)  # Flatten time and batch dims
        x = self.mlp(x) #removed mlp
        #x = x.reshape(x.shape[0], -1)  # Flatten time dim into last one (instead of before into sample dim)
        x = x.reshape(-1, int(self.lstm_dim/2) * self.seq_len)
        #x = self.mlp(x)
        x = self.last(x)
        return x

In [14]:
seq_len = 1024
emb_dim = 100 #50, 100, 200, or 300
lstm_dim= 20 #50, 100, 200
linear_dim = 100
out_dim = 1
num_tokens=1024
learning_rate = 0.001

model = RNN(seq_len, emb_dim, lstm_dim, linear_dim, out_dim, num_tokens).to(device)
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [15]:
from tqdm import tqdm
def train_RNN(model, train_loader, test_loader, loss_fn, optimizer, num_epochs=10, device="cpu"):
    model.to(device)
    train_losses = []
    test_losses = []

    for epoch in range(num_epochs):
        # Training
        model.train()
        train_loss = 0
        #for x, y in train_loader:
        for x, y in tqdm(train_loader, desc=f'Training epoch {epoch + 1}/{num_epochs}'):
            x, y = x.to(device), y.to(device)

            optimizer.zero_grad()
            output = model(x)
            loss = loss_fn(output, y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        train_loss /= len(train_loader)
        train_losses.append(train_loss)

        # Testing
        model.eval()
        test_loss = 0
        with torch.no_grad():
            for x, y in test_loader:
                x, y = x.to(device), y.to(device)

                output = model(x)
                loss = loss_fn(output, y)
                test_loss += loss.item()

        test_loss /= len(test_loader)
        test_losses.append(test_loss)

        print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}")

    return train_losses, test_losses

In [None]:
num_epochs = 120
train_losses, test_losses = train_RNN(model, train_dataloader, test_dataloader, loss_fn, optimizer, num_epochs, device)

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
Training epoch 1/120: 100%|██████████| 300/300 [01:08<00:00,  4.40it/s]
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 1/120, Train Loss: 0.4808, Test Loss: 0.4343


Training epoch 2/120: 100%|██████████| 300/300 [01:08<00:00,  4.41it/s]


Epoch 2/120, Train Loss: 0.4423, Test Loss: 0.4340


Training epoch 3/120: 100%|██████████| 300/300 [01:07<00:00,  4.42it/s]


Epoch 3/120, Train Loss: 0.4434, Test Loss: 0.4434


Training epoch 4/120: 100%|██████████| 300/300 [01:08<00:00,  4.40it/s]


Epoch 4/120, Train Loss: 0.4393, Test Loss: 0.4448


Training epoch 5/120: 100%|██████████| 300/300 [01:07<00:00,  4.42it/s]


Epoch 5/120, Train Loss: 0.4400, Test Loss: 0.4355


Training epoch 6/120: 100%|██████████| 300/300 [01:07<00:00,  4.41it/s]


Epoch 6/120, Train Loss: 0.4377, Test Loss: 0.4796


Training epoch 7/120: 100%|██████████| 300/300 [01:07<00:00,  4.41it/s]


Epoch 7/120, Train Loss: 0.4422, Test Loss: 0.4356


Training epoch 8/120: 100%|██████████| 300/300 [01:08<00:00,  4.40it/s]


Epoch 8/120, Train Loss: 0.4386, Test Loss: 0.4396


Training epoch 9/120: 100%|██████████| 300/300 [01:07<00:00,  4.41it/s]


Epoch 9/120, Train Loss: 0.4388, Test Loss: 0.4421


Training epoch 10/120: 100%|██████████| 300/300 [01:08<00:00,  4.39it/s]


Epoch 10/120, Train Loss: 0.4379, Test Loss: 0.4344


Training epoch 11/120: 100%|██████████| 300/300 [01:08<00:00,  4.41it/s]


Epoch 11/120, Train Loss: 0.4386, Test Loss: 0.4394


Training epoch 12/120: 100%|██████████| 300/300 [01:07<00:00,  4.41it/s]


Epoch 12/120, Train Loss: 0.4387, Test Loss: 0.4397


Training epoch 13/120: 100%|██████████| 300/300 [01:08<00:00,  4.41it/s]


Epoch 13/120, Train Loss: 0.4371, Test Loss: 0.4339


Training epoch 14/120: 100%|██████████| 300/300 [01:08<00:00,  4.40it/s]


Epoch 14/120, Train Loss: 0.4390, Test Loss: 0.4342


Training epoch 15/120: 100%|██████████| 300/300 [01:08<00:00,  4.41it/s]


Epoch 15/120, Train Loss: 0.4387, Test Loss: 0.4342


Training epoch 16/120: 100%|██████████| 300/300 [01:08<00:00,  4.40it/s]


Epoch 16/120, Train Loss: 0.4388, Test Loss: 0.4439


Training epoch 17/120: 100%|██████████| 300/300 [01:08<00:00,  4.41it/s]


Epoch 17/120, Train Loss: 0.4381, Test Loss: 0.4355


Training epoch 18/120: 100%|██████████| 300/300 [01:07<00:00,  4.43it/s]


Epoch 18/120, Train Loss: 0.4375, Test Loss: 0.4353


Training epoch 19/120: 100%|██████████| 300/300 [01:07<00:00,  4.44it/s]


Epoch 19/120, Train Loss: 0.4379, Test Loss: 0.4350


Training epoch 20/120: 100%|██████████| 300/300 [01:07<00:00,  4.44it/s]


Epoch 20/120, Train Loss: 0.4387, Test Loss: 0.4342


Training epoch 21/120: 100%|██████████| 300/300 [01:07<00:00,  4.43it/s]


Epoch 21/120, Train Loss: 0.4376, Test Loss: 0.4373


Training epoch 22/120: 100%|██████████| 300/300 [01:08<00:00,  4.41it/s]


Epoch 22/120, Train Loss: 0.4392, Test Loss: 0.4497


Training epoch 23/120: 100%|██████████| 300/300 [01:07<00:00,  4.43it/s]


Epoch 23/120, Train Loss: 0.4372, Test Loss: 0.4388


Training epoch 24/120: 100%|██████████| 300/300 [01:07<00:00,  4.43it/s]


Epoch 24/120, Train Loss: 0.4381, Test Loss: 0.4342


Training epoch 25/120: 100%|██████████| 300/300 [01:07<00:00,  4.43it/s]


Epoch 25/120, Train Loss: 0.4398, Test Loss: 0.4345


Training epoch 26/120: 100%|██████████| 300/300 [01:07<00:00,  4.43it/s]


Epoch 26/120, Train Loss: 0.4374, Test Loss: 0.4357


Training epoch 27/120: 100%|██████████| 300/300 [01:07<00:00,  4.42it/s]


Epoch 27/120, Train Loss: 0.4373, Test Loss: 0.4368


Training epoch 28/120: 100%|██████████| 300/300 [01:07<00:00,  4.43it/s]


Epoch 28/120, Train Loss: 0.4387, Test Loss: 0.4344


Training epoch 29/120: 100%|██████████| 300/300 [01:08<00:00,  4.39it/s]


Epoch 29/120, Train Loss: 0.4376, Test Loss: 0.4391


Training epoch 30/120: 100%|██████████| 300/300 [01:08<00:00,  4.38it/s]


Epoch 30/120, Train Loss: 0.4374, Test Loss: 0.4363


Training epoch 31/120: 100%|██████████| 300/300 [01:08<00:00,  4.40it/s]


Epoch 31/120, Train Loss: 0.4372, Test Loss: 0.4343


Training epoch 32/120: 100%|██████████| 300/300 [01:08<00:00,  4.41it/s]


Epoch 32/120, Train Loss: 0.4375, Test Loss: 0.4387


Training epoch 33/120: 100%|██████████| 300/300 [01:08<00:00,  4.41it/s]


Epoch 33/120, Train Loss: 0.4374, Test Loss: 0.4375


Training epoch 34/120: 100%|██████████| 300/300 [01:08<00:00,  4.40it/s]


Epoch 34/120, Train Loss: 0.4372, Test Loss: 0.4345


Training epoch 35/120: 100%|██████████| 300/300 [01:08<00:00,  4.41it/s]


Epoch 35/120, Train Loss: 0.4378, Test Loss: 0.4395


Training epoch 36/120: 100%|██████████| 300/300 [01:08<00:00,  4.41it/s]


Epoch 36/120, Train Loss: 0.4371, Test Loss: 0.4510


Training epoch 37/120: 100%|██████████| 300/300 [01:07<00:00,  4.43it/s]


Epoch 37/120, Train Loss: 0.4370, Test Loss: 0.4356


Training epoch 38/120: 100%|██████████| 300/300 [01:07<00:00,  4.44it/s]


Epoch 38/120, Train Loss: 0.4392, Test Loss: 0.4353


Training epoch 39/120: 100%|██████████| 300/300 [01:07<00:00,  4.44it/s]


Epoch 39/120, Train Loss: 0.4379, Test Loss: 0.4341


Training epoch 40/120: 100%|██████████| 300/300 [01:07<00:00,  4.42it/s]


Epoch 40/120, Train Loss: 0.4365, Test Loss: 0.4362


Training epoch 41/120: 100%|██████████| 300/300 [01:07<00:00,  4.41it/s]


Epoch 41/120, Train Loss: 0.4367, Test Loss: 0.4399


Training epoch 42/120: 100%|██████████| 300/300 [01:08<00:00,  4.40it/s]


Epoch 42/120, Train Loss: 0.4383, Test Loss: 0.4359


Training epoch 43/120: 100%|██████████| 300/300 [01:07<00:00,  4.42it/s]


Epoch 43/120, Train Loss: 0.4375, Test Loss: 0.4344


Training epoch 44/120: 100%|██████████| 300/300 [01:07<00:00,  4.44it/s]


Epoch 44/120, Train Loss: 0.4370, Test Loss: 0.4360


Training epoch 45/120: 100%|██████████| 300/300 [01:07<00:00,  4.44it/s]


Epoch 45/120, Train Loss: 0.4375, Test Loss: 0.4340


Training epoch 46/120: 100%|██████████| 300/300 [01:08<00:00,  4.40it/s]


Epoch 46/120, Train Loss: 0.4367, Test Loss: 0.4428


Training epoch 47/120: 100%|██████████| 300/300 [01:08<00:00,  4.40it/s]


Epoch 47/120, Train Loss: 0.4379, Test Loss: 0.4353


Training epoch 48/120: 100%|██████████| 300/300 [01:07<00:00,  4.42it/s]


Epoch 48/120, Train Loss: 0.4370, Test Loss: 0.4500


Training epoch 49/120: 100%|██████████| 300/300 [01:07<00:00,  4.43it/s]


Epoch 49/120, Train Loss: 0.4369, Test Loss: 0.4341


Training epoch 50/120: 100%|██████████| 300/300 [01:07<00:00,  4.41it/s]


Epoch 50/120, Train Loss: 0.4375, Test Loss: 0.4339


Training epoch 51/120: 100%|██████████| 300/300 [01:08<00:00,  4.38it/s]


Epoch 51/120, Train Loss: 0.4368, Test Loss: 0.4393


Training epoch 52/120: 100%|██████████| 300/300 [01:08<00:00,  4.39it/s]


Epoch 52/120, Train Loss: 0.4369, Test Loss: 0.4356


Training epoch 53/120: 100%|██████████| 300/300 [01:07<00:00,  4.42it/s]


Epoch 53/120, Train Loss: 0.4383, Test Loss: 0.4339


Training epoch 54/120: 100%|██████████| 300/300 [01:07<00:00,  4.42it/s]


Epoch 54/120, Train Loss: 0.4371, Test Loss: 0.4366


Training epoch 55/120: 100%|██████████| 300/300 [01:07<00:00,  4.46it/s]


Epoch 55/120, Train Loss: 0.4360, Test Loss: 0.4356


Training epoch 56/120: 100%|██████████| 300/300 [01:08<00:00,  4.41it/s]


Epoch 56/120, Train Loss: 0.4371, Test Loss: 0.4402


Training epoch 57/120:  98%|█████████▊| 293/300 [01:06<00:01,  4.42it/s]

In [None]:
current_time = time.strftime("%Y%m%d-%H%M%S")
model_path = f"model_{current_time}.pt"

torch.save(model.state_dict(), model_path)
print("Saved PyTorch Model State to "+model_path)

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

def evaluate_model(model, dataloader, device):
    model.eval()
    all_outputs = []
    all_targets = []
    
    with torch.no_grad():
        for batch in dataloader:
            x, y = batch
            x = x.to(device)
            y = y.to(device)
            
            output = model(x)
            all_outputs.extend(output.cpu().numpy())
            all_targets.extend(y.cpu().numpy())

    return np.array(all_outputs), np.array(all_targets)

# Evaluate the model
predicted, targets = evaluate_model(model, test_dataloader, device)

# Calculate R2 score, MAE, and RMSE
r2 = r2_score(targets, predicted)
mae = mean_absolute_error(targets, predicted)
rmse = np.sqrt(mean_squared_error(targets, predicted))

print(f"R2 Score: {r2:.4f}")
print(f"Mean Absolute Error: {mae:.4f}")
print(f"Root Mean Squared Error: {rmse:.4f}")

In [None]:
targets_np = np.array(targets).flatten()
predicted_np = np.array(predicted).flatten()

plt.figure(figsize=(10, 10))
plt.scatter(predicted_np, targets_np, alpha=0.5)

# Fit a linear regression model
m, b = np.polyfit(predicted_np, targets_np, 1)

# Create line points based on the min and max of the predicted values
line_x = np.linspace(min(predicted_np), max(predicted_np), 100)
line_y = m * line_x + b

# Plot the best fit line
plt.plot(line_x, line_y, '--', c='r')

plt.xlabel('Predicted Values')
plt.ylabel('True Values')
plt.title('Scatter plot of Predicted vs. True Values')
plt.show()
