In [9]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import pandas as pd
from sklearn.model_selection import train_test_split

# Load data
data = pd.read_csv('M_data_modeling_561SMILES.csv')
data

Unnamed: 0,SMILES,ALogP,ALogp2,AMR,apol,nAtom,ATSm1,ATSm2,ATSm3,ATSm4,...,VAdjMat,MW,WTPT.1,WTPT.2,WTPT.3,WPATH,WPOL,XLogP,Zagreb,mean_zscore
0,[2H][C@@](C)(NC(=O)COC(F)F)c1cc(F)cc(Cl)c1COc2...,4.9035,24.044312,132.0162,71.093239,60,53.349434,47.297124,70.977093,73.804066,...,6.321928,533.930913,76.681182,2.017926,36.181243,4808,63,4.639,194,1.511929
1,[2H][C@@](N)(CO)C(O)=O,-1.4888,2.216525,22.1161,13.453551,14,9.690430,7.246378,9.368815,8.992028,...,3.584963,105.092711,14.271680,1.783960,11.777389,62,9,-3.956,26,-0.293797
2,[2H]C([2H])([2H])N(C([2H])([2H])[2H])C([2H])([...,2.2509,5.066551,62.1359,34.790688,31,16.564855,18.002201,24.671700,23.731145,...,5.000000,204.268694,48.532898,1.941316,32.890133,1278,48,1.633,76,0.148202
3,[2H]C1([2H])[C@H]2C[C@@H](Oc3ccc(cn3)C(F)(F)F)...,3.6800,13.542400,107.9118,60.714274,51,42.011121,41.986894,67.242763,64.111806,...,6.209453,458.409095,71.640211,2.046863,33.229469,3752,61,4.196,184,-0.944121
4,[2H]C1([2H])C([C@@H](CC#N)n2cc(cn2)-c3ncnc4[nH...,2.7411,7.513629,87.7190,48.522274,41,25.216222,28.693159,40.251216,37.588003,...,5.700440,306.365663,62.343269,2.011073,37.936323,2435,65,2.696,124,-0.076593
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
556,OS(O)(=O)=O.C[C@H](N)Cc1ccccc1.C[C@H](N)Cc2ccccc2,1.9234,3.699468,101.1792,58.658204,53,34.945590,34.557962,35.311542,18.332364,...,5.584963,368.492698,47.570246,1.902810,16.789671,200000000068,18,1.760,108,0.446026
557,OS(O)(=O)=O.COc1ccc(cc1)S(=O)(=O)NCc2cc(CN3CCC...,1.7642,3.112402,141.8191,81.093755,72,56.531199,60.281187,69.859569,56.323915,...,6.285402,557.683107,73.720532,1.992447,35.620419,160000002987,46,0.840,188,-0.574772
558,OS(O)(=O)=O.N[C@H]1C[C@@H]1c2ccccc2.N[C@H]3C[C...,1.0414,1.084514,97.1628,55.991032,49,34.945590,36.557962,35.311542,20.332364,...,5.700440,364.460935,48.425994,1.937040,16.857489,200000000054,20,0.950,124,-1.809539
559,OS(O)(=O)=O.O[C@H]1CCN(C1)C(=O)Nc2cnn3ccc(nc23...,1.0149,1.030022,124.6342,68.389032,60,51.938537,53.573697,69.483914,57.391041,...,6.285402,526.515572,72.658877,2.018302,42.006338,155000002643,49,1.310,192,0.620174


In [10]:
# Split data into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

In [11]:
# Define custom dataset class to handle data
class DescriptorDataset(Dataset):
    def __init__(self, data):
        self.x = data.drop(columns=["SMILES", "mean_zscore"]).values.astype(float)
        self.y = data["mean_zscore"].values.astype(float)
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.tensor(self.x[idx]), torch.tensor(self.y[idx])
    
# Define custom neural network
class Net(torch.nn.Module):
    def __init__(self, input_size):
        super(Net, self).__init__()
        self.fc1 = torch.nn.Linear(input_size, 64)
        self.fc2 = torch.nn.Linear(64, 32)
        self.fc3 = torch.nn.Linear(32, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Create DataLoader objects for training and validation data
train_dataset = DescriptorDataset(train_data)
val_dataset = DescriptorDataset(val_data)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True)

# Define neural network model
net = Net(train_dataset.x.shape[1])

# Define optimizer and loss function
optimizer = torch.optim.Adam(net.parameters(), lr=0.001)
criterion = torch.nn.MSELoss()

# Train model
num_epochs = 100
for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = net(inputs.float())
        loss = criterion(outputs.flatten(), labels.float())
        loss.backward()
        optimizer.step()

    # Compute validation loss
    val_loss = 0.0
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = net(inputs.float())
            val_loss += criterion(outputs.flatten(), labels.float())
    val_loss /= len(val_loader)
    
    print(f"Epoch {epoch+1}, Train Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}")


Epoch 1, Train Loss: 275335475579846656.0000, Val Loss: 738118923278352384.0000
Epoch 2, Train Loss: 58381481864593408.0000, Val Loss: 33885466604339200.0000
Epoch 3, Train Loss: 24548437483585536.0000, Val Loss: 31309667342745600.0000
Epoch 4, Train Loss: 7439273124954112.0000, Val Loss: 24035403640078336.0000
Epoch 5, Train Loss: 1713639337754624.0000, Val Loss: 6127645410459648.0000
Epoch 6, Train Loss: 210132046708736.0000, Val Loss: 1053875922534400.0000
Epoch 7, Train Loss: 231723149295616.0000, Val Loss: 209697902690304.0000
Epoch 8, Train Loss: 28519589478400.0000, Val Loss: 108990499389440.0000
Epoch 9, Train Loss: 1248597324595200.0000, Val Loss: 66422134800384.0000
Epoch 10, Train Loss: 126997242052608.0000, Val Loss: 41777608261632.0000
Epoch 11, Train Loss: 13106214338560.0000, Val Loss: 42993520214016.0000
Epoch 12, Train Loss: 175399652818944.0000, Val Loss: 38981953650688.0000
Epoch 13, Train Loss: 12855761960960.0000, Val Loss: 39785309667328.0000
Epoch 14, Train Loss: