In [47]:
import pandas as pd
import torch

import torch.nn as nn
import torch.optim as optim

In [48]:

# Load the data
data = pd.read_csv('davis.txt', header=None, delimiter=' ')

data.rename(columns={0: 'c1', 1: 'c2', 2: 'SMILES', 3: 'Protein', 4: 'pKb'}, inplace=True)
data.head()


Unnamed: 0,c1,c2,SMILES,Protein,pKb
0,11314340,AAK1,CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...,MKKFFDSRREQGGSGLGSGSSGGGGSTSGLGSGYIGRVFGIGRQQV...,7.366532
1,11314340,ABL1(E255K),CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...,PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGK...,5.0
2,11314340,ABL1(F317I),CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...,PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGK...,5.0
3,11314340,ABL1(F317I)p,CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...,PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGK...,5.0
4,11314340,ABL1(F317L),CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...,PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGK...,5.0


In [49]:
amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
encoding_protein = {aa: i+1 for i, aa in enumerate(amino_acids)}
smiles_symbols = '=()CNOF123456'
encoding_smiles = {s: i+1 for i, s in enumerate(smiles_symbols)}

data['enc_protein'] = [[encoding_protein.get(char, 0) for char in data.at[i, 'Protein']] for i in range(data.index.size)]
data['enc_protein_len'] = [len(data.at[i, 'enc_protein']) for i in range(data.index.size)]
data['enc_smiles'] = [[encoding_smiles.get(char, 0) for char in data.at[i, 'SMILES']] for i in range(data.index.size)]
data['enc_smiles_len'] = [len(data.at[i, 'enc_smiles']) for i in range(data.index.size)]
data.head()

Unnamed: 0,c1,c2,SMILES,Protein,pKb,enc_protein,enc_protein_len,enc_smiles,enc_smiles_len
0,11314340,AAK1,CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...,MKKFFDSRREQGGSGLGSGSSGGGGSTSGLGSGYIGRVFGIGRQQV...,7.366532,"[11, 9, 9, 5, 5, 3, 16, 15, 15, 4, 14, 6, 6, 1...",961,"[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ...",51
1,11314340,ABL1(E255K),CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...,PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGK...,5.0,"[13, 5, 19, 9, 8, 10, 12, 13, 10, 10, 4, 15, 6...",1167,"[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ...",51
2,11314340,ABL1(F317I),CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...,PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGK...,5.0,"[13, 5, 19, 9, 8, 10, 12, 13, 10, 10, 4, 15, 6...",1167,"[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ...",51
3,11314340,ABL1(F317I)p,CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...,PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGK...,5.0,"[13, 5, 19, 9, 8, 10, 12, 13, 10, 10, 4, 15, 6...",1167,"[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ...",51
4,11314340,ABL1(F317L),CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...,PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGK...,5.0,"[13, 5, 19, 9, 8, 10, 12, 13, 10, 10, 4, 15, 6...",1167,"[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ...",51


In [50]:
df1 = data[['enc_protein_len', 'enc_smiles_len']]
df1.to_csv('csv1.csv')
df1.min(axis='index')

enc_protein_len_min = 244
enc_smiles_len_min = 39

data['enc_protein_trunc'] = [data.at[i, 'enc_protein'][0 : enc_protein_len_min] for i in range(data.index.size)]
data['enc_smiles_trunc'] = [data.at[i, 'enc_smiles'][0 : enc_smiles_len_min] for i in range(data.index.size)]
data.head()

Unnamed: 0,c1,c2,SMILES,Protein,pKb,enc_protein,enc_protein_len,enc_smiles,enc_smiles_len,enc_protein_trunc,enc_smiles_trunc
0,11314340,AAK1,CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...,MKKFFDSRREQGGSGLGSGSSGGGGSTSGLGSGYIGRVFGIGRQQV...,7.366532,"[11, 9, 9, 5, 5, 3, 16, 15, 15, 4, 14, 6, 6, 1...",961,"[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ...",51,"[11, 9, 9, 5, 5, 3, 16, 15, 15, 4, 14, 6, 6, 1...","[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ..."
1,11314340,ABL1(E255K),CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...,PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGK...,5.0,"[13, 5, 19, 9, 8, 10, 12, 13, 10, 10, 4, 15, 6...",1167,"[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ...",51,"[13, 5, 19, 9, 8, 10, 12, 13, 10, 10, 4, 15, 6...","[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ..."
2,11314340,ABL1(F317I),CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...,PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGK...,5.0,"[13, 5, 19, 9, 8, 10, 12, 13, 10, 10, 4, 15, 6...",1167,"[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ...",51,"[13, 5, 19, 9, 8, 10, 12, 13, 10, 10, 4, 15, 6...","[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ..."
3,11314340,ABL1(F317I)p,CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...,PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGK...,5.0,"[13, 5, 19, 9, 8, 10, 12, 13, 10, 10, 4, 15, 6...",1167,"[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ...",51,"[13, 5, 19, 9, 8, 10, 12, 13, 10, 10, 4, 15, 6...","[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ..."
4,11314340,ABL1(F317L),CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...,PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGK...,5.0,"[13, 5, 19, 9, 8, 10, 12, 13, 10, 10, 4, 15, 6...",1167,"[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ...",51,"[13, 5, 19, 9, 8, 10, 12, 13, 10, 10, 4, 15, 6...","[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ..."


In [51]:
data_to_use = data[['enc_protein_trunc', 'enc_smiles_trunc', 'pKb']]
data_to_use.head()

Unnamed: 0,enc_protein_trunc,enc_smiles_trunc,pKb
0,"[11, 9, 9, 5, 5, 3, 16, 15, 15, 4, 14, 6, 6, 1...","[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ...",7.366532
1,"[13, 5, 19, 9, 8, 10, 12, 13, 10, 10, 4, 15, 6...","[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ...",5.0
2,"[13, 5, 19, 9, 8, 10, 12, 13, 10, 10, 4, 15, 6...","[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ...",5.0
3,"[13, 5, 19, 9, 8, 10, 12, 13, 10, 10, 4, 15, 6...","[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ...",5.0
4,"[13, 5, 19, 9, 8, 10, 12, 13, 10, 10, 4, 15, 6...","[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ...",5.0


In [52]:
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(enc_protein_len_min + enc_smiles_len_min, 50)
        self.fc2 = nn.Linear(50, 10)
        self.fc3 = nn.Linear(10, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Prepare the data
X = data_to_use[['enc_protein_trunc', 'enc_smiles_trunc']].apply(lambda x: x[0] + x[1], axis=1).tolist()
y = data_to_use['pKb'].tolist()

X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32).view(-1, 1)

# Split the data into training and testing sets
train_size = int(0.8 * len(X))
test_size = len(X) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(list(zip(X, y)), [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Initialize the model, loss function, and optimizer
model = SimpleNN()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader)}')

# Evaluate the model
model.eval()
test_loss = 0.0
with torch.no_grad():
    for inputs, targets in test_loader:
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        test_loss += loss.item()
print(f'Test Loss: {test_loss/len(test_loader)}')

  X = data_to_use[['enc_protein_trunc', 'enc_smiles_trunc']].apply(lambda x: x[0] + x[1], axis=1).tolist()


Epoch 1/10, Loss: 0.8752767871470845
Epoch 2/10, Loss: 0.6809692454147847
Epoch 3/10, Loss: 0.6766530201334428
Epoch 4/10, Loss: 0.6614484690684588
Epoch 5/10, Loss: 0.6556634338414098
Epoch 6/10, Loss: 0.6472200329039008
Epoch 7/10, Loss: 0.6394920852035284
Epoch 8/10, Loss: 0.6271812616589856
Epoch 9/10, Loss: 0.6144048577927529
Epoch 10/10, Loss: 0.5978860886509907
Test Loss: 0.5902412586865273
