In [68]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim

In [69]:

# Load the data
data = pd.read_csv('davis.txt', header=None, delimiter=' ')

data.rename(columns={0: 'drug_id', 1: 'protein_id', 2: 'SMILES', 3: 'Protein', 4: 'pKb'}, inplace=True)
data.head()


Unnamed: 0,drug_id,protein_id,SMILES,Protein,pKb
0,11314340,AAK1,CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...,MKKFFDSRREQGGSGLGSGSSGGGGSTSGLGSGYIGRVFGIGRQQV...,7.366532
1,11314340,ABL1(E255K),CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...,PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGK...,5.0
2,11314340,ABL1(F317I),CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...,PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGK...,5.0
3,11314340,ABL1(F317I)p,CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...,PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGK...,5.0
4,11314340,ABL1(F317L),CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...,PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGK...,5.0


In [70]:
# encoding data

amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
encoding_protein = {aa: i+1 for i, aa in enumerate(amino_acids)}
smiles_symbols = '=()CNOF123456'
encoding_smiles = {s: i+1 for i, s in enumerate(smiles_symbols)}

data['enc_protein'] = [np.array([encoding_protein.get(char, 0) for char in data.at[i, 'Protein']]) for i in range(data.index.size)]
# data['enc_protein'] = [np.frombuffer(data.at[i, 'Protein'].encode(), dtype=np.int8) - 64 for i in range(data.index.size)]
data['enc_protein_len'] = [len(data.at[i, 'enc_protein']) for i in range(data.index.size)]

data['enc_smiles'] = [np.array([encoding_smiles.get(char, 0) for char in data.at[i, 'SMILES']]) for i in range(data.index.size)]
# data['enc_smiles'] = [np.frombuffer(data.at[i, 'SMILES'].encode(), dtype=np.int8) - 40 for i in range(data.index.size)]
data['enc_smiles_len'] = [len(data.at[i, 'enc_smiles']) for i in range(data.index.size)]
data.head()

Unnamed: 0,drug_id,protein_id,SMILES,Protein,pKb,enc_protein,enc_protein_len,enc_smiles,enc_smiles_len
0,11314340,AAK1,CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...,MKKFFDSRREQGGSGLGSGSSGGGGSTSGLGSGYIGRVFGIGRQQV...,7.366532,"[11, 9, 9, 5, 5, 3, 16, 15, 15, 4, 14, 6, 6, 1...",961,"[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ...",51
1,11314340,ABL1(E255K),CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...,PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGK...,5.0,"[13, 5, 19, 9, 8, 10, 12, 13, 10, 10, 4, 15, 6...",1167,"[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ...",51
2,11314340,ABL1(F317I),CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...,PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGK...,5.0,"[13, 5, 19, 9, 8, 10, 12, 13, 10, 10, 4, 15, 6...",1167,"[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ...",51
3,11314340,ABL1(F317I)p,CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...,PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGK...,5.0,"[13, 5, 19, 9, 8, 10, 12, 13, 10, 10, 4, 15, 6...",1167,"[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ...",51
4,11314340,ABL1(F317L),CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...,PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGK...,5.0,"[13, 5, 19, 9, 8, 10, 12, 13, 10, 10, 4, 15, 6...",1167,"[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ...",51


In [71]:
df1 = data[['enc_protein_len', 'enc_smiles_len']]
print(df1.min(axis='index'))
print(df1.max(axis='index'))

enc_protein_len    244
enc_smiles_len      39
dtype: int64
enc_protein_len    2549
enc_smiles_len       92
dtype: int64


In [72]:
# converting inputs to fixed-length arrays

enc_protein_len_min = 244
enc_smiles_len_min = 39

enc_protein_len_max = 2549
enc_smiles_len_max = 92

# truncating input arrays
# data['enc_protein_pad'] = [data.at[i, 'enc_protein'][0 : enc_protein_len_min] for i in range(data.index.size)]
# data['enc_smiles_pad'] = [data.at[i, 'enc_smiles'][0 : enc_smiles_len_min] for i in range(data.index.size)]

# padding input arrays
data['enc_protein_pad'] = [np.pad(i, (0, enc_protein_len_max - i.size), 'constant', constant_values=(0, 0)) for i in data['enc_protein']]
data['enc_smiles_pad'] = [np.pad(i, (0, enc_smiles_len_max - i.size), 'constant', constant_values=(0, 0)) for i in data['enc_smiles']]
data.head()

Unnamed: 0,drug_id,protein_id,SMILES,Protein,pKb,enc_protein,enc_protein_len,enc_smiles,enc_smiles_len,enc_protein_pad,enc_smiles_pad
0,11314340,AAK1,CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...,MKKFFDSRREQGGSGLGSGSSGGGGSTSGLGSGYIGRVFGIGRQQV...,7.366532,"[11, 9, 9, 5, 5, 3, 16, 15, 15, 4, 14, 6, 6, 1...",961,"[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ...",51,"[11, 9, 9, 5, 5, 3, 16, 15, 15, 4, 14, 6, 6, 1...","[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ..."
1,11314340,ABL1(E255K),CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...,PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGK...,5.0,"[13, 5, 19, 9, 8, 10, 12, 13, 10, 10, 4, 15, 6...",1167,"[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ...",51,"[13, 5, 19, 9, 8, 10, 12, 13, 10, 10, 4, 15, 6...","[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ..."
2,11314340,ABL1(F317I),CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...,PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGK...,5.0,"[13, 5, 19, 9, 8, 10, 12, 13, 10, 10, 4, 15, 6...",1167,"[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ...",51,"[13, 5, 19, 9, 8, 10, 12, 13, 10, 10, 4, 15, 6...","[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ..."
3,11314340,ABL1(F317I)p,CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...,PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGK...,5.0,"[13, 5, 19, 9, 8, 10, 12, 13, 10, 10, 4, 15, 6...",1167,"[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ...",51,"[13, 5, 19, 9, 8, 10, 12, 13, 10, 10, 4, 15, 6...","[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ..."
4,11314340,ABL1(F317L),CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...,PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGK...,5.0,"[13, 5, 19, 9, 8, 10, 12, 13, 10, 10, 4, 15, 6...",1167,"[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ...",51,"[13, 5, 19, 9, 8, 10, 12, 13, 10, 10, 4, 15, 6...","[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ..."


In [73]:
data_to_use = data[['enc_protein_pad', 'enc_smiles_pad', 'pKb']]
data_to_use.head()

Unnamed: 0,enc_protein_pad,enc_smiles_pad,pKb
0,"[11, 9, 9, 5, 5, 3, 16, 15, 15, 4, 14, 6, 6, 1...","[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ...",7.366532
1,"[13, 5, 19, 9, 8, 10, 12, 13, 10, 10, 4, 15, 6...","[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ...",5.0
2,"[13, 5, 19, 9, 8, 10, 12, 13, 10, 10, 4, 15, 6...","[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ...",5.0
3,"[13, 5, 19, 9, 8, 10, 12, 13, 10, 10, 4, 15, 6...","[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ...",5.0
4,"[13, 5, 19, 9, 8, 10, 12, 13, 10, 10, 4, 15, 6...","[4, 4, 8, 1, 4, 9, 4, 1, 4, 2, 4, 1, 4, 4, 9, ...",5.0


In [74]:
#  Prepare the data
X = data_to_use[['enc_protein_pad', 'enc_smiles_pad']].apply(lambda x: np.concatenate((x.iloc[0], x.iloc[1]), axis=0), axis=1).tolist()
y = data_to_use['pKb'].tolist()

X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32).view(-1, 1)

In [75]:
# Random split

def random_split(data, X, y):
    train_size = int(0.8 * len(X))
    test_size = len(X) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(list(zip(X, y)), [train_size, test_size])
    return train_dataset, test_dataset

In [76]:
# Split data such that each protein has a fixed ratio of entries in both training and testing split

def split_by_protein(data, X, y):
    # Group data by protein
    grouped = data.groupby('protein_id')

    train_indices = []
    test_indices = []

    # Split each group
    for _, group in grouped:
        train_idx, test_idx = train_test_split(group.index, test_size=0.3, random_state=42)
        train_indices.extend(train_idx)
        test_indices.extend(test_idx)

    # Create train and test datasets
    train_dataset = torch.utils.data.TensorDataset(X[train_indices], y[train_indices])
    test_dataset = torch.utils.data.TensorDataset(X[test_indices], y[test_indices])
    return train_dataset, test_dataset


In [77]:
# Split data that some proteins are entirely new in the testing split

def split_new_proteins(data, X, y):
    # Get unique protein IDs
    unique_proteins = data['protein_id'].unique()

    # Shuffle the protein IDs
    np.random.shuffle(unique_proteins)

    # Split the protein IDs into training and testing sets
    train_proteins = unique_proteins[:int(0.9 * len(unique_proteins))]
    test_proteins = unique_proteins[int(0.9 * len(unique_proteins)):]

    # Get the indices for the training and testing sets
    train_indices = data[data['protein_id'].isin(train_proteins)].index
    test_indices = data[data['protein_id'].isin(test_proteins)].index

    # Create train and test datasets
    train_dataset = torch.utils.data.TensorDataset(X[train_indices], y[train_indices])
    test_dataset = torch.utils.data.TensorDataset(X[test_indices], y[test_indices])
    return train_dataset, test_dataset

In [78]:
# Split data such that each drug has a fixed ratio of entries in both training and testing split

def split_by_drug(data, X, y):
    # Group data by drug
    grouped = data.groupby('drug_id')

    train_indices = []
    test_indices = []

    # Split each group
    for _, group in grouped:
        train_idx, test_idx = train_test_split(group.index, test_size=0.3, random_state=42)
        train_indices.extend(train_idx)
        test_indices.extend(test_idx)

    # Create train and test datasets
    train_dataset = torch.utils.data.TensorDataset(X[train_indices], y[train_indices])
    test_dataset = torch.utils.data.TensorDataset(X[test_indices], y[test_indices])
    return train_dataset, test_dataset


In [79]:
# Split data that some drugs are entirely new in the testing split

def split_new_drugs(data, X, y):
    # Get unique drug IDs
    unique_drugs = data['drug_id'].unique()

    # Shuffle the drug IDs
    np.random.shuffle(unique_drugs)

    # Split the drug IDs into training and testing sets
    train_drugs = unique_drugs[:int(0.9 * len(unique_drugs))]
    test_drugs = unique_drugs[int(0.9 * len(unique_drugs)):]

    # Get the indices for the training and testing sets
    train_indices = data[data['drug_id'].isin(train_drugs)].index
    test_indices = data[data['drug_id'].isin(test_drugs)].index

    # Create train and test datasets
    train_dataset = torch.utils.data.TensorDataset(X[train_indices], y[train_indices])
    test_dataset = torch.utils.data.TensorDataset(X[test_indices], y[test_indices])
    return train_dataset, test_dataset

In [80]:
# Concordance index

from lifelines.utils import concordance_index

def evaluate_ci(test_targets, test_predictions):
    # Calculate the concordance index
    c_index = concordance_index(test_targets, test_predictions)
    print(f'Concordance Index: {c_index}')
    

In [81]:
# Mean square error

from sklearn.metrics import mean_squared_error

def evaluate_mse(test_targets, test_predictions):
    # Calculate the mean-square error
    mse = mean_squared_error(test_targets, test_predictions)
    print(f'Mean-Square Error: {mse}')

In [82]:
# Pearson correlation coefficient

from scipy.stats import pearsonr

def evaluate_pearsonr(test_targets, test_predictions):
    # Calculate the Pearson correlation coefficient
    pearson_corr, _ = pearsonr(test_targets, test_predictions)
    print(f'Pearson Correlation Coefficient: {pearson_corr}')

In [83]:
# Area under the precision-recall curve

from sklearn.metrics import precision_recall_curve, auc
from sklearn.preprocessing import Binarizer

def evaluate_auprc(test_targets, test_predictions):
    # Binarize the targets and predictions
    binarizer = Binarizer(threshold=7.0)
    test_targets_bin = binarizer.fit_transform(np.array(test_targets).reshape(-1, 1)).flatten()
    test_predictions_bin = binarizer.transform(np.array(test_predictions).reshape(-1, 1)).flatten()
    
    precision, recall, _ = precision_recall_curve(test_targets_bin, test_predictions_bin)
    auprc = auc(recall, precision)
    print(f'Area Under the Precision-Recall Curve: {auprc}')

In [84]:
# r_m^2 index

from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression

def evaluate_rm2(test_targets, test_predictions):
    y_obs = np.array(test_targets)
    y_pred = np.array(test_predictions)

    # Calculate r^2
    r2 = r2_score(y_obs, y_pred)

    # Calculate r_0^2 (Linear regression without intercept)
    model_no_intercept = LinearRegression(fit_intercept=False)
    model_no_intercept.fit(y_obs.reshape(-1, 1), y_pred)
    y_pred_no_intercept = model_no_intercept.predict(y_obs.reshape(-1, 1))
    r2_0 = r2_score(y_obs, y_pred_no_intercept)

    if (r2 - r2_0 < 0):
        print(f'r^2: {r2}, r_0^2: {r2_0}')
        print('r_m^2 index is not defined')
        return

    # Calculate r_m^2
    rm2 = r2 * (1 - np.sqrt(r2 - r2_0))
    print(f'rm2: {rm2}')

In [85]:
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(enc_protein_len_max + enc_smiles_len_max, 500)
        self.fc2 = nn.Linear(500, 20)
        self.fc3 = nn.Linear(20, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [86]:
# Prepare training and testing datasets

train_dataset, test_dataset = split_new_drugs(data, X, y)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [87]:
# Initialize the model, loss function, and optimizer
model = SimpleNN()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader)}')

# Evaluate the model
model.eval()
test_loss = 0.0
with torch.no_grad():
    for inputs, targets in test_loader:
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        test_loss += loss.item()
print(f'Test Loss: {test_loss/len(test_loader)}')



Epoch 1/10, Loss: 0.9482549312452687
Epoch 2/10, Loss: 0.6632592715367163
Epoch 3/10, Loss: 0.6461594050853985
Epoch 4/10, Loss: 0.6320316008737361
Epoch 5/10, Loss: 0.6080344259243985
Epoch 6/10, Loss: 0.5947562843684907
Epoch 7/10, Loss: 0.5983285294450192
Epoch 8/10, Loss: 0.572738699632678
Epoch 9/10, Loss: 0.5637190953816242
Epoch 10/10, Loss: 0.5622930572161612
Test Loss: 0.7603720065167884


In [88]:
# Get the predictions and true values for the test set
model.eval()
test_predictions = []
test_targets = []
with torch.no_grad():
    for inputs, targets in test_loader:
        outputs = model(inputs)
        test_predictions.extend(outputs.numpy().flatten())
        test_targets.extend(targets.numpy().flatten())


evaluate_ci(test_targets, test_predictions)
evaluate_mse(test_targets, test_predictions)
evaluate_pearsonr(test_targets, test_predictions)
evaluate_auprc(test_targets, test_predictions)
evaluate_rm2(test_targets, test_predictions)

Concordance Index: 0.6800800171187353
Mean-Square Error: 0.7621972494008337
Pearson Correlation Coefficient: 0.2803141089292507
Area Under the Precision-Recall Curve: 0.039107950872656755
r^2: 0.05363047122955322, r_0^2: 0.9964703321456909
r_m^2 index is not defined
