In [5]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import json
import pickle
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Concatenate
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

2025-10-10 22:08:02.410501: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
MAX_PROT_LEN = 1200
MAX_DRUG_LEN = 85
EMBED_DIM = 128

In [7]:
DATA_PATH = r"/teamspace/studios/this_studio/datasets/davis.txt"

df = pd.read_csv(
    DATA_PATH, 
    sep=r'\s+',  # one or more spaces/tabs
    header=None, 
    engine="python",
    names=["drug_id", "protein_id", "smiles", "sequence", "affinity"]
)

print(f"Loaded {len(df)} samples from {DATA_PATH}")
print(df.head())


Loaded 30056 samples from /teamspace/studios/this_studio/datasets/davis.txt
    drug_id    protein_id                                             smiles  \
0  11314340          AAK1  CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...   
1  11314340   ABL1(E255K)  CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...   
2  11314340   ABL1(F317I)  CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...   
3  11314340  ABL1(F317I)p  CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...   
4  11314340   ABL1(F317L)  CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...   

                                            sequence  affinity  
0  MKKFFDSRREQGGSGLGSGSSGGGGSTSGLGSGYIGRVFGIGRQQV...  7.366532  
1  PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGK...  5.000000  
2  PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGK...  5.000000  
3  PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGK...  5.000000  
4  PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGK...  5.000000  


In [None]:
all_smiles_chars = set("".join(df["smiles"].astype(str).values))
all_smiles_chars = sorted(all_smiles_chars)

smiles_token2idx = {}
for index, char in enumerate(all_smiles_chars):
    smiles_token2idx[char] = index + 1

all_protein_aas = set("".join(df["sequence"].astype(str).values))
all_protein_aas = sorted(all_protein_aas)

protein_token2idx = {}
for index, aa in enumerate(all_protein_aas):
    protein_token2idx[aa] = index + 1

def tokenize_smiles(smiles_string):
    max_len=MAX_DRUG_LEN
    tokens = []
    for ch in smiles_string[:max_len]:
        if ch in smiles_token2idx:
            tokens.append(smiles_token2idx[ch])

    padded_tokens = np.pad(tokens, pad_width=(0, max_len - len(tokens)), mode="constant",constant_values=0)

    return padded_tokens

def tokenize_protein(sequence_string):
    max_len=MAX_PROT_LEN
    tokens = []
    for ch in sequence_string[:max_len]:
        if ch in protein_token2idx:
            tokens.append(protein_token2idx[ch])

    padded_tokens = np.pad(tokens, pad_width=(0, max_len - len(tokens)), mode="constant",constant_values=0)

    return padded_tokens

drug_token_arrays = []
for smiles in df["smiles"]:
    token_array = tokenize_smiles(smiles)
    drug_token_arrays.append(token_array)

protein_token_arrays = []
for seq in df["sequence"]:
    token_array = tokenize_protein(seq)
    protein_token_arrays.append(token_array)

X_drug = np.vstack(drug_token_arrays)
X_prot = np.vstack(protein_token_arrays)

y = df["affinity"].values.astype(np.float32).reshape(-1, 1)

print("Drugs:", X_drug.shape)
print("Proteins:", X_prot.shape)
print("Affinities:", y.shape)

Drugs: (30056, 85)
Proteins: (30056, 1200)
Affinities: (30056, 1)


In [9]:
X_prot_train, X_prot_test, X_drug_train, X_drug_test, y_train, y_test = train_test_split(
    X_prot, X_drug, y, test_size=0.2, random_state=42, shuffle=True
)
print("Train size:", len(y_train))
print("Test size:", len(y_test))

Train size: 24044
Test size: 6012


In [10]:
protein_input = Input(shape=(MAX_PROT_LEN,), name="protein_input")
p_embed = Embedding(input_dim=len(all_protein_aas) + 1, output_dim=EMBED_DIM)(protein_input)
p_conv1 = Conv1D(32, 4, activation="relu", padding="valid")(p_embed)
p_conv2 = Conv1D(64, 6, activation="relu", padding="valid")(p_conv1)
p_conv3 = Conv1D(96, 8, activation="relu", padding="valid")(p_conv2)
p_flat = GlobalMaxPooling1D()(p_conv3)

drug_input = Input(shape=(MAX_DRUG_LEN,), name="drug_input")
d_embed = Embedding(input_dim=len(all_smiles_chars) + 1, output_dim=EMBED_DIM)(drug_input)
d_conv1 = Conv1D(32, 4, activation="relu", padding="valid")(d_embed)
d_conv2 = Conv1D(64, 6, activation="relu", padding="valid")(d_conv1)
d_conv3 = Conv1D(96, 8, activation="relu", padding="valid")(d_conv2)
d_flat = GlobalMaxPooling1D()(d_conv3)

merged = Concatenate()([p_flat, d_flat])
dense1 = Dense(1024, activation="relu")(merged)
drop1 = Dropout(0.1)(dense1)
dense2 = Dense(1024, activation="relu")(drop1)
drop2 = Dropout(0.1)(dense2)
dense3 = Dense(512, activation="relu")(drop2)
output = Dense(1, activation="linear")(dense3)

model = Model(inputs=[protein_input, drug_input], outputs=output)
model.compile(optimizer=Adam(learning_rate=1e-4), loss="mse")

model.summary()

I0000 00:00:1760134094.920487    4973 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


In [11]:
early_stop = EarlyStopping(monitor="val_loss", patience=30, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=10, min_lr=1e-6)

history = model.fit(
    [X_prot_train, X_drug_train],
    y_train,
    validation_split=0.1,
    epochs=100,
    batch_size=64,
    callbacks=[early_stop, reduce_lr],
    verbose=1,
)

Epoch 1/100


2025-10-10 22:08:20.867632: I external/local_xla/xla/service/service.cc:163] XLA service 0x7ce3a0007540 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-10-10 22:08:20.867669: I external/local_xla/xla/service/service.cc:171]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
2025-10-10 22:08:21.004535: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-10-10 22:08:21.671568: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91002


[1m  3/339[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m19s[0m 58ms/step - loss: 32.1475

I0000 00:00:1760134107.272954   24839 device_compiler.h:196] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 49ms/step - loss: 2.3508 - val_loss: 0.6075 - learning_rate: 1.0000e-04
Epoch 2/100
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 30ms/step - loss: 0.6518 - val_loss: 0.5436 - learning_rate: 1.0000e-04
Epoch 3/100
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 31ms/step - loss: 0.6131 - val_loss: 0.5561 - learning_rate: 1.0000e-04
Epoch 4/100
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 31ms/step - loss: 0.5907 - val_loss: 0.5137 - learning_rate: 1.0000e-04
Epoch 5/100
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 31ms/step - loss: 0.5802 - val_loss: 0.4936 - learning_rate: 1.0000e-04
Epoch 6/100
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 32ms/step - loss: 0.5652 - val_loss: 0.5838 - learning_rate: 1.0000e-04
Epoch 7/100
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 32ms/step - loss: 0.5265 - 

In [12]:
y_pred = model.predict([X_prot_test, X_drug_test])
mse = np.mean((y_test - y_pred) ** 2)
rmse = np.sqrt(mse)

def concordance_index(y_true, y_pred):
    pairs = conc = 0
    for i in range(len(y_true)):
        for j in range(i + 1, len(y_true)):
            if y_true[i] != y_true[j]:
                pairs += 1
                if (y_pred[i] > y_pred[j] and y_true[i] > y_true[j]) or (y_pred[i] < y_pred[j] and y_true[i] < y_true[j]):
                    conc += 1
                elif y_pred[i] == y_pred[j]:
                    conc += 0.5
    return conc / pairs if pairs > 0 else 0

ci = concordance_index(y_test.ravel(), y_pred.ravel())

print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"CI: {ci:.4f}")

[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step
MSE: 0.2315
RMSE: 0.4811
CI: 0.8761


In [13]:
SAVE_DIR = r"/teamspace/studios/this_studio/models/davis_better"
os.makedirs(SAVE_DIR, exist_ok=True)

model_save_path = os.path.join(SAVE_DIR, "deepdta_davis_model.keras")
model.save(model_save_path)
print(f"Model saved at: {model_save_path}")

weights_path = os.path.join(SAVE_DIR, "deepdta_davis.weights.h5")
model.save_weights(weights_path)
print(f"Weights saved at: {weights_path}")

tokenizers_path = os.path.join(SAVE_DIR, "tokenizers.pkl")
with open(tokenizers_path, "wb") as f:
    pickle.dump({
        "smiles_token2idx": smiles_token2idx,
        "protein_token2idx": protein_token2idx
    }, f)
print(f"Tokenizers saved at: {tokenizers_path}")

history_path = os.path.join(SAVE_DIR, "training_history.json")
with open(history_path, "w") as f:
    json.dump(history.history, f)
print(f"Training history saved at: {history_path}")

results_path = os.path.join(SAVE_DIR, "results.json")
results = {
    "mse": float(mse),
    "rmse": float(rmse),
    "ci": float(ci)
}
with open(results_path, "w") as f:
    json.dump(results, f, indent=4)
print(f"Results saved at: {results_path}")

preds_path = os.path.join(SAVE_DIR, "predictions.csv")
pd.DataFrame({
    "y_true": y_test.ravel(),
    "y_pred": y_pred.ravel()
}).to_csv(preds_path, index=False)
print(f"Predictions saved at: {preds_path}")


Model saved at: /teamspace/studios/this_studio/models/davis_better/deepdta_davis_model.keras
Weights saved at: /teamspace/studios/this_studio/models/davis_better/deepdta_davis.weights.h5
Tokenizers saved at: /teamspace/studios/this_studio/models/davis_better/tokenizers.pkl
Training history saved at: /teamspace/studios/this_studio/models/davis_better/training_history.json
Results saved at: /teamspace/studios/this_studio/models/davis_better/results.json
Predictions saved at: /teamspace/studios/this_studio/models/davis_better/predictions.csv
