<a href="https://colab.research.google.com/github/alirezafarhadi01/DrugDesignCourse-FinalProject/blob/main/DeepDTA_Version_5_(ChemBERTa%2C_ProtBERT_).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import json
import numpy as np
import pickle
import tensorflow as tf
from tensorflow.keras import layers, Model
from sklearn.metrics import mean_squared_error
import os
import warnings
import zipfile
import time

warnings.filterwarnings('ignore')

try:
    import torch
    from transformers import AutoTokenizer, AutoModel
except ImportError:
    import subprocess
    import sys
    print("Installing PyTorch and Transformers from Hugging Face...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "torch", "transformers"])
    import torch
    from transformers import AutoTokenizer, AutoModel

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ['OMP_NUM_THREADS'] = str(os.cpu_count())

np.random.seed(42)
tf.random.set_seed(42)

KIBA_DATA_PATH = './kiba'
SPLIT_PATH = "/content/New Data/kiba_split.json"

PROTBERT_MODEL_NAME = "Rostlab/prot_bert_bfd"
CHEMBERTA_MODEL_NAME = "seyonec/ChemBERTa-zinc-base-v1"
PROTBERT_EMBED_DIM = 1024
CHEMBERTA_EMBED_DIM = 768

try:
    tf.config.optimizer.set_jit(True)
except:
    pass

def generate_embeddings(sequences, model_name, saved_path, max_len=512, batch_size=32):
    if os.path.exists(saved_path):
        print(f"Loading pre-computed embeddings from {saved_path}...")
        return np.load(saved_path)
    print(f"Generating embeddings with {model_name}. This may take a while...")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()
    all_embeddings = []
    num_batches = (len(sequences) + batch_size - 1) // batch_size
    print(f"Total batches to process: {num_batches}")
    for batch_num, i in enumerate(range(0, len(sequences), batch_size)):
        if (batch_num + 1) % 10 == 0:
            print(f"Processing batch {batch_num + 1}/{num_batches}...")
        batch_seqs = sequences[i:i + batch_size]
        if "prot_bert" in model_name.lower():
            batch_seqs = [" ".join(list(seq)) for seq in batch_seqs]
        inputs = tokenizer(batch_seqs, return_tensors="pt", truncation=True, padding=True, max_length=max_len).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        all_embeddings.append(embeddings)
    final_embeddings = np.concatenate(all_embeddings, axis=0)
    print(f"Saving embeddings to {saved_path}...")
    np.save(saved_path, final_embeddings)
    return final_embeddings

def concordance_index(y_true, y_pred):
    y_true = y_true.flatten().astype(np.float32)
    y_pred = y_pred.flatten().astype(np.float32)
    mask = ~(np.isnan(y_true) | np.isnan(y_pred))
    y_true, y_pred = y_true[mask], y_pred[mask]
    if len(y_true) < 2: return 0.0
    if len(y_true) > 10000:
        indices = np.random.choice(len(y_true), 10000, replace=False)
        y_true, y_pred = y_true[indices], y_pred[indices]
    n = len(y_true)
    indices = np.arange(n)
    i_indices, j_indices = np.meshgrid(indices, indices, indexing='ij')
    mask = i_indices < j_indices
    y_true_i, y_true_j = y_true[i_indices[mask]], y_true[j_indices[mask]]
    y_pred_i, y_pred_j = y_pred[i_indices[mask]], y_pred[j_indices[mask]]
    different_mask = y_true_i != y_true_j
    if np.sum(different_mask) == 0: return 0.0
    y_true_i, y_true_j = y_true_i[different_mask], y_true_j[different_mask]
    y_pred_i, y_pred_j = y_pred_i[different_mask], y_pred_j[different_mask]
    concordant = ((y_true_i > y_true_j) & (y_pred_i > y_pred_j)) | ((y_true_i < y_true_j) & (y_pred_i < y_pred_j))
    ties = (y_pred_i == y_pred_j)
    return (np.sum(concordant) + 0.5 * np.sum(ties)) / len(y_true_i)

class OptimizedKibaDataProcessor:
    def __init__(self, data_path='./kiba'):
        self.data_path = data_path

    def load_data(self):
        ligands_path = os.path.join(self.data_path, 'ligands_can.txt')
        proteins_path = os.path.join(self.data_path, 'proteins.txt')
        with open(ligands_path, 'r') as f: drugs_dict = json.load(f)
        with open(proteins_path, 'r') as f: proteins_dict = json.load(f)
        y_file_path = os.path.join(self.data_path, 'Y')
        with open(y_file_path, 'rb') as f: affinity_matrix = pickle.load(f, encoding='latin1')
        valid_indices = np.where(~np.isnan(affinity_matrix))
        total_valid_pairs = len(valid_indices[0])

        if not os.path.exists(SPLIT_PATH):
            raise FileNotFoundError(f"Split file not found at {SPLIT_PATH}")
        with open(SPLIT_PATH, "r") as f:
            split = json.load(f)
        train_indices = np.array(split["train_indices"], dtype=int)
        test_indices  = np.array(split["test_indices"], dtype=int)
        if "total_valid_pairs" in split and int(split["total_valid_pairs"]) != int(total_valid_pairs):
            print(f"[warn] total_valid_pairs in split ({split['total_valid_pairs']}) != current ({total_valid_pairs})")

        return drugs_dict, proteins_dict, affinity_matrix, train_indices, test_indices, valid_indices

    def process_data(self):
        data = self.load_data()
        if data is None: return None
        drugs_dict, proteins_dict, affinity_matrix, train_indices, test_indices, valid_indices = data
        all_drug_smiles = list(drugs_dict.values())
        all_protein_seqs = list(proteins_dict.values())
        drug_embeddings = generate_embeddings(all_drug_smiles, CHEMBERTA_MODEL_NAME, os.path.join(self.data_path, "chemberta_embeddings.npy"))
        prot_embeddings = generate_embeddings(all_protein_seqs, PROTBERT_MODEL_NAME, os.path.join(self.data_path, "protbert_embeddings.npy"))
        def create_dataset_from_embeddings(indices, set_name):
            print(f"Assembling {set_name} set from pre-computed embeddings...")
            original_drug_indices = valid_indices[0][indices]
            original_prot_indices = valid_indices[1][indices]
            drug_vectors = drug_embeddings[original_drug_indices]
            prot_vectors = prot_embeddings[original_prot_indices]
            affinities = affinity_matrix[original_drug_indices, original_prot_indices]
            return drug_vectors, prot_vectors, affinities.astype(np.float32)
        XD_train, XT_train, y_train = create_dataset_from_embeddings(train_indices, "training")
        XD_test, XT_test, y_test = create_dataset_from_embeddings(test_indices, "test")
        return (XD_train, XT_train, y_train), (XD_test, XT_test, y_test)

def build_improved_deepdta_model(drug_embed_dim, prot_embed_dim):
    DENSE_UNITS_1 = 1024
    DENSE_UNITS_2 = 512
    DROPOUT_RATE = 0.2
    LEARNING_RATE = 0.001
    drug_input = layers.Input(shape=(drug_embed_dim,), name='drug_input_vector')
    prot_input = layers.Input(shape=(prot_embed_dim,), name='protein_input_vector')
    combined = layers.Concatenate(axis=-1, name='concatenate_branches')([drug_input, prot_input])
    fc1 = layers.Dense(DENSE_UNITS_1, activation='relu', name='fully_connected_1')(combined)
    bn1 = layers.BatchNormalization(name='batch_norm_1')(fc1)
    drop1 = layers.Dropout(DROPOUT_RATE, name='dropout_1')(bn1)
    fc2 = layers.Dense(DENSE_UNITS_2, activation='relu', name='fully_connected_2')(drop1)
    bn2 = layers.BatchNormalization(name='batch_norm_2')(fc2)
    drop2 = layers.Dropout(DROPOUT_RATE, name='dropout_2')(bn2)
    output = layers.Dense(1, activation='linear', name='output')(drop2)
    model = Model(inputs=[drug_input, prot_input], outputs=output, name='DTA_with_Pretrained_Embeddings')
    optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
    model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mean_squared_error'])
    return model

print("All functions and classes defined successfully!")
print("OptimizedKibaDataProcessor is now available!")
print("Ready to process data!")


All functions and classes defined successfully!
OptimizedKibaDataProcessor is now available!
Ready to process data!


In [4]:
#Section 2

import zipfile
import os

# Define the path to your zip file
zip_file_path = '/content/data.zip' # Replace with the actual path to your .zip file

# Define the directory where you want to extract the contents
# If the directory doesn't exist, it will be created.
extract_dir = '/content/' # You can change this to your desired extraction path

# Create the extraction directory if it doesn't exist
os.makedirs(extract_dir, exist_ok=True)

# Extract the zip file
try:
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
    print(f"Successfully extracted '{zip_file_path}' to '{extract_dir}'")
except FileNotFoundError:
    print(f"Error: The file '{zip_file_path}' was not found.")
except zipfile.BadZipFile:
    print(f"Error: '{zip_file_path}' is not a valid zip file.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

# Optional: List the contents of the extracted directory to verify
print("\nContents of the extracted directory:")
for item in os.listdir(extract_dir):
    print(os.path.join(extract_dir, item))

Successfully extracted '/content/data.zip' to '/content/'

Contents of the extracted directory:
/content/.config
/content/data.zip
/content/New Data
/content/.ipynb_checkpoints
/content/davis
/content/kiba
/content/sample_data


In [17]:
# Section 3

print("=== Starting Data Processing ===")

if not os.path.exists(KIBA_DATA_PATH):
    print(f"Data path not found: {KIBA_DATA_PATH}")
    print("Please make sure you have uploaded and extracted the kiba folder.")
    print("Current directory contents:")
    for item in os.listdir('.'):
        print(f"  - {item}")
else:
    print(f"Data path found: {KIBA_DATA_PATH}")

    processor = OptimizedKibaDataProcessor(data_path=KIBA_DATA_PATH)

    processed_data = processor.process_data()
    if processed_data is None:
        print("Failed to process data. Please check your data files.")
    else:
        (XD_train, XT_train, y_train), (XD_test, XT_test, y_test) = processed_data

        print("Data processing completed successfully!")
        print("Data shapes:")
        print(f"   XD_train: {XD_train.shape}, dtype: {XD_train.dtype}")
        print(f"   XT_train: {XT_train.shape}, dtype: {XT_train.dtype}")
        print(f"   y_train: {y_train.shape}, dtype: {y_train.dtype}")
        print(f"   XD_test:  {XD_test.shape},  dtype: {XD_test.dtype}")
        print(f"   XT_test:  {XT_test.shape},  dtype: {XT_test.dtype}")
        print(f"   y_test:   {y_test.shape},   dtype: {y_test.dtype}")


=== Starting Data Processing ===
Data path found: ./kiba
Loading pre-computed embeddings from ./kiba/chemberta_embeddings.npy...
Loading pre-computed embeddings from ./kiba/protbert_embeddings.npy...
Assembling training set from pre-computed embeddings...
Assembling test set from pre-computed embeddings...
Data processing completed successfully!
Data shapes:
   XD_train: (94603, 768), dtype: float32
   XT_train: (94603, 1024), dtype: float32
   y_train: (94603,), dtype: float32
   XD_test:  (23651, 768),  dtype: float32
   XT_test:  (23651, 1024),  dtype: float32
   y_test:   (23651,),   dtype: float32


In [18]:
#Section 4

# CORRECTED CONFIGURATION
REDUCE_DATASET = True
DATASET_FRACTION = 1
EPOCHS = 100
BATCH_SIZE = 256  # REDUCED for better stability
VALIDATION_SPLIT = 0.1

print("CORRECTED CONFIGURATION:")
print("=" * 50)
print(f"Batch size: {BATCH_SIZE}")
print(f"Epochs: {EPOCHS}")

# FIXED: Proper dataset reduction
if 'XD_train' in locals() and len(XD_train) > 1000:  # Only if we have full data

    # Use the ORIGINAL full dataset, not the already reduced one
    print(f"Using original dataset: {len(XD_train):,} training samples")

    # Calculate correct sample sizes
    n_train = int(len(XD_train) * DATASET_FRACTION)
    n_test = int(len(XD_test) * DATASET_FRACTION)

    print(f"Target training samples: {n_train:,}")
    print(f"Target test samples: {n_test:,}")

    # Randomly sample subset
    np.random.seed(42)  # For reproducibility
    train_indices = np.random.choice(len(XD_train), n_train, replace=False)
    test_indices = np.random.choice(len(XD_test), n_test, replace=False)

    XD_train = XD_train[train_indices]
    XT_train = XT_train[train_indices]
    y_train = y_train[train_indices]

    XD_test = XD_test[test_indices]
    XT_test = XT_test[test_indices]
    y_test = y_test[test_indices]

    # Calculate batches per epoch
    batches_per_epoch = len(XD_train) // BATCH_SIZE

    print(f"CORRECTED DATASET:")
    print(f"   Training: {len(XD_train):,} samples")
    print(f"   Test: {len(XD_test):,} samples")
    print(f"   Batches per epoch: {batches_per_epoch}")
    print(f"   Much more stable training expected!")

    # Use corrected datasets
    XD_train = XD_train
    XT_train = XT_train
    y_train = y_train
    XD_test = XD_test
    XT_test = XT_test
    y_test = y_test

    print("Ready for corrected training!")

elif 'XD_train' in locals():
    print(f"Current dataset already reduced: {len(XD_train):,} samples")
    print("Please restart and run data processing again for full dataset")

    # Still apply batch size fix for current data
    batches_per_epoch = len(XD_train) // BATCH_SIZE
    print(f"Batches per epoch with current data: {batches_per_epoch}")

else:
    print("Cannot apply optimizations - data not loaded yet!")

CORRECTED CONFIGURATION:
Batch size: 256
Epochs: 100
Using original dataset: 94,603 training samples
Target training samples: 94,603
Target test samples: 23,651
CORRECTED DATASET:
   Training: 94,603 samples
   Test: 23,651 samples
   Batches per epoch: 369
   Much more stable training expected!
Ready for corrected training!


In [19]:
# Section 5

# Build the model (embedding-based)
if 'XD_train' in locals() and 'XT_train' in locals():
    print("\n=== Building Model (Embeddings) ===")

    # infer input dims from data (safer than hard-coding)
    drug_embed_dim = int(XD_train.shape[1])
    prot_embed_dim = int(XT_train.shape[1])

    # or, if you prefer hard-coded constants from Section 1:
    # drug_embed_dim = CHEMBERTA_EMBED_DIM
    # prot_embed_dim = PROTBERT_EMBED_DIM

    model = build_improved_deepdta_model(
        drug_embed_dim=drug_embed_dim,
        prot_embed_dim=prot_embed_dim
    )

    print("\n=== Model Summary ===")
    model.summary()
    print("Model built successfully!")

    # expose for later sections
    fast_model = model
else:
    print("Cannot build model - data processing failed or incomplete")



=== Building Model (Embeddings) ===

=== Model Summary ===


Model built successfully!


In [20]:
#Section 6

# OPTIMIZED TRAINING
if 'fast_model' in locals():
    print("STARTING OPTIMIZED TRAINING ...")
    print("=" * 55)

    # Training configuration summary
    print(f"Configuration Summary:")
    print(f"   Batch size: {BATCH_SIZE}")
    print(f"   Epochs: {EPOCHS}")
    print(f"   Training samples: {len(XD_train):,}")
    print(f"   Test samples: {len(XD_test):,}")
    print(f"   Batches per epoch: {len(XD_train) // BATCH_SIZE}")

    # Optimized callbacks for the full dataset
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=15,  # More patience for larger dataset
            restore_best_weights=True,
            verbose=1
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=7,   # More patience for larger dataset
            min_lr=1e-6,
            verbose=1
        )
    ]

    print(f"\nStarting training...")
    print(f"Expected CI: 0.72-0.78 (significant improvement over 50%)")
    print(f"Expected total time: 1-3 minutes")
    print("=" * 55)

    import time
    start_time = time.time()

    # Train the model
    history = fast_model.fit(
        x=[XD_train, XT_train],
        y=y_train,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        validation_split=VALIDATION_SPLIT,
        callbacks=callbacks,
        verbose=1
    )

    end_time = time.time()
    training_time = end_time - start_time

    print(f"\nTRAINING COMPLETED!")
    print("=" * 30)
    print(f"Total training time: {training_time/60:.1f} minutes")
    print(f"Average per epoch: {training_time/len(history.history['loss']):.1f} seconds")
    print(f"Best validation loss: {min(history.history['val_loss']):.4f}")
    print(f"Final training loss: {history.history['loss'][-1]:.4f}")
    print(f"Total epochs completed: {len(history.history['loss'])}")

else:
    print("Cannot start training - model not built")

STARTING OPTIMIZED TRAINING ...
Configuration Summary:
   Batch size: 256
   Epochs: 100
   Training samples: 94,603
   Test samples: 23,651
   Batches per epoch: 369

Starting training...
Expected CI: 0.72-0.78 (significant improvement over 50%)
Expected total time: 1-3 minutes
Epoch 1/100
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 122ms/step - loss: 75.8480 - mean_squared_error: 75.8480 - val_loss: 0.8763 - val_mean_squared_error: 0.8763 - learning_rate: 0.0010
Epoch 2/100
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 108ms/step - loss: 2.9600 - mean_squared_error: 2.9600 - val_loss: 0.6656 - val_mean_squared_error: 0.6656 - learning_rate: 0.0010
Epoch 3/100
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 104ms/step - loss: 2.3503 - mean_squared_error: 2.3503 - val_loss: 0.5942 - val_mean_squared_error: 0.5942 - learning_rate: 0.0010
Epoch 4/100
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 109ms/ste

In [21]:
#Section 7

# EVALUATION
if 'history' in locals():
    print("EVALUATING RESULTS...")
    print("=" * 45)

    # Make predictions
    print("Making predictions on test set...")
    y_pred_test = fast_model.predict([XD_test, XT_test], batch_size=BATCH_SIZE, verbose=1)

    # Calculate metrics
    print("Computing metrics...")
    test_mse = mean_squared_error(y_test, y_pred_test)
    test_ci = concordance_index(y_test, y_pred_test)

    print(f"\n100% DATASET RESULTS:")
    print("=" * 30)
    print(f"Test MSE: {test_mse:.6f}")
    print(f"Test CI: {test_ci:.6f}")

    # Performance comparison across all datasets
    print(f"\nPERFORMANCE PROGRESSION:")
    print("-" * 35)
    print(f"   10% dataset: CI = 0.6552")
    print(f"   Full dataset: CI = {test_ci:.4f}")

    # Comparison with literature
    print(f"\nLiterature Comparison:")
    print("-" * 25)
    print(f"   Paper DeepDTA: 0.863")
    print(f"   Our 100% model: {test_ci:.3f}")
    print(f"   Performance ratio: {test_ci/0.863:.1%}")

    # Performance analysis
    if test_ci >= 0.72:
        print(f"\nEXCELLENT! CI ≥ 0.72 - Ready for full dataset!")
        print(f"Next step: Try Full dataset for CI ≈ 0.80+")
    elif test_ci >= 0.68:
        print(f"\nGOOD! CI ≥ 0.68 - Model scaling properly")
        print(f"Continue scaling to Full dataset")
    else:
        print(f"\nCI < 0.68 - May need model architecture improvements")

    # Additional metrics
    mae = np.mean(np.abs(y_test - y_pred_test.flatten()))
    rmse = np.sqrt(test_mse)
    r2 = 1 - np.var(y_test - y_pred_test.flatten()) / np.var(y_test)

    print(f"\nAdditional Metrics:")
    print("-" * 20)
    print(f"MAE: {mae:.6f}")
    print(f"RMSE: {rmse:.6f}")
    print(f"R² Score: {r2:.6f}")

    print("\n" + "="*45)
    print("Full DATASET EVALUATION COMPLETED!")
    print("="*45)

else:
    print("Cannot evaluate - training not completed")

EVALUATING RESULTS...
Making predictions on test set...
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 44ms/step
Computing metrics...

100% DATASET RESULTS:
Test MSE: 0.259478
Test CI: 0.812397

PERFORMANCE PROGRESSION:
-----------------------------------
   10% dataset: CI = 0.6552
   Full dataset: CI = 0.8124

Literature Comparison:
-------------------------
   Paper DeepDTA: 0.863
   Our 100% model: 0.812
   Performance ratio: 94.1%

EXCELLENT! CI ≥ 0.72 - Ready for full dataset!
Next step: Try Full dataset for CI ≈ 0.80+

Additional Metrics:
--------------------
MAE: 0.334473
RMSE: 0.509390
R² Score: 0.625458

Full DATASET EVALUATION COMPLETED!
