<a href="https://colab.research.google.com/github/alirezafarhadi01/DrugDesignCourse-FinalProject/blob/main/DeepDTA_Version_2_(Wide_%2B_BN_%2B_Dropout_0_1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Section 1

import json
import numpy as np
import pickle
import tensorflow as tf
from tensorflow.keras import layers, Model
from sklearn.metrics import mean_squared_error
import os
import warnings

warnings.filterwarnings('ignore')

# Install and import tqdm for progress bars
try:
    from tqdm import tqdm
except ImportError:
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "tqdm"])
    from tqdm import tqdm

# Basic optimizations
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ['OMP_NUM_THREADS'] = str(os.cpu_count())

# Set random seeds
np.random.seed(42)
tf.random.set_seed(42)

# Constants
KIBA_MAX_DRUG_LEN = 100
KIBA_MAX_PROT_LEN = 1000
EMBEDDING_DIM = 128
KIBA_DATA_PATH = './kiba'
SPLIT_PATH = "/content/New Data/kiba_split.json"

# Enable XLA for faster computation
try:
    tf.config.optimizer.set_jit(True)
except:
    pass

def concordance_index(y_true, y_pred):
    """Optimized concordance index calculation."""
    y_true = y_true.flatten().astype(np.float32)
    y_pred = y_pred.flatten().astype(np.float32)

    # Remove NaN values
    mask = ~(np.isnan(y_true) | np.isnan(y_pred))
    y_true = y_true[mask]
    y_pred = y_pred[mask]

    if len(y_true) < 2:
        return 0.0

    # For large datasets, use sampling for faster calculation
    if len(y_true) > 10000:
        indices = np.random.choice(len(y_true), 10000, replace=False)
        y_true = y_true[indices]
        y_pred = y_pred[indices]

    # Vectorized CI calculation
    n = len(y_true)
    indices = np.arange(n)
    i_indices, j_indices = np.meshgrid(indices, indices, indexing='ij')
    mask = i_indices < j_indices

    y_true_i = y_true[i_indices[mask]]
    y_true_j = y_true[j_indices[mask]]
    y_pred_i = y_pred[i_indices[mask]]
    y_pred_j = y_pred[j_indices[mask]]

    # Only consider pairs with different true values
    different_mask = y_true_i != y_true_j
    if np.sum(different_mask) == 0:
        return 0.0

    y_true_i = y_true_i[different_mask]
    y_true_j = y_true_j[different_mask]
    y_pred_i = y_pred_i[different_mask]
    y_pred_j = y_pred_j[different_mask]

    # Count concordant pairs
    concordant = ((y_true_i > y_true_j) & (y_pred_i > y_pred_j)) | \
                 ((y_true_i < y_true_j) & (y_pred_i < y_pred_j))
    ties = (y_pred_i == y_pred_j)

    return (np.sum(concordant) + 0.5 * np.sum(ties)) / len(y_true_i)

def create_char_vocab(sequences_dict):
    """Create character vocabulary with progress tracking."""
    char_set = set()

    # Use tqdm for vocabulary creation
    for seq in tqdm(sequences_dict.values(), desc="Processing sequences", unit="seq"):
        char_set.update(seq)

    chars = sorted(list(char_set))
    char_to_int = {char: idx + 1 for idx, char in enumerate(chars)}
    return char_to_int

def encode_sequences_batch(sequences, char_vocab, max_length, desc="Encoding sequences"):
    """Batch encode sequences with progress bar."""
    encoded_sequences = []

    for sequence in tqdm(sequences, desc=desc, unit="seq"):
        encoded = [char_vocab.get(char, 0) for char in sequence]
        if len(encoded) < max_length:
            encoded += [0] * (max_length - len(encoded))
        else:
            encoded = encoded[:max_length]
        encoded_sequences.append(encoded)

    return np.array(encoded_sequences, dtype=np.int32)

class OptimizedKibaDataProcessor:
    """Enhanced data processor with detailed progress tracking."""

    def __init__(self, data_path='./kiba', max_drug_len=100, max_prot_len=1000):
        self.data_path = data_path
        self.max_drug_len = max_drug_len
        self.max_prot_len = max_prot_len
        self.drug_vocab = None
        self.prot_vocab = None

    def load_data(self):
        """Load data with progress tracking."""

        try:
            # Load ligands and proteins
            ligands_path = os.path.join(self.data_path, 'ligands_can.txt')
            proteins_path = os.path.join(self.data_path, 'proteins.txt')

            with open(ligands_path, 'r') as f:
                drugs_dict = json.load(f)

            with open(proteins_path, 'r') as f:
                proteins_dict = json.load(f)

            # Load affinity matrix with progress indication
            y_file_path = os.path.join(self.data_path, 'Y')

            if os.path.exists(y_file_path):
                try:
                    with open(y_file_path, 'rb') as f:
                        affinity_matrix = pickle.load(f)
                except:
                    try:
                        with open(y_file_path, 'rb') as f:
                            affinity_matrix = pickle.load(f, encoding='latin1')
                    except Exception as e:
                        raise
            else:
                raise FileNotFoundError(f"Y file not found at {y_file_path}")

            # Find valid indices with progress
            valid_indices = np.where(~np.isnan(affinity_matrix))
            total_valid_pairs = len(valid_indices[0])

            # Load precomputed split (no new random split is created)
            if not os.path.exists(SPLIT_PATH):
                raise FileNotFoundError(f"Split file not found at {SPLIT_PATH}")
            with open(SPLIT_PATH, "r") as f:
                split = json.load(f)
            train_indices = np.array(split["train_indices"], dtype=int)
            test_indices  = np.array(split["test_indices"], dtype=int)

            if "total_valid_pairs" in split and int(split["total_valid_pairs"]) != int(total_valid_pairs):
                print(f"[warn] total_valid_pairs in split ({split['total_valid_pairs']}) != current ({total_valid_pairs})")

            return drugs_dict, proteins_dict, affinity_matrix, train_indices, test_indices, valid_indices

        except Exception as e:
            return None

    def process_data(self):
        """Process data with comprehensive progress tracking."""
        data = self.load_data()
        if data is None:
            return None

        drugs_dict, proteins_dict, affinity_matrix, train_indices, test_indices, valid_indices = data

        self.drug_vocab = create_char_vocab(drugs_dict)
        self.prot_vocab = create_char_vocab(proteins_dict)

        drug_ids = list(drugs_dict.keys())
        protein_ids = list(proteins_dict.keys())

        def create_dataset_optimized(indices, dataset_name):
            """Create dataset with detailed progress tracking."""

            drug_sequences = []
            prot_sequences = []
            affinities = []

            chunk_size = 10000
            total_chunks = (len(indices) + chunk_size - 1) // chunk_size

            # Overall progress bar for chunks
            chunk_pbar = tqdm(total=total_chunks, desc=f"Processing {dataset_name} chunks", unit="chunk")

            for chunk_idx, i in enumerate(range(0, len(indices), chunk_size)):
                chunk_indices = indices[i:i + chunk_size]
                chunk_drugs = []
                chunk_prots = []
                chunk_affs = []

                # Progress bar for processing samples in this chunk
                sample_pbar = tqdm(
                    chunk_indices,
                    desc=f"Chunk {chunk_idx + 1}/{total_chunks}",
                    unit="sample",
                    leave=False
                )

                for idx in sample_pbar:
                    if idx < len(valid_indices[0]):
                        drug_idx = valid_indices[0][idx]
                        prot_idx = valid_indices[1][idx]

                        if drug_idx < len(drug_ids) and prot_idx < len(protein_ids):
                            drug_id = drug_ids[drug_idx]
                            prot_id = protein_ids[prot_idx]

                            chunk_drugs.append(drugs_dict[drug_id])
                            chunk_prots.append(proteins_dict[prot_id])
                            chunk_affs.append(affinity_matrix[drug_idx, prot_idx])

                sample_pbar.close()

                # Batch encode sequences with progress
                if chunk_drugs:
                    encoded_drugs = encode_sequences_batch(
                        chunk_drugs, self.drug_vocab, self.max_drug_len,
                        desc=f"Encoding drugs (chunk {chunk_idx + 1})"
                    )
                    encoded_prots = encode_sequences_batch(
                        chunk_prots, self.prot_vocab, self.max_prot_len,
                        desc=f"Encoding proteins (chunk {chunk_idx + 1})"
                    )

                    drug_sequences.append(encoded_drugs)
                    prot_sequences.append(encoded_prots)
                    affinities.extend(chunk_affs)

                chunk_pbar.update(1)

            chunk_pbar.close()

            # Concatenate all chunks with progress
            if drug_sequences:
                drugs = np.concatenate(drug_sequences, axis=0)
                proteins = np.concatenate(prot_sequences, axis=0)
                affinities = np.array(affinities, dtype=np.float32)
            else:
                drugs = np.array([], dtype=np.int32).reshape(0, self.max_drug_len)
                proteins = np.array([], dtype=np.int32).reshape(0, self.max_prot_len)
                affinities = np.array([], dtype=np.float32)

            return drugs, proteins, affinities

        XD_train, XT_train, y_train = create_dataset_optimized(train_indices, "training")
        XD_test, XT_test, y_test = create_dataset_optimized(test_indices, "test")

        return ((XD_train, XT_train, y_train),
                (XD_test, XT_test, y_test),
                (len(self.drug_vocab) + 1, len(self.prot_vocab) + 1))

def build_improved_deepdta_model(drug_vocab_size, prot_vocab_size, max_drug_len, max_prot_len):
    EMBEDDING_DIM = 128
    DROPOUT_RATE = 0.1
    LEARNING_RATE = 0.001

    # Wide CNN filters
    FILTER_NUM_1 = 64
    FILTER_NUM_2 = 128
    FILTER_NUM_3 = 192

    DRUG_FILTER_LENS = [4, 6, 8]
    PROT_FILTER_LENS = [4, 8, 12]

    FC_NEURONS_1 = 1024
    FC_NEURONS_2 = 1024
    FC_NEURONS_3 = 512

    # Drug branch
    drug_input = layers.Input(shape=(max_drug_len,), name='drug_input', dtype='int32')
    drug_embedding = layers.Embedding(
        input_dim=drug_vocab_size,
        output_dim=EMBEDDING_DIM,
        name='drug_embedding'
    )(drug_input)

    drug_conv1 = layers.Conv1D(FILTER_NUM_1, DRUG_FILTER_LENS[0], padding='valid')(drug_embedding)
    drug_bn1 = layers.BatchNormalization()(drug_conv1)
    drug_act1 = layers.Activation('relu')(drug_bn1)

    drug_conv2 = layers.Conv1D(FILTER_NUM_2, DRUG_FILTER_LENS[1], padding='valid')(drug_act1)
    drug_bn2 = layers.BatchNormalization()(drug_conv2)
    drug_act2 = layers.Activation('relu')(drug_bn2)

    drug_conv3 = layers.Conv1D(FILTER_NUM_3, DRUG_FILTER_LENS[2], padding='valid')(drug_act2)
    drug_bn3 = layers.BatchNormalization()(drug_conv3)
    drug_act3 = layers.Activation('relu')(drug_bn3)

    drug_pool = layers.GlobalMaxPooling1D()(drug_act3)

    # Protein branch
    prot_input = layers.Input(shape=(max_prot_len,), name='protein_input', dtype='int32')
    prot_embedding = layers.Embedding(
        input_dim=prot_vocab_size,
        output_dim=EMBEDDING_DIM,
        name='protein_embedding'
    )(prot_input)

    prot_conv1 = layers.Conv1D(FILTER_NUM_1, PROT_FILTER_LENS[0], padding='valid')(prot_embedding)
    prot_bn1 = layers.BatchNormalization()(prot_conv1)
    prot_act1 = layers.Activation('relu')(prot_bn1)

    prot_conv2 = layers.Conv1D(FILTER_NUM_2, PROT_FILTER_LENS[1], padding='valid')(prot_act1)
    prot_bn2 = layers.BatchNormalization()(prot_conv2)
    prot_act2 = layers.Activation('relu')(prot_bn2)

    prot_conv3 = layers.Conv1D(FILTER_NUM_3, PROT_FILTER_LENS[2], padding='valid')(prot_act2)
    prot_bn3 = layers.BatchNormalization()(prot_conv3)
    prot_act3 = layers.Activation('relu')(prot_bn3)

    prot_pool = layers.GlobalMaxPooling1D()(prot_act3)

    # Merge branches
    combined = layers.Concatenate()([drug_pool, prot_pool])

    fc1 = layers.Dense(FC_NEURONS_1, activation='relu')(combined)
    fc1 = layers.Dropout(DROPOUT_RATE)(fc1)

    fc2 = layers.Dense(FC_NEURONS_2, activation='relu')(fc1)
    fc2 = layers.Dropout(DROPOUT_RATE)(fc2)

    fc3 = layers.Dense(FC_NEURONS_3, activation='relu')(fc2)

    output = layers.Dense(1, activation='linear')(fc3)

    model = Model(inputs=[drug_input, prot_input], outputs=output)

    optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
    model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mean_squared_error'])

    return model



print("All functions and classes defined successfully!")
print("OptimizedKibaDataProcessor is now available!")
print("Ready to process data!")


All functions and classes defined successfully!
OptimizedKibaDataProcessor is now available!
Ready to process data!


In [2]:
#Section 2

import zipfile
import os

# Define the path to your zip file
zip_file_path = '/content/data.zip' # Replace with the actual path to your .zip file

# Define the directory where you want to extract the contents
# If the directory doesn't exist, it will be created.
extract_dir = '/content/' # You can change this to your desired extraction path

# Create the extraction directory if it doesn't exist
os.makedirs(extract_dir, exist_ok=True)

# Extract the zip file
try:
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
    print(f"Successfully extracted '{zip_file_path}' to '{extract_dir}'")
except FileNotFoundError:
    print(f"Error: The file '{zip_file_path}' was not found.")
except zipfile.BadZipFile:
    print(f"Error: '{zip_file_path}' is not a valid zip file.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

# Optional: List the contents of the extracted directory to verify
print("\nContents of the extracted directory:")
for item in os.listdir(extract_dir):
    print(os.path.join(extract_dir, item))

Successfully extracted '/content/data.zip' to '/content/'

Contents of the extracted directory:
/content/.config
/content/data.zip
/content/kiba
/content/New Data
/content/.ipynb_checkpoints
/content/davis
/content/sample_data


In [3]:
#Section 3

# Process the data
print("=== Starting Data Processing ===")

# Check if data path exists
if not os.path.exists(KIBA_DATA_PATH):
    print(f"Data path not found: {KIBA_DATA_PATH}")
    print("Please make sure you have uploaded and extracted the kiba folder.")
    print("Current directory contents:")
    for item in os.listdir('.'):
        print(f"  - {item}")
else:
    print(f"Data path found: {KIBA_DATA_PATH}")

    # Now OptimizedKibaDataProcessor should be available
    processor = OptimizedKibaDataProcessor(data_path=KIBA_DATA_PATH)

    processed_data = processor.process_data()
    if processed_data is None:
        print("Failed to process data. Please check your data files.")
    else:
        (XD_train, XT_train, y_train), (XD_test, XT_test, y_test), (drug_vocab_size, prot_vocab_size) = processed_data
        print("Data processing completed successfully!")
        print(f"Data shapes:")
        print(f"   XD_train: {XD_train.shape}, dtype: {XD_train.dtype}")
        print(f"   XT_train: {XT_train.shape}, dtype: {XT_train.dtype}")
        print(f"   y_train: {y_train.shape}, dtype: {y_train.dtype}")
        print(f"   XD_test: {XD_test.shape}, dtype: {XD_test.dtype}")
        print(f"   XT_test: {XT_test.shape}, dtype: {XT_test.dtype}")
        print(f"   y_test: {y_test.shape}, dtype: {y_test.dtype}")


=== Starting Data Processing ===
Data path found: ./kiba


Processing sequences: 100%|██████████| 2111/2111 [00:00<00:00, 792390.88seq/s]
Processing sequences: 100%|██████████| 229/229 [00:00<00:00, 94092.44seq/s]
Processing training chunks:   0%|          | 0/10 [00:00<?, ?chunk/s]
Chunk 1/10:   0%|          | 0/10000 [00:00<?, ?sample/s][A
                                                         [A
Encoding drugs (chunk 1): 100%|██████████| 10000/10000 [00:00<00:00, 236700.21seq/s]

Encoding proteins (chunk 1):   0%|          | 0/10000 [00:00<?, ?seq/s][A
Encoding proteins (chunk 1):  15%|█▌        | 1500/10000 [00:00<00:00, 13942.19seq/s][A
Encoding proteins (chunk 1):  29%|██▉       | 2933/10000 [00:00<00:00, 14161.85seq/s][A
Encoding proteins (chunk 1):  44%|████▍     | 4418/10000 [00:00<00:00, 14468.35seq/s][A
Encoding proteins (chunk 1):  59%|█████▊    | 5866/10000 [00:00<00:00, 14356.74seq/s][A
Encoding proteins (chunk 1):  73%|███████▎  | 7303/10000 [00:00<00:00, 14203.32seq/s][A
Encoding proteins (chunk 1): 100%|██████████| 1

Data processing completed successfully!
Data shapes:
   XD_train: (94603, 100), dtype: int32
   XT_train: (94603, 1000), dtype: int32
   y_train: (94603,), dtype: float32
   XD_test: (23651, 100), dtype: int32
   XT_test: (23651, 1000), dtype: int32
   y_test: (23651,), dtype: float32





In [5]:
#Section 4

# CORRECTED CONFIGURATION
REDUCE_DATASET = True
DATASET_FRACTION = 1
EPOCHS = 100
BATCH_SIZE = 256  # REDUCED for better stability
VALIDATION_SPLIT = 0.1

print("CORRECTED CONFIGURATION:")
print("=" * 50)
print(f"Batch size: {BATCH_SIZE}")
print(f"Epochs: {EPOCHS}")

# FIXED: Proper dataset reduction
if 'XD_train' in locals() and len(XD_train) > 1000:  # Only if we have full data

    # Use the ORIGINAL full dataset, not the already reduced one
    print(f"Using original dataset: {len(XD_train):,} training samples")

    # Calculate correct sample sizes
    n_train = int(len(XD_train) * DATASET_FRACTION)
    n_test = int(len(XD_test) * DATASET_FRACTION)

    print(f"Target training samples: {n_train:,}")
    print(f"Target test samples: {n_test:,}")

    # Randomly sample subset
    np.random.seed(42)  # For reproducibility
    train_indices = np.random.choice(len(XD_train), n_train, replace=False)
    test_indices = np.random.choice(len(XD_test), n_test, replace=False)

    XD_train = XD_train[train_indices]
    XT_train = XT_train[train_indices]
    y_train = y_train[train_indices]

    XD_test = XD_test[test_indices]
    XT_test = XT_test[test_indices]
    y_test = y_test[test_indices]

    # Calculate batches per epoch
    batches_per_epoch = len(XD_train) // BATCH_SIZE

    print(f"CORRECTED DATASET:")
    print(f"   Training: {len(XD_train):,} samples")
    print(f"   Test: {len(XD_test):,} samples")
    print(f"   Batches per epoch: {batches_per_epoch}")
    print(f"   Much more stable training expected!")

    # Use corrected datasets
    XD_train = XD_train
    XT_train = XT_train
    y_train = y_train
    XD_test = XD_test
    XT_test = XT_test
    y_test = y_test

    print("Ready for corrected training!")

elif 'XD_train' in locals():
    print(f"Current dataset already reduced: {len(XD_train):,} samples")
    print("Please restart and run data processing again for full dataset")

    # Still apply batch size fix for current data
    batches_per_epoch = len(XD_train) // BATCH_SIZE
    print(f"Batches per epoch with current data: {batches_per_epoch}")

else:
    print("Cannot apply optimizations - data not loaded yet!")


CORRECTED CONFIGURATION:
Batch size: 256
Epochs: 100
Using original dataset: 94,603 training samples
Target training samples: 94,603
Target test samples: 23,651
CORRECTED DATASET:
   Training: 94,603 samples
   Test: 23,651 samples
   Batches per epoch: 369
   Much more stable training expected!
Ready for corrected training!


In [7]:
#Section 5

# Build the model (only if data processing was successful)
if 'XD_train' in locals() and 'drug_vocab_size' in locals():
    print("\n=== Building Optimized Model ===")

    # Build the fast model
    fast_model = build_improved_deepdta_model(
        drug_vocab_size=drug_vocab_size,
        prot_vocab_size=prot_vocab_size,
        max_drug_len=processor.max_drug_len,
        max_prot_len=processor.max_prot_len
    )

    print("\n=== Model Summary ===")
    fast_model.summary()
    print("Model built successfully!")
else:
    print("Cannot build model - data processing failed or incomplete")





=== Building Optimized Model ===

=== Model Summary ===


Model built successfully!


In [8]:
#Section 6

# OPTIMIZED TRAINING
if 'fast_model' in locals():
    print("STARTING OPTIMIZED TRAINING ...")
    print("=" * 55)

    # Training configuration summary
    print(f"Configuration Summary:")
    print(f"   Batch size: {BATCH_SIZE}")
    print(f"   Epochs: {EPOCHS}")
    print(f"   Training samples: {len(XD_train):,}")
    print(f"   Test samples: {len(XD_test):,}")
    print(f"   Batches per epoch: {len(XD_train) // BATCH_SIZE}")

    # Optimized callbacks for the full dataset
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=15,  # More patience for larger dataset
            restore_best_weights=True,
            verbose=1
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=7,   # More patience for larger dataset
            min_lr=1e-6,
            verbose=1
        )
    ]

    print(f"\nStarting training...")
    print(f"Expected CI: 0.72-0.78 (significant improvement over 50%)")
    print(f"Expected total time: 1-3 minutes")
    print("=" * 55)

    import time
    start_time = time.time()

    # Train the model
    history = fast_model.fit(
        x=[XD_train, XT_train],
        y=y_train,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        validation_split=VALIDATION_SPLIT,
        callbacks=callbacks,
        verbose=1
    )

    end_time = time.time()
    training_time = end_time - start_time

    print(f"\nTRAINING COMPLETED!")
    print("=" * 30)
    print(f"Total training time: {training_time/60:.1f} minutes")
    print(f"Average per epoch: {training_time/len(history.history['loss']):.1f} seconds")
    print(f"Best validation loss: {min(history.history['val_loss']):.4f}")
    print(f"Final training loss: {history.history['loss'][-1]:.4f}")
    print(f"Total epochs completed: {len(history.history['loss'])}")

else:
    print("Cannot start training - model not built")

STARTING OPTIMIZED TRAINING ...
Configuration Summary:
   Batch size: 256
   Epochs: 100
   Training samples: 94,603
   Test samples: 23,651
   Batches per epoch: 369

Starting training...
Expected CI: 0.72-0.78 (significant improvement over 50%)
Expected total time: 1-3 minutes
Epoch 1/100
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 219ms/step - loss: 8.7894 - mean_squared_error: 8.7894 - val_loss: 38.9234 - val_mean_squared_error: 38.9234 - learning_rate: 0.0010
Epoch 2/100
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 165ms/step - loss: 0.6688 - mean_squared_error: 0.6688 - val_loss: 4.3845 - val_mean_squared_error: 4.3845 - learning_rate: 0.0010
Epoch 3/100
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 164ms/step - loss: 0.6112 - mean_squared_error: 0.6112 - val_loss: 0.4797 - val_mean_squared_error: 0.4797 - learning_rate: 0.0010
Epoch 4/100
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 166ms/s

In [9]:
#Section 7

# EVALUATION
if 'history' in locals():
    print("EVALUATING RESULTS...")
    print("=" * 45)

    # Make predictions
    print("Making predictions on test set...")
    y_pred_test = fast_model.predict([XD_test, XT_test], batch_size=BATCH_SIZE, verbose=1)

    # Calculate metrics
    print("Computing metrics...")
    test_mse = mean_squared_error(y_test, y_pred_test)
    test_ci = concordance_index(y_test, y_pred_test)

    print(f"\n100% DATASET RESULTS:")
    print("=" * 30)
    print(f"Test MSE: {test_mse:.6f}")
    print(f"Test CI: {test_ci:.6f}")

    # Performance comparison across all datasets
    print(f"\nPERFORMANCE PROGRESSION:")
    print("-" * 35)
    print(f"   10% dataset: CI = 0.6552")
    print(f"   Full dataset: CI = {test_ci:.4f}")

    # Comparison with literature
    print(f"\nLiterature Comparison:")
    print("-" * 25)
    print(f"   Paper DeepDTA: 0.863")
    print(f"   Our 100% model: {test_ci:.3f}")
    print(f"   Performance ratio: {test_ci/0.863:.1%}")

    # Performance analysis
    if test_ci >= 0.72:
        print(f"\nEXCELLENT! CI ≥ 0.72 - Ready for full dataset!")
        print(f"Next step: Try Full dataset for CI ≈ 0.80+")
    elif test_ci >= 0.68:
        print(f"\nGOOD! CI ≥ 0.68 - Model scaling properly")
        print(f"Continue scaling to Full dataset")
    else:
        print(f"\nCI < 0.68 - May need model architecture improvements")

    # Additional metrics
    mae = np.mean(np.abs(y_test - y_pred_test.flatten()))
    rmse = np.sqrt(test_mse)
    r2 = 1 - np.var(y_test - y_pred_test.flatten()) / np.var(y_test)

    print(f"\nAdditional Metrics:")
    print("-" * 20)
    print(f"MAE: {mae:.6f}")
    print(f"RMSE: {rmse:.6f}")
    print(f"R² Score: {r2:.6f}")

    print("\n" + "="*45)
    print("Full DATASET EVALUATION COMPLETED!")
    print("="*45)

else:
    print("Cannot evaluate - training not completed")

EVALUATING RESULTS...
Making predictions on test set...
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 62ms/step
Computing metrics...

100% DATASET RESULTS:
Test MSE: 0.476595
Test CI: 0.726964

PERFORMANCE PROGRESSION:
-----------------------------------
   10% dataset: CI = 0.6552
   Full dataset: CI = 0.7270

Literature Comparison:
-------------------------
   Paper DeepDTA: 0.863
   Our 100% model: 0.727
   Performance ratio: 84.2%

EXCELLENT! CI ≥ 0.72 - Ready for full dataset!
Next step: Try Full dataset for CI ≈ 0.80+

Additional Metrics:
--------------------
MAE: 0.500620
RMSE: 0.690359
R² Score: 0.338513

Full DATASET EVALUATION COMPLETED!
