<a href="https://colab.research.google.com/github/alirezafarhadi01/DrugDesignCourse-FinalProject/blob/main/DeepDTA_Datasets_Version_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import zipfile
import os

# Define the path to your zip file
zip_file_path = '/content/data.zip' # Replace with the actual path to your .zip file

# Define the directory where you want to extract the contents
# If the directory doesn't exist, it will be created.
extract_dir = '/content/' # You can change this to your desired extraction path

# Create the extraction directory if it doesn't exist
os.makedirs(extract_dir, exist_ok=True)

# Extract the zip file
try:
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
    print(f"Successfully extracted '{zip_file_path}' to '{extract_dir}'")
except FileNotFoundError:
    print(f"Error: The file '{zip_file_path}' was not found.")
except zipfile.BadZipFile:
    print(f"Error: '{zip_file_path}' is not a valid zip file.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

# Optional: List the contents of the extracted directory to verify
print("\nContents of the extracted directory:")
for item in os.listdir(extract_dir):
    print(os.path.join(extract_dir, item))

Successfully extracted '/content/data.zip' to '/content/'

Contents of the extracted directory:
/content/.config
/content/data.zip
/content/kiba
/content/davis
/content/sample_data


In [None]:
import os, json, numpy as np, pickle, gzip

# candidates for Y path
candidate_paths = [
    "/content/kiba/Y",
    "/content/kiba/Y.npy",
    "/content/kiba/Y.pkl",
    "/content/kiba/Y.pickle",
    "/content/kiba/Y.npz",
    "/content/kiba/Y.gz",
]

def first_existing(paths):
    for p in paths:
        if os.path.exists(p):
            return p
    raise FileNotFoundError("Could not find Y file in expected locations.")

def load_affinity_matrix(path):
    # 1) numpy loader (handles .npy/.npz and sometimes raw)
    try:
        obj = np.load(path, allow_pickle=True)
        if isinstance(obj, np.lib.npyio.NpzFile):
            if 'arr_0' in obj.files:
                return obj['arr_0']
            # fallback to the first key
            return obj[obj.files[0]]
        if isinstance(obj, np.ndarray):
            return obj
    except Exception:
        pass
    # 2) pickle default
    try:
        with open(path, "rb") as f:
            return pickle.load(f)
    except Exception:
        pass
    # 3) pickle with latin1 (py2 pickles)
    try:
        with open(path, "rb") as f:
            return pickle.load(f, encoding="latin1")
    except Exception:
        pass
    # 4) gzip+pickle
    try:
        with gzip.open(path, "rb") as f:
            return pickle.load(f, encoding="latin1")
    except Exception as e:
        raise RuntimeError(f"Failed to load affinity matrix from {path}: {e}")

y_path = first_existing(candidate_paths)
affinity_matrix = load_affinity_matrix(y_path)
if not isinstance(affinity_matrix, np.ndarray):
    affinity_matrix = np.array(affinity_matrix)
if affinity_matrix.ndim != 2:
    raise ValueError(f"Unexpected affinity matrix shape: {affinity_matrix.shape}")

valid_indices = np.where(~np.isnan(affinity_matrix))
total_valid_pairs = int(len(valid_indices[0]))
print(f"Detected Y at: {y_path}")
print(f"Total valid pairs: {total_valid_pairs}")

# save split under /content/New Data
save_dir = "/content/New Data"
os.makedirs(save_dir, exist_ok=True)
split_path = os.path.join(save_dir, "kiba_split.json")

rng = np.random.default_rng(42)
all_indices = np.arange(total_valid_pairs)
rng.shuffle(all_indices)
split_point = int(0.8 * total_valid_pairs)
train_indices = all_indices[:split_point]
test_indices  = all_indices[split_point:]

split = {
    "train_indices": train_indices.tolist(),
    "test_indices": test_indices.tolist(),
    "total_valid_pairs": total_valid_pairs,
    "seed": 42,
    "y_path_used": y_path,
}
with open(split_path, "w") as f:
    json.dump(split, f)

print(f"Train size: {len(train_indices)}, Test size: {len(test_indices)}")
print(f"Saved JSON: {split_path}")


Detected Y at: /content/kiba/Y
Total valid pairs: 118254
Train size: 94603, Test size: 23651
Saved JSON: /content/New Data/kiba_split.json


In [None]:
import json, numpy as np, os

split_path = "/content/New Data/kiba_split.json"
with open(split_path, "r") as f:
    split = json.load(f)

train_indices = np.array(split["train_indices"], dtype=int)
test_indices  = np.array(split["test_indices"], dtype=int)

print(f"Loaded split -> Train: {len(train_indices)}, Test: {len(test_indices)}")


Loaded split -> Train: 94603, Test: 23651
