In [1]:
import sys
sys.path.append(r"scripts")
from functools import lru_cache
import pandas as pd
import numpy as np
import torch
import torch

# Check if PyTorch is installed
print(f"PyTorch version: {torch.__version__}")
import tensorflow as tf
print(f"Num GPUs Available: {len(tf.config.experimental.list_physical_devices('GPU'))}")
print("CUDA Available:", torch.cuda.is_available())
print("CUDA Version:", torch.version.cuda)
from torch_geometric.data import Data
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
import os
import requests
import zipfile
import networkx as nx
import scripts
from scripts import *
import torchmetrics
from torch import nn
import optuna
import models
from optuna.integration import TensorBoardCallback
from model_GNN import ModularPathwayConv, ModularGNN
torch.set_printoptions(threshold=torch.inf)
from model_ResNet import CombinedModel, ResNet, DrugMLP  
from torch_geometric.loader import DataLoader
from torch_geometric.data import Batch
from copy import deepcopy
import itertools
print(f"Memory allocated: {torch.cuda.memory_allocated() / 1e6} MB")
print(f"Max memory allocated: {torch.cuda.max_memory_allocated() / 1e6} MB")
import uuid
import torch.multiprocessing as mp
import torch.distributed as dist





from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import  DistributedSampler
from tqdm import tqdm  # Import tqdm for the progress bars





PyTorch version: 2.5.1


2024-12-16 17:45:53.086533: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1734367553.100591 3118152 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734367553.104965 3118152 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-16 17:45:53.121319: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Num GPUs Available: 2
CUDA Available: True
CUDA Version: 12.4
Memory allocated: 0.0 MB
Max memory allocated: 0.0 MB


In [2]:
@lru_cache(maxsize=None)
def get_data(n_fold=0, fp_radius=2):
    """Download, process, and prepare data for use in graph-based machine learning models."""
    import os
    import zipfile
    import requests
    import torch
    import pandas as pd
    import numpy as np
    import networkx as nx
    from torch_geometric.data import Data
    import scripts  # Assuming scripts has required functions

    def download_if_not_present(url, filepath):
        """Download a file from a URL if it does not exist locally."""
        if not os.path.exists(filepath):
            print(f"File not found at {filepath}. Downloading...")
            response = requests.get(url, stream=True)
            os.makedirs(os.path.dirname(filepath), exist_ok=True)
            with open(filepath, "wb") as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)
            print("Download completed.")
        else:
            print(f"File already exists at {filepath}.")

    # Step 1: Download and load RNA-seq data
    zip_url = "https://cog.sanger.ac.uk/cmp/download/rnaseq_all_20220624.zip"
    zip_filepath = "data/rnaseq.zip"
    rnaseq_filepath = "data/rnaseq_normcount.csv"
    if not os.path.exists(rnaseq_filepath):
        download_if_not_present(zip_url, zip_filepath)
        with zipfile.ZipFile(zip_filepath, "r") as zipf:
            zipf.extractall("data/")
    rnaseq = pd.read_csv(rnaseq_filepath, index_col=0)

    # Step 2: Load gene network, hierarchies, and driver genes
    hierarchies = pd.read_csv("data/gene_to_pathway_final_with_hierarchy.csv")
    driver_genes = pd.read_csv("data/driver_genes_2.csv")['gene'].dropna()
    gene_network = nx.read_edgelist("data/filtered_gene_network.edgelist", nodetype=str)
    ensembl_to_hgnc = dict(zip(hierarchies['Ensembl_ID'], hierarchies['HGNC']))
    mapped_gene_network = nx.relabel_nodes(gene_network, ensembl_to_hgnc)

    # Step 3: Filter RNA-seq data and identify valid nodes
    driver_columns = rnaseq.columns.isin(driver_genes)
    filtered_rna = rnaseq.loc[:, driver_columns]
    valid_nodes = set(filtered_rna.columns)  # Get valid nodes after filtering RNA-seq columns

    # Step 4: Create edge tensors for the graph
    edges_df = pd.DataFrame(
        list(mapped_gene_network.edges(data="weight")),
        columns=["source", "target", "weight"]
    )
    edges_df["weight"] = edges_df["weight"].fillna(1.0).astype(float)
    filtered_edges = edges_df[
        (edges_df["source"].isin(valid_nodes)) & (edges_df["target"].isin(valid_nodes))
    ]
    node_to_idx = {node: idx for idx, node in enumerate(valid_nodes)}
    filtered_edges["source_idx"] = filtered_edges["source"].map(node_to_idx)
    filtered_edges["target_idx"] = filtered_edges["target"].map(node_to_idx)
    edge_index = torch.tensor(filtered_edges[["source_idx", "target_idx"]].values.T, dtype=torch.long)
    edge_attr = torch.tensor(filtered_edges["weight"].values, dtype=torch.float32)

    # Step 5: Process the hierarchy to create pathway groups
    filtered_hierarchy = hierarchies[hierarchies["HGNC"].isin(valid_nodes)]
    pathway_dict = {
        gene: pathway.split(':', 1)[1].split('[', 1)[0].strip() if isinstance(pathway, str) and ':' in pathway else None
        for gene, pathway in zip(filtered_hierarchy['HGNC'], filtered_hierarchy['Level_1'])
    }
    grouped_pathway_dict = {}
    for gene, pathway in pathway_dict.items():
        if pathway:
            grouped_pathway_dict.setdefault(pathway, []).append(gene)
    pathway_groups = {
        pathway: [node_to_idx[gene] for gene in genes if gene in node_to_idx]
        for pathway, genes in grouped_pathway_dict.items()
    }
    # Convert to padded tensor
    pathway_tensors = pad_sequence(
        [torch.tensor(indices, dtype=torch.long) for indices in pathway_groups.values()], 
        batch_first=True, 
        padding_value=-1  # Use -1 as padding
    )

    # Step 6: Create cell-line graphs
    tensor_exp = torch.tensor(filtered_rna.to_numpy())
    cell_dict = {cell: tensor_exp[i] for i, cell in enumerate(filtered_rna.index.to_numpy())}
    graph_data_list = {}
    for cell, x in cell_dict.items():
        if x.ndim == 2 and x.shape[0] == 1:
            x = x.T
        elif x.ndim == 1:
            x = x.unsqueeze(1)
        graph_data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
        graph_data.y = None
        graph_data.cell_line = cell
        graph_data_list[cell] = graph_data

    # Step 7: Load drug data
    smile_dict = pd.read_csv("data/smiles.csv", index_col=0)
    fp = scripts.FingerprintFeaturizer(R=fp_radius)
    drug_dict = fp(smile_dict.iloc[:, 1], smile_dict.iloc[:, 0])

    # Step 8: Load IC50 data and filter for valid cell lines and drugs
    data = pd.read_csv("data/GDSC1.csv", index_col=0)
    data = data.query("SANGER_MODEL_ID in @cell_dict.keys() & DRUG_ID in @drug_dict.keys()")

    # Step 9: Split the data into folds for cross-validation
    unique_cell_lines = data["SANGER_MODEL_ID"].unique()
    np.random.seed(420)
    np.random.shuffle(unique_cell_lines)
    folds = np.array_split(unique_cell_lines, 10)
    train_idxs = list(range(10))
    train_idxs.remove(n_fold)
    validation_idx = np.random.choice(train_idxs)
    train_idxs.remove(validation_idx)
    train_lines = np.concatenate([folds[idx] for idx in train_idxs])
    validation_lines = folds[validation_idx]
    test_lines = folds[n_fold]

    train_data = data.query("SANGER_MODEL_ID in @train_lines")
    validation_data = data.query("SANGER_MODEL_ID in @validation_lines")
    test_data = data.query("SANGER_MODEL_ID in @test_lines")

    # Step 10: Build the datasets for training, validation, and testing
    train_dataset = scripts.OmicsDataset(graph_data_list, drug_dict, train_data)
    validation_dataset = scripts.OmicsDataset(graph_data_list, drug_dict, validation_data)
    test_dataset = scripts.OmicsDataset(graph_data_list, drug_dict, test_data)

    return train_dataset, validation_dataset, test_dataset, pathway_tensors


In [3]:
train_data, val_data, test_data, pathway_groups=get_data(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_edges["source_idx"] = filtered_edges["source"].map(node_to_idx)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_edges["target_idx"] = filtered_edges["target"].map(node_to_idx)


In [4]:
def custom_collate_fn(batch):
    try:
        cell_graphs = [item[0] for item in batch if item[0] is not None]
        if len(cell_graphs) == 0:
            raise ValueError("No graphs to batch in this batch. Batch might be empty or contains None.")
        
        drug_vectors = torch.stack([item[1] for item in batch if item[1] is not None])  # Stack drug vectors
        targets = torch.stack([item[2] for item in batch if item[2] is not None])  # Stack target values
        cell_ids = torch.stack([item[3] for item in batch if item[3] is not None])  # Stack cell IDs
        drug_ids = torch.stack([item[4] for item in batch if item[4] is not None])  # Stack drug IDs

        # Batch the PyG graphs into a single DataBatch
        cell_graph_batch = Batch.from_data_list(cell_graphs)
        return cell_graph_batch, drug_vectors, targets, cell_ids, drug_ids
    
    except Exception as e:
        print(f"Error in custom_collate_fn: {e}")
        print(f"Batch contents: {batch}")
        raise e

In [5]:
#gnn_model = ModularGNN(**config["gnn"])
#drug_mlp = DrugMLP(input_dim=config["drug"]["input_dim"], embed_dim=config["drug"]["embed_dim"])
#resnet = ResNet(embed_dim=config["drug"]["embed_dim"], hidden_dim=config["resnet"]["hidden_dim"], 
#                n_layers=config["resnet"]["n_layers"], dropout=config["resnet"]["dropout"])
#
#combined_model = CombinedModel(gnn=gnn_model, drug_mlp=drug_mlp, resnet=resnet)
#
#device = torch.device(config["env"]["device"])
#combined_model.to(device)
#print(pathway_groups)

In [6]:
config = {
    "gnn": {
        "input_dim": 1,
        "hidden_dim": 128,
        "output_dim": 1,
        "pathway_groups": pathway_groups,  # Specify pathway groups if required
        "layer_modes": [True, True, True],
        "pooling_mode": "pathway",
        "aggr_modes": ["mean", "mean", "mean"],
        "num_pathways_per_instance": 44
    },
    "resnet": {
        "hidden_dim": 44,
        "n_layers": 6,
        "dropout": 0.1,
    },
    "drug": {
        "input_dim": 2048,
        "embed_dim": 44
    },
    "optimizer": {
        "learning_rate": 1e-3,
        "batch_size": 16,
        "clip_norm": 1.0,
        "stopping_patience": 10,
    },
    "env": {
        "device": f"cuda:{0}" if torch.cuda.is_available() else "cpu",  # Use GPU 0
        "max_epochs": 50,
        "world_size": 1,  # Only one GPU
        "rank": 0,  # Only one process
        "local_rank": 0,  # Only one GPU, so local rank is 0
        "master_addr": "localhost",  # Master node address
        "master_port": "12345",  # Communication port
    }
}

In [7]:
R, model = scripts.train_model(config, train_data, val_data)
metrics = torchmetrics.MetricTracker(torchmetrics.MetricCollection({ "R_cellwise_residuals": torchmetrics.PearsonCorrCoef(num_outputs=1), "R_cellwise": torchmetrics.PearsonCorrCoef(num_outputs=1), "MSE": torchmetrics.MeanSquaredError() }))

device = torch.device(config["env"]["device"])
metrics.to(device)

test_dataloader = DataLoader( dataset=test_data, batch_size=config["optimizer"]["batch_size"], shuffle=True, drop_last=True, collate_fn=scripts.custom_collate_fn
)

final_metrics = scripts.evaluate_step(model, test_dataloader, metrics, device)

print("\n\n=== Final Test Metrics ===") 
print(final_metrics)

number of GPUs:2
Lazy layers initialized successfully with a real batch instance.
Running on device: cuda:0




Batch 1, Loss: 107946442752.0000
Batch 2, Loss: 85221687296.0000
Batch 3, Loss: 41068732416.0000
Batch 4, Loss: 13269834752.0000
Batch 5, Loss: 18368745472.0000
Batch 6, Loss: 4297502208.0000
Batch 7, Loss: 3815027968.0000
Batch 8, Loss: 6247688192.0000
Batch 9, Loss: 9071473664.0000
Batch 10, Loss: 4265682432.0000
Batch 11, Loss: 76128144.0000
Batch 12, Loss: 5389819904.0000
Batch 13, Loss: 9044699136.0000
Batch 14, Loss: 3945611264.0000
Batch 15, Loss: 2737492992.0000
Batch 16, Loss: 439898368.0000
Batch 17, Loss: 2187254784.0000
Batch 18, Loss: 3293343744.0000
Batch 19, Loss: 1048024192.0000
Batch 20, Loss: 86713744.0000
Batch 21, Loss: 637604224.0000
Batch 22, Loss: 173974464.0000
Batch 23, Loss: 426823424.0000
Batch 24, Loss: 819325440.0000
Batch 25, Loss: 808972544.0000


KeyboardInterrupt: 