In [18]:
import sys
sys.path.append(r"scripts")
from functools import lru_cache
import pandas as pd
import numpy as np
import torch
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split
import os
import requests
import zipfile
import networkx as nx
import scripts
import torchmetrics
from torch import nn
import optuna
import models

from optuna.integration import TensorBoardCallback


In [19]:
@lru_cache(maxsize=None)
def get_data(n_fold=0, fp_radius=2):
    def download_if_not_present(url, filepath):
        if not os.path.exists(filepath):
            print(f"File not found at {filepath}. Downloading...")
            response = requests.get(url, stream=True)
            os.makedirs(os.path.dirname(filepath), exist_ok=True)
            with open(filepath, "wb") as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)
            print("Download completed.")
        else:
            print(f"File already exists at {filepath}.")

    # Download RNA-seq data
    zip_url = "https://cog.sanger.ac.uk/cmp/download/rnaseq_all_20220624.zip"
    zip_filepath = "data/rnaseq.zip"
    rnaseq_filepath = "data/rnaseq_normcount.csv"
    if not os.path.exists(rnaseq_filepath):
        download_if_not_present(zip_url, zip_filepath)
        with zipfile.ZipFile(zip_filepath, "r") as zipf:
            zipf.extractall("data/")
    rnaseq = pd.read_csv(rnaseq_filepath, index_col=0)

    # Load gene network, hierarchies, and driver genes
    hierarchies = pd.read_csv("data/gene_to_pathway_final_with_hierarchy.csv")
    driver_genes = pd.read_csv("data/driver_genes_2.csv").loc[:, "gene"].dropna()
    gene_network = nx.read_edgelist("data/filtered_gene_network.edgelist", nodetype=str)
    ensembl_to_hgnc = dict(zip(hierarchies['Ensembl_ID'], hierarchies['HGNC']))
    mapped_gene_network = nx.relabel_nodes(gene_network, ensembl_to_hgnc)

    # Create edge tensors for the graph
    edges_df = pd.DataFrame(
        list(mapped_gene_network.edges(data="weight")),
        columns=["source", "target", "weight"]
    )
    edges_df["weight"] = edges_df["weight"].fillna(1.0).astype(float)
    valid_nodes = rnaseq.columns.intersection(hierarchies["HGNC"])
    filtered_edges = edges_df[edges_df["source"].isin(valid_nodes) & edges_df["target"].isin(valid_nodes)]
    node_to_idx = {node: idx for idx, node in enumerate(valid_nodes)}
    edge_index = torch.tensor(
        filtered_edges[["source", "target"]].replace(node_to_idx).values.T,
        dtype=torch.long
    )
    edge_attr = torch.tensor(filtered_edges["weight"].values, dtype=torch.float32)

    # Prepare RNA-seq data for graph construction
    driver_columns = rnaseq.columns.isin(driver_genes)
    filtered_rna = rnaseq.loc[:, driver_columns]
    pathway_dict = {gene: pathway.split(':')[1].split('[')[0].strip()
                    for gene, pathway in zip(hierarchies["HGNC"], hierarchies["Level_1"])
                    if isinstance(pathway, str)}
    pathway_groups = {
        pathway: torch.tensor([node_to_idx[gene] for gene in genes if gene in node_to_idx])
        for pathway, genes in pathway_dict.items()
    }

    # Create cell-line graphs
    tensor_exp = torch.tensor(filtered_rna.to_numpy())
    cell_dict = {cell: Data(
        x=tensor_exp[i].unsqueeze(1),
        edge_index=edge_index,
        edge_attr=edge_attr,
        y=None,
        cell_line=cell
    ) for i, cell in enumerate(filtered_rna.index)}

    # Load drug data
    smile_dict = pd.read_csv("data/smiles.csv", index_col=0)
    fp = scripts.FingerprintFeaturizer(R=fp_radius)
    drug_dict = fp(smile_dict.iloc[:, 1], smile_dict.iloc[:, 0])

    # Load IC50 data
    data = pd.read_csv("data/GDSC1.csv", index_col=0)
    data = data.query("SANGER_MODEL_ID in @cell_dict.keys() & DRUG_ID in @drug_dict.keys()")

    # Split data into folds
    unique_cell_lines = data["SANGER_MODEL_ID"].unique()
    np.random.seed(420)
    np.random.shuffle(unique_cell_lines)
    folds = np.array_split(unique_cell_lines, 10)
    train_idxs = list(range(10))
    train_idxs.remove(n_fold)
    validation_idx = np.random.choice(train_idxs)
    train_idxs.remove(validation_idx)
    train_lines = np.concatenate([folds[idx] for idx in train_idxs])
    validation_lines = folds[validation_idx]
    test_lines = folds[n_fold]

    train_data = data.query("SANGER_MODEL_ID in @train_lines")
    validation_data = data.query("SANGER_MODEL_ID in @validation_lines")
    test_data = data.query("SANGER_MODEL_ID in @test_lines")

    # Build datasets
    train_dataset = scripts.OmicsDataset(cell_dict, drug_dict, train_data)
    validation_dataset = scripts.OmicsDataset(cell_dict, drug_dict, validation_data)
    test_dataset = scripts.OmicsDataset(cell_dict, drug_dict, test_data)

    return train_dataset, validation_dataset, test_dataset, pathway_groups

In [20]:
train_data, val_data, test_data, pathway_groups=get_data(0)

  filtered_edges[["source", "target"]].replace(node_to_idx).values.T,


AttributeError: module 'scripts' has no attribute 'FingerprintFeaturizer'

In [None]:
config = {
    "gnn": {
        "input_dim": 100,  # Number of node features
        "hidden_dim": 128,  # Hidden dimension for GNN layers
        "output_dim": 128,  # Output dimension of GNN
        "pathway_groups": pathway_groups,  # Pathway groups from your data
        "layer_modes": [False, False, True],  # Pathway-specific message passing in the last layer
        "pooling_mode": "pathway",  # Use pathway-specific pooling
        "aggr_modes": ["sum", "sum", "sum"],  # Aggregation types for GNN layers
    },
    "drug": {
        "input_dim": 256,  # Dimension of raw drug features
        "embed_dim": 128,  # Embedding dimension (must match GNN output)
    },
    "resnet": {
        "hidden_dim": 1024,  # Hidden dimension of ResNet layers
        "n_layers": 6,  # Number of ResNet layers
        "dropout": 0.1,  # Dropout rate
    },
    "optimizer": {
        "learning_rate": 1e-3,  # Learning rate
        "batch_size": 32,  # Batch size
        "clip_norm": 1.0,  # Gradient clipping norm
        "stopping_patience": 10,  # Patience for early stopping
        "use_momentum": True,  # Use momentum in optimization
    },
    "env": {
        "device": "cuda" if torch.cuda.is_available() else "cpu",  # Use GPU if available
        "max_epochs": 50,  # Maximum number of epochs
    },
}

In [21]:
gnn_model = ModularGNN(**config["gnn"])
drug_mlp = DrugMLP(input_dim=config["drug"]["input_dim"], embed_dim=config["drug"]["embed_dim"])
resnet = ResNet(embed_dim=config["drug"]["embed_dim"], hidden_dim=config["resnet"]["hidden_dim"], 
                n_layers=config["resnet"]["n_layers"], dropout=config["resnet"]["dropout"])

# Create the combined model
combined_model = CombinedModel(gnn=gnn_model, drug_mlp=drug_mlp, resnet=resnet)

# Move model to device
device = torch.device(config["env"]["device"])
combined_model.to(device)

NameError: name 'ModularGNN' is not defined

In [None]:
# Assuming train_data is a PyTorch dataset
single_instance = train_data[0]  # Extract the first instance

# Unpack the instance
cell_graph, drug_vector, target_value = single_instance

# Move data to the appropriate device
cell_graph = cell_graph.to(device)
drug_vector = drug_vector.to(device)

# Forward pass through the model
with torch.no_grad():
    output = combined_model(cell_graph, drug_vector)

# Print the result
print("Predicted output:", output.item())
print("True target value:", target_value)