In [1]:
# %%
# === 第一步：设置与初始化 ===
# 启用自动重载，修改 .py 文件后无需重启内核！
%load_ext autoreload
%autoreload 2


In [2]:
# 导入系统库和项目模块
import os
import sys
import torch
import scanpy as sc
import numpy as np
import pandas as pd

# 确保当前目录在 Python 路径中
if '.' not in sys.path:
    sys.path.insert(0, '.')

# 导入我们项目的所有模块
from config import *
from data_preprocessing import *
from graph_construction import *
from model import CellLineKGModel
from train import train_model
from test import evaluate_model

print("✅ 所有模块导入成功，autoreload 已启用！")


✅ 所有模块导入成功，autoreload 已启用！


In [3]:
# %%
print("🚀 步骤 1: 加载 Tahoe 数据...")

# 设置 Tahoe 数据路径 (请根据实际情况修改)
TAHOE_GCS_PATH = "gs://arc-ctc-tahoe100/2025-02-25/tutorial/plate3_2k-obs.h5ad"

# ========== 开发阶段 ==========
adata = load_tahoe_data(TAHOE_GCS_PATH, n_cells=1000)  # ⬅️ 只读1000个细胞！

# ========== 正式运行时 ==========
# adata = load_tahoe_data(TAHOE_GCS_PATH)  # 读全部数据print(f"数据加载完成: {adata.shape} (cells x genes)")


🚀 步骤 1: 加载 Tahoe 数据...
⚠️  [DEBUG] Subsetting to first 1000 cells.
✅ Loaded data: 1000 cells, 62710 genes.


In [4]:
# %%
# === 第二步：加载并预处理 Tahoe 数据 ===
print("🚀 步骤 1: 加载 Tahoe 数据...")

# 设置 Tahoe 数据路径 (请根据实际情况修改)
TAHOE_GCS_PATH = "gs://arc-ctc-tahoe100/2025-02-25/tutorial/plate3_2k-obs.h5ad"

# 加载数据
adata = load_tahoe_data(TAHOE_GCS_PATH)
print(f"数据加载完成: {adata.shape} (cells x genes)")


🚀 步骤 1: 加载 Tahoe 数据...
✅ Loaded data: 2000 cells, 62710 genes.
数据加载完成: (2000, 62710) (cells x genes)


In [5]:
# === 第三步：过滤蛋白节点 ===
print("🧬 步骤 2: 过滤蛋白节点 (基于表达量)...")

# 根据配置过滤基因
filtered_genes = filter_proteins_by_expression(adata, min_cell_lines=MIN_CELL_LINES_FOR_TOP80)
print(f"过滤后保留基因数: {len(filtered_genes)}")


🧬 步骤 2: 过滤蛋白节点 (基于表达量)...
Filtered from 62710 to 20602 genes.
过滤后保留基因数: 20602


In [6]:
# === 第四步：构建各种边 ===
print("🔗 步骤 3: 构建异质图的边...")

# 1. 构建 Protein-Protein 边 (PPI)
print(f"  → 构建 PPI 边 (方法: {PPI_BUILD_METHOD})...")
ppi_edges = build_protein_protein_edges(adata, filtered_genes)

# 对于后续的 compute_coexpression_ppi 函数，其时间复杂度是 O(n²)。当 n=20602 时，需要计算的基因对数量是：
# 20602 × 20601 / 2 ≈ 212,261,301 对

🔗 步骤 3: 构建异质图的边...
  → 构建 PPI 边 (方法: coexpression)...


KeyboardInterrupt: 

In [None]:
# 2. 构建 Drug-Protein 边 (示例数据，需替换为真实 DTI)
print("  → 构建 Drug-Protein 边...")
# 这里使用示例数据，实际项目中应从 PINNACLE/ZINC 加载
drug_protein_edges = [
    ('Everolimus', 'MTOR'),
    ('Infigratinib', 'FGFR1'),
    ('Paclitaxel', 'TUBB1')
]
drugs = list(set(d for d, _ in drug_protein_edges))


In [None]:
# 3. 构建 CellLine-Protein 边
print("  → 构建 CellLine-Protein 边...")
cell_line_protein_edges = []
cell_lines = adata.obs['cell_name'].unique().tolist()
for cell_line in cell_lines:
    cell_mask = adata.obs['cell_name'] == cell_line
    if cell_mask.sum() == 0:
        continue
    # 获取该细胞系下过滤后基因的表达
    adata_cell = adata[cell_mask, filtered_genes]
    if hasattr(adata_cell.X, "toarray"):
        expr_subset = adata_cell.X.toarray()
    else:
        expr_subset = adata_cell.X
    mean_expr = np.mean(expr_subset, axis=0)
    for i, gene in enumerate(filtered_genes):
        if mean_expr[i] > EXPR_Z_THRESHOLD:
            cell_line_protein_edges.append((cell_line, gene, float(mean_expr[i])))



In [None]:
# 4. 构建 Disease-Protein 边
print("  → 构建 Disease-Protein 边...")
diseases = ['Breast Cancer', 'Lung Cancer']  # 示例疾病
disease_protein_edges = []
for disease in diseases:
    disease_genes = get_disease_genes(disease, top_k=DISEASE_PROTEIN_TOP_K)
    for gene in disease_genes:
        if gene in filtered_genes:
            disease_protein_edges.append((disease, gene))



In [None]:
# 5. 构建 Disease-CellLine 边
print(f"  → 构建 Disease-CellLine 边 (方法: {DISEASE_CELL_BUILD_METHOD})...")
disease_cell_line_edges = build_disease_cell_line_edges(
    diseases, cell_lines, adata, method=DISEASE_CELL_BUILD_METHOD
)


In [None]:
# === 第五步：构建异质图 ===
print("📊 步骤 4: 构建 CellLineBioHG 异质图...")

hg = create_cell_line_bio_hg(
    proteins=filtered_genes,
    drugs=drugs,
    cell_lines=cell_lines,
    diseases=diseases,
    drug_protein_edges=drug_protein_edges,
    protein_protein_edges=ppi_edges,
    cell_line_protein_edges=cell_line_protein_edges,
    disease_protein_edges=disease_protein_edges,
    disease_cell_line_edges=disease_cell_line_edges
)

print(f"✅ 图构建完成: {hg.number_of_nodes()} 个节点, {hg.number_of_edges()} 条边")


In [None]:
# === 第六步：准备节点特征 ===
print("🧠 步骤 5: 准备节点初始特征...")

# Drug: 使用随机指纹 (实际应从 SMILES 计算 Morgan Fingerprint)
np.random.seed(42)
drug_features = torch.randn(len(drugs), 1024)

# Protein: 使用随机初始化
protein_features = torch.randn(len(filtered_genes), 256)

# CellLine & Disease: 不提供初始特征，让模型学习嵌入
cell_line_features = None
disease_features = None

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"使用设备: {device}")


In [None]:
# === 第七步：训练模型 ===
print("🏋️ 步骤 6: 训练 CellLineKG 模型...")

model = train_model(
    graph=hg,
    drug_features=drug_features,
    protein_features=protein_features,
    cell_line_features=cell_line_features,
    disease_features=disease_features,
    device=device
)

print("✅ 模型训练完成!")


In [None]:
# === 第八步：评估模型 (示例) ===
print("📈 步骤 7: 模型评估...")

# 这里只是一个占位符，你需要实现真实的评估逻辑
evaluate_model()

print("🎉 项目主流程运行完毕！")

# %%
# === 第九步：进行预测 (示例) ===
print("🔮 步骤 8: 预测药物-疾病关系...")

# 假设我们想预测 "Everolimus" 对 "Breast Cancer" 的效果
drug_name = "Everolimus"
disease_name = "Breast Cancer"

# 获取节点 ID (假设你的 graph_construction.py 中有相关映射)
# 注意：你需要在 create_cell_line_bio_hg 函数中返回节点名到ID的映射，或在这里重建
drug_id = drugs.index(drug_name)
disease_id = diseases.index(disease_name)

# 获取节点嵌入 (需要修改 model.py 的 forward 函数以支持返回 embeddings)
# 假设你已修改模型，可以这样调用：
# node_embeddings = model(hg, drug_features, protein_features, task='embeddings')

# 伪代码示例：
# score = model.predict_drug_disease(
#     node_embeddings['drug'][drug_id],
#     node_embeddings['disease'][disease_id]
# )
# print(f"药物 '{drug_name}' 治疗疾病 '{disease_name}' 的预测得分: {score:.4f}")

print("⚠️ 预测功能待实现，请根据 model.py 中的 predict_drug_disease 函数完善此部分。")

# %%
# === 附录：配置查看 ===
print("⚙️ 当前配置参数:")
print(f"- PPI 构建方法: {PPI_BUILD_METHOD}")
print(f"- Disease-CellLine 构建方法: {DISEASE_CELL_BUILD_METHOD}")
print(f"- 蛋白过滤阈值 (min_cell_lines): {MIN_CELL_LINES_FOR_TOP80}")
print(f"- CellLine-Protein 边阈值: {EXPR_Z_THRESHOLD}")
