In [None]:
import pandas as pd
import networkx as nx
import re
from networkx.algorithms.community import greedy_modularity_communities

# === STEP 1: 读取 reaction.tsv ===
def load_reactions_tsv(file_path):
    return pd.read_csv(file_path, sep="\t")

# === STEP 2: 解析 equation 字符串，提取代谢物及系数，并标准化 compartment ===
def parse_equation(equation):
    arrow = '<=>' if '<=>' in equation else '=>' if '=>' in equation else '<='
    lhs, rhs = equation.split(arrow)

    def get_mets_with_stoich(side):
        mets = []
        for m in side.split('+'):
            m = m.strip()
            match = re.match(r'^\((\d+)\)\s*(.+)', m)
            if match:
                coef, met = match.groups()
            else:
                coef, met = 1, m
            # 标准化 compartment：cpdxxxx[0] → cpdxxxx_c0，cpdxxxx[1] → cpdxxxx_e0
            met = re.sub(r"\[(\d+)\]", lambda x: {'0': '_c0', '1': '_e0'}.get(x.group(1), f"_x{x.group(1)}"), met)

            mets.append((met.strip(), int(coef)))
        return mets

    return get_mets_with_stoich(lhs), get_mets_with_stoich(rhs), arrow

# === STEP 3: 构建 NetworkX 代谢物图（包含方向和计量信息） ===
def build_metabolite_graph(df):
    G = nx.DiGraph()
    for idx, row in df.iterrows():
        try:
            lhs, rhs, direction = parse_equation(row['equation'])
            rxn_id = row['id']
            for l_met, l_coef in lhs:
                for r_met, r_coef in rhs:
                    G.add_edge(l_met, r_met, reaction=rxn_id,
                               l_coef=l_coef, r_coef=r_coef)
                    if direction == '<=>':
                        G.add_edge(r_met, l_met, reaction=rxn_id,
                                   l_coef=r_coef, r_coef=l_coef)
        except Exception as e:
            print(f" Failed parsing {row['id']}: {e}")
    return G

# === STEP 4: 注释代谢物图中的节点属性 ===
def annotate_metabolites(G, compounds_file):
    comp_df = pd.read_csv(compounds_file, sep="\t", low_memory=False)
    for _, row in comp_df.iterrows():
        met_id = row['id']
        for comp in ['_c0', '_e0']:  
            full_id = f"{met_id}{comp}"
            if full_id in G.nodes:
                G.nodes[full_id]['name'] = row.get('name', '')
                G.nodes[full_id]['formula'] = row.get('formula', '')
                G.nodes[full_id]['charge'] = row.get('charge', '')
                G.nodes[full_id]['base_id'] = met_id
                G.nodes[full_id]['compartment'] = comp

# === STEP 5: 社区检测 ===
def detect_communities(G):
    communities = list(greedy_modularity_communities(G.to_undirected()))
    # 给每个节点加 community 属性
    for i, comm in enumerate(communities):
        for node in comm:
            G.nodes[node]['community'] = i
    return communities

# === STEP 6: 显示图信息 ===
def print_graph_summary(G):
    print(f"图中包含 {G.number_of_nodes()} 个代谢物节点，{G.number_of_edges()} 条反应边")

# === STEP 7: 可视化 ===
def draw_metabolite_graph(G, with_labels=False, node_size=50):
    import matplotlib.pyplot as plt
    pos = nx.spring_layout(G, seed=42)
    nx.draw(G, pos, node_size=node_size, with_labels=with_labels, arrows=True)
    plt.title("Metabolite Graph from ModelSEED Reactions")
    plt.show()

# === STEP 8: 保存特定社区子图 ===
def save_community_subgraph(G, communities, target_met, out_file="community_subgraph.graphml"):
    for i, comm in enumerate(communities):
        if target_met in comm:
            subG = G.subgraph(comm).copy()
            nx.write_graphml(subG, out_file)
            print(f" {target_met} 属于社区 {i}，子图已保存为 {out_file}")
            return subG
    print(f" 未找到 {target_met} 所属社区。")
    return None


In [47]:
reactions_file = "../../data/database/reactions.tsv"
compounds_file = "../../data/database/compounds.tsv"

# 加载并构建图
reactions_df = load_reactions_tsv(reactions_file)
G = build_metabolite_graph(reactions_df)
print_graph_summary(G)

图中包含 23891 个代谢物节点，158091 条反应边


In [48]:
# 加载代谢物注释
annotate_metabolites(G, compounds_file)

In [50]:
related = [n for n in G.nodes if "cpd00211" in n]
print("所有含 cpd00211 的节点：")
for n in related:
    print(" -", n)

所有含 cpd00211 的节点：
 - cpd00211_c0
 - cpd00211_e0


In [51]:
communities = detect_communities(G)
print(f" 共检测到 {len(communities)} 个模块（子图）")

 共检测到 421 个模块（子图）


In [58]:
import networkx.algorithms.community as nx_comm

communities2 = list(nx_comm.label_propagation_communities(G.to_undirected()))
print(f" 共检测到 {len(communities2)} 个模块（子图）")

 共检测到 1161 个模块（子图）


In [63]:
target_met = "cpd00211_c0"

for i, comm in enumerate(communities2):
    if target_met in comm:
        print(f" {target_met} 属于社区 {i}，该社区包含 {len(comm)} 个代谢物。")
        break
else:
    print(f" {target_met} 不属于任何社区。")

 cpd00211_c0 属于社区 0，该社区包含 20151 个代谢物。


In [54]:
target_met = "cpd00211_e0"

for i, comm in enumerate(communities):
    if target_met in comm:
        print(f" {target_met} 属于社区 {i}，该社区包含 {len(comm)} 个代谢物。")
        break
else:
    print(f" {target_met} 不属于任何社区。")


 cpd00211_e0 属于社区 2，该社区包含 4665 个代谢物。


In [60]:
G.nodes["cpd00211_c0"]


{'name': 'Butyrate',
 'formula': 'C4H7O2',
 'charge': -1,
 'base_id': 'cpd00211',
 'compartment': '_c0',
 'community': 1}

In [56]:
G.nodes["cpd00211_e0"]

{'name': 'Butyrate',
 'formula': 'C4H7O2',
 'charge': -1,
 'base_id': 'cpd00211',
 'compartment': '_e0',
 'community': 2}

In [64]:
nx.write_graphml(G, "full_metabolite_graph.graphml")


In [65]:

def save_community_subgraph(G, communities, target_met, out_file="community_subgraph.graphml"):
    # 1. 找到该代谢物所属社区
    for i, comm in enumerate(communities):
        if target_met in comm:
            print(f" 目标代谢物 {target_met} 属于社区 {i}，该社区包含 {len(comm)} 个节点。")

            # 2. 提取该社区子图
            subG = G.subgraph(comm).copy()

            # 3. 保存为 graphml（或其他格式）
            nx.write_graphml(subG, out_file)
            print(f" 子图已保存为: {out_file}")
            return subG

    print(f" 未找到 {target_met} 所属社区。")
    return None

save_community_subgraph(G, communities2, "cpd00211_c0", out_file="cpd00211_community.graphml")



 目标代谢物 cpd00211_c0 属于社区 0，该社区包含 20151 个节点。
 子图已保存为: cpd00211_community.graphml


<networkx.classes.digraph.DiGraph at 0x3cbcf3a30>

In [67]:
def get_reactions_for_node(G, node_id):
    reactions = set()
    
    # 上游反应（它是产物）
    for pred in G.predecessors(node_id):
        edge = G[pred][node_id]
        reactions.add(edge.get('reaction'))

    # 下游反应（它是底物）
    for succ in G.successors(node_id):
        edge = G[node_id][succ]
        reactions.add(edge.get('reaction'))

    return reactions
rxns = get_reactions_for_node(G, "cpd00211_c0")
print(f"参与的反应共有 {len(rxns)} 个：")
for r in rxns:
    print(f" - {r}")


参与的反应共有 21 个：
 - rxn36392
 - rxn45696
 - rxn01238
 - rxn36395
 - rxn02879
 - rxn43672
 - rxn00994
 - rxn08183
 - rxn01237
 - rxn02683
 - rxn11378
 - rxn13713
 - rxn36394
 - rxn38354
 - rxn36391
 - rxn36393
 - rxn01236
 - rxn00873
 - rxn13427
 - rxn47214
 - rxn11382


In [1]:
##
import pandas as pd
from cobra import Reaction, Metabolite
import pickle
import random

# ===== 文件路径（请根据你本地路径修改） =====
reaction_file_path = "../../data/database/reactions.tsv"
compound_file_path = "../../data/database/compounds.tsv"
output_pickle_path = "reaction_dict.pkl"

# ===== 读取数据 =====
df_rxn = pd.read_csv(reaction_file_path, sep="\t")
df_cpd = pd.read_csv(compound_file_path, sep="\t")

# ===== 构建化合物信息字典 =====
compound_info = {}
for idx, row in df_cpd.iterrows():
    if pd.notna(row.get('id')):
        compound_info[row['id']] = {
            'name': row.get('name', row['id']),
            'formula': row.get('formula', ''),
            'charge': row.get('charge', 0)
        }

# ===== 构建 reaction_dict =====
reaction_dict = {}

for idx, row in df_rxn.iterrows():
    rxn_id = row['id']
    stoich_raw = row['stoichiometry']
    reversibility = row['reversibility']
    rxn_name = row['name'] if pd.notna(row['name']) else rxn_id

    if pd.isna(stoich_raw) or pd.isna(rxn_id):
        continue

    rxn = Reaction(rxn_id)
    rxn.name = rxn_name
    rxn.lower_bound = -1000 if reversibility == '=' else 0
    rxn.upper_bound = 1000

    stoich_items = stoich_raw.split(';')
    met_dict = {}

    for item in stoich_items:
        if not item or 'null' in item.lower():
            continue

        parts = item.split(':')
        if len(parts) < 3:
            continue

        try:
            coeff = float(parts[0])
            base_id = parts[1].strip()
            compartment_index = parts[2].strip()
        except:
            continue

        if compartment_index == '0':
            met_id = base_id + '_c0'
            comp = 'c'
        elif compartment_index == '1':
            met_id = base_id + '_e0'
            comp = 'e'
        else:
            met_id = base_id
            comp = ''

        met_info = compound_info.get(base_id, {})
        met = Metabolite(
            id=met_id,
            name=met_info.get('name', base_id),
            formula=met_info.get('formula', ''),
            charge=met_info.get('charge', 0),
            compartment=comp
        )
        met_dict[met] = coeff

    rxn.add_metabolites(met_dict)
    reaction_dict[rxn_id] = rxn

# ===== 输出 & 保存 =====
print(f" 已构建反应数量: {len(reaction_dict)}")
print("\n 示例反应方程式:")
for rxn_id in random.sample(list(reaction_dict.keys()), min(5, len(reaction_dict))):
    rxn = reaction_dict[rxn_id]
    print(f"[{rxn.id}] {rxn.reaction}")

with open(output_pickle_path, "wb") as f:
    pickle.dump(reaction_dict, f)
    print(f"\n reaction_dict 已保存至: {output_pickle_path}")


  df_cpd = pd.read_csv(compound_file_path, sep="\t")


 已构建反应数量: 43763

 示例反应方程式:
[rxn41725] cpd31852_c0 + cpd32612_c0 --> cpd00012_c0 + cpd00067_c0 + cpd24820_c0
[rxn43988] cpd00042_c0 + cpd19935_c0 --> cpd00067_c0 + cpd33692_c0
[rxn46297] 28.0 cpd02338_c0 + cpd23200_c0 <=> 28.0 cpd00014_c0 + cpd23213_c0
[rxn16422] 2.0 cpd00005_c0 + 2.0 cpd00007_c0 + cpd00066_c0 + 2.0 cpd00067_c0 --> 3.0 cpd00001_c0 + 2.0 cpd00006_c0 + cpd00011_c0 + cpd20962_c0
[rxn05244] cpd00067_e0 + cpd00322_e0 <=> cpd00067_c0 + cpd00322_c0

 reaction_dict 已保存至: reaction_dict.pkl


In [2]:
import pickle
import random

# ===== 修改为你的实际路径 =====
reaction_dict_path = "reaction_dict.pkl"

# ===== 加载 reaction_dict =====
with open(reaction_dict_path, "rb") as f:
    reaction_dict = pickle.load(f)

print(f" reaction_dict 已加载，总反应数: {len(reaction_dict)}")

# ===== 随机抽查若干反应 =====
sample_size = min(5, len(reaction_dict))
sample_rxns = random.sample(list(reaction_dict.keys()), sample_size)

print(f"\n 随机抽查 {sample_size} 条反应：\n")

for rxn_id in sample_rxns:
    rxn = reaction_dict[rxn_id]
    print(f" Reaction: {rxn.id}")
    print(f"   Name    : {rxn.name}")
    print(f"   Equation: {rxn.reaction}")
    print("   Metabolites:")
    for met, coeff in rxn.metabolites.items():
        print(f"     - ID: {met.id}")
        print(f"       Name     : {met.name}")
        print(f"       Coefficient: {coeff}")
        print(f"       Formula  : {met.formula}")
        print(f"       Charge   : {met.charge}")
        print(f"       Compartment: {met.compartment}")
    print("-" * 50)


 reaction_dict 已加载，总反应数: 43763

 随机抽查 5 条反应：

 Reaction: rxn00340
   Name    : L-aspartate:ammonia ligase (AMP-forming)
   Equation: cpd00002_c0 + cpd00013_c0 + cpd00041_c0 --> cpd00012_c0 + cpd00018_c0 + 2.0 cpd00067_c0 + cpd00132_c0
   Metabolites:
     - ID: cpd00002_c0
       Name     : ATP
       Coefficient: -1.0
       Formula  : C10H13N5O13P3
       Charge   : -3
       Compartment: c
     - ID: cpd00013_c0
       Name     : NH3
       Coefficient: -1.0
       Formula  : H4N
       Charge   : 1
       Compartment: c
     - ID: cpd00041_c0
       Name     : L-Aspartate
       Coefficient: -1.0
       Formula  : C4H6NO4
       Charge   : -1
       Compartment: c
     - ID: cpd00012_c0
       Name     : PPi
       Coefficient: 1.0
       Formula  : HO7P2
       Charge   : -3
       Compartment: c
     - ID: cpd00018_c0
       Name     : AMP
       Coefficient: 1.0
       Formula  : C10H12N5O7P
       Charge   : -2
       Compartment: c
     - ID: cpd00067_c0
       Name     : H+
 