In [3]:
###step1
###根据药材列表，从BATMAN数据库获取成分和靶点,
import os
import re
import csv
import sys
import time
import json
import requests

# 防止 CSV 长字段溢出
csv.field_size_limit(sys.maxsize)

####################################
# 输入输出文件设置
####################################
DTD_input = "/slurm/home/yrd/liaolab/tanshuoyan/TCM-opt-Tan/PD/herb/GBL-herb/GBL.txt"
ingredient_output = "/slurm/home/yrd/liaolab/tanshuoyan/TCM-opt-Tan/PD/herb/GBL-herb/GBL-ingredient-fromBATMAN.csv"
target_output     = "/slurm/home/yrd/liaolab/tanshuoyan/TCM-opt-Tan/PD/herb/GBL-herb/GBL-target-fromBATMAN.csv"

herb_browse_path                 = "/slurm/home/yrd/liaolab/tanshuoyan/training_Data/TCM-database/BATMAN-TCM/download-data/herb_browse.txt"
known_file_path                  = "/slurm/home/yrd/liaolab/tanshuoyan/training_Data/TCM-database/BATMAN-TCM/download-data/known_browse_by_ingredients.txt"
predicted_ing_file               = "/slurm/home/yrd/liaolab/tanshuoyan/training_Data/TCM-database/BATMAN-TCM/download-data/predicted_browse_by_ingredinets.txt"
predicted_target_mapping_file    = "/slurm/home/yrd/liaolab/tanshuoyan/training_Data/TCM-database/BATMAN-TCM/download-data/predicted_browse_by_targets.txt"

####################################
# 第 1 部分：读取 herb 列表
####################################
herb_list = []
with open(DTD_input, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        # 按制表符或逗号切分，取第 1 列
        herb = re.split(r'[\t,]\s*', line)[0]
        herb_list.append(herb)

####################################
# 第 2 部分：加载 herb_browse.txt
####################################
herb_data = {}
with open(herb_browse_path, "r", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter="\t")
    header = next(reader)
    for row in reader:
        if len(row) < 5:
            continue
        chinese_name   = row[1].strip()
        ingredients_str = row[4].strip()
        herb_data[chinese_name] = ingredients_str

####################################
# 第 3 部分：生成成分信息
####################################
output_rows = []
for herb in herb_list:
    if herb in herb_data:
        ingredients_str = herb_data[herb]
        if ingredients_str:
            for ingredient in ingredients_str.split("|"):
                ingredient = ingredient.strip()
                if "(" in ingredient and ingredient.endswith(")"):
                    idx = ingredient.rfind("(")
                    name = ingredient[:idx].strip()
                    cid  = ingredient[idx+1:-1].strip()
                    output_rows.append([herb, name, cid])
                else:
                    output_rows.append([herb, ingredient, ""])
        else:
            output_rows.append([herb, "", ""])
    else:
        print(f"Warning: 药材 {herb} 在 herb_browse.txt 中未找到。")

with open(ingredient_output, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["herbname", "ingredient_name", "PubChem_CID"])
    writer.writerows(output_rows)
print(f"成分文件已生成：{ingredient_output}")

####################################
# 第 4 部分：生成靶点信息
####################################
# 已知
known_targets = {}
with open(known_file_path, "r", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter="\t")
    next(reader)
    for row in reader:
        if len(row) < 3: continue
        cid = row[0].strip()
        prots = [p.strip() for p in row[2].split("|") if p.strip()]
        known_targets[cid] = prots

# 预测
predicted_targets = {}
with open(predicted_ing_file, "r", encoding="utf-8") as f:
    lines = f.readlines()
    for line in lines[1:]:
        parts = line.strip().split(maxsplit=2)
        if len(parts) < 3: continue
        cid = parts[0].strip()
        preds = []
        for tok in parts[2].split("|"):
            tok = tok.strip()
            if "(" in tok and tok.endswith(")"):
                idx = tok.rfind("(")
                pid = tok[:idx].strip()
                prob = tok[idx+1:-1].strip()
                try: prob = float(prob)
                except: prob = None
                preds.append((pid, prob))
        predicted_targets[cid] = preds

# ID 映射
pred_map = {}
with open(predicted_target_mapping_file, "r", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter="\t")
    next(reader)
    for row in reader:
        if len(row) < 2: continue
        pred_map[row[0].strip()] = row[1].strip()

# 组合成分→靶点
ingredient_data = []
with open(ingredient_output, "r", encoding="utf-8") as f:
    reader = csv.reader(f)
    next(reader)
    for herbname, _, cid in reader:
        ingredient_data.append((herbname, cid))

output_target_rows = []
for herbname, cid in ingredient_data:
    if cid in known_targets:
        for prot in known_targets[cid]:
            output_target_rows.append([herbname, cid, prot, 1])
    if cid in predicted_targets:
        for pid, prob in predicted_targets[cid]:
            pname = pred_map.get(pid, pid)
            output_target_rows.append([herbname, cid, pname, prob])

with open(target_output, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["herbname", "PubChem_CID", "protein", "probability"])
    writer.writerows(output_target_rows)
print(f"靶点文件已生成：{target_output}")

####################################
# 第 5 部分：批量获取 CanonicalSMILES 并写回
####################################
# 读取刚才的成分文件
rows = []
unique_cids = set()
with open(ingredient_output, "r", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for r in reader:
        rows.append(r)
        cid = r.get("PubChem_CID","").strip()
        if cid:
            unique_cids.add(cid)

if not rows:
    print("■ 未检测到任何成分行，跳过 SMILES 获取 ■")
else:
    # 分批调用 PubChem API
    cid_list = list(unique_cids)
    batch_size = 100
    cid2smiles = {}
    for i in range(0, len(cid_list), batch_size):
        batch = cid_list[i:i+batch_size]
        url = ("https://pubchem.ncbi.nlm.nih.gov/rest/pug/"
               f"compound/cid/{','.join(batch)}/property/CanonicalSMILES/JSON")
        try:
            resp = requests.get(url, timeout=10)
            data = resp.json()
            for prop in data.get("PropertyTable",{}).get("Properties",[]):
                c = str(prop.get("CID","")).strip()
                s = prop.get("CanonicalSMILES","").strip()
                cid2smiles[c] = s
        except Exception as e:
            print(f"SMILES 批 {i}-{i+batch_size} 请求异常：{e}")
        time.sleep(0.2)

    # 更新并写回
    for r in rows:
        cid = r.get("PubChem_CID","").strip()
        r["smiles"] = cid2smiles.get(cid, "")
    fieldnames = list(rows[0].keys())
    with open(ingredient_output, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(rows)
    print(f"已更新 smiles 字段，文件：{ingredient_output}")

成分文件已生成：/slurm/home/yrd/liaolab/tanshuoyan/TCM-opt-Tan/PD/herb/GBL-herb/GBL-ingredient-fromBATMAN.csv
靶点文件已生成：/slurm/home/yrd/liaolab/tanshuoyan/TCM-opt-Tan/PD/herb/GBL-herb/GBL-target-fromBATMAN.csv
已更新 smiles 字段，文件：/slurm/home/yrd/liaolab/tanshuoyan/TCM-opt-Tan/PD/herb/GBL-herb/GBL-ingredient-fromBATMAN.csv


In [34]:
##step2
####对方剂的靶点进行加权统计获得药材靶点权重，method 1
####这里忽略成分和相互作用概率,得到的第二列protein_weight均≤1
import os
import pandas as pd

# 路径设置
base_dir   = "/slurm/home/yrd/liaolab/tanshuoyan/TCM-opt-Tan/PD/herb/TGD-herb"
txt_file   = os.path.join(base_dir, "TGD.txt")
target_csv = os.path.join(base_dir, "TGD-target-fromBATMAN.csv")
out_dir    = os.path.join(base_dir, "target")
os.makedirs(out_dir, exist_ok=True)
out_csv    = os.path.join(out_dir, "TGD-target-from-BATMAN-processed-ignore-compound.csv")

# 1. 读取配方中药材用量，计算权重
#    DBYW.txt 无表头，两列：药材,用量
df_w = pd.read_csv(
    txt_file,
    sep=",",            # 按逗号切分
    header=None,
    names=["herbname", "amount"],
    dtype={"herbname": str, "amount": float},
    engine="python"
)
total_amount = df_w["amount"].sum()
df_w["weight"] = df_w["amount"] / total_amount

# 2. 读取 BATMAN 靶点文件
#    四列：herbname, PubChem_CID, protein, probability
df_t = pd.read_csv(target_csv, dtype=str)

# 3. 合并药材权重到靶点表（按 herbname）
df = pd.merge(
    df_t,
    df_w[["herbname", "weight"]],
    on="herbname",
    how="left"
)
# 如果有个别 herbname 未匹配上，用 0 权重替代
df["weight"] = df["weight"].fillna(0.0)

# 4. 对同一 herb–protein 对只计一次权重（避免重复行多次累加）
df_unique = df[["herbname", "protein", "weight"]].drop_duplicates(subset=["herbname", "protein"])

# 5. 按 protein 聚合：各 herb weight 直接相加
df_out = (
    df_unique
    .groupby("protein", as_index=False)["weight"]
    .sum()
    .rename(columns={"weight": "protein_weight"})
)

# 6. 保存结果
df_out.to_csv(out_csv, index=False, encoding="utf-8-sig")
print(f"处理完成，结果已保存到：{out_csv}")

处理完成，结果已保存到：/slurm/home/yrd/liaolab/tanshuoyan/TCM-opt-Tan/PD/herb/TGD-herb/target/TGD-target-from-BATMAN-processed-ignore-compound.csv


In [35]:
##step2
####对方剂的靶点进行加权统计获得药材靶点权重
####这里考虑成分，但是忽略相互作用概率，method 2
# 路径设置
base_dir   = "/slurm/home/yrd/liaolab/tanshuoyan/TCM-opt-Tan/PD/herb/TGD-herb"
txt_file   = os.path.join(base_dir, "TGD.txt")
target_csv = os.path.join(base_dir, "TGD-target-fromBATMAN.csv")
out_dir    = os.path.join(base_dir, "target")
os.makedirs(out_dir, exist_ok=True)
out_csv    = os.path.join(out_dir, "TGD-target-from-BATMAN-processed-consider-compound.csv")

# 1. 读取配方中药材用量，计算权重
df_w = pd.read_csv(
    txt_file,
    sep=",",            # 按逗号切分
    header=None,
    names=["herbname", "amount"],
    dtype={"herbname": str, "amount": float},
    engine="python"
)
total_amount = df_w["amount"].sum()
df_w["weight"] = df_w["amount"] / total_amount

# 2. 读取 BATMAN 靶点文件
#    四列：herbname, PubChem_CID, protein, probability
df_t = pd.read_csv(target_csv, dtype=str)

# 3. 合并药材权重到靶点表（按 herbname）
df = pd.merge(
    df_t,
    df_w[["herbname", "weight"]],
    on="herbname",
    how="left"
)
df["weight"] = df["weight"].fillna(0.0)

# 4. 考虑成分层面：每个 herb–compound–protein 一次计重
df_unique = df.drop_duplicates(subset=["herbname", "PubChem_CID", "protein"])

# 5. 按 protein 聚合：对所有命中该 protein 的 herb–compound 权重求和
df_out = (
    df_unique
    .groupby("protein", as_index=False)["weight"]
    .sum()
    .rename(columns={"weight": "protein_weight"})
)

# 6. 保存结果
df_out.to_csv(out_csv, index=False, encoding="utf-8-sig")
print(f"处理完成，结果已保存到：{out_csv}")

处理完成，结果已保存到：/slurm/home/yrd/liaolab/tanshuoyan/TCM-opt-Tan/PD/herb/TGD-herb/target/TGD-target-from-BATMAN-processed-consider-compound.csv


In [36]:
##step2
####对方剂的靶点进行加权统计获得药材靶点权重
####这里考虑成分并且也考虑了相互作用概率，method 3
import os
import pandas as pd

# 路径设置
base_dir   = "/slurm/home/yrd/liaolab/tanshuoyan/TCM-opt-Tan/PD/herb/TGD-herb"
txt_file   = os.path.join(base_dir, "TGD.txt")
target_csv = os.path.join(base_dir, "TGD-target-fromBATMAN.csv")
out_dir    = os.path.join(base_dir, "target")
os.makedirs(out_dir, exist_ok=True)
out_csv    = os.path.join(out_dir, "TGD-target-from-BATMAN-processed-consider-compound-probability.csv")

# 1. 读取配方中药材用量，计算 herb 权重
df_w = pd.read_csv(
    txt_file,
    sep=",",
    header=None,
    names=["herbname", "amount"],
    dtype={"herbname": str, "amount": float},
    engine="python"
)
total_amount = df_w["amount"].sum()
df_w["weight"] = df_w["amount"] / total_amount

# 2. 读取 BATMAN 靶点文件
#    列：herbname, PubChem_CID, protein, probability
df_t = pd.read_csv(target_csv, dtype=str)

# 3. 转成 numeric，缺失或非数字设 0
df_t["probability"] = pd.to_numeric(df_t["probability"], errors="coerce").fillna(0.0)

# 4. 合并 herb 权重
df = pd.merge(
    df_t,
    df_w[["herbname", "weight"]],
    on="herbname",
    how="left"
)
df["weight"] = df["weight"].fillna(0.0)

# 5. 去重到 herb–compound–protein 级别，保留对应的 probability
df_unique = df.drop_duplicates(subset=["herbname", "PubChem_CID", "protein"])

# 6. 计算每条记录对 protein 的贡献： herb_weight * probability
df_unique["contribution"] = df_unique["weight"] * df_unique["probability"]

# 7. 按 protein 聚合贡献值之和
df_out = (
    df_unique
    .groupby("protein", as_index=False)["contribution"]
    .sum()
    .rename(columns={"contribution": "protein_weight"})
)

# 8. 保存结果
df_out.to_csv(out_csv, index=False, encoding="utf-8-sig")
print(f"处理完成，结果已保存到：{out_csv}")

处理完成，结果已保存到：/slurm/home/yrd/liaolab/tanshuoyan/TCM-opt-Tan/PD/herb/TGD-herb/target/TGD-target-from-BATMAN-processed-consider-compound-probability.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unique["contribution"] = df_unique["weight"] * df_unique["probability"]


In [47]:
##step2：
##对蛋白权重进行标准化处理，标准化方法使用最大值归一化（Max–Scaling）方法，并另存为标准化后的权重文件
import os
import pandas as pd

# 目标目录
target_dir = "/slurm/home/yrd/liaolab/tanshuoyan/TCM-opt-Tan/PD/herb/TGD-herb/target"

# 要处理的文件列表
filenames = [
    "TGD-target-from-BATMAN-processed-consider-compound.csv",
    "TGD-target-from-BATMAN-processed-consider-compound-probability.csv"
]

for fname in filenames:
    # 构造完整路径并读取
    path = os.path.join(target_dir, fname)
    df = pd.read_csv(path, dtype=str)
    
    # 确定权重列（假设是第二列）
    weight_col = df.columns[1]
    df[weight_col] = df[weight_col].astype(float)
    
    # 最大值归一化作为药材靶点的标准化权重
    max_w = df[weight_col].max()
    df_out = pd.DataFrame({
        "protein": df["protein"],
        "protein_weight": df[weight_col] / max_w
    })
    
    # 构造输出文件名，并保存
    base, _ = os.path.splitext(fname)
    out_fname = f"{base}-standardization.csv"
    out_path = os.path.join(target_dir, out_fname)
    df_out.to_csv(out_path, index=False, encoding="utf-8-sig")
    print(f"已保存标准化结果到：{out_path}")

已保存标准化结果到：/slurm/home/yrd/liaolab/tanshuoyan/TCM-opt-Tan/PD/herb/TGD-herb/target/TGD-target-from-BATMAN-processed-consider-compound-standardization.csv
已保存标准化结果到：/slurm/home/yrd/liaolab/tanshuoyan/TCM-opt-Tan/PD/herb/TGD-herb/target/TGD-target-from-BATMAN-processed-consider-compound-probability-standardization.csv
