## pbmc数据生成

In [None]:
import scanpy as sc
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# 定义文件路径
file_path = "/home/yu.zhiyin/CellRAG/pbmc_rare_0.003619.h5ad"

# 加载数据
adata = sc.read_h5ad(file_path)

# 筛选出 tissue 为 Bone_Marrow 的细胞
bone_marrow_cells = adata[adata.obs['tissue'] == 'Bone_Marrow']

# 获取所有 cell_type
cell_types = bone_marrow_cells.obs['final_annotation'].unique()

# 确保只有前15种细胞类型
cell_types = cell_types[:15]

# 初始化训练集和测试集
train_data = []
test_data = []

# 遍历每种 cell_type
for cell_type in cell_types:
    # 获取该 cell_type 的所有细胞
    cell_type_data = bone_marrow_cells[bone_marrow_cells.obs['final_annotation'] == cell_type]
    
    # 获取细胞样本索引
    cell_indices = cell_type_data.obs.index
    
    # 确保 cell_indices 至少有 50 个样本
    if len(cell_indices) < 50:
        print(f"Warning: Not enough samples for cell type {cell_type}. Skipping.")
        continue

    # 如果样本足够，抽取 30 个训练集和 20 个测试集
    train_size = 30
    test_size = 20
    
    # 随机抽取细胞索引
    train_cells, test_cells = train_test_split(cell_indices, train_size=train_size, test_size=test_size, random_state=42)
    
    # 获取训练集和测试集的细胞信息
    for cell_id in train_cells:
        # 获取细胞的基因表达数据
        cell_expr = cell_type_data[cell_id].X.toarray().flatten()
        # 提取表达水平前 100 的基因
        top_genes_idx = np.argsort(cell_expr)[::-1][:100]  # 从高到低排序
        top_genes = adata.var_names[top_genes_idx]  # 获取基因名称
        train_data.append(f"Top 100 genes for this cell (highest expression first): {', '.join(top_genes)}. Cell type: {cell_type}. Tissue: Bone_Marrow.\n")
    
    for cell_id in test_cells:
        # 获取细胞的基因表达数据
        cell_expr = cell_type_data[cell_id].X.toarray().flatten()
        # 提取表达水平前 100 的基因
        top_genes_idx = np.argsort(cell_expr)[::-1][:100]  # 从高到低排序
        top_genes = adata.var_names[top_genes_idx]  # 获取基因名称
        test_data.append(f"Top 100 genes for this cell (highest expression first): {', '.join(top_genes)}. Cell type: {cell_type}. Tissue: Bone_Marrow.\n")

# 将训练集和测试集数据写入 txt 文件
with open('train_data.txt', 'w') as f:
    f.writelines(train_data)

with open('test_data.txt', 'w') as f:
    f.writelines(test_data)

print("数据生成完成")


## heart数据生成

In [None]:
import scanpy as sc
import numpy as np
from sklearn.model_selection import train_test_split

# 定义文件路径
file_path = r"E:\yzy\大四上\特斯联\research\human_heart_atlas.h5ad"

# 加载数据
adata = sc.read_h5ad(file_path)

# 统计每个tissue有多少个细胞
tissue_counts = adata.obs.groupby('tissue').size()

# 打印每个tissue的细胞数量
print(tissue_counts)

# 获取所有 tissue
tissues = adata.obs['tissue'].unique()

# 初始化训练集和测试集
train_data = []
test_data = []

# 遍历每种 tissue
for tissue in tissues:
    # 获取该 tissue 的所有细胞
    tissue_data = adata[adata.obs['tissue'] == tissue]

    # 获取细胞样本索引
    cell_indices = tissue_data.obs.index

    # 确保 cell_indices 至少有 125 个样本（75 个训练集 + 50 个测试集）
    if len(cell_indices) < 125:
        print(f"Warning: Not enough samples for tissue {tissue}. Skipping.")
        continue

    # 随机抽取 125 个细胞索引
    selected_cells = np.random.choice(cell_indices, 125, replace=False)

    # 分割训练集和测试集（75 个训练集，50 个测试集）
    train_cells, test_cells = train_test_split(selected_cells, train_size=75, test_size=50, random_state=42)

    # 获取训练集和测试集的细胞信息
    for cell_id in train_cells:
        # 获取细胞的基因表达数据
        cell_expr = tissue_data[cell_id].X.toarray().flatten()
        # 提取表达水平前 100 的基因
        top_genes_idx = np.argsort(cell_expr)[::-1][:100]  # 从高到低排序
        top_genes = adata.var['feature_name'].iloc[top_genes_idx].values  # 获取基因名称
        cell_type = tissue_data.obs.loc[cell_id, 'cell_type']  # 获取cell_type信息
        train_data.append(
            f"Top 100 genes for this cell (highest expression first): {', '.join(top_genes)}. Cell type: {cell_type}. Tissue: {tissue_data.obs['tissue'].values[0]}.\n")

    for cell_id in test_cells:
        # 获取细胞的基因表达数据
        cell_expr = tissue_data[cell_id].X.toarray().flatten()
        # 提取表达水平前 100 的基因
        top_genes_idx = np.argsort(cell_expr)[::-1][:100]  # 从高到低排序
        top_genes = adata.var['feature_name'].iloc[top_genes_idx].values  # 获取基因名称
        cell_type = tissue_data.obs.loc[cell_id, 'cell_type']  # 获取cell_type信息
        test_data.append(
            f"Top 100 genes for this cell (highest expression first): {', '.join(top_genes)}. Cell type: {cell_type}. Tissue: {tissue_data.obs['tissue'].values[0]}.\n")

# 将训练集和测试集数据写入 txt 文件
with open('train_data_1.txt', 'w') as f:
    f.writelines(train_data)

with open('test_data_1.txt', 'w') as f:
    f.writelines(test_data)

## panceas数据生成

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# 定义文件路径
label_file = 'human_pancreas_data_label.csv'  # 细胞类型文件
expression_file = 'human_pancreas_data.csv'  # 基因表达数据文件

# 读取数据
labels = pd.read_csv(label_file)  # 细胞类型数据
expression_data = pd.read_csv(expression_file, index_col=0)  # 基因表达数据，行名是基因名称，列名是cell_id

# 定义每种细胞类型的抽样数量
cell_type_sample_sizes = { 
    # 'acinar': (75, 50),
    # 'alpha': (75, 50),
    # 'beta': (75, 50),
    # 'delta': (75, 50),
     'endothelial': (2, 1),
    # 'gamma': (75, 50)
    # 'ductal':(95,65)
}

# 初始化训练集和测试集
train_data = []
test_data = []

# 遍历每种细胞类型
for cell_type, (train_size, test_size) in cell_type_sample_sizes.items():
    # 过滤该细胞类型的细胞
    cell_type_cells = labels[labels['celltype'] == cell_type]['cell_id'].values

    # 检查是否有足够的细胞
    if len(cell_type_cells) < train_size + test_size:
        print(f"Warning: Not enough samples for cell type {cell_type}. Skipping.")
        continue

    # 随机抽取训练集和测试集
    train_cells, test_cells = train_test_split(cell_type_cells, train_size=train_size, test_size=test_size, random_state=42)

    # 对训练集中的每个细胞获取其基因表达数据
    for cell_id in train_cells:
        if cell_id not in expression_data.columns:
            print(f"Warning: Cell ID {cell_id} not found in expression data. Skipping.")
            continue
        
        # 获取该细胞的基因表达数据
        cell_expr = expression_data[cell_id].values  # 每列是一个细胞的基因表达数据
        # 获取表达水平前100的基因索引
        top_genes_idx = np.argsort(cell_expr)[::-1][:100]  # 按表达水平从高到低排序，取前100个
        # 获取基因名称
        top_genes = expression_data.index[top_genes_idx]
        # 添加到训练数据中
        train_data.append(
            f"Top 100 genes for this cell (highest expression first): {', '.join(top_genes)}. Cell type: {cell_type}. Tissue: pancreas.\n")

    # 对测试集中的每个细胞获取其基因表达数据
    for cell_id in test_cells:
        if cell_id not in expression_data.columns:
            print(f"Warning: Cell ID {cell_id} not found in expression data. Skipping.")
            continue

        # 获取该细胞的基因表达数据
        cell_expr = expression_data[cell_id].values  # 每列是一个细胞的基因表达数据
        # 获取表达水平前100的基因索引
        top_genes_idx = np.argsort(cell_expr)[::-1][:100]  # 按表达水平从高到低排序，取前100个
        # 获取基因名称
        top_genes = expression_data.index[top_genes_idx]
        # 添加到测试数据中
        test_data.append(
            f"Top 100 genes for this cell (highest expression first): {', '.join(top_genes)}. Cell type: {cell_type}. Tissue: pancreas.\n")

# 将训练集和测试集数据写入txt文件
with open('train_data_2.txt', 'w') as f:
    f.writelines(train_data)

with open('test_data.txt_2', 'w') as f:
    f.writelines(test_data)

print("Data processing complete.")

