### Analyze hub genes and high-weight edges.

#### Hub gene analysis

##### Obtain the gene connectivity matrix (for each cell type, there are five degree matrices, one for each fold)

In [None]:
! python ../get_degree_matrix_train.py -expr ../../data/pre_data/scRNAseq_datasets/Baron_Human.npz \
    -outdir ../../result/datasets/ \
    -ca 0.01 -hvgs 2000

##### Save the hub genes for each cell type for the five folds (100 genes for each fold).

In [None]:
import pandas as pd
import numpy as np
import os

seq_dict = np.load('../../result/datasets/Baron_Human/seq_dict.npz', allow_pickle=True) 
label = seq_dict['label']

matrix_dict = np.load('../../result/datasets/Baron_Human/degree_matrix_train_Baron_Human_a0.01_hvgs2000.npz', allow_pickle=True)

print("Keys in loaded matrix_dict:", matrix_dict.files)

str_labels = matrix_dict['str_labels']
print("cell-type: ", str_labels)

print("Length of str_labels:", len(str_labels))

for k in range(len(str_labels)):
    degree_matrix_key = f'{k}' 
    degree_matrix_dict = matrix_dict[degree_matrix_key].item()  
    print(degree_matrix_dict.keys())
    fold_top_indices_df = pd.DataFrame()

    for i in range(5):
        fold = i + 1
        cur_fold_degree_matrix = degree_matrix_dict[f'CV_{fold}']
        mean_degree = np.mean(cur_fold_degree_matrix, axis=1)
        top_100_indices = np.argsort(mean_degree)[-100:][::-1]     
        fold_top_indices_df[f'Fold_{fold}'] = top_100_indices

    cell_type_name = str_labels[k]
    os.makedirs("data/Baron_Human/Gene_degree",exist_ok=True)
    tsv_filename = f'data/Baron_Human/Gene_degree/{cell_type_name}_top100_indices.tsv'
    fold_top_indices_df.to_csv(tsv_filename, sep='\t', index=False)

    print(f"Saved top 100 indices for cell type '{cell_type_name}' to {tsv_filename}")

##### Obtain the union of hub genes for each cell type to get the charactersit gene set for each cell type.

In [None]:
import pandas as pd
import os

def read_and_union_tsv(cell_type):

    file_path = f'data/Baron_Human/Gene_degree/{cell_type}_top100_indices.tsv'
    
    try:
        data = pd.read_csv(file_path, sep='\t')
        union_set = set()
        for col in data.columns:
            union_set.update(data[col].dropna().astype(int))
        
        return list(union_set)
    except FileNotFoundError:
        print(f"Warning: File not found for cell type {cell_type}")
        return []
    except Exception as e:
        print(f"Error processing {cell_type}: {str(e)}")
        return []

def process_cell_types(str_labels):

    result_dict = {}
    
    for cell_type in str_labels:
        union_genes = read_and_union_tsv(cell_type)
        result_dict[cell_type] = pd.Series(union_genes, dtype='Int64')
        print(f"{cell_type}: {len(union_genes)} genes")

    result_df = pd.DataFrame(result_dict)
    
    return result_df

seq_dict = np.load('../../result/datasets/Baron_Human/seq_dict.npz', allow_pickle=True) 
str_labels = seq_dict['str_labels']

result_df = process_cell_types(str_labels)

output_path = 'data/Baron_Human/Baron_Human_character_gene_set_indices.tsv'
result_df.to_csv(output_path, sep='\t', index=False)
print(f"\nResults saved to: {output_path}")

##### Save the gene symbols corresponding to the 2000 HVGs (Highly Variable Genes). 

In [None]:
seq_dict = np.load('../../result/datasets/Baron_Human/seq_dict.npz', allow_pickle=True) 
genes = seq_dict['gene_symbol']

all_filtered_genes_file = '../../result/datasets/Baron_Human/Baron_Human_filtered_hvgs2000.npy'

all_filtered_genes_array = np.load(all_filtered_genes_file, allow_pickle=True)
filtered_genes_index = all_filtered_genes_array[0]
filtered_genes_index = filtered_genes_index.astype(int)
print(filtered_genes_index.shape)

filtered_genes = genes[filtered_genes_index]

print(filtered_genes)
print(filtered_genes.shape)
df = pd.DataFrame(filtered_genes, columns=['gene_symbol'])
df.to_csv('data/Baron_Human/Baron_Human_gene_symbol_hvgs2000.tsv',  sep='\t', index=False, header=True)

##### Based on the gene symbols obtained above, convert the indices in the character gene set to the corresponding gene symbols

In [None]:
import pandas as pd

file_path = 'data/Baron_Human/Baron_Human_character_gene_set_indices.tsv'
gene_idx_df = pd.read_csv(file_path, sep='\t')

gene_symbol_file = 'data/Baron_Human/Baron_Human_gene_symbol_hvgs2000.tsv'
gene_symbol = pd.read_csv(gene_symbol_file, sep='\t')

gene_symbol_dict = {i: gene_symbol.iloc[i, 0] for i in range(len(gene_symbol))}

def map_to_gene_symbol(value):
    if pd.isna(value):
        return None
    return gene_symbol_dict.get(int(value), None)

gene_idx_df = gene_idx_df.applymap(map_to_gene_symbol)

output_file_path = 'data/Baron_Human/Baron_Human_character_gene_set.tsv'
gene_idx_df.to_csv(output_file_path, sep='\t', index=False)

print(f"Save to {output_file_path}")


#### hub gene plot

##### Venn: R

In [None]:
Rscript Figure-hub-genes.R

##### Upset: Differential analysis of different cell types.

In [None]:
import pandas as pd
import numpy as np
from upsetplot import from_memberships
from upsetplot import UpSet
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams['font.family'] = 'Times New Roman'

file_path = "data/Baron_Human/Baron_Human_character_gene_set_indices.tsv"
data = pd.read_csv(file_path, sep="\t", header=0, na_values="NA")

cell_type_sets = {}
for column in data.columns:
    genes = set(data[column].dropna().values)
    cell_type_sets[column] = genes
all_genes = list(set.union(*cell_type_sets.values()))
cell_types = list(cell_type_sets.keys())
binary_matrix = pd.DataFrame(0, index=all_genes, columns=cell_types)

for gene in all_genes:
    for cell_type in cell_types:
        if gene in cell_type_sets[cell_type]:
            binary_matrix.loc[gene, cell_type] = 1

combinations = binary_matrix.apply(tuple, axis=1)
combination_counts = combinations.value_counts()

membership_lists = []
for comb in combination_counts.index:
    current_members = []
    for i, v in enumerate(comb):
        if v:
            current_members.append(cell_types[i])
    membership_lists.append(current_members)

upset_data = from_memberships(
    membership_lists,
    data=combination_counts.values
)

plt.figure(figsize=(3, 1))

upset = UpSet(upset_data,
              min_subset_size=3,
              show_counts=True,
              sort_by='cardinality',
              element_size=20,
              facecolor='#34495e',          
              other_dots_color=0.3,         
              shading_color='#f5f6fa',
              )     

upset.plot()

plt.tight_layout()

plt.savefig('../../result/Figures/Baron_Human/upset_cell_type_gene.png', dpi=1200, bbox_inches='tight',format='png')
plt.show()

print("\nGene set statistics")
for cell_type, genes in cell_type_sets.items():
    print(f"{cell_type}: {len(genes)} genes")

print("\nThe largest intersection:")
for i, (comb, count) in enumerate(combination_counts.head().items()):
    cell_types_in_combo = [cell_types[i] for i, v in enumerate(comb) if v]
    print(f"intersection {i+1}: {', '.join(cell_types_in_combo)} - {count} genes")

#### Analysis of highly correlated gene pairs.

##### Obtain the top 100 high-weight edges (gene pairs) for each cell type

In [None]:
import numpy as np
import os
import torch
from scipy import sparse
import heapq
from collections import defaultdict

def find_top_edges(matrices, top_k=100):

    total_matrices = len(matrices)  
    edge_weights = defaultdict(float)

    for matrix in matrices:
        rows, cols = matrix.nonzero()
        values = matrix.data 
        for i in range(len(rows)):
            node1 = int(min(rows[i], cols[i]))
            node2 = int(max(rows[i], cols[i]))
            if node1 != node2:
                edge_weights[(node1, node2)] += float(values[i])

    edge_avg_weights = []
    for (node1, node2), weight_sum in edge_weights.items():
        avg_weight = float(weight_sum) / float(total_matrices)
        edge_avg_weights.append((int(node1), int(node2), float(avg_weight)))

    return heapq.nlargest(top_k, edge_avg_weights, key=lambda x: x[2])


def load_and_process_matrices(cell_test_folder, cur_label_idxs):

    matrices = []

    for idx in cur_label_idxs:
        data = torch.load(os.path.join(cell_test_folder, f'cell_{idx}.pt'))
        edge_index = data.edge_index
        edge_weight = data.edge_weight
        num_nodes = data.x.shape[0]  
        edges = edge_index.cpu().numpy()
        weights = edge_weight.cpu().numpy()
        sparse_mat = sparse.csr_matrix(
            (weights, (edges[0], edges[1])),
            shape=(num_nodes, num_nodes)
        )           
      
        matrices.append(sparse_mat)     
    
    return matrices
    


def save_cell_type_edges(cell_type, fold_edges, save_folder, base_filename):

    save_dir = os.path.join(save_folder, f"{base_filename}_edges")
    os.makedirs(save_dir, exist_ok=True)

    filename = os.path.join(save_dir, f"{cell_type}_top_edges.npz")

    save_dict = {
        'cell_type': cell_type,
        'base_filename': base_filename
    }

    for fold_idx, edges in fold_edges.items():
        edges_array = np.array(edges, dtype=[
            ('node1', 'int32'), 
            ('node2', 'int32'), 
            ('weight', 'float32')
        ])
        save_dict[f'fold_{fold_idx}_edges'] = edges_array        

    np.savez(filename, **save_dict)


In [None]:
seq_dict = np.load('../../result/datasets/Baron_Human/seq_dict.npz', allow_pickle=True) 
label = seq_dict['label'] 
str_labels = seq_dict['str_labels']
save_folder = 'data/Baron_Human/Edge_weight'

cell_type_edges = {}

for cur_label, cell_type in enumerate(str_labels):
    print("cur_label: ", cur_label)
    print("cell_type: ", cell_type)
    fold_edges = {}
    
    for k in range(5):
        k_fold = k + 1
        train_index = seq_dict[f'train_index_{k_fold}']
        label_train = label[train_index]
        cur_label_idxs = np.where(label_train == cur_label)[0].tolist()


        cell_train_folder = os.path.join(
            "../../result/datasets/Baron_Human/wcsn_a0.01_hvgs2000", 
            f"train_f{k_fold}", 
            'processed'
        )
        
        cur_mat = load_and_process_matrices(cell_train_folder, cur_label_idxs)
        cur_top_edges = find_top_edges(cur_mat)

        fold_edges[k_fold] = cur_top_edges

    save_cell_type_edges(cell_type, fold_edges, save_folder,  base_filename="Baron_Human")

In [None]:
import numpy as np
import pandas as pd
import os
import glob

def create_edge_strings(edges):
    """Convert edge pairs to string format."""
    return [f"{int(edge[0])}-{int(edge[1])}" for edge in edges]

def process_npz_file(file_path):
    """Process a single npz file and save results."""
    # Extract cell type name from file path
    cell_type = os.path.basename(file_path).replace('_top_edges.npz', '')
    
    # Create output directory
    output_dir = os.path.join('data/Baron_Human/Edge_weight', f'{cell_type}_top_edges')
    
    os.makedirs(output_dir, exist_ok=True)
    
    # Load and process data
    try:
        data = np.load(file_path, allow_pickle=True)
        
        # Process each fold
        for i in range(1, 6):  # 5-fold
            fold_key = f'fold_{i}_edges'
            if fold_key in data:
                edges = data[fold_key]
                edge_strings = create_edge_strings(edges)
                
                # Save to CSV
                output_file = os.path.join(output_dir, f'fold_{i}_edges.csv')
                pd.DataFrame({'edges': edge_strings}).to_csv(output_file, index=False)
                
        print(f"Successfully processed {cell_type}")
        
    except Exception as e:
        print(f"Error processing {cell_type}: {str(e)}")

def process_all_cell_types(base_dir):
    """Process all npz files in the directory."""
    # Find all npz files in the directory
    npz_files = glob.glob(os.path.join(base_dir, '*_top_edges.npz'))
    
    if not npz_files:
        print(f"No npz files found in {base_dir}")
        return
    
    print(f"Found {len(npz_files)} files to process")
    
    # Process each file
    for file_path in npz_files:
        process_npz_file(file_path)
    
    print("Processing complete!")

base_dir = 'data/Baron_Human/Edge_weight/Baron_Human_edges'
process_all_cell_types(base_dir)

##### Obtain the union of highly correlated gene pairs for each cell type to get the character edge set for each cell type.

In [None]:
import numpy as np
import pandas as pd
import os

def get_union_edges(npz_file):

   data = np.load(npz_file, allow_pickle=True)

   all_edges = set()

   for i in range(1, 6):
       fold_key = f'fold_{i}_edges'
       if fold_key in data:
           edges = data[fold_key]
           edge_strings = [f"{int(edge[0])}-{int(edge[1])}" for edge in edges]
           all_edges.update(edge_strings)
   print(list(all_edges))
   return sorted(list(all_edges)) 


seq_dict = np.load('../../result/datasets/Baron_Human/seq_dict.npz', allow_pickle=True) 
str_labels = seq_dict['str_labels']

cell_types = str_labels.tolist()  
print("cell types: ", cell_types)

edges_dict = {}

for cell_type in cell_types:
   npz_file = f'data/Baron_Human/Edge_weight/Baron_Human_edges/{cell_type}_top_edges.npz'  # npz文件路径
   if os.path.exists(npz_file):
        edges_dict[cell_type] = get_union_edges(npz_file)

max_length = max(len(edges) for edges in edges_dict.values())

for cell_type in edges_dict:
   if len(edges_dict[cell_type]) < max_length:
       edges_dict[cell_type].extend([''] * (max_length - len(edges_dict[cell_type])))

df = pd.DataFrame(edges_dict)
print(df.columns)

save_path = 'data/Baron_Human/Baron_Human_character_edge_set_indices.tsv'
df.to_csv(save_path, sep='\t', index=False)

print(f"Save to: {save_path}")
 
print("\n The number of different cell types:")

for cell_type in cell_types:
   edge_count = len([x for x in edges_dict[cell_type] if x != ''])
   print(f"{cell_type}: {edge_count}")

##### Convert the indices in the character edge set file obtained above to the corresponding gene pair gene symbols and save them.

In [None]:
import pandas as pd

gene_symbol_file = 'data/Baron_Human/Baron_Human_gene_symbol_hvgs2000.tsv'
gene_symbol = pd.read_csv(gene_symbol_file, sep='\t')

gene_symbol_dict = {i: gene_symbol.iloc[i, 0] for i in range(len(gene_symbol))}

file_path = 'data/Baron_Human/Baron_Human_character_edge_set_indices.tsv'
df = pd.read_csv(file_path, sep='\t')

def convert_edges_to_gene_symbols(edge_str):
    if pd.isna(edge_str):
        return None
    gene1, gene2 = edge_str.split('-')
    gene1_symbol = gene_symbol_dict.get(int(gene1), 'Unknown')
    gene2_symbol = gene_symbol_dict.get(int(gene2), 'Unknown')
    return f"{gene1_symbol}-{gene2_symbol}"

for column in df.columns:
    df[column] = df[column].apply(convert_edges_to_gene_symbols)

save_path = 'data/Baron_Human/Baron_Human_character_edge_set.tsv'
df.to_csv(save_path, sep='\t', index=False)

print(f"Save to: {save_path}")

print("\n The number of different cell type: ")
for column in df.columns:
    edge_count = df[column].apply(lambda x: x != '').sum()
    print(f"{column}: {edge_count} edges")


#### Highly correlated gene pairs: Plot

##### Venn: R

In [None]:
Rscript Figure-high-weight.R

##### Upset: Differential analysis of different cell types.

In [None]:
import pandas as pd
import numpy as np
from upsetplot import from_memberships
from upsetplot import UpSet
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

file_path = "data/Baron_Human/Baron_Human_character_edge_set_indices.tsv"
data = pd.read_csv(file_path, sep="\t", header=0, na_values="NA")

cell_type_sets = {}
for column in data.columns:
    genes = set(data[column].dropna().values)
    cell_type_sets[column] = genes

all_genes = list(set.union(*cell_type_sets.values()))
cell_types = list(cell_type_sets.keys())
binary_matrix = pd.DataFrame(0, index=all_genes, columns=cell_types)

for gene in all_genes:
    for cell_type in cell_types:
        if gene in cell_type_sets[cell_type]:
            binary_matrix.loc[gene, cell_type] = 1

combinations = binary_matrix.apply(tuple, axis=1)
combination_counts = combinations.value_counts()

membership_lists = []
for comb in combination_counts.index:
    current_members = []
    for i, v in enumerate(comb):
        if v:
            current_members.append(cell_types[i])
    membership_lists.append(current_members)

upset_data = from_memberships(
    membership_lists,
    data=combination_counts.values
)

plt.figure(figsize=(10, 5))

upset = UpSet(upset_data,
              min_subset_size=3,
              show_counts=True,
              sort_by='cardinality',
              element_size=20,
              facecolor='#34495e',           
              other_dots_color=0.3,          
              shading_color='#f5f6fa',
              )       

upset.plot()

plt.tight_layout()

plt.savefig('../../result/Figures/Baron_Human/upset_cell_type_edge.svg', dpi=1200, bbox_inches='tight',format='svg')
plt.savefig('../../result/Figures/Baron_Human/upset_cell_type_edge.png', dpi=1200, bbox_inches='tight',format='png')

plt.show()
plt.close()

print("\n Edge set statistics:")
for cell_type, genes in cell_type_sets.items():
    print(f"{cell_type}: {len(genes)} edges")

print("\n The largest intersection:")
for i, (comb, count) in enumerate(combination_counts.head().items()):
    cell_types_in_combo = [cell_types[i] for i, v in enumerate(comb) if v]
    print(f"Intersection: {i+1}: {', '.join(cell_types_in_combo)} - {count} edges")

### tsne

#### True label tsne

In [None]:
# true
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams
rcParams['font.family'] = 'Times New Roman'


seq_dict_file = '../../result/datasets/Baron_Human/seq_dict.npz'
seq_dict = np.load(seq_dict_file, allow_pickle=True) 
label = seq_dict['label']
str_labels = seq_dict['str_labels']
barcodes = seq_dict['barcode']

genes = seq_dict['gene_symbol']

all_filtered_genes_file = '../../result/datasets/Baron_Human/Baron_Human_filtered_hvgs2000.npy'
all_filtered_genes_array = np.load(all_filtered_genes_file, allow_pickle=True)
filtered_genes_index = all_filtered_genes_array[0]
filtered_genes_index = filtered_genes_index.astype(int)
print(filtered_genes_index.shape)

gene_hvgs = genes[filtered_genes_index]

print(gene_hvgs)

pred_path = '../../result/wcsn_preds/Baron_Human_a0.01_hvgs2000_prediction.h5'

cell_embedding = pd.read_hdf(pred_path, key='embedding')

cell_type = pd.read_hdf(pred_path, key='cell_type')
pred_prob = pd.read_hdf(pred_path, key='pred_prob')
print(cell_embedding.shape)
print(cell_type.shape)
print(cell_type[:5])

lst = [3]
for k in lst:
    k_fold = k + 1
    print("k_fold: ", k_fold)
    test_index = seq_dict[f'test_index_{k_fold}']
    barcodes_test = barcodes[test_index]
    cur_cell_emb = cell_embedding.loc[barcodes_test]

    print("Embedding shape:", cur_cell_emb.shape)  
    print("Number of unique labels:", len(np.unique(cell_type['true_cell_type'].loc[barcodes_test])))

    perp = 30
    ee = 12
    tsne = TSNE(n_components=2,      
                perplexity=perp,       
                early_exaggeration=ee,
                random_state=42,     
                n_iter=1000,         
                learning_rate='auto')

    embeddings_2d = tsne.fit_transform(cur_cell_emb.to_numpy())
    
    plt.figure(figsize=(8, 5))
    unique_labels = np.unique(cell_type['true_cell_type'].loc[barcodes_test])

    colors = sns.color_palette('husl', n_colors=len(unique_labels))
    color_dict = dict(zip(unique_labels, colors))

    for label in unique_labels:
        mask = cell_type['true_cell_type'].loc[barcodes_test] == label
        plt.scatter(embeddings_2d[mask, 0], 
                embeddings_2d[mask, 1],
                c=[color_dict[label]],
                label=str_labels[label],
                alpha=0.6,
                s=20)

    plt.legend(bbox_to_anchor=(1.01, 1),
                loc='upper left',
                borderaxespad=0,
                fontsize=12)
    plt.xlabel('t-SNE 1', fontsize=14,fontweight='bold')
    plt.ylabel('t-SNE 2', fontsize=14,fontweight='bold')

    plt.tight_layout()

    plt.savefig(f'../../result/Figures/Baron_Human/tsne/tsne_visualization_true_f{k_fold}.svg', 
                dpi=1200, 
                bbox_inches='tight',
                format='svg')

    plt.savefig(f'../../result/Figures/Baron_Human/tsne/tsne_visualization_true_f{k_fold}.png', 
                dpi=1200, 
                bbox_inches='tight',
                format='png')

    plt.show()

#### Pred label tsne

In [None]:
# pred

import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib import rcParams
rcParams['font.family'] = 'Times New Roman'


seq_dict_file = '../../result/datasets/Baron_Human/seq_dict.npz'
seq_dict = np.load(seq_dict_file, allow_pickle=True) 
label = seq_dict['label']
str_labels = seq_dict['str_labels']
barcodes = seq_dict['barcode']

genes = seq_dict['gene_symbol']

all_filtered_genes_file = '../../result/datasets/Baron_Human/Baron_Human_filtered_hvgs2000.npy'

all_filtered_genes_array = np.load(all_filtered_genes_file, allow_pickle=True)
filtered_genes_index = all_filtered_genes_array[0]
filtered_genes_index = filtered_genes_index.astype(int)
print(filtered_genes_index.shape)

gene_hvgs = genes[filtered_genes_index]

print(gene_hvgs)

pred_path = '../../result/wcsn_preds/Baron_Human_a0.01_hvgs2000_prediction.h5'

cell_embedding = pd.read_hdf(pred_path, key='embedding')

cell_type = pd.read_hdf(pred_path, key='cell_type')
pred_prob = pd.read_hdf(pred_path, key='pred_prob')
print(cell_embedding.shape)
print(cell_type.shape)
print(cell_type[:5])

lst = [3]
for k in lst:
    k_fold = k + 1
    print("k_fold: ", k_fold)
    test_index = seq_dict[f'test_index_{k_fold}']
    barcodes_test = barcodes[test_index]
    cur_cell_emb = cell_embedding.loc[barcodes_test]

    print("Embedding shape:", cur_cell_emb.shape)  
    print("Number of unique labels:", len(np.unique(cell_type['pred_cell_type'].loc[barcodes_test])))

    perp = 30
    ee = 12
    tsne = TSNE(n_components=2,      
                perplexity=perp,       
                early_exaggeration=ee,
                random_state=42,    
                n_iter=1000,        
                learning_rate='auto')

    embeddings_2d = tsne.fit_transform(cur_cell_emb.to_numpy())
    
    plt.figure(figsize=(8, 5))
    unique_labels = np.unique(cell_type['pred_cell_type'].loc[barcodes_test])

    colors = sns.color_palette('husl', n_colors=len(unique_labels))
    color_dict = dict(zip(unique_labels, colors))

    for label in unique_labels:
        mask = cell_type['pred_cell_type'].loc[barcodes_test] == label
        plt.scatter(embeddings_2d[mask, 0], 
                embeddings_2d[mask, 1],
                c=[color_dict[label]],
                label=str_labels[label],
                alpha=0.6,
                s=20)

    plt.legend(bbox_to_anchor=(1.01, 1),
                loc='upper left',
                borderaxespad=0,
                fontsize=12)

    plt.xlabel('t-SNE 1', fontsize=14,fontweight='bold')
    plt.ylabel('t-SNE 2', fontsize=14,fontweight='bold')

    plt.tight_layout()

    plt.savefig(f'../../result/Figures/Baron_Human/tsne/tsne_visualization_pred_f{k_fold}.svg', 
                dpi=1200, 
                bbox_inches='tight',
                format='svg')

    plt.savefig(f'../../result/Figures/Baron_Human/tsne/tsne_visualization_pred_f{k_fold}.png', 
                dpi=1200, 
                bbox_inches='tight',
                format='png')

    plt.show()

#### gene-tsne

##### Obtain the degree matrix for cells and genes in the test set for each fold.

In [None]:
import numpy as np
import os
import torch
import os
pathjoin = os.path.join

seq_dict = np.load('../../result/datasets/Baron_Human/seq_dict.npz', allow_pickle=True) 
label = seq_dict['label']
str_labels = seq_dict['str_labels']

cur_label = 0
matrix_dict = {}
matrix_dict['str_labels'] = str_labels

degree_matrices_for_folds = {} 

for k in range(5):
    k_fold = k + 1
    test_index = seq_dict[f'test_index_{k_fold}']
    label_test = label[test_index]
    cell_test_folder = f'../../result/datasets/Baron_Human/wcsn_a0.01_hvgs2000/test_f{k_fold}/processed'
    
    degree_matrix = []
    print(test_index.shape)
    for idx in range(len(test_index)):
        data = torch.load(os.path.join(cell_test_folder, f'cell_{idx}.pt'))
        edge_index = data.edge_index
        edge_weight = data.edge_weight
        row, col = edge_index
        symmetric_edge_index = torch.cat([edge_index, torch.stack([col, row])], dim=1)
        degrees = torch.bincount(symmetric_edge_index[0])
        if degrees.size(0) < 2000:
            degrees = torch.cat([degrees, torch.zeros(2000 - degrees.size(0))])

        degree_matrix.append(degrees.numpy())

    degree_matrix = np.array(degree_matrix).T
    print(degree_matrix.shape)
    degree_matrices_for_folds[f'CV_{k_fold}'] = degree_matrix 

degree_file = pathjoin('data/Baron_Human', f'degree_matrix_Baron_Human_per_fold_a0.01_hvgs2000.npz')

print(f"Matrix dict structure before saving: {type(degree_matrices_for_folds)}")
for key in degree_matrices_for_folds:
    print(f"Key: {key}, Type: {type(degree_matrices_for_folds[key])}")
    if isinstance(degree_matrices_for_folds[key], dict):  
        for inner_key in degree_matrices_for_folds[key]:
            print(f"  Inner Key: {inner_key}, Type: {type(degree_matrices_for_folds[key][inner_key])}")


np.savez(degree_file, **degree_matrices_for_folds)


##### Obtain the cell-type-specific genes from the character gene set for each cell type

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_venn import venn3
import numpy as np

Baron_Human_union_top_100_genes_file = 'data/Baron_Human/Baron_Human_character_gene_set_indices.tsv'
Baron_Human_union_genes = pd.read_csv(Baron_Human_union_top_100_genes_file, sep='\t', dtype=str, header=0)
print(Baron_Human_union_genes.iloc[:5, :5])
columns = Baron_Human_union_genes.columns.tolist()

sets = {}
for col in columns:
    sets[col] = set(Baron_Human_union_genes[col].dropna().values)

print("\nSize of each set:")
for col, s in sets.items():
    print(f"{col}: {len(s)} genes")

common_genes = set.intersection(*sets.values())
print(f"\nNumber of common genes across all thirteen sets: {len(common_genes)}")
if len(common_genes) > 0:
    print("List of common genes:")
    print(common_genes)


# Calculate unique genes for each region
unique_genes = {}
for col in columns:
    print("Current cell type: ", col)
    # Calculate genes that only appear in this set
    unique = sets[col] - set.union(*[sets[c] for c in columns if c != col])
    unique_genes[col] = unique
    print(f"Number of unique genes in {col}: {len(unique)}")
    if len(unique) > 0:
        print("List of unique genes:")
        print(unique)
    print("\n")


##### Plot tsne

In [None]:
seq_dict_file = '../../result/datasets/Baron_Human/seq_dict.npz'
seq_dict = np.load(seq_dict_file, allow_pickle=True) 
label = seq_dict['label']
str_labels = seq_dict['str_labels']
barcodes = seq_dict['barcode']

genes = seq_dict['gene_symbol']

all_filtered_genes_file = '../../result/datasets/Baron_Human/Baron_Human_filtered_hvgs2000.npy'

all_filtered_genes_array = np.load(all_filtered_genes_file, allow_pickle=True)
filtered_genes_index = all_filtered_genes_array[0]
filtered_genes_index = filtered_genes_index.astype(int)
print(filtered_genes_index.shape)

gene_hvgs = genes[filtered_genes_index]

print(gene_hvgs)

In [None]:
pred_path = '../../result/wcsn_preds/Baron_Human_a0.01_hvgs2000_prediction.h5'

cell_embedding = pd.read_hdf(pred_path, key='embedding')

cell_type = pd.read_hdf(pred_path, key='cell_type')
pred_prob = pd.read_hdf(pred_path, key='pred_prob')
print(cell_embedding.shape)
print(cell_type.shape)
print(cell_type[:5])

degree_file = pathjoin('data/Baron_Human', f'degree_matrix_Baron_Human_per_fold_a0.01_hvgs2000.npz')
degree_fold = np.load(degree_file, allow_pickle=True)
degree_f4 = degree_fold['CV_4']
print(degree_f4.shape)
print(degree_f4[:5, :5])

In [None]:
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
import os

pathjoin = os.path.join

from matplotlib import rcParams
rcParams['font.family'] = 'Times New Roman'

lst = [3]
for k in lst:
    k_fold = k + 1
    print("k_fold: ", k_fold)
    test_index = seq_dict[f'test_index_{k_fold}']
    barcodes_test = barcodes[test_index]
    cur_cell_emb = cell_embedding.loc[barcodes_test]

    print(gene_hvgs[344])
    target_gene_degree = degree_f4[344]

    gene_degrees = target_gene_degree

    print("Embedding shape:", cur_cell_emb.shape)  
    print("Number of unique labels:", len(np.unique(cell_type['true_cell_type'].loc[barcodes_test])))

    perp = 30
    ee = 12
    tsne = TSNE(n_components=2,     
                perplexity=perp,       
                early_exaggeration=ee,
                random_state=42,     
                n_iter=1000,         
                learning_rate='auto')

    embeddings_2d = tsne.fit_transform(cur_cell_emb.to_numpy())

    plt.figure(figsize=(7, 5))

    scatter = plt.scatter(embeddings_2d[:, 0], 
                        embeddings_2d[:, 1],
                        c=gene_degrees,  
                        cmap='viridis',  
                        alpha=0.6,
                        s=20)

    cbar = plt.colorbar(scatter)
    plt.title(f'CCDC157 (acinar)', fontsize=14)
    plt.xlabel('t-SNE 1', fontsize=14,fontweight='bold')
    plt.ylabel('t-SNE 2', fontsize=14,fontweight='bold')

    plt.tight_layout()

    plt.savefig(f'../result/Figures/Baron_Human/tsne/gene/tsne_acinar_CCDC157_degree_f{k_fold}_perp{perp}_ee{ee}.svg', 
                dpi=1200, 
                bbox_inches='tight',
                format='svg')    
    plt.savefig(f'../result/Figures/Baron_Human/tsne/gene/tsne_acinar_CCDC157_degree_f{k_fold}_perp{perp}_ee{ee}.png', 
                dpi=1200, 
                bbox_inches='tight',
                format='png')

    plt.show()

In [None]:
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
import os

pathjoin = os.path.join

from matplotlib import rcParams
rcParams['font.family'] = 'Times New Roman'

lst = [3]
for k in lst:
    k_fold = k + 1
    print("k_fold: ", k_fold)
    test_index = seq_dict[f'test_index_{k_fold}']
    barcodes_test = barcodes[test_index]
    cur_cell_emb = cell_embedding.loc[barcodes_test]

    print(gene_hvgs[685])
    target_gene_degree = degree_f4[685]

    gene_degrees = target_gene_degree

    print("Embedding shape:", cur_cell_emb.shape)  
    print("Number of unique labels:", len(np.unique(cell_type['true_cell_type'].loc[barcodes_test])))

    perp = 30
    ee = 12
    tsne = TSNE(n_components=2,     
                perplexity=perp,       
                early_exaggeration=ee,
                random_state=42,    
                n_iter=1000,        
                learning_rate='auto')

    embeddings_2d = tsne.fit_transform(cur_cell_emb.to_numpy())
    plt.figure(figsize=(7, 5))

    scatter = plt.scatter(embeddings_2d[:, 0], 
                        embeddings_2d[:, 1],
                        c=gene_degrees,  
                        cmap='viridis',  
                        alpha=0.6,
                        s=20)

    cbar = plt.colorbar(scatter)
    plt.title(f'EPN1 (activated_stellate)', fontsize=14)
    plt.xlabel('t-SNE 1', fontsize=14,fontweight='bold')
    plt.ylabel('t-SNE 2', fontsize=14,fontweight='bold')

    plt.tight_layout()

    plt.savefig(f'../../result/Figures/Baron_Human/tsne/gene/tsne_activated_stellate_EPN1_degree_f{k_fold}_perp{perp}_ee{ee}.svg', 
                dpi=1200, 
                bbox_inches='tight',
                format='svg')    
    plt.savefig(f'../../result/Figures/Baron_Human/tsne/gene/tsne_activated_stellate_EPN1_degree_f{k_fold}_perp{perp}_ee{ee}.png', 
                dpi=1200, 
                bbox_inches='tight',
                format='png')

    plt.show()

In [None]:
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
import os

pathjoin = os.path.join

from matplotlib import rcParams
rcParams['font.family'] = 'Times New Roman'
lst = [3]
for k in lst:
    k_fold = k + 1
    print("k_fold: ", k_fold)
    test_index = seq_dict[f'test_index_{k_fold}']
    barcodes_test = barcodes[test_index]
    cur_cell_emb = cell_embedding.loc[barcodes_test]


    print(gene_hvgs[959])
    target_gene_degree = degree_f4[959]   

    gene_degrees = target_gene_degree

    print("Embedding shape:", cur_cell_emb.shape) 
    print("Number of unique labels:", len(np.unique(cell_type['true_cell_type'].loc[barcodes_test])))

    perp = 30
    ee = 12
    tsne = TSNE(n_components=2,      
                perplexity=perp,       
                early_exaggeration=ee,
                random_state=42,     
                n_iter=1000,       
                learning_rate='auto')

    embeddings_2d = tsne.fit_transform(cur_cell_emb.to_numpy())

    plt.figure(figsize=(7, 5))
    scatter = plt.scatter(embeddings_2d[:, 0], 
                        embeddings_2d[:, 1],
                        c=gene_degrees,  
                        cmap='viridis',  
                        alpha=0.6,
                        s=20)

    cbar = plt.colorbar(scatter)
    plt.title(f'GPR4 (alpha)', fontsize=14)
    plt.xlabel('t-SNE 1', fontsize=14,fontweight='bold')
    plt.ylabel('t-SNE 2', fontsize=14,fontweight='bold')

    plt.tight_layout()

    plt.savefig(f'../../result/Figures/Baron_Human/tsne/gene/tsne_alpha_GPR4_degree_f{k_fold}_perp{perp}_ee{ee}.svg', 
                dpi=1200, 
                bbox_inches='tight',
                format='svg')    
    plt.savefig(f'../../result/Figures/Baron_Human/tsne/gene/tsne_alpha_GPR4_degree_f{k_fold}_perp{perp}_ee{ee}.png', 
                dpi=1200, 
                bbox_inches='tight',
                format='png')

    plt.show()

In [None]:
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
import os

pathjoin = os.path.join

from matplotlib import rcParams
rcParams['font.family'] = 'Times New Roman'

lst = [3]
for k in lst:
    k_fold = k + 1
    print("k_fold: ", k_fold)
    test_index = seq_dict[f'test_index_{k_fold}']
    barcodes_test = barcodes[test_index]
    cur_cell_emb = cell_embedding.loc[barcodes_test]

    idx = 1130
    print(gene_hvgs[idx])
    target_gene_degree = degree_f4[idx]   

    gene_degrees = target_gene_degree

    print("Embedding shape:", cur_cell_emb.shape) 
    print("Number of unique labels:", len(np.unique(cell_type['true_cell_type'].loc[barcodes_test])))

    perp = 30
    ee = 12
    tsne = TSNE(n_components=2,      
                perplexity=perp,     
                early_exaggeration=ee,
                random_state=42,    
                n_iter=1000,       
                learning_rate='auto')

    embeddings_2d = tsne.fit_transform(cur_cell_emb.to_numpy())

    plt.figure(figsize=(7, 5))
    scatter = plt.scatter(embeddings_2d[:, 0], 
                        embeddings_2d[:, 1],
                        c=gene_degrees,  
                        cmap='viridis', 
                        alpha=0.6,
                        s=20)

    cbar = plt.colorbar(scatter)
    plt.title(f'{gene_hvgs[idx]} (ductal)', fontsize=14)
    plt.xlabel('t-SNE 1', fontsize=14,fontweight='bold')
    plt.ylabel('t-SNE 2', fontsize=14,fontweight='bold')

    plt.tight_layout()

    plt.savefig(f'../../result/Figures/Baron_Human/tsne/gene/tsne_ductal_{gene_hvgs[idx]}_degree_f{k_fold}_perp{perp}_ee{ee}.svg', 
                dpi=1200, 
                bbox_inches='tight',
                format='svg')    
    plt.savefig(f'../../result/Figures/Baron_Human/tsne/gene/tsne_ductal_{gene_hvgs[idx]}_degree_f{k_fold}_perp{perp}_ee{ee}.png', 
                dpi=1200, 
                bbox_inches='tight',
                format='png')

    plt.show()

In [None]:
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
import os

pathjoin = os.path.join

from matplotlib import rcParams
rcParams['font.family'] = 'Times New Roman'

lst = [3]
for k in lst:
    k_fold = k + 1
    print("k_fold: ", k_fold)
    test_index = seq_dict[f'test_index_{k_fold}']
    barcodes_test = barcodes[test_index]
    cur_cell_emb = cell_embedding.loc[barcodes_test]


    idx = 1006
    print(gene_hvgs[idx])
    target_gene_degree = degree_f4[idx]   

    gene_degrees = target_gene_degree

    print("Embedding shape:", cur_cell_emb.shape) 
    print("Number of unique labels:", len(np.unique(cell_type['true_cell_type'].loc[barcodes_test])))

    perp = 30
    ee = 12
    tsne = TSNE(n_components=2,      
                perplexity=perp,      
                early_exaggeration=ee,
                random_state=42,     
                n_iter=1000,         
                learning_rate='auto')

    embeddings_2d = tsne.fit_transform(cur_cell_emb.to_numpy())

    plt.figure(figsize=(7, 5))

    scatter = plt.scatter(embeddings_2d[:, 0], 
                        embeddings_2d[:, 1],
                        c=gene_degrees,  
                        cmap='viridis', 
                        alpha=0.6,
                        s=20)

    cbar = plt.colorbar(scatter)

    plt.title(f'{gene_hvgs[idx]} (endothelial)', fontsize=14)
    plt.xlabel('t-SNE 1', fontsize=14,fontweight='bold')
    plt.ylabel('t-SNE 2', fontsize=14,fontweight='bold')

    plt.tight_layout()

    plt.savefig(f'../../result/Figures/Baron_Human/tsne/gene/tsne_endothelial_{gene_hvgs[idx]}_degree_f{k_fold}_perp{perp}_ee{ee}.svg', 
                dpi=1200, 
                bbox_inches='tight',
                format='svg')    
    plt.savefig(f'../../result/Figures/Baron_Human/tsne/gene/tsne_endothelial_{gene_hvgs[idx]}_degree_f{k_fold}_perp{perp}_ee{ee}.png', 
                dpi=1200, 
                bbox_inches='tight',
                format='png')

    plt.show()

In [None]:
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
import os

pathjoin = os.path.join

from matplotlib import rcParams
rcParams['font.family'] = 'Times New Roman'

lst = [3]
for k in lst:
    k_fold = k + 1
    print("k_fold: ", k_fold)
    test_index = seq_dict[f'test_index_{k_fold}']
    barcodes_test = barcodes[test_index]
    cur_cell_emb = cell_embedding.loc[barcodes_test]


    idx = 955
    print(gene_hvgs[idx])
    target_gene_degree = degree_f4[idx]   
    gene_degrees = target_gene_degree

    print("Embedding shape:", cur_cell_emb.shape)  
    print("Number of unique labels:", len(np.unique(cell_type['true_cell_type'].loc[barcodes_test])))

    perp = 30
    ee = 12
    tsne = TSNE(n_components=2,      
                perplexity=perp,      
                early_exaggeration=ee,
                random_state=42,    
                n_iter=1000,        
                learning_rate='auto')

    embeddings_2d = tsne.fit_transform(cur_cell_emb.to_numpy())
    plt.figure(figsize=(7, 5))

    scatter = plt.scatter(embeddings_2d[:, 0], 
                        embeddings_2d[:, 1],
                        c=gene_degrees,  
                        cmap='viridis', 
                        alpha=0.6,
                        s=20)

    cbar = plt.colorbar(scatter)

    plt.title(f'{gene_hvgs[idx]} (macrophage)', fontsize=14)
    plt.xlabel('t-SNE 1', fontsize=14,fontweight='bold')
    plt.ylabel('t-SNE 2', fontsize=14,fontweight='bold')

    plt.tight_layout()

    plt.savefig(f'../../result/Figures/Baron_Human/tsne/gene/tsne_macrophage_{gene_hvgs[idx]}_degree_f{k_fold}_perp{perp}_ee{ee}.svg', 
                dpi=1200, 
                bbox_inches='tight',
                format='svg')    
    plt.savefig(f'../../result/Figures/Baron_Human/tsne/gene/tsne_macrophage_{gene_hvgs[idx]}_degree_f{k_fold}_perp{perp}_ee{ee}.png', 
                dpi=1200, 
                bbox_inches='tight',
                format='png')

    plt.show()

In [None]:
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
import os

pathjoin = os.path.join

from matplotlib import rcParams
rcParams['font.family'] = 'Times New Roman'

lst = [3]
for k in lst:
    k_fold = k + 1
    print("k_fold: ", k_fold)
    test_index = seq_dict[f'test_index_{k_fold}']
    barcodes_test = barcodes[test_index]
    cur_cell_emb = cell_embedding.loc[barcodes_test]


    idx = 1480
    print(gene_hvgs[idx])
    target_gene_degree = degree_f4[idx]   

    gene_degrees = target_gene_degree

    print("Embedding shape:", cur_cell_emb.shape) 
    print("Number of unique labels:", len(np.unique(cell_type['true_cell_type'].loc[barcodes_test])))

    perp = 30
    ee = 12
    tsne = TSNE(n_components=2,      
                perplexity=perp,       
                early_exaggeration=ee,
                random_state=42,    
                n_iter=1000,    
                learning_rate='auto')

    embeddings_2d = tsne.fit_transform(cur_cell_emb.to_numpy())

    plt.figure(figsize=(7, 5))

    scatter = plt.scatter(embeddings_2d[:, 0], 
                        embeddings_2d[:, 1],
                        c=gene_degrees,  
                        cmap='viridis',  
                        alpha=0.6,
                        s=20)

    cbar = plt.colorbar(scatter)

    plt.title(f'{gene_hvgs[idx]} (mast)', fontsize=14)
    plt.xlabel('t-SNE 1', fontsize=14,fontweight='bold')
    plt.ylabel('t-SNE 2', fontsize=14,fontweight='bold')

    plt.tight_layout()

    plt.savefig(f'../../result/Figures/Baron_Human/tsne/gene/tsne_mast_{gene_hvgs[idx]}_degree_f{k_fold}_perp{perp}_ee{ee}.svg', 
                dpi=1200, 
                bbox_inches='tight',
                format='svg')    
    plt.savefig(f'../../result/Figures/Baron_Human/tsne/gene/tsne_mast_{gene_hvgs[idx]}_degree_f{k_fold}_perp{perp}_ee{ee}.png', 
                dpi=1200, 
                bbox_inches='tight',
                format='png')

    plt.show()

#### edge-tsne

##### Identify gene pairs specific to each cell type.

In [None]:
Baron_Human_union_edges_file = 'data/Baron_Human/Baron_Human_character_edge_set_indices.tsv'
Baron_Human_union_edges = pd.read_csv(Baron_Human_union_edges_file, sep='\t', header=0)
print(Baron_Human_union_edges)
columns = Baron_Human_union_edges.columns.tolist()

sets = {}
for col in columns:
    sets[col] = set(Baron_Human_union_edges[col].dropna().values)

print("\nSize of each set:")
for col, s in sets.items():
    print(f"{col}: {len(s)} edges")

common_edges = set.intersection(*sets.values())
print(f"\nNumber of gene pairs common to all thirteen sets: {len(common_edges)}")
if len(common_edges) > 0:
    print("List of common gene pairs:")
    print(common_edges)

unique_edges = {}
for col in columns:
    unique = sets[col] - set.union(*[sets[c] for c in columns if c != col])
    unique_edges[col] = unique
    print(f"\nNumber of gene pairs unique to {col}: {len(unique)}")
    if len(unique) > 0:
        print("List of unique gene pairs:")
        print(unique)


##### Obtain the edge weight matrix for test set cells in each fold

In [None]:
import numpy as np
import pandas as pd

all_edges = set.union(*sets.values())

def extract_node_indices(edge_str):
    node1, node2 = edge_str.split('-')
    return int(node1), int(node2)

all_nodes = set()
edge_pairs = []

for edge_str in all_edges:
    node1, node2 = extract_node_indices(edge_str)
    edge_pairs.append((node1, node2))
    all_nodes.add(node1)
    all_nodes.add(node2)

edge_array = np.array(edge_pairs)
edges_df = pd.DataFrame(edge_pairs, columns=['Node1', 'Node2'])

unique_nodes = sorted(list(all_nodes))


for col in columns:
    current_edges = sets[col]
    current_nodes = set()
    for edge_str in current_edges:
        node1, node2 = extract_node_indices(edge_str)
        current_nodes.add(node1)
        current_nodes.add(node2)

result = {
    'edge_pairs': edge_array,  
    'node_set': all_nodes,     
    'edges_df': edges_df       
}

In [None]:
edges_df.to_csv('data/Baron_Human/all_union_edges.tsv', sep='\t',header=True, index=False)

In [None]:
edges_genes_only = edges_df.copy()
edges_genes_only['Gene1'] = edges_genes_only['Node1'].apply(lambda x: gene_hvgs[x])
edges_genes_only['Gene2'] = edges_genes_only['Node2'].apply(lambda x: gene_hvgs[x])

edges_genes_only = edges_genes_only[['Gene1', 'Gene2']]
print(edges_genes_only)
edges_genes_only.to_csv('data/Baron_Human/all_union_edges_genes.tsv', 
                       sep='\t', 
                       header=True, 
                       index=False)

In [None]:
import numpy as np
import torch
import os
from tqdm import tqdm
import pandas as pd
from scipy import sparse

def process_cell_graph(cell_data, target_edges):

    edge_index = cell_data.edge_index
    edge_weight = cell_data.edge_weight
    
    edge_weight_dict = dict(zip(
        [f"{src}-{dst}" for src, dst in zip(edge_index[0].tolist(), edge_index[1].tolist())],
        edge_weight.tolist()
    ))

    edge_weights = [
        edge_weight_dict.get(f"{row['Node1']}-{row['Node2']}", 0.0) 
        for _, row in target_edges.iterrows()
    ]
    
    return np.array(edge_weights)


def extract_edge_weights(seq_dict_file, edges_df, gene_hvgs, batch_size=100):
    seq_dict = np.load(seq_dict_file, allow_pickle=True)
    edge_weight_matrices = {}
    
    column_names = [f"{row['Node1']}-{row['Node2']}" 
                   for _, row in edges_df.iterrows()]

    for k in range(5):
        k_fold = k + 1

        test_index = seq_dict[f'test_index_{k_fold}']

        cell_test_folder = f'../../result/datasets/Baron_Human/wcsn_a0.01_hvgs2000/test_f{k_fold}/processed'

        edge_weight_matrix = []

        for i in tqdm(range(0, len(test_index), batch_size), desc=f"Fold {k_fold}"):
            batch_indices = range(i, min(i + batch_size, len(test_index)))
            batch_weights = []
            
            for idx in batch_indices:
                cell_data = torch.load(os.path.join(cell_test_folder, f'cell_{idx}.pt'))
                cell_edge_weights = process_cell_graph(cell_data, edges_df)
                batch_weights.append(cell_edge_weights)

            edge_weight_matrix.extend(batch_weights)
        df = pd.DataFrame(edge_weight_matrix, columns=column_names)
        df.index = barcodes[test_index]
        
        edge_weight_matrices[f'CV_{k_fold}'] = df
        
        print(f"Fold {k_fold} DataFrame shape: {df.shape}")
        print(f"Memory usage: {df.memory_usage().sum() / 1024**2:.2f} MB")
    
    return edge_weight_matrices

if __name__ == "__main__":
    seq_dict_file = '../../result/datasets/Baron_Human/seq_dict.npz'
    output_file = 'data/Baron_Human/edge_weight_matrices_Baron_Human_per_fold_a0.01_hvgs2000.h5'
    

    seq_dict = np.load(seq_dict_file, allow_pickle=True) 
    genes = seq_dict['gene_symbol']

    all_filtered_genes_file = '../../result/datasets/Baron_Human/Baron_Human_filtered_hvgs2000.npy'

    all_filtered_genes_array = np.load(all_filtered_genes_file, allow_pickle=True)
    filtered_genes_index = all_filtered_genes_array[0]
    filtered_genes_index = filtered_genes_index.astype(int)
    print(filtered_genes_index.shape)

    gene_hvgs = genes[filtered_genes_index]

    edge_weight_matrices = extract_edge_weights(seq_dict_file, edges_df, gene_hvgs)

    with pd.HDFStore(output_file, mode='w') as store:
        for fold, df in edge_weight_matrices.items():
            store[fold] = df

    print(f"\nResults saved to: {output_file}")

    print("\nVerify the saved DataFrame:")
    with pd.HDFStore(output_file, mode='r') as store:
        for fold in store.keys():
            df = store[fold]
            print(f"\n{fold}:")
            print(f"Shape: {df.shape}")
            print(f"Columns (first 5): {df.columns[:5]}")
            print(f"Index (first 5): {df.index[:5]}")
            print(f"Memory usage: {df.memory_usage().sum() / 1024**2:.2f} MB")

In [None]:
fold4_df = pd.read_hdf('data/Baron_Human/edge_weight_matrices_Baron_Human_per_fold_a0.01_hvgs2000.h5', 
                       key='CV_4')

print(fold4_df.info())
print(fold4_df.head())
print(fold4_df.columns)
print(fold4_df.index)

##### tsne

In [None]:
seq_dict_file = '../../result/datasets/Baron_Human/seq_dict.npz'
seq_dict = np.load(seq_dict_file, allow_pickle=True) 
label = seq_dict['label']
str_labels = seq_dict['str_labels']
barcodes = seq_dict['barcode']

genes = seq_dict['gene_symbol']

all_filtered_genes_file = '../../result/datasets/Baron_Human/Baron_Human_filtered_hvgs2000.npy'

all_filtered_genes_array = np.load(all_filtered_genes_file, allow_pickle=True)
filtered_genes_index = all_filtered_genes_array[0]
filtered_genes_index = filtered_genes_index.astype(int)
print(filtered_genes_index.shape)

gene_hvgs = genes[filtered_genes_index]

print(gene_hvgs)

In [None]:
pred_path = '../../result/wcsn_preds/Baron_Human_a0.01_hvgs2000_prediction.h5'

cell_embedding = pd.read_hdf(pred_path, key='embedding')

cell_type = pd.read_hdf(pred_path, key='cell_type')
pred_prob = pd.read_hdf(pred_path, key='pred_prob')
print(cell_embedding.shape)
print(cell_type.shape)
print(cell_type[:5])

In [None]:
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib import rcParams
rcParams['font.family'] = 'Times New Roman'

lst = [3]
for k in lst:
    k_fold = k + 1
    print("k_fold: ", k_fold)
    test_index = seq_dict[f'test_index_{k_fold}']
    barcodes_test = barcodes[test_index]
    cur_cell_emb = cell_embedding.loc[barcodes_test]

    fold_df = pd.read_hdf('data/Baron_Human/edge_weight_matrices_Baron_Human_per_fold_a0.01_hvgs2000.h5', 
                        key=f'CV_{k_fold}')

    gene1 = 344
    gene2 = 1441
    target_gene_degree = fold_df[f'{gene1}-{gene2}']
    print(gene_hvgs[gene1])
    print(gene_hvgs[gene2])
    edge_w = target_gene_degree

    print("Embedding shape:", cur_cell_emb.shape)  
    print("Number of unique labels:", len(np.unique(cell_type['true_cell_type'].loc[barcodes_test])))

    perp = 30
    ee = 12
    tsne = TSNE(n_components=2,     
                perplexity=perp,     
                early_exaggeration=ee,
                random_state=42,    
                n_iter=1000,        
                learning_rate='auto')

    embeddings_2d = tsne.fit_transform(cur_cell_emb.to_numpy())
    plt.figure(figsize=(7, 5))

    scatter = plt.scatter(embeddings_2d[:, 0], 
                        embeddings_2d[:, 1],
                        c=edge_w,  
                        cmap='viridis', 
                        alpha=0.6,
                        s=20)

    cbar = plt.colorbar(scatter)
    plt.title(f'{gene_hvgs[gene1]}-{gene_hvgs[gene2]} (acinar)', 
                fontsize=14)
    
    plt.xlabel('t-SNE 1', fontsize=14,fontweight='bold')
    plt.ylabel('t-SNE 2', fontsize=14,fontweight='bold')

    plt.tight_layout()

    plt.savefig(f'../../result/Figures/Baron_Human/tsne/edge/tsne_acinar_{gene_hvgs[gene1]}-{gene_hvgs[gene2]}_weight_f{k_fold}_perp{perp}.svg',
                dpi=1200,
                bbox_inches='tight',
                format='svg')
    plt.savefig(f'../../result/Figures/Baron_Human/tsne/edge/tsne_acinar_{gene_hvgs[gene1]}-{gene_hvgs[gene2]}_weight_f{k_fold}_perp{perp}.png',
                dpi=1200,
                bbox_inches='tight',
                format='png')

    plt.show()

In [None]:
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib import rcParams
rcParams['font.family'] = 'Times New Roman'

lst = [3]
for k in lst:
    k_fold = k + 1
    print("k_fold: ", k_fold)
    test_index = seq_dict[f'test_index_{k_fold}']
    barcodes_test = barcodes[test_index]
    cur_cell_emb = cell_embedding.loc[barcodes_test]

    fold_df = pd.read_hdf('data/Baron_Human/edge_weight_matrices_Baron_Human_per_fold_a0.01_hvgs2000.h5', 
                        key=f'CV_{k_fold}')

    gene1 = 426
    gene2 = 1825
    target_gene_degree = fold_df[f'{gene1}-{gene2}']
    print(gene_hvgs[gene1])
    print(gene_hvgs[gene2])
    edge_w = target_gene_degree

    print("Embedding shape:", cur_cell_emb.shape) 
    print("Number of unique labels:", len(np.unique(cell_type['true_cell_type'].loc[barcodes_test])))

    perp = 30
    ee = 12
    tsne = TSNE(n_components=2,      
                perplexity=perp,       
                early_exaggeration=ee,
                random_state=42,    
                n_iter=1000,        
                learning_rate='auto')
    embeddings_2d = tsne.fit_transform(cur_cell_emb.to_numpy())

    plt.figure(figsize=(7, 5))

    scatter = plt.scatter(embeddings_2d[:, 0], 
                        embeddings_2d[:, 1],
                        c=edge_w,  
                        cmap='viridis',  
                        alpha=0.6,
                        s=20)

    cbar = plt.colorbar(scatter)
    plt.title(f'{gene_hvgs[gene1]}-{gene_hvgs[gene2]} (activated_stellate)', 
                fontsize=14)
    
    plt.xlabel('t-SNE 1', fontsize=14,fontweight='bold')
    plt.ylabel('t-SNE 2', fontsize=14,fontweight='bold')

    plt.tight_layout()

    plt.savefig(f'../../result/Figures/Baron_Human/tsne/edge/tsne_activated_stellate_{gene_hvgs[gene1]}-{gene_hvgs[gene2]}_weight_f{k_fold}_perp{perp}.svg',
                dpi=1200,
                bbox_inches='tight',
                format='svg')
    plt.savefig(f'../../result/Figures/Baron_Human/tsne/edge/tsne_activated_stellate_{gene_hvgs[gene1]}-{gene_hvgs[gene2]}_weight_f{k_fold}_perp{perp}.png',
                dpi=1200,
                bbox_inches='tight',
                format='png')

    plt.show()

In [None]:
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

# 设置字体
from matplotlib import rcParams
rcParams['font.family'] = 'Times New Roman'

lst = [3]
for k in lst:
    k_fold = k + 1
    print("k_fold: ", k_fold)
    test_index = seq_dict[f'test_index_{k_fold}']
    barcodes_test = barcodes[test_index]
    cur_cell_emb = cell_embedding.loc[barcodes_test]

    fold_df = pd.read_hdf('data/Baron_Human/edge_weight_matrices_Baron_Human_per_fold_a0.01_hvgs2000.h5', 
                        key=f'CV_{k_fold}')

    gene1 = 1129
    gene2 = 1908
    target_gene_degree = fold_df[f'{gene1}-{gene2}']
    print(gene_hvgs[gene1])
    print(gene_hvgs[gene2])

    edge_w = target_gene_degree

    print("Embedding shape:", cur_cell_emb.shape)  
    print("Number of unique labels:", len(np.unique(cell_type['true_cell_type'].loc[barcodes_test])))

    perp = 30
    ee = 12
    tsne = TSNE(n_components=2,      
                perplexity=perp,       
                early_exaggeration=ee,
                random_state=42,     
                n_iter=1000,         
                learning_rate='auto')

    embeddings_2d = tsne.fit_transform(cur_cell_emb.to_numpy())

    plt.figure(figsize=(7, 5))

    scatter = plt.scatter(embeddings_2d[:, 0], 
                        embeddings_2d[:, 1],
                        c=edge_w,  
                        cmap='viridis', 
                        alpha=0.6,
                        s=20)

    cbar = plt.colorbar(scatter)
    plt.title(f'{gene_hvgs[gene1]}-{gene_hvgs[gene2]} (alpha)', 
                fontsize=14)
    
    plt.xlabel('t-SNE 1', fontsize=14,fontweight='bold')
    plt.ylabel('t-SNE 2', fontsize=14,fontweight='bold')

    plt.tight_layout()

    plt.savefig(f'../../result/Figures/Baron_Human/tsne/edge/tsne_alpha_{gene_hvgs[gene1]}-{gene_hvgs[gene2]}_weight_f{k_fold}_perp{perp}.svg',
                dpi=1200,
                bbox_inches='tight',
                format='svg')
    plt.savefig(f'../../result/Figures/Baron_Human/tsne/edge/tsne_alpha_{gene_hvgs[gene1]}-{gene_hvgs[gene2]}_weight_f{k_fold}_perp{perp}.png',
                dpi=1200,
                bbox_inches='tight',
                format='png')

    plt.show()

In [None]:
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib import rcParams
rcParams['font.family'] = 'Times New Roman'

lst = [3]
for k in lst:
    k_fold = k + 1
    print("k_fold: ", k_fold)
    test_index = seq_dict[f'test_index_{k_fold}']
    barcodes_test = barcodes[test_index]
    cur_cell_emb = cell_embedding.loc[barcodes_test]

    fold_df = pd.read_hdf('data/Baron_Human/edge_weight_matrices_Baron_Human_per_fold_a0.01_hvgs2000.h5', 
                        key=f'CV_{k_fold}')

    gene1 = 310
    gene2 = 478
    target_gene_degree = fold_df[f'{gene1}-{gene2}']
    print(gene_hvgs[gene1])
    print(gene_hvgs[gene2])
    edge_w = target_gene_degree

    print("Embedding shape:", cur_cell_emb.shape) 
    print("Number of unique labels:", len(np.unique(cell_type['true_cell_type'].loc[barcodes_test])))

    perp = 30
    ee = 12
    tsne = TSNE(n_components=2,      
                perplexity=perp,      
                early_exaggeration=ee,
                random_state=42,   
                n_iter=1000,      
                learning_rate='auto')

    embeddings_2d = tsne.fit_transform(cur_cell_emb.to_numpy())
    plt.figure(figsize=(7, 5))
    scatter = plt.scatter(embeddings_2d[:, 0], 
                        embeddings_2d[:, 1],
                        c=edge_w,  
                        cmap='viridis', 
                        alpha=0.6,
                        s=20)

    cbar = plt.colorbar(scatter)
    plt.title(f'{gene_hvgs[gene1]}-{gene_hvgs[gene2]} (macrophage)', 
                fontsize=14)
    
    plt.xlabel('t-SNE 1', fontsize=14,fontweight='bold')
    plt.ylabel('t-SNE 2', fontsize=14,fontweight='bold')

    plt.tight_layout()

    plt.savefig(f'../../result/Figures/Baron_Human/tsne/edge/tsne_macrophage_{gene_hvgs[gene1]}-{gene_hvgs[gene2]}_weight_f{k_fold}_perp{perp}.svg',
                dpi=1200,
                bbox_inches='tight',
                format='svg')
    plt.savefig(f'../../result/Figures/Baron_Human/tsne/edge/tsne_macrophage_{gene_hvgs[gene1]}-{gene_hvgs[gene2]}_weight_f{k_fold}_perp{perp}.png',
                dpi=1200,
                bbox_inches='tight',
                format='png')

    plt.show()

In [None]:
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib import rcParams
rcParams['font.family'] = 'Times New Roman'

lst = [3]
for k in lst:
    k_fold = k + 1
    print("k_fold: ", k_fold)
    test_index = seq_dict[f'test_index_{k_fold}']
    barcodes_test = barcodes[test_index]
    cur_cell_emb = cell_embedding.loc[barcodes_test]

    fold_df = pd.read_hdf('data/Baron_Human/edge_weight_matrices_Baron_Human_per_fold_a0.01_hvgs2000.h5', 
                        key=f'CV_{k_fold}')

    gene1 = 909
    gene2 = 948
    target_gene_degree = fold_df[f'{gene1}-{gene2}']
    print(gene_hvgs[gene1])
    print(gene_hvgs[gene2])
    edge_w = target_gene_degree

    print("Embedding shape:", cur_cell_emb.shape)  
    print("Number of unique labels:", len(np.unique(cell_type['true_cell_type'].loc[barcodes_test])))

    perp = 30
    ee = 12
    tsne = TSNE(n_components=2,      
                perplexity=perp,       
                early_exaggeration=ee,
                random_state=42,     
                n_iter=1000,        
                learning_rate='auto')

    embeddings_2d = tsne.fit_transform(cur_cell_emb.to_numpy())

    plt.figure(figsize=(7, 5))

    scatter = plt.scatter(embeddings_2d[:, 0], 
                        embeddings_2d[:, 1],
                        c=edge_w, 
                        cmap='viridis', 
                        alpha=0.6,
                        s=20)

    cbar = plt.colorbar(scatter)
    plt.title(f'{gene_hvgs[gene1]}-{gene_hvgs[gene2]} (beta)', 
                fontsize=14)
    
    plt.xlabel('t-SNE 1', fontsize=14,fontweight='bold')
    plt.ylabel('t-SNE 2', fontsize=14,fontweight='bold')

    plt.tight_layout()

    plt.savefig(f'../result/Figures/Baron_Human/tsne/edge/tsne_beta_{gene_hvgs[gene1]}-{gene_hvgs[gene2]}_weight_f{k_fold}_perp{perp}.svg',
                dpi=1200,
                bbox_inches='tight',
                format='svg')
    plt.savefig(f'../../result/Figures/Baron_Human/tsne/edge/tsne_beta_{gene_hvgs[gene1]}-{gene_hvgs[gene2]}_weight_f{k_fold}_perp{perp}.png',
                dpi=1200,
                bbox_inches='tight',
                format='png')

    plt.show()

In [None]:
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib import rcParams
rcParams['font.family'] = 'Times New Roman'

lst = [3]
for k in lst:
    k_fold = k + 1
    print("k_fold: ", k_fold)
    test_index = seq_dict[f'test_index_{k_fold}']
    barcodes_test = barcodes[test_index]
    cur_cell_emb = cell_embedding.loc[barcodes_test]

    fold_df = pd.read_hdf('data/Baron_Human/edge_weight_matrices_Baron_Human_per_fold_a0.01_hvgs2000.h5', 
                        key=f'CV_{k_fold}')

    gene1 = 730
    gene2 = 1525 
    target_gene_degree = fold_df[f'{gene1}-{gene2}']
    print(gene_hvgs[gene1])
    print(gene_hvgs[gene2])
    edge_w = target_gene_degree

    print("Embedding shape:", cur_cell_emb.shape)  
    print("Number of unique labels:", len(np.unique(cell_type['true_cell_type'].loc[barcodes_test])))

    perp = 30
    ee = 12
    tsne = TSNE(n_components=2,      
                perplexity=perp,       
                early_exaggeration=ee,
                random_state=42,     
                n_iter=1000,        
                learning_rate='auto')

    embeddings_2d = tsne.fit_transform(cur_cell_emb.to_numpy())

    plt.figure(figsize=(7, 5))

    scatter = plt.scatter(embeddings_2d[:, 0], 
                        embeddings_2d[:, 1],
                        c=edge_w, 
                        cmap='viridis', 
                        alpha=0.6,
                        s=20)

    cbar = plt.colorbar(scatter)
    plt.title(f'{gene_hvgs[gene1]}-{gene_hvgs[gene2]} (mast)', 
                fontsize=14)
    
    plt.xlabel('t-SNE 1', fontsize=14,fontweight='bold')
    plt.ylabel('t-SNE 2', fontsize=14,fontweight='bold')

    plt.tight_layout()

    plt.savefig(f'../../result/Figures/Baron_Human/tsne/edge/tsne_mast_{gene_hvgs[gene1]}-{gene_hvgs[gene2]}_weight_f{k_fold}_perp{perp}.svg',
                dpi=1200,
                bbox_inches='tight',
                format='svg')

    plt.savefig(f'../../result/Figures/Baron_Human/tsne/edge/tsne_mast_{gene_hvgs[gene1]}-{gene_hvgs[gene2]}_weight_f{k_fold}_perp{perp}.png',
                dpi=1200,
                bbox_inches='tight',
                format='png')

    plt.show()