#### Analysis of gene degree on AMB

In [None]:
# Generate the gene degree matrix for each cell type and each fold based on WCSN for AMB.
! python get_degree_matrix.py -expr ../../dataset/pre_data/scRNAseq_datasets/AMB.npz \
    -ca 0.01 -hvgs 2000 

In [None]:
# Generate the indices of the top 100 high-degree genes for each fold of all cell types.

import pandas as pd

# Load seq_dict file
seq_dict = np.load('../../dataset/5fold_data/AMB/seq_dict.npz', allow_pickle=True) 
label = seq_dict['label']

# Load degree matrix file
matrix_dict = np.load('../../dataset/5fold_data/AMB/degree_matrix_AMB_a0.01_hvgs2000.npz', allow_pickle=True)

# Print keys in the loaded matrix_dict
print("Keys in loaded matrix_dict:", matrix_dict.files)

# Read str_labels from the matrix_dict
str_labels = matrix_dict['str_labels']

# Check the length of str_labels
print("Length of str_labels:", len(str_labels))

# Access degree matrix for each cell type
for k in range(len(str_labels)):
    # Access degree matrix using string key
    degree_matrix_key = f'{k}'  # Use the string form of k as the key
    degree_matrix_dict = matrix_dict[degree_matrix_key].item()  # Unpack to dictionary
    print(degree_matrix_dict.keys())  # Print keys of the current degree matrix dictionary

    # Create a DataFrame to store top 100 mean indices for each fold
    fold_top_indices_df = pd.DataFrame()

    for i in range(5):
        fold = i + 1
        cur_fold_degree_matrix = degree_matrix_dict[f'CV_{fold}']  # Access current fold's degree matrix

        mean_degree = np.mean(cur_fold_degree_matrix, axis=1)  # Compute mean for each row

        # Get top 100 indices with the highest mean values
        top_100_indices = np.argsort(mean_degree)[-100:][::-1]  # Sort, take the last 100, and reverse order

        # Print results
        print("Mean values per row:", mean_degree)
        print("Top 100 indices with highest mean values:", top_100_indices)
        
        # Add the top 100 indices for the current fold to the DataFrame
        fold_top_indices_df[f'Fold_{fold}'] = top_100_indices

    # Save the DataFrame as a .tsv file, including the cell type name in the filename
    cell_type_name = str_labels[k]
    tsv_filename = f'data/AMB/Gene_degree/{cell_type_name}_top100_indices.tsv'
    fold_top_indices_df.to_csv(tsv_filename, sep='\t', index=False)

    print(f"Saved top 100 indices for cell type '{cell_type_name}' to {tsv_filename}")


In [None]:
import pandas as pd

# Define a function to get the union of the top 100 genes from each fold in a TSV file
def read_and_union_tsv(file_path):
    # Read the TSV file
    data = pd.read_csv(file_path, sep='\t')
    # Compute the union of all columns by combining values across folds
    union_set = set.union(*[set(data[col].dropna().astype(int)) for col in data.columns])
    # Return the union as a list
    return list(union_set)

# File paths for different cell types
file1_path = 'data/AMB/Gene_degree/GABAergic_top100_indices.tsv'
file2_path = 'data/AMB/Gene_degree/Glutamatergic_top100_indices.tsv'
file3_path = 'data/AMB/Gene_degree/Non-Neuronal_top100_indices.tsv'

# Compute the union for each cell type
union1 = read_and_union_tsv(file1_path)
union2 = read_and_union_tsv(file2_path)
union3 = read_and_union_tsv(file3_path)

# Print the size of each union
print("Number of genes in GABAergic union:", len(union1))
print("Number of genes in Glutamatergic union:", len(union2))
print("Number of genes in Non-Neuronal union:", len(union3))

# Combine the union results into a DataFrame, ensuring the data type is integer
result_df = pd.DataFrame({
    'GABAergic': pd.Series(union1, dtype='Int64'),
    'Glutamatergic': pd.Series(union2, dtype='Int64'),
    'NonNeuronal': pd.Series(union3, dtype='Int64')
})

# Save the merged results to a new TSV file
result_df.to_csv('data/AMB/Gene_degree/AMB_merged_top100_genes.tsv', sep='\t', index=False)


#### Analysis of edge weight on AMB

In [None]:
import numpy as np
import os
import torch
from scipy import sparse
import heapq
from collections import defaultdict

def find_top_edges(matrices, top_k=100):
    """
    Find the top edges with the highest average weight across multiple sparse matrices.
    
    Parameters:
    matrices: List[scipy.sparse.csr_matrix] - List of sparse matrices
    top_k: int - The number of top edges to return
    
    Returns:
    List[tuple] - A list of (node1, node2, avg_weight) tuples sorted by average weight in descending order
    """
    total_matrices = len(matrices)  # Total number of matrices
    edge_weights = defaultdict(float)
    
    # Iterate over all matrices
    for matrix in matrices:
        # Get coordinates and values of non-zero elements
        rows, cols = matrix.nonzero()
        values = matrix.data
        
        # Iterate over each non-zero element
        for i in range(len(rows)):
            # Convert numpy integers to Python integers
            node1 = int(min(rows[i], cols[i]))
            node2 = int(max(rows[i], cols[i]))
            if node1 != node2:  # Ignore self-loops
                # Convert numpy floats to Python floats
                edge_weights[(node1, node2)] += float(values[i])

    # Compute average weight for each edge
    edge_avg_weights = []
    for (node1, node2), weight_sum in edge_weights.items():
        # Ensure all values are Python native types
        avg_weight = float(weight_sum) / float(total_matrices)
        edge_avg_weights.append((int(node1), int(node2), float(avg_weight)))
    
    # Return the top k edges by average weight
    return heapq.nlargest(top_k, edge_avg_weights, key=lambda x: x[2])


def load_and_process_matrices(cell_test_folder, cur_label_idxs):
    """
    Load sparse matrix data for the specified cells.
    
    Parameters:
    file_paths: List[str] - List of sparse matrix file paths
    
    Returns:
    List[scipy.sparse.csr_matrix] - List of sparse matrices
    """
    matrices = []

    for idx in cur_label_idxs:
        data = torch.load(os.path.join(cell_test_folder, f'cell_{idx}.pt'))
        # Get the current edge index and edge weight
        edge_index = data.edge_index
        edge_weight = data.edge_weight
        
        # Get the number of nodes
        num_nodes = data.x.shape[0]  # Extract node count from feature matrix

        # Convert edge_index and edge_weight to numpy arrays
        edges = edge_index.cpu().numpy()
        weights = edge_weight.cpu().numpy()

        # Create sparse matrix from edge_index and edge_weight
        sparse_mat = sparse.csr_matrix(
            (weights, (edges[0], edges[1])),
            shape=(num_nodes, num_nodes)
        )           
        
        matrices.append(sparse_mat)     
    
    return matrices


def save_cell_type_edges(cell_type, fold_edges, save_folder, base_filename):
    """
    Save the top edges for each cell type across different folds.
    
    Parameters:
    cell_type: str - The name of the cell type
    fold_edges: dict - Dictionary containing the top edges for each fold
    save_folder: str - Folder path where the results will be saved
    base_filename: str - Base filename for naming the saved files
    """
    # Create save directory if it doesn't exist
    save_dir = os.path.join(save_folder, f"{base_filename}_top_edges")
    os.makedirs(save_dir, exist_ok=True)
    
    # Build the file path for saving
    filename = os.path.join(save_dir, f"{cell_type}_top_edges.npz")
    
    # Prepare dictionary for saving data
    save_dict = {
        'cell_type': cell_type,
        'base_filename': base_filename
    }
    
    # Add top edges for each fold to the dictionary
    for fold_idx, edges in fold_edges.items():
        # Convert edges to numpy array and ensure proper data types
        edges_array = np.array(edges, dtype=[
            ('node1', 'int32'), 
            ('node2', 'int32'), 
            ('weight', 'float32')
        ])
        save_dict[f'fold_{fold_idx}_edges'] = edges_array        
    
    # Save the data as a .npz file
    np.savez(filename, **save_dict)

In [10]:
seq_dict = np.load('../../dataset/5fold_data/AMB/seq_dict.npz', allow_pickle=True) 
label = seq_dict['label'] 
str_labels = seq_dict['str_labels']
save_folder = 'data/AMB/Gene_degree/'

cell_type_edges = {}

for cur_label, cell_type in enumerate(str_labels):
    print("cur_label: ", cur_label)
    print("cell_type: ", cell_type)
    fold_edges = {}
    
    for k in range(5):
        k_fold = k + 1
        test_index = seq_dict[f'test_index_{k_fold}']
        label_test = label[test_index]
        cur_label_idxs = np.where(label_test == cur_label)[0].tolist()

        cell_test_folder = os.path.join(
            "../../dataset/5fold_data/AMB/wcsn_a0.01_hvgs2000", 
            f"test_f{k_fold}", 
            'processed'
        )
        
        cur_mat = load_and_process_matrices(cell_test_folder, cur_label_idxs)
        cur_top_edges = find_top_edges(cur_mat)
        
        fold_edges[k_fold] = cur_top_edges

    save_cell_type_edges(cell_type, fold_edges, save_folder,  base_filename="AMB")

cur_label:  0
cell_type:  GABAergic
cur_label:  1
cell_type:  Glutamatergic
cur_label:  2
cell_type:  Non-Neuronal


In [13]:
# Convert npz data into R-readable format
import numpy as np
import pandas as pd

data = np.load('data/AMB/AMB_top_edges/Non-Neuronal_top_edges.npz', allow_pickle=True)

def create_edge_strings(edges):
    return [f"{int(edge[0])}-{int(edge[1])}" for edge in edges]

os.makedirs('data/AMB/AMB_top_edges/NonNeuronal_top_edges',exist_ok=True)

fold_edges = {}
for i in range(1, 6):  # 5折
    fold_key = f'fold_{i}_edges'
    if fold_key in data:
        edges = data[fold_key]
        edge_strings = create_edge_strings(edges)
        pd.DataFrame({'edges': edge_strings}).to_csv(f'data/AMB/AMB_top_edges/NonNeuronal_top_edges/fold_{i}_edges.csv', index=False)

In [15]:
# Convert npz data into R-readable format
import numpy as np
import pandas as pd

data = np.load('data/AMB/AMB_top_edges/Glutamatergic_top_edges.npz', allow_pickle=True)

def create_edge_strings(edges):
    return [f"{int(edge[0])}-{int(edge[1])}" for edge in edges]

os.makedirs('data/AMB/AMB_top_edges/Glutamatergic_top_edges',exist_ok=True)

fold_edges = {}
for i in range(1, 6): 
    fold_key = f'fold_{i}_edges'
    if fold_key in data:
        edges = data[fold_key]
        edge_strings = create_edge_strings(edges)
        if not os.path.exists('data/AMB/AMB_top_edges/Glutamatergic_top_edges/'):
            os.makedirs('data/AMB/AMB_top_edges/Glutamatergic_top_edges/', exist_ok=True)         
        pd.DataFrame({'edges': edge_strings}).to_csv(f'data/AMB/AMB_top_edges/Glutamatergic_top_edges/fold_{i}_edges.csv', index=False)

In [None]:
# Convert npz data into R-readable format
import numpy as np
import pandas as pd

data = np.load('data/AMB/AMB_top_edges/GABAergic_top_edges.npz', allow_pickle=True)

def create_edge_strings(edges):
    return [f"{int(edge[0])}-{int(edge[1])}" for edge in edges]

os.makedirs('data/AMB/AMB_top_edges/GABAergic_top_edges',exist_ok=True)

fold_edges = {}
for i in range(1, 6): 
    fold_key = f'fold_{i}_edges'
    if fold_key in data:
        edges = data[fold_key]
        edge_strings = create_edge_strings(edges)
        if not os.path.exists('data/AMB/AMB_top_edges/GABAergic_top_edges/'):
            os.makedirs('data/AMB/AMB_top_edges/GABAergic_top_edges/', exist_ok=True)         
        pd.DataFrame({'edges': edge_strings}).to_csv(f'data/AMB/AMB_top_edges/GABAergic_top_edges/fold_{i}_edges.csv', index=False)

In [None]:
import numpy as np
import pandas as pd
import os

def get_union_edges(npz_file):
   data = np.load(npz_file, allow_pickle=True)
   
   all_edges = set()
   
   # 遍历5折
   for i in range(1, 6):
       fold_key = f'fold_{i}_edges'
       if fold_key in data:
           edges = data[fold_key]
           edge_strings = [f"{int(edge[0])}-{int(edge[1])}" for edge in edges]
           all_edges.update(edge_strings)
   print(list(all_edges))
   return sorted(list(all_edges))  

cell_types = ['GABAergic','Glutamatergic', 'Non-Neuronal'] 
edges_dict = {}

for cell_type in cell_types:
   npz_file = f'data/AMB/AMB_top_edges/{cell_type}_top_edges.npz' 
   if os.path.exists(npz_file):
        edges_dict[cell_type] = get_union_edges(npz_file)


max_length = max(len(edges) for edges in edges_dict.values())

for cell_type in edges_dict:
   if len(edges_dict[cell_type]) < max_length:
       edges_dict[cell_type].extend([''] * (max_length - len(edges_dict[cell_type])))

df = pd.DataFrame(edges_dict)
print(df.columns)

df.columns = ['GABAergic', 'Glutamatergic', 'NonNeuronal']

save_path = 'data/AMB/AMB_top_edges/union_edges.tsv'
df.to_csv(save_path, sep='\t', index=False)