#### Obtain the gene expression matrix after cell filtering and highly variable gene selection, along with the data splits for five-fold cross-validation. Save them as CSV files for network construction using both WGCNA and PCA-PMI methods

In [None]:
import numpy as np
import pandas as pd

def get_logExpr3(expr_npz):
    data = np.load(expr_npz, allow_pickle=True)
    countExpr = data['count']  # count: row-cell, column-gene
    print("raw (cells, genes): ", countExpr.shape)

    row_sums = countExpr.sum(axis=1, keepdims=True)
    normalized_data = 1e6 * countExpr / row_sums
    normalized_data = normalized_data.astype(np.float32)

    logExpr0 = np.log1p(normalized_data)  
    logExpr1 = np.log1p(normalized_data + 1e-5) 
    return logExpr0, logExpr1

In [None]:
# get hvgs and split
import os
import numpy as np
import pandas as pd

scRNA_datasets = ['Muraro', 'Baron_Mouse', 'Segerstolpe', 'Baron_Human', 'Zhang_T', 'Kang_ctrl', 'AMB', 'TM', 'Zheng68K']
pathjoin = os.path.join

for base_filename in scRNA_datasets:
    print("\n\n")
    print(base_filename)
    seq_dict = np.load(f'../dataset/5fold_data/{base_filename}/seq_dict.npz', allow_pickle=True) 
    gene_symbol = seq_dict['gene_symbol']
    barcodes = seq_dict['barcode']
    seq_folder = f"../dataset/5fold_data/{base_filename}"
    all_filtered_genes_file = pathjoin(seq_folder, f'{base_filename}_filtered_hvgs2000.npy')
    all_filtered_genes_array = np.load(all_filtered_genes_file, allow_pickle=True)
    filtered_genes_index = all_filtered_genes_array[0].astype(int)
    gene_hvgs = gene_symbol[filtered_genes_index]
    expr_npz = f"../dataset/pre_data/scRNAseq_datasets/{base_filename}.npz"
    logExpr0, _ = get_logExpr3(expr_npz) 
    print(logExpr0.shape)
    filtered_genes_index = all_filtered_genes_array[0]
    filtered_genes_index = filtered_genes_index.astype(int)
    logExpr0_filtered = logExpr0[:, filtered_genes_index]
    print(logExpr0_filtered.shape)
    df = pd.DataFrame(logExpr0_filtered, index=barcodes, columns=gene_hvgs)

    os.makedirs('../dataset/pre_data/scRNAseq_datasets_hvgs', exist_ok=True)
    # Save to CSV
    output_csv_path = os.path.join('../dataset/pre_data/scRNAseq_datasets_hvgs', f"{base_filename}_hvgs.csv")
    df.to_csv(output_csv_path)
    print(f"Saved filtered expression data for {base_filename} to {output_csv_path}.")

    # Process each fold
    for k in range(5):
        k_fold = k + 1
        print(f"Processing k_fold: {k_fold} for {base_filename}")
        # Extract train index
        train_index = seq_dict[f'train_index_{k_fold}']
        # Save train_index to a file
        os.makedirs(os.path.join('../dataset/pre_data/scRNAseq_datasets_hvgs', 'splits'), exist_ok=True)
        output_file_path = os.path.join('../dataset/pre_data/scRNAseq_datasets_hvgs',"splits", f"{base_filename}_train_f{k_fold}.txt")
        np.savetxt(output_file_path, train_index, fmt='%d')
        print(f"Saved train_index for k_fold {k_fold} to {output_file_path}.")