# Libraries

In [1]:
import os
import scanpy as sc
import pandas as pd

# Configurations

In [None]:
meta = pd.read_csv('meta_all_gene.csv')
meta = meta[meta['species'] == 'human']

path = "../data"

In [3]:
meta

Unnamed: 0,slide,species,tissue,pmid,title,abstract,keywords,involve_cancer,tech,spot_num,gene_num
0,GSE144239_GSM4284316,human,skin,3257997438037084,Title 1: Multimodal Analysis of Composition an...,Abstract 1: To define the cellular composition...,Keywords 1: CRISPR screen; MIBI; intra-tumoral...,True,ST,666,17138
1,GSE144239_GSM4284317,human,skin,3257997438037084,Title 1: Multimodal Analysis of Composition an...,Abstract 1: To define the cellular composition...,Keywords 1: CRISPR screen; MIBI; intra-tumoral...,True,ST,646,17344
2,GSE144239_GSM4284318,human,skin,3257997438037084,Title 1: Multimodal Analysis of Composition an...,Abstract 1: To define the cellular composition...,Keywords 1: CRISPR screen; MIBI; intra-tumoral...,True,ST,638,17883
3,GSE144239_GSM4284319,human,skin,3257997438037084,Title 1: Multimodal Analysis of Composition an...,Abstract 1: To define the cellular composition...,Keywords 1: CRISPR screen; MIBI; intra-tumoral...,True,ST,590,16959
4,GSE144239_GSM4284320,human,skin,3257997438037084,Title 1: Multimodal Analysis of Composition an...,Abstract 1: To define the cellular composition...,Keywords 1: CRISPR screen; MIBI; intra-tumoral...,True,ST,521,17689
...,...,...,...,...,...,...,...,...,...,...,...
1114,Human_Prostate_Erickson_08102022_Visium_Patien...,human,prostate,35948708,Spatially resolved clonal copy number alterati...,Defining the transition from benign to maligna...,,True,Visium,3190,33538
1115,Human_Prostate_Erickson_08102022_Visium_Patien...,human,prostate,35948708,Spatially resolved clonal copy number alterati...,Defining the transition from benign to maligna...,,True,Visium,3554,33538
1116,Human_Prostate_Erickson_08102022_Visium_Patien...,human,prostate,35948708,Spatially resolved clonal copy number alterati...,Defining the transition from benign to maligna...,,True,Visium,2736,33538
1145,Human_Colon_10X_03252024_VisiumHD,human,colon,,,,,False,VisiumHD,545913,18085


# Preprocessing functions

In [None]:
def save_slides(filtered_meta, tissue):
        try:
            # Define file path
            slide_dir = os.path.join(f"{path}/preprocessed_genes/slides")
            os.makedirs(slide_dir, exist_ok=True)
    
            filtered_meta['slide'].to_csv(f"{slide_dir}/{tissue}_slide.csv", index=False, header=False)
            print(f"Success Slide file created for tissue: {tissue}")
        except Exception as e:
            print(f"Error Creating slide for {tissue}: {e}")

In [None]:
def find_hvg(slide):
    try:
        gene_exp_slide = pd.read_csv(f'{path}/Visium/gene_exp/{slide}_count.csv',sep=',',index_col=0)
        adata = sc.AnnData(gene_exp_slide)
        adata.var_names_make_unique()
        
        sc.pp.filter_cells(adata, min_genes=1)
        sc.experimental.pp.highly_variable_genes(adata, n_top_genes=128)
        
        # sort genes by highly_variable
        adata.var_names_make_unique()
        hvg_list = adata.var['highly_variable_rank']
        hvg_list = hvg_list.sort_values()
        hvg_list = hvg_list.dropna()
        
        adata_hvg = adata[:, hvg_list.index]
        sc.pp.normalize_total(adata_hvg)
        sc.pp.log1p(adata_hvg)
        hvg = adata_hvg.X
        hvg = pd.DataFrame(hvg)
        hvg.index = adata_hvg.obs.index
        hvg.to_csv(f'{path}/preprocessed_genes/HVG/{slide}_count_hvg.csv',sep=',')
        print(f"FOUND HVG: {path}/Visium/gene_exp/{slide}_count.csv")
    except Exception as e:
        print(f"Error finding HVG {slide}: {e}")

In [None]:
def find_hvgs(tissue):
    try:
        # Process each row efficiently
        os.makedirs(f"{path}/preprocessed_genes/HVG", exist_ok=True)

        slide_file = f'{path}/preprocessed_genes/slides/{tissue}_slide.csv'
        with open(slide_file, 'r') as f:
                slides = f.read().splitlines()
        
        for slide in slides:
            find_hvg(slide)
        print(f"Success HVG files created for tissue: {tissue}")
    except Exception as e:
        print(f"Error creating HVG files for {tissue}: {e}")

In [None]:
def combine_hvg(tissue):
    try:
        path = '../data'
        combined_hvg_dir='../data/preprocessed_genes/combined_hvg/'
        output_file = f'{combined_hvg_dir}{tissue}_count_hvg.csv'
        slide_file = f'../data/preprocessed_genes/slides/{tissue}_slide.csv'

        os.makedirs(combined_hvg_dir, exist_ok=True)

        # Remove the output file if it exists
        if os.path.exists(output_file):
            os.remove(output_file)
        
        # Read slide names from file
        with open(slide_file, 'r') as f:
            slides = f.read().splitlines()
        
        # Process each slide file
        for slide in slides:
            input_file = os.path.join(path, 'preprocessed_genes', 'HVG', f'{slide}_count_hvg.csv')
            if os.path.exists(input_file):
                df = pd.read_csv(input_file, skiprows=1)  # Skip the first row (header)
                df.to_csv(output_file, mode='a', index=False, header=False)
            # print(slide)
        print(f"Success HVG files combined for tissue: {tissue}")
    except Exception as e:
        print(f"Error combining HVG files for {tissue}: {e}")

In [None]:
def find_overlap_genes(filtered_meta, tissue):
    try:
        gene_name_overlap = []
        data = filtered_meta
        data.index = range(len(data.index))
        slides = pd.read_csv(f"../data/preprocessed_genes/slides/{tissue}_slide.csv")
        slides = slides.iloc[:, 0].tolist()
        
        i = 0
        for slide in slides:
            
            try:
                gene_exp_slide = pd.read_csv(f'../data/Visium/gene_exp/{slide}_count.csv',sep=',',nrows=1,index_col=0)
                if i == 0:
                    gene_name_overlap = gene_exp_slide.columns
                else:
                    gene_name_overlap = gene_name_overlap.intersection(gene_exp_slide.columns)
                i += 1
            except:
                pass
        
        if len(gene_name_overlap) != 0:
            save_dir = "../data/preprocessed_genes/overlap-genes"
            os.makedirs(save_dir, exist_ok=True)
            pd.DataFrame(gene_name_overlap).to_csv(f'{save_dir}/{tissue}_gene.csv',index=False,header=False)
        print(f"Success Overlap genes found for tissue: {tissue}")
    except Exception as e:
        print(f"Error finding overlap genes for {tissue}: {e}")

In [None]:
def extract_overlap_gene(tissue):
    try:
        gene_list = tissue+'_gene'
        slide_file = f'../data/preprocessed_genes/slides/{tissue}_slide.csv'
        with open(slide_file, 'r') as f:
            slides = f.read().splitlines()
        
        for slide in slides:
            try:
                gene_exp_slide = pd.read_csv(f'../data/Visium/gene_exp/{slide}_count.csv',sep=',',index_col=0)
                overlap_gene = pd.read_csv(f'../data/preprocessed_genes/overlap-genes/{gene_list}.csv',header=None)
                gene_exp_slide = gene_exp_slide.loc[:,overlap_gene[0]]
                os.makedirs("../data/preprocessed_genes/overlap-hvg", exist_ok=True)
                gene_exp_slide.to_csv(f'../data/preprocessed_genes/overlap-hvg/{slide}_{gene_list}.csv',sep=',')
                print(f"DONE: ../data/Visium/gene_exp/{slide}_count.csv")
            except Exception as e:
                print(f"Error in slide: {slide}: {e}")
        print(f"Success overlap genes extracted for tissue: {tissue}")
    except Exception as e:
        print(f"Error extracting overlap genes for {tissue}: {e}")

In [None]:
def overlap_tissue(tissue):
    try:
        # Define variables
        path = '../data/preprocessed_genes/overlap-hvg/'
        output_file = f'../data/preprocessed_genes/overlap-tissue/{tissue}_count_overlap.csv'
        slide_file = f'../data/preprocessed_genes/slides/{tissue}_slide.csv'

        os.makedirs("../data/preprocessed_genes/overlap-tissue", exist_ok=True)
        # Remove existing output file if it exists
        if os.path.exists(output_file):
            os.remove(output_file)

        # Read slide names
        with open(slide_file, 'r') as file:
            slides = file.read().splitlines()

        # Process slides
        missing_slides = []

        for slide in slides:
            input_file = os.path.join(path, f"{slide}_{tissue}_gene.csv")
        
            if os.path.isfile(input_file):
                df = pd.read_csv(input_file, skiprows=1)  # Skip the first row (header)
                df.to_csv(output_file, mode='a', index=False, header=False)  # Append without header
            else:
                missing_slides.append(slide)
        print(f"Success overlapped tissue: {tissue}")
    except Exception as e:
        print(f"Error overlapping {tissue}: {e}")

In [None]:
def overlap_hvg(tissue):
    try:
        # Here we take input of combined gene expression of one tissue type
        gene_exp_slide = pd.read_csv(f'../data/preprocessed_genes/overlap-tissue/{tissue}_count_overlap.csv',sep=',',index_col=0,header=None)
        gene_exp_slide.columns = ['gene'+str(i) for i in range(gene_exp_slide.shape[1])]

        adata = sc.AnnData(gene_exp_slide)
        sc.pp.filter_cells(adata, min_genes=1)
        sc.pp.filter_genes(adata, min_cells=1)
        # Normalizing to median total counts
        sc.pp.normalize_total(adata)
        # Logarithmize the data
        sc.pp.log1p(adata)
        sc.pp.highly_variable_genes(adata, n_top_genes=100)
        # sort genes by highly_variable
        adata_hvg = adata[:, adata.var.highly_variable]
        hvg = adata_hvg.X
        hvg = pd.DataFrame(hvg)
        hvg.index = adata_hvg.obs.index

        os.makedirs("../data/all_genes", exist_ok=True)
        hvg.to_csv(f'../data/all_genes/{tissue}_count_overlap_hvg.csv',sep=',')
        print(f"Success overlapped hvg tissue: {tissue}")
    except Exception as e:
        print(f"Error overlapping hvg {tissue}: {e}")

# Find HVGs

In [None]:
tissues= meta['tissue'].unique()

for tissue in tissues:
    print(f"=== Processing Tissue: {tissue} ===")
    
    # Step 1: Filter meta file for the current tissue
    print("🔎 Filtering meta file for current tissue...")
    existing_files = {file.split('_count.csv')[0] for file in os.listdir("../data/Visium/gene_exp")}
    filtered_meta = meta[(meta['tissue'] == tissue) & (meta['slide'].isin(existing_files))]
    
    # Step 2: Save slides for the current tissue
    print("💾 Saving slides...")
    save_slides(filtered_meta=filtered_meta, tissue=tissue)
    
    # Step 3: Identify Highly Variable Genes (HVGs)
    print("🧬 Identifying Highly Variable Genes (HVGs)...")
    find_hvgs(tissue=tissue)
    
    # Step 4: Combine HVGs for the tissue
    print("🧩 Combining HVGs...")
    combine_hvg(tissue=tissue)
    
    # Step 5: Identify Overlapping Genes
    print("🔬 Finding overlapping genes...")
    find_overlap_genes(filtered_meta=filtered_meta, tissue=tissue)
    
    # Step 6: Extract Overlapping Genes
    print("📋 Extracting overlapping genes...")
    extract_overlap_gene(tissue=tissue)
    
    # Step 7: Process Overlapping Tissue Data
    print("🧪 Processing overlapping tissue data...")
    overlap_tissue(tissue=tissue)
    
    # Step 8: Process Overlapping HVGs
    print("🗂️ Processing overlapping HVGs...")
    overlap_hvg(tissue=tissue)
    
    print(f"✅ PREPROCESSING COMPLETED FOR TISSUE: {tissue}")


=== Processing Tissue: skin ===
🔎 Filtering meta file for current tissue...
💾 Saving slides...
Success Slide file created for tissue: skin
🧬 Identifying Highly Variable Genes (HVGs)...
Success HVG files created for tissue: skin
🧩 Combining HVGs...
Success HVG files combined for tissue: skin
🔬 Finding overlapping genes...
Error finding overlap genes for skin: No columns to parse from file
📋 Extracting overlapping genes...
Success overlap genes extracted for tissue: skin
🧪 Processing overlapping tissue data...
Success overlapped tissue: skin
🗂️ Processing overlapping HVGs...
Error overlapping hvg skin: [Errno 2] No such file or directory: '../data/overlap-tissue/skin_count_overlap.csv'
✅ PREPROCESSING COMPLETED FOR TISSUE: skin
=== Processing Tissue: breast ===
🔎 Filtering meta file for current tissue...
💾 Saving slides...
Success Slide file created for tissue: breast
🧬 Identifying Highly Variable Genes (HVGs)...
Success HVG files created for tissue: breast
🧩 Combining HVGs...
Success HV