# Libraries

In [3]:
import sys
import os
import scanpy as sc
import pandas as pd
import numpy as np

# Configurations

In [4]:
meta = pd.read_csv('meta_all_gene.csv')
path = '.'

In [4]:
def find_hvg(slide, tech):
    try:
        gene_exp_slide = pd.read_csv(f'{path}/{tech}/gene_exp/{slide}.csv',sep=',',index_col=0)
        adata = sc.AnnData(gene_exp_slide)
        adata.var_names_make_unique()
        
        sc.pp.filter_cells(adata, min_genes=1)
        sc.experimental.pp.highly_variable_genes(adata, n_top_genes=128)
        
        # sort genes by highly_variable
        adata.var_names_make_unique()
        hvg_list = adata.var['highly_variable_rank']
        hvg_list = hvg_list.sort_values()
        hvg_list = hvg_list.dropna()
        
        adata_hvg = adata[:, hvg_list.index]
        sc.pp.normalize_total(adata_hvg)
        sc.pp.log1p(adata_hvg)
        hvg = adata_hvg.X
        hvg = pd.DataFrame(hvg)
        hvg.index = adata_hvg.obs.index
        hvg.to_csv(f'./HVG/{slide}_count_hvg.csv',sep=',')
        print(f"FOUND HVG: {path}/{tech}/gene_exp/{slide}.csv")
    except:
        pass

# Find HVGs

In [6]:
for i in range(len(meta)):
    slide = meta['slide'][i]
    tech = meta['tech'][i]

    find_hvg(slide, tech)

  view_to_actual(adata)


FOUND HVG: ./Visium/gene_exp/GSE144239_GSM4565823.csv
FOUND HVG: ./Visium/gene_exp/GSE144239_GSM4565824.csv


  view_to_actual(adata)


# Combine HVG of Human Skin

In [7]:
data = meta.loc[(meta['species'] == 'human') & (meta['tissue'] == 'skin'),:]
data.slide.to_csv(f'human_skin_slide.csv',index=False,header=False)

In [1]:
import os
import pandas as pd

type_ = 'human_skin'
path = './'
output_file = f'./{type_}_count_hvg.csv'
slide_file = f'./{type_}_slide.csv'

# Remove the output file if it exists
if os.path.exists(output_file):
    os.remove(output_file)

# Read slide names from file
with open(slide_file, 'r') as f:
    slides = f.read().splitlines()

# Process each slide file
for slide in slides:
    input_file = os.path.join(path, 'HVG', f'{slide}_count_hvg.csv')
    if os.path.exists(input_file):
        df = pd.read_csv(input_file, skiprows=1)  # Skip the first row (header)
        df.to_csv(output_file, mode='a', index=False, header=False)
    print(slide)


GSE144239_GSM4284316
GSE144239_GSM4284317
GSE144239_GSM4284318
GSE144239_GSM4284319
GSE144239_GSM4284320
GSE144239_GSM4284321
GSE144239_GSM4284322
GSE144239_GSM4284323
GSE144239_GSM4284324
GSE144239_GSM4284325
GSE144239_GSM4284326
GSE144239_GSM4284327
GSE144239_GSM4565823
GSE144239_GSM4565824
GSE144239_GSM4565825
GSE144239_GSM4565826
GSE173651_GSM5273010
GSE173651_GSM5273011
GSE173651_GSM5273012
GSE173651_GSM5273013
GSE173651_GSM5273014
GSE173651_GSM5273015
GSE182208_GSM5531131
GSE197023_GSM5907077
GSE197023_GSM5907078
GSE197023_GSM5907079
GSE197023_GSM5907080
GSE197023_GSM5907081
GSE197023_GSM5907082
GSE197023_GSM5907083
GSE197023_GSM5907084
GSE197023_GSM5907085
GSE197023_GSM5907086
GSE197023_GSM5907088
GSE197023_GSM5907089
GSE197023_GSM5907090
GSE197023_GSM5907091
GSE197023_GSM5907092
GSE197023_GSM5907093
GSE197023_GSM5907094
GSE197023_GSM5907095
GSE197023_GSM5907096
GSE237771_GSM7648697
GSE237771_GSM7648698
GSE237771_GSM7648699
GSE237771_GSM7648700
GSE249729_GSM7962127
GSE249729_GSM

# Overlap HVG

In [13]:
data = meta.loc[(meta['species'] == 'human') & (meta['tissue'] == 'skin'),:]
data.index = range(len(data.index))
gene_name_overlap = []

In [16]:
data[data['slide'] == 'GSE144239_GSM4565823']

Unnamed: 0,slide,species,tissue,pmid,title,abstract,keywords,involve_cancer,tech,spot_num,gene_num
12,GSE144239_GSM4565823,human,skin,3257997438037084,Title 1: Multimodal Analysis of Composition an...,Abstract 1: To define the cellular composition...,Keywords 1: CRISPR screen; MIBI; intra-tumoral...,True,Visium,744,33538


In [18]:
# here we find the overlap genes in human brain


for i in range(len(data.index)):
    try:
        gene_exp_slide = pd.read_csv(f'./{data.tech[i]}/gene_exp/{data.slide[i]}.csv',sep=',',nrows=1,index_col=0)
        if i == 12:
            gene_name_overlap = gene_exp_slide.columns
        else:
            gene_name_overlap = gene_name_overlap.intersection(gene_exp_slide.columns)
        print(i)
    except:
        pass

pd.DataFrame(gene_name_overlap).to_csv('human_skin_gene.csv',index=False,header=False)

12
13


In [37]:
species = 'human'
tissue = 'skin'
gene_list = 'human_skin_gene'
meta = pd.read_csv('meta_all_gene.csv')

In [38]:
if tissue == 'all':
    data = meta.loc[(meta['species'] == species), :].reset_index(drop=True)
    data.slide.to_csv(f'{species}_slide.csv', index=False, header=False)
else:
    data = meta.loc[(meta['species'] == species) & (meta['tissue'] == tissue), :].reset_index(drop=True)
    data.slide.to_csv(f'{species}_{tissue}_slide.csv', index=False, header=False)

In [39]:
data[data['slide'] == 'GSE144239_GSM4565823']

Unnamed: 0,slide,species,tissue,pmid,title,abstract,keywords,involve_cancer,tech,spot_num,gene_num
12,GSE144239_GSM4565823,human,skin,3257997438037084,Title 1: Multimodal Analysis of Composition an...,Abstract 1: To define the cellular composition...,Keywords 1: CRISPR screen; MIBI; intra-tumoral...,True,Visium,744,33538


In [40]:
for index in range(len(data.index)):
    try:
        slide = data['slide'][index]
        gene_exp_slide = pd.read_csv(f'./{data.tech[index]}/gene_exp/{slide}.csv',sep=',',index_col=0)
        overlap_gene = pd.read_csv(f'{gene_list}.csv',header=None)
        gene_exp_slide = gene_exp_slide.loc[:,overlap_gene[0]]
        gene_exp_slide.to_csv(f'./overlap-hvg/{data.slide[index]}_{gene_list}.csv',sep=',')
        print(f"DONE: ./{data.tech[index]}/gene_exp/{data.slide[index]}.csv")
    except:
        pass

DONE: ./Visium/gene_exp/GSE144239_GSM4565823.csv
DONE: ./Visium/gene_exp/GSE144239_GSM4565824.csv


In [44]:
import os
import pandas as pd

# Define variables
type_ = 'human_skin'
path = './overlap-hvg/'
output_file = f'{type_}_count_overlap.csv'
slide_file = f'{type_}_slide.csv'

# Remove the existing output file if it exists
if os.path.exists(output_file):
    os.remove(output_file)

# Read slide names from the slide file
with open(slide_file, 'r') as file:
    slides = file.read().splitlines()

# Process each slide
missing_slides = []
for slide in slides:
    input_file = os.path.join(path, f"{slide}_{type_}_gene.csv")

    if os.path.isfile(input_file):
        df = pd.read_csv(input_file, skiprows=1)  # Skip the first row (header)
        df.to_csv(output_file, mode='a', index=False, header=False)  # Append without header
    else:
        missing_slides.append(slide)

# Print missing slides
for slide in missing_slides:
    print(slide)


GSE144239_GSM4284316
GSE144239_GSM4284317
GSE144239_GSM4284318
GSE144239_GSM4284319
GSE144239_GSM4284320
GSE144239_GSM4284321
GSE144239_GSM4284322
GSE144239_GSM4284323
GSE144239_GSM4284324
GSE144239_GSM4284325
GSE144239_GSM4284326
GSE144239_GSM4284327
GSE144239_GSM4565825
GSE144239_GSM4565826
GSE173651_GSM5273010
GSE173651_GSM5273011
GSE173651_GSM5273012
GSE173651_GSM5273013
GSE173651_GSM5273014
GSE173651_GSM5273015
GSE182208_GSM5531131
GSE197023_GSM5907077
GSE197023_GSM5907078
GSE197023_GSM5907079
GSE197023_GSM5907080
GSE197023_GSM5907081
GSE197023_GSM5907082
GSE197023_GSM5907083
GSE197023_GSM5907084
GSE197023_GSM5907085
GSE197023_GSM5907086
GSE197023_GSM5907088
GSE197023_GSM5907089
GSE197023_GSM5907090
GSE197023_GSM5907091
GSE197023_GSM5907092
GSE197023_GSM5907093
GSE197023_GSM5907094
GSE197023_GSM5907095
GSE197023_GSM5907096
GSE237771_GSM7648697
GSE237771_GSM7648698
GSE237771_GSM7648699
GSE237771_GSM7648700
GSE249729_GSM7962127
GSE249729_GSM7962128
GSE249729_GSM7962129


In [46]:
import sys
import pandas as pd
import numpy as np
import scanpy as sc
import os

path='.'

def overlap_hvg(tissue_type):
    # Here we take input of combined gene expression of one tissue type
    # see below for how to generate this file
    gene_exp_slide = pd.read_csv(f'{path}/{tissue_type}_count_overlap.csv',sep=',',index_col=0,header=None)
    gene_exp_slide.columns = ['gene'+str(i) for i in range(gene_exp_slide.shape[1])]
    adata = sc.AnnData(gene_exp_slide)
    sc.pp.filter_cells(adata, min_genes=1)
    sc.pp.filter_genes(adata, min_cells=1)
    # Normalizing to median total counts
    sc.pp.normalize_total(adata)
    # Logarithmize the data
    sc.pp.log1p(adata)
    sc.pp.highly_variable_genes(adata, n_top_genes=100)
    # sort genes by highly_variable
    adata_hvg = adata[:, adata.var.highly_variable]
    hvg = adata_hvg.X
    hvg = pd.DataFrame(hvg)
    hvg.index = adata_hvg.obs.index
    hvg.to_csv(f'./all/{tissue_type}_count_overlap_hvg.csv',sep=',')

overlap_hvg("human_skin")