In [2]:
import scanpy as sc
import pandas as pd 
import numpy as np
import anndata
import re
import h5py
import scipy.sparse as scs
import concurrent.futures
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import csc_matrix
from concurrent.futures import ThreadPoolExecutor, as_completed
import umap
import random
import multiprocessing
random.seed(123)
from joblib import Parallel, delayed
import copy
import os
from tqdm import tqdm
import math
import scanpy.external as sce
from concurrent.futures import ProcessPoolExecutor

from PIL import Image, ImageDraw, ImageFont
import gc
import re
import multiprocessing

In [3]:
def run_leiden(adata, resolution, key_added):
    
    adata_copy = copy.deepcopy(adata)
    adata_clustering = sc.tl.leiden(adata_copy, resolution=resolution, key_added=key_added,n_iterations=3, copy=True)
    return adata_clustering.obs

def run_leiden_parallel(adata, tasks):
    with ProcessPoolExecutor(max_workers=5) as executor:
        futures = [executor.submit(run_leiden, copy.deepcopy(adata), resolution, key_added) for resolution, key_added in tasks]
        
        results = [future.result() for future in futures]

    for result, (_, key_added) in zip(results, tasks):
        adata.obs[key_added] = result[key_added]

    return adata

In [4]:
files = os.listdir("h5_cleaned_by_celltype/")
pattern = re.compile(r'mono')

In [None]:
adata_list=[]
for i in files:
    adata_sinlge=sc.read_h5ad("h5_cleaned_by_celltype/"+i)
    adata_list.append(adata_sinlge)
    

In [None]:
adata=anndata.concat(adata_list)

In [None]:
adata

In [None]:
del adata_list
gc.collect()

In [None]:
%%time

sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata)
adata = adata[:, adata.var_names[adata.var['highly_variable']]]
sc.pp.scale(adata)
sc.tl.pca(adata, svd_solver='arpack')

In [None]:
%%time
sce.pp.harmony_integrate(adata, 'pool_id',max_iter_harmony = 10)

In [None]:
%%time
sc.pp.neighbors(adata, n_neighbors=50,use_rep='X_pca_harmony', n_pcs=30)
sc.tl.umap(adata,min_dist=0.45)

In [None]:
adata.write_h5ad('All_processed_pre_leiden.h5ad')

In [None]:
%%time
tasks = [(1, "leiden_resolution_1"),(1.5, "leiden_resolution_1.5"),(2, "leiden_resolution_2")]
adata = run_leiden_parallel(adata, tasks)

In [None]:
adata=adata.raw.to_adata()

In [None]:
adata.write_h5ad('All_leiden.h5ad')