In [14]:
import celltypist
from celltypist import models
import scanpy as sc
import pandas as pd 
import numpy as np
import anndata
import re
import h5py
import scipy.sparse as scs
import concurrent.futures
import scanpy.external as sce
import os

In [1]:
pip install scrublet

[0mNote: you may need to restart the kernel to use updated packages.


In [16]:
def read_mat(h5_con):
    mat = scs.csc_matrix(
        (h5_con['matrix']['data'][:], # Count values
         h5_con['matrix']['indices'][:], # Row indices
         h5_con['matrix']['indptr'][:]), # Pointers for column positions
        shape = tuple(h5_con['matrix']['shape'][:]) # Matrix dimensions
    )
    return mat


def read_obs(h5con):
    bc = h5con['matrix']['barcodes'][:]
    bc = [x.decode('UTF-8') for x in bc]

    # Initialized the DataFrame with cell barcodes
    obs_df = pd.DataFrame({ 'barcodes' : bc })

    # Get the list of available metadata columns
    obs_columns = h5con['matrix']['observations'].keys()

    # For each column
    for col in obs_columns:
        # Read the values
        values = h5con['matrix']['observations'][col][:]
        # Check for byte storage
        if(isinstance(values[0], (bytes, bytearray))):
            # Decode byte strings
            values = [x.decode('UTF-8') for x in values]
        # Add column to the DataFrame
        obs_df[col] = values
    
    return obs_df
# define a function to construct anndata object from a h5 file
def read_h5_anndata(h5_file):
    h5_con = h5py.File(h5_file, mode = 'r')
    # extract the expression matrix
    mat = read_mat(h5_con)
    # extract gene names
    genes = h5_con['matrix']['features']['name'][:]
    genes = [x.decode('UTF-8') for x in genes]
    # extract metadata
    obs_df = read_obs(h5_con)
    # construct anndata
    adata = anndata.AnnData(mat.T,
                             obs = obs_df)
    # make sure the gene names aligned
    adata.var_names = genes

    adata.var_names_make_unique()
    return adata
def get_last_pattern(inputstr):
    pattern = r"[^/]+(?=$)"
    match = re.search(pattern, inputstr)
    if match:
        return match.group(0)
    else:
        return ""

In [17]:
meta_data=pd.read_csv('hise_meta_data_2024-01-23_fixed.csv')

In [18]:
def process_file(file_name):
    result = read_h5_anndata(file_name)
    output_file = 'Doublet_Scores/' + result.obs['pbmc_sample_id'][0] + '.csv'
    if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
        print(f"File {output_file} already exists and is not empty. Skipping processing.")
        return
    sc.external.pp.scrublet(result)
    result.obs[['barcodes', 'predicted_doublet', 'doublet_score']].to_csv(output_file)

In [19]:
from concurrent.futures import ThreadPoolExecutor

results = []

with ThreadPoolExecutor(max_workers=60) as executor:  
    for result in executor.map(process_file, meta_data['file.path']):
        results.append(result)

File Doublet_Scores/PB00393-01.csv already exists and is not empty. Skipping processing.
File Doublet_Scores/PB00166-01.csv already exists and is not empty. Skipping processing.
File Doublet_Scores/PB00504-01.csv already exists and is not empty. Skipping processing.
File Doublet_Scores/PB00002-01.csv already exists and is not empty. Skipping processing.
File Doublet_Scores/PB00363-01.csv already exists and is not empty. Skipping processing.
File Doublet_Scores/PB00350-01.csv already exists and is not empty. Skipping processing.
File Doublet_Scores/PB00041-01.csv already exists and is not empty. Skipping processing.
File Doublet_Scores/PB00016-01.csv already exists and is not empty. Skipping processing.
File Doublet_Scores/PB00384-01.csv already exists and is not empty. Skipping processing.
File Doublet_Scores/PB00501-01.csv already exists and is not empty. Skipping processing.
File Doublet_Scores/PB00368-07.csv already exists and is not empty. Skipping processing.
File Doublet_Scores/P