# QC filtering of subsets

In this notebook, we apply data QC filters to each of the major sets of data that were compiled. These include removing cells flagged as Doublets by scrublet, and removal of cells with abnormally low or high gene counts.

## Load libraries

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

from datetime import date
import hisepy
import os
import pandas as pd
import scanpy as sc

## Set QC Cutoffs

In [2]:
max_mito = 10
min_genes = 200
max_genes = 5000

## Helper functions

These functions make reading HISE .h5ad files straightforward

In [3]:
def cache_uuid_path(uuid):
    cache_path = '/home/jupyter/cache/{u}'.format(u = uuid)
    if not os.path.isdir(cache_path):
        hise_res = hisepy.reader.cache_files([uuid])
    filename = os.listdir(cache_path)[0]
    cache_file = '{p}/{f}'.format(p = cache_path, f = filename)
    return cache_file

In [4]:
def read_adata_uuid(uuid):
    cache_file = cache_uuid_path(uuid)
    res = sc.read_h5ad(cache_file)
    return res

This function will be used to apply the QC filtering criteria, write the output file, and return counts so we can track what happens at each step.

In [5]:
def apply_qc_filters(
    adata, 
    group_name, 
    out_files, 
    max_mito, 
    min_genes, 
    max_genes):

    counts = {
        'group': group_name
    }
    
    # Filter doublets
    counts['n_start'] = [adata.shape[0]]
    counts['n_doublets'] = [sum(adata.obs['predicted_doublet'] == True)]
    adata = adata[adata.obs['predicted_doublet'] == False]
    counts['n_singlets'] = [adata.shape[0]]
    
    # Compute fraction mitochondrial
    adata.var["mito"] = adata.var_names.str.startswith("MT-")
    sc.pp.calculate_qc_metrics(adata, qc_vars=["mito"], inplace=True)
    
    counts['n_high_mito'] = [sum(adata.obs["pct_counts_mito"] >= max_mito)]
    adata = adata[adata.obs["pct_counts_mito"] < max_mito]
    counts['n_low_mito'] = [adata.shape[0]]

    # Remove low gene counts
    counts['n_low_genes'] = [sum(adata.obs['n_genes'] <= min_genes)]
    adata = adata[adata.obs['n_genes'] > min_genes]
    counts['n_above_min_genes'] = [adata.shape[0]]

    # Remove high gene counts
    counts['n_high_genes'] = [sum(adata.obs['n_genes'] >= max_genes)]
    adata = adata[adata.obs['n_genes'] < max_genes]
    counts['n_below_max_genes'] = [adata.shape[0]]
    
    counts['total_removed'] = [adata.shape[0] - counts['n_start'][0]]
    counts['n_final'] = [adata.shape[0]]
    
    adata.write_h5ad(out_files['h5ad_file'])

    obs = adata.obs
    obs.to_csv(out_files['csv_file'])
    obs.to_parquet(out_files['parquet_file'])

    counts_df = pd.DataFrame(counts)
    
    return counts_df

## Identify HISE input files

In [21]:
search_id = 'cobalt-neptunium-cadmium'

Retrieve files stored in our HISE project store

In [22]:
ps_df = hisepy.list_files_in_project_store('cohorts')
ps_df = ps_df[['id', 'name']]

Filter for files from the previous notebook using our search_id

In [23]:
search_df = ps_df[ps_df['name'].str.contains(search_id)]
search_df = search_df.sort_values('name')

Select .h5ad files

In [None]:
search_df = ps_df[ps_df['name'].str.contains('.h5ad')]

In [24]:
search_df['name'].tolist()

['ASDC',
 'C1Q+ CD16 monocyte',
 'CD14+ cDC2',
 'Core CD14 monocyte',
 'Core CD16 monocyte',
 'HLA-DRhi cDC2',
 'IL1B+ CD14 monocyte',
 'ISG+ CD14 monocyte',
 'ISG+ CD16 monocyte',
 'ISG+ cDC2',
 'Intermediate monocyte',
 'cDC1',
 'pDC']

In [6]:
h5ad_uuids = {
    'BR1_Female_Negative': 'f7cbcafb-a748-4b32-958a-3085673a9630',
    'BR1_Female_Positive': '84d46f63-5979-47ed-a8ca-714fdfbdfd08',
    'BR1_Male_Negative':   '1e56fff4-d085-4f4d-b732-9e0c20ec2680',
    'BR1_Male_Positive':   'f7a8b4b9-0f88-46b8-96d9-99537a7b740f',
    'BR2_Female_Negative': 'b3b11ddc-e354-4e4b-b7df-63c64c3f6022',
    'BR2_Female_Positive': '60167536-fc17-42dc-9ba7-dd97ceda7bfa',
    'BR2_Male_Negative':   'ada15e33-18e1-4731-94be-dbdf1d5f2209',
    'BR2_Male_Positive':   '830df098-81ec-47f6-89ea-bcde336deb88'
}

## Set up output filenames

In [7]:
out_dir = 'output'
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)

In [8]:
out_files = {}
for group_name in h5ad_uuids.keys():
    h5ad_file = 'output/diha_PBMC_{g}_qc_labeled_{d}.h5ad'.format(
        g = group_name,
        d = date.today()
    )
    csv_file = 'output/diha_PBMC_{g}_qc_labeled_meta_{d}.csv'.format(
        g = group_name,
        d = date.today()
    )
    parquet_file = 'output/diha_PBMC_{g}_qc_labeled_meta_{d}.parquet'.format(
        g = group_name,
        d = date.today()
    )
    out_files[group_name] = {
        'h5ad_file': h5ad_file,
        'csv_file': csv_file,
        'parquet_file': parquet_file
    }

## Apply to each subset

In [9]:
filter_counts = []
for group_name, uuid in h5ad_uuids.items():
    print(group_name)
    group_out_files = out_files[group_name]

    adata = read_adata_uuid(uuid)
    print(adata.shape)
    
    group_counts = apply_qc_filters(
        adata, 
        group_name = group_name, 
        out_files = group_out_files, 
        max_mito = max_mito, 
        min_genes = min_genes, 
        max_genes = max_genes
    )
    print(group_counts)
    
    filter_counts.append(group_counts)

BR1_Female_Negative
(2970934, 33538)


  adata.var["mito"] = adata.var_names.str.startswith("MT-")


                 group  n_start  n_doublets  n_singlets  n_high_mito  \
0  BR1_Female_Negative  2970934       23464     2947470       130110   

   n_low_mito  n_low_genes  n_above_min_genes  n_high_genes  \
0     2817360         1074            2816286          3355   

   n_below_max_genes  total_removed  n_final  
0            2812931        -158003  2812931  
BR1_Female_Positive
downloading fileID: 84d46f63-5979-47ed-a8ca-714fdfbdfd08
Files have been successfully downloaded!
(1406534, 33538)


  adata.var["mito"] = adata.var_names.str.startswith("MT-")


                 group  n_start  n_doublets  n_singlets  n_high_mito  \
0  BR1_Female_Positive  1406534       12027     1394507        81066   

   n_low_mito  n_low_genes  n_above_min_genes  n_high_genes  \
0     1313441          161            1313280          1295   

   n_below_max_genes  total_removed  n_final  
0            1311985         -94549  1311985  
BR1_Male_Negative
downloading fileID: 1e56fff4-d085-4f4d-b732-9e0c20ec2680
Files have been successfully downloaded!
(1947023, 33538)


  adata.var["mito"] = adata.var_names.str.startswith("MT-")


               group  n_start  n_doublets  n_singlets  n_high_mito  \
0  BR1_Male_Negative  1947023       23506     1923517        88172   

   n_low_mito  n_low_genes  n_above_min_genes  n_high_genes  \
0     1835345          304            1835041          2378   

   n_below_max_genes  total_removed  n_final  
0            1832663        -114360  1832663  
BR1_Male_Positive
downloading fileID: f7a8b4b9-0f88-46b8-96d9-99537a7b740f
Files have been successfully downloaded!
(1399879, 33538)


  adata.var["mito"] = adata.var_names.str.startswith("MT-")


               group  n_start  n_doublets  n_singlets  n_high_mito  \
0  BR1_Male_Positive  1399879       17840     1382039        78649   

   n_low_mito  n_low_genes  n_above_min_genes  n_high_genes  \
0     1303390          112            1303278          1979   

   n_below_max_genes  total_removed  n_final  
0            1301299         -98580  1301299  
BR2_Female_Negative
downloading fileID: b3b11ddc-e354-4e4b-b7df-63c64c3f6022
Files have been successfully downloaded!
(1576589, 33538)


  adata.var["mito"] = adata.var_names.str.startswith("MT-")


                 group  n_start  n_doublets  n_singlets  n_high_mito  \
0  BR2_Female_Negative  1576589       12756     1563833        73461   

   n_low_mito  n_low_genes  n_above_min_genes  n_high_genes  \
0     1490372         2695            1487677          1606   

   n_below_max_genes  total_removed  n_final  
0            1486071         -90518  1486071  
BR2_Female_Positive
downloading fileID: 60167536-fc17-42dc-9ba7-dd97ceda7bfa
Files have been successfully downloaded!
(2960616, 33538)


  adata.var["mito"] = adata.var_names.str.startswith("MT-")


                 group  n_start  n_doublets  n_singlets  n_high_mito  \
0  BR2_Female_Positive  2960616       28780     2931836       139852   

   n_low_mito  n_low_genes  n_above_min_genes  n_high_genes  \
0     2791984          792            2791192          3103   

   n_below_max_genes  total_removed  n_final  
0            2788089        -172527  2788089  
BR2_Male_Negative
downloading fileID: ada15e33-18e1-4731-94be-dbdf1d5f2209
Files have been successfully downloaded!
(2005249, 33538)


  adata.var["mito"] = adata.var_names.str.startswith("MT-")


               group  n_start  n_doublets  n_singlets  n_high_mito  \
0  BR2_Male_Negative  2005249       24405     1980844       104165   

   n_low_mito  n_low_genes  n_above_min_genes  n_high_genes  \
0     1876679          715            1875964          2979   

   n_below_max_genes  total_removed  n_final  
0            1872985        -132264  1872985  
BR2_Male_Positive
downloading fileID: 830df098-81ec-47f6-89ea-bcde336deb88
Files have been successfully downloaded!
(1515062, 33538)


  adata.var["mito"] = adata.var_names.str.startswith("MT-")


               group  n_start  n_doublets  n_singlets  n_high_mito  \
0  BR2_Male_Positive  1515062       13603     1501459        75107   

   n_low_mito  n_low_genes  n_above_min_genes  n_high_genes  \
0     1426352          449            1425903          1904   

   n_below_max_genes  total_removed  n_final  
0            1423999         -91063  1423999  


## Assemble all counts

In [10]:
all_filter_counts = pd.concat(filter_counts)
all_filter_counts

Unnamed: 0,group,n_start,n_doublets,n_singlets,n_high_mito,n_low_mito,n_low_genes,n_above_min_genes,n_high_genes,n_below_max_genes,total_removed,n_final
0,BR1_Female_Negative,2970934,23464,2947470,130110,2817360,1074,2816286,3355,2812931,-158003,2812931
0,BR1_Female_Positive,1406534,12027,1394507,81066,1313441,161,1313280,1295,1311985,-94549,1311985
0,BR1_Male_Negative,1947023,23506,1923517,88172,1835345,304,1835041,2378,1832663,-114360,1832663
0,BR1_Male_Positive,1399879,17840,1382039,78649,1303390,112,1303278,1979,1301299,-98580,1301299
0,BR2_Female_Negative,1576589,12756,1563833,73461,1490372,2695,1487677,1606,1486071,-90518,1486071
0,BR2_Female_Positive,2960616,28780,2931836,139852,2791984,792,2791192,3103,2788089,-172527,2788089
0,BR2_Male_Negative,2005249,24405,1980844,104165,1876679,715,1875964,2979,1872985,-132264,1872985
0,BR2_Male_Positive,1515062,13603,1501459,75107,1426352,449,1425903,1904,1423999,-91063,1423999


In [21]:
counts_file = 'output/diha_PBMC_qc_filter_counts_{d}.csv'.format(d = date.today())
all_filter_counts.to_csv(counts_file)

## Upload assembled data to HISE

Finally, we'll use `hisepy.upload.upload_files()` to send a copy of our output to HISE to use for downstream analysis steps.

In [22]:
study_space_uuid = 'de025812-5e73-4b3c-9c3b-6d0eac412f2a'
title = 'DIHA PBMC Labeled and QC Filtered .h5ad {d}'.format(d = date.today())

In [27]:
in_files = list(h5ad_uuids.values())
in_files

['f7cbcafb-a748-4b32-958a-3085673a9630',
 '84d46f63-5979-47ed-a8ca-714fdfbdfd08',
 '1e56fff4-d085-4f4d-b732-9e0c20ec2680',
 'f7a8b4b9-0f88-46b8-96d9-99537a7b740f',
 'b3b11ddc-e354-4e4b-b7df-63c64c3f6022',
 '60167536-fc17-42dc-9ba7-dd97ceda7bfa',
 'ada15e33-18e1-4731-94be-dbdf1d5f2209',
 '830df098-81ec-47f6-89ea-bcde336deb88']

In [24]:
out_list = []
for file_dict in out_files.values():
    for fn in file_dict.values():
        out_list.append(fn)

In [25]:
out_list = out_list + [counts_file]

In [26]:
out_list

['output/diha_PBMC_BR1_Female_Negative_qc_labeled_2024-03-13.h5ad',
 'output/diha_PBMC_BR1_Female_Negative_qc_labeled_meta_2024-03-13.csv',
 'output/diha_PBMC_BR1_Female_Negative_qc_labeled_meta_2024-03-13.parquet',
 'output/diha_PBMC_BR1_Female_Positive_qc_labeled_2024-03-13.h5ad',
 'output/diha_PBMC_BR1_Female_Positive_qc_labeled_meta_2024-03-13.csv',
 'output/diha_PBMC_BR1_Female_Positive_qc_labeled_meta_2024-03-13.parquet',
 'output/diha_PBMC_BR1_Male_Negative_qc_labeled_2024-03-13.h5ad',
 'output/diha_PBMC_BR1_Male_Negative_qc_labeled_meta_2024-03-13.csv',
 'output/diha_PBMC_BR1_Male_Negative_qc_labeled_meta_2024-03-13.parquet',
 'output/diha_PBMC_BR1_Male_Positive_qc_labeled_2024-03-13.h5ad',
 'output/diha_PBMC_BR1_Male_Positive_qc_labeled_meta_2024-03-13.csv',
 'output/diha_PBMC_BR1_Male_Positive_qc_labeled_meta_2024-03-13.parquet',
 'output/diha_PBMC_BR2_Female_Negative_qc_labeled_2024-03-13.h5ad',
 'output/diha_PBMC_BR2_Female_Negative_qc_labeled_meta_2024-03-13.csv',
 'output

In [28]:
hisepy.upload.upload_files(
    files = out_list,
    study_space_id = study_space_uuid,
    title = title,
    input_file_ids = in_files
)

output/diha_PBMC_BR1_Female_Negative_qc_labeled_2024-03-13.h5ad
output/diha_PBMC_BR1_Female_Negative_qc_labeled_meta_2024-03-13.csv
output/diha_PBMC_BR1_Female_Negative_qc_labeled_meta_2024-03-13.parquet
output/diha_PBMC_BR1_Female_Positive_qc_labeled_2024-03-13.h5ad
output/diha_PBMC_BR1_Female_Positive_qc_labeled_meta_2024-03-13.csv
output/diha_PBMC_BR1_Female_Positive_qc_labeled_meta_2024-03-13.parquet
output/diha_PBMC_BR1_Male_Negative_qc_labeled_2024-03-13.h5ad
output/diha_PBMC_BR1_Male_Negative_qc_labeled_meta_2024-03-13.csv
output/diha_PBMC_BR1_Male_Negative_qc_labeled_meta_2024-03-13.parquet
output/diha_PBMC_BR1_Male_Positive_qc_labeled_2024-03-13.h5ad
output/diha_PBMC_BR1_Male_Positive_qc_labeled_meta_2024-03-13.csv
output/diha_PBMC_BR1_Male_Positive_qc_labeled_meta_2024-03-13.parquet
output/diha_PBMC_BR2_Female_Negative_qc_labeled_2024-03-13.h5ad
output/diha_PBMC_BR2_Female_Negative_qc_labeled_meta_2024-03-13.csv
output/diha_PBMC_BR2_Female_Negative_qc_labeled_meta_2024-03-13.

(y/n) y


{'trace_id': '2478d8d8-4bda-4fc9-8e14-fb9766d7260b',
 'files': ['output/diha_PBMC_BR1_Female_Negative_qc_labeled_2024-03-13.h5ad',
  'output/diha_PBMC_BR1_Female_Negative_qc_labeled_meta_2024-03-13.csv',
  'output/diha_PBMC_BR1_Female_Negative_qc_labeled_meta_2024-03-13.parquet',
  'output/diha_PBMC_BR1_Female_Positive_qc_labeled_2024-03-13.h5ad',
  'output/diha_PBMC_BR1_Female_Positive_qc_labeled_meta_2024-03-13.csv',
  'output/diha_PBMC_BR1_Female_Positive_qc_labeled_meta_2024-03-13.parquet',
  'output/diha_PBMC_BR1_Male_Negative_qc_labeled_2024-03-13.h5ad',
  'output/diha_PBMC_BR1_Male_Negative_qc_labeled_meta_2024-03-13.csv',
  'output/diha_PBMC_BR1_Male_Negative_qc_labeled_meta_2024-03-13.parquet',
  'output/diha_PBMC_BR1_Male_Positive_qc_labeled_2024-03-13.h5ad',
  'output/diha_PBMC_BR1_Male_Positive_qc_labeled_meta_2024-03-13.csv',
  'output/diha_PBMC_BR1_Male_Positive_qc_labeled_meta_2024-03-13.parquet',
  'output/diha_PBMC_BR2_Female_Negative_qc_labeled_2024-03-13.h5ad',
  'ou

In [29]:
import session_info
session_info.show()