# Generate separate L3 review data

In this notebook, we perform UMAP using data from each individual AIFI_L3 cell type. This can be used to assess if our label refinement has been effective and if our cell types are largely uniform.

**Note**: Because some of these are *very* large cell types, you'll need to ensure that a lot of disk space is available (> 400GB)

## Load packages

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

from datetime import date
import hisepy
import os
import pandas as pd
import scanpy as sc
import scanpy.external as sce
import re
import tarfile

In [2]:
out_dir = 'output'
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)

## Helper functions

Helpers for reading data based on UUIDs from HISE

In [3]:
def cache_uuid_path(uuid):
    cache_path = '/home/jupyter/cache/{u}'.format(u = uuid)
    if not os.path.isdir(cache_path):
        hise_res = hisepy.reader.cache_files([uuid])
    filename = os.listdir(cache_path)[0]
    cache_file = '{p}/{f}'.format(p = cache_path, f = filename)
    return cache_file

In [4]:
def read_csv_uuid(uuid):
    cache_file = cache_uuid_path(uuid)
    res = pd.read_csv(cache_file)
    return res

In [5]:
def read_adata_uuid(uuid):
    cache_file = cache_uuid_path(uuid)
    res = sc.read_h5ad(cache_file)
    return res

This function formats cell types for use in filenames

In [6]:
def format_cell_type(cell_type):
    cell_type = re.sub('\\+', 'pos', cell_type)
    cell_type = re.sub('-', 'neg', cell_type)
    cell_type = re.sub(' ', '_', cell_type)
    return cell_type

This function corrects type labels generated when there are ties in selection of most frequent labels:

In [7]:
def fix_cat_labels(labels):
    label_cat = labels.cat.categories
    for cat in label_cat:
        cat = str(cat)
        if '[' in cat:
            new_cat = re.sub('\', .+\n.+','',cat)
            new_cat = re.sub('\[\'','',new_cat)
            if not new_cat in label_cat:
                labels = labels.cat.add_categories(new_cat)
            labels[labels == cat] = new_cat
    return labels

In [8]:
def process_adata(adata):
    
    # Keep a copy of the raw data
    adata = adata.raw.to_adata()
    adata.raw = adata

    print('Normalizing', end = "; ")
    # Normalize and log transform
    sc.pp.normalize_total(adata)
    sc.pp.log1p(adata)

    print('Finding HVGs', end = "; ")
    # Restrict downstream steps to variable genes
    sc.pp.highly_variable_genes(adata)
    adata = adata[:, adata.var_names[adata.var['highly_variable']]].copy()

    print('Scaling', end = "; ")
    # Scale variable genes
    sc.pp.scale(adata)

    print('PCA', end = "; ")
    # Run PCA
    sc.tl.pca(adata, svd_solver = 'arpack')

    print('Harmony', end = "; ")
    # Integrate subjects
    sce.pp.harmony_integrate(
        adata, 
        'subject.subjectGuid',
        max_iter_harmony = 30,
        verbose = False
    )
    
    print('Neighbors', end = "; ")
    # Find nearest neighbors
    sc.pp.neighbors(
        adata, 
        n_neighbors = 50,
        n_pcs = 30,
        use_rep = 'X_pca_harmony'
    )

    print('UMAP', end = "; ")
    # Run UMAP
    sc.tl.umap(adata, min_dist = 0.05)
    
    print('Renormalizing')
    adata = adata.raw.to_adata()
    adata.raw = adata

    # Normalize and log transform
    sc.pp.normalize_total(adata)
    sc.pp.log1p(adata)
    
    return adata

## Identify files in HISE

In [9]:
h5ad_uuids = {
    'monocyte_marker': '85f54006-7242-4474-bfbb-ae52783d8dd6',
    'asdc': 'eb71216b-8f71-4b73-9a5e-1775e1804205',
    'b_naive': '1ae3bda3-c0f9-4b70-8bb6-eb70ce214abc',
    'b_memory': '4e5e30b5-d725-4f35-9d5a-92f9d348e77a',
    'dc': 'bd744517-075d-4118-84f8-6a594f555151',
    'dnt_mait': 'fa2aea32-d174-422e-9334-436bdb6d3404',
    'eryth_platelet': '3321e686-0dc2-4370-b06b-9cbaccb7f227',
    'gdt_cd8aa': 'fe17ec4f-4b45-4c0c-9eb0-b41162e6ab2b',
    'monocyte_label': '4e523f48-d4b7-4445-b681-08e76ed489d7',
    'nk_cells': '91bfb846-8948-4bef-9b59-010d669a3946',
    'plasma_ilc': '0228d6e0-b5db-4b66-b0f8-b609910a298e',
    'pro': '02958072-3eca-4539-b3fc-53d8b7830ba6'
}

## Process data to generate UMAP coords.

In [10]:
def get_file_stem(file):
    stem = re.sub('_2024.+','',file)
    stem = re.sub('.+/','',stem)
    return stem

In [11]:
def check_previous(file, path):
    path_files = os.listdir(path)
    path_bases = [get_file_stem(f) for f in path_files]
    file_base = get_file_stem(file)
    return(file_base in path_bases)

In [None]:
out_h5ads = []
for type_group, uuid in h5ad_uuids.items():
    adata_path = cache_uuid_path(uuid)
    # get L3 types and check for previous processing
    adata = sc.read_h5ad(adata_path, backed = 'r')
    cell_types = adata.obs['AIFI_L3'].copy()
    cell_types = fix_cat_labels(cell_types)
    cell_types = cell_types.unique()
    
    for cell_type in cell_types:
        out_type = format_cell_type(cell_type)
        if type_group == 'monocyte_marker':
            out_type = 'marker-{t}'.format(t = out_type)
        
        out_h5ad = 'output/diha_AIFI_L3_{t}_{d}.h5ad'.format(t = out_type, d = date.today())
        out_csv = 'output/diha_AIFI_L3_{t}_meta_{d}.csv'.format(t = out_type, d = date.today())
        out_parquet = 'output/diha_AIFI_L3_{t}_meta_{d}.parquet'.format(t = out_type, d = date.today())
        
        if check_previous(out_h5ad, 'output/'):
            print('previously analyzed {t}; skipping.'.format(t = cell_type))
            out_h5ads.append(out_h5ad)
            continue
        
        adata = sc.read_h5ad(adata_path)
        adata.obs['AIFI_L3'] = fix_cat_labels(adata.obs['AIFI_L3'])
        adata = adata[adata.obs['AIFI_L3'] == cell_type]
        print('{t}: {n} cells'.format(t = cell_type, n = adata.shape[0]))

        if(adata.shape[0] < 100):
            print('Too few cells. Skipping')
            continue
        
        adata = process_adata(adata)
        adata.write_h5ad(out_h5ad)
        out_h5ads.append(out_h5ad)

    cache_dir = re.sub('diha.+','',adata_path)
    rm_call = 'rm -r {d}'.format(d = cache_dir)
    os.system(rm_call)



previously analyzed Core CD14 monocyte; skipping.




ISG+ CD14 monocyte: 307720 cells
Normalizing; Finding HVGs; Scaling; PCA; Harmony; 

2024-04-09 22:36:30,159 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...
2024-04-09 22:38:23,044 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; UMAP; Renormalizing




C1Q+ CD16 monocyte: 29413 cells
Normalizing; Finding HVGs; Scaling; PCA; 

2024-04-09 23:02:24,021 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


Harmony; 

2024-04-09 23:02:39,371 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; UMAP; Renormalizing




Intermediate monocyte: 73846 cells
Normalizing; Finding HVGs; Scaling; PCA; 

2024-04-09 23:07:00,507 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


Harmony; 

2024-04-09 23:07:36,220 - harmonypy - INFO - sklearn.KMeans initialization complete.


## Upload L3 h5ad's to HISE

Finally, we'll use `hisepy.upload.upload_files()` to send a copy of our output to HISE to use for downstream analysis steps.

In [13]:
study_space_uuid = 'de025812-5e73-4b3c-9c3b-6d0eac412f2a'
title = 'DIHA Non-CD4 or CD8 T separate L3 objects {d}'.format(d = date.today())

In [14]:
in_files = list(h5ad_uuids.values())
in_files

['85f54006-7242-4474-bfbb-ae52783d8dd6',
 'eb71216b-8f71-4b73-9a5e-1775e1804205',
 '1ae3bda3-c0f9-4b70-8bb6-eb70ce214abc',
 '4e5e30b5-d725-4f35-9d5a-92f9d348e77a',
 'bd744517-075d-4118-84f8-6a594f555151',
 'fa2aea32-d174-422e-9334-436bdb6d3404',
 '3321e686-0dc2-4370-b06b-9cbaccb7f227',
 'fe17ec4f-4b45-4c0c-9eb0-b41162e6ab2b',
 '4e523f48-d4b7-4445-b681-08e76ed489d7',
 '91bfb846-8948-4bef-9b59-010d669a3946',
 '0228d6e0-b5db-4b66-b0f8-b609910a298e',
 '02958072-3eca-4539-b3fc-53d8b7830ba6']

In [21]:
output_files = os.listdir('output')
out_files = []
for f in output_files:
    if '.h5ad' in f:
        out_files.append('output/' + f)

In [22]:
out_files

['output/diha_AIFI_L3_cDC1_2024-04-08.h5ad',
 'output/diha_AIFI_L3_ASDC_2024-04-08.h5ad',
 'output/diha_AIFI_L3_Platelet_2024-04-08.h5ad',
 'output/diha_AIFI_L3_Early_memory_B_cell_2024-04-08.h5ad',
 'output/diha_AIFI_L3_marker-IL1Bpos_CD14_monocyte_2024-04-09.h5ad',
 'output/diha_AIFI_L3_GZMKpos_CD56dim_NK_cell_2024-04-09.h5ad',
 'output/diha_AIFI_L3_CD14pos_cDC2_2024-04-08.h5ad',
 'output/diha_AIFI_L3_CD27neg_effector_B_cell_2024-04-08.h5ad',
 'output/diha_AIFI_L3_CMP_cell_2024-04-09.h5ad',
 'output/diha_AIFI_L3_KLRF1neg_effector_Vd1_gdT_2024-04-08.h5ad',
 'output/diha_AIFI_L3_Type_2_polarized_memory_B_cell_2024-04-08.h5ad',
 'output/diha_AIFI_L3_IL1Bpos_CD14_monocyte_2024-04-08.h5ad',
 'output/diha_AIFI_L3_Erythrocyte_2024-04-08.h5ad',
 'output/diha_AIFI_L3_CD8_MAIT_2024-04-08.h5ad',
 'output/diha_AIFI_L3_Plasma_cell_2024-04-09.h5ad',
 'output/diha_AIFI_L3_Core_naive_B_cell_2024-04-08.h5ad',
 'output/diha_AIFI_L3_Activated_memory_B_cell_2024-04-08.h5ad',
 'output/diha_AIFI_L3_ISGpos

In [23]:
hisepy.upload.upload_files(
    files = out_files,
    study_space_id = study_space_uuid,
    title = title,
    input_file_ids = in_files
)

you are trying to upload file_ids... ['output/diha_AIFI_L3_cDC1_2024-04-08.h5ad', 'output/diha_AIFI_L3_ASDC_2024-04-08.h5ad', 'output/diha_AIFI_L3_Platelet_2024-04-08.h5ad', 'output/diha_AIFI_L3_Early_memory_B_cell_2024-04-08.h5ad', 'output/diha_AIFI_L3_marker-IL1Bpos_CD14_monocyte_2024-04-09.h5ad', 'output/diha_AIFI_L3_GZMKpos_CD56dim_NK_cell_2024-04-09.h5ad', 'output/diha_AIFI_L3_CD14pos_cDC2_2024-04-08.h5ad', 'output/diha_AIFI_L3_CD27neg_effector_B_cell_2024-04-08.h5ad', 'output/diha_AIFI_L3_CMP_cell_2024-04-09.h5ad', 'output/diha_AIFI_L3_KLRF1neg_effector_Vd1_gdT_2024-04-08.h5ad', 'output/diha_AIFI_L3_Type_2_polarized_memory_B_cell_2024-04-08.h5ad', 'output/diha_AIFI_L3_IL1Bpos_CD14_monocyte_2024-04-08.h5ad', 'output/diha_AIFI_L3_Erythrocyte_2024-04-08.h5ad', 'output/diha_AIFI_L3_CD8_MAIT_2024-04-08.h5ad', 'output/diha_AIFI_L3_Plasma_cell_2024-04-09.h5ad', 'output/diha_AIFI_L3_Core_naive_B_cell_2024-04-08.h5ad', 'output/diha_AIFI_L3_Activated_memory_B_cell_2024-04-08.h5ad', 'output

(y/n) y


{'trace_id': '379a19c6-6763-4aa0-8f41-6fb5b3281be3',
 'files': ['output/diha_AIFI_L3_cDC1_2024-04-08.h5ad',
  'output/diha_AIFI_L3_ASDC_2024-04-08.h5ad',
  'output/diha_AIFI_L3_Platelet_2024-04-08.h5ad',
  'output/diha_AIFI_L3_Early_memory_B_cell_2024-04-08.h5ad',
  'output/diha_AIFI_L3_marker-IL1Bpos_CD14_monocyte_2024-04-09.h5ad',
  'output/diha_AIFI_L3_GZMKpos_CD56dim_NK_cell_2024-04-09.h5ad',
  'output/diha_AIFI_L3_CD14pos_cDC2_2024-04-08.h5ad',
  'output/diha_AIFI_L3_CD27neg_effector_B_cell_2024-04-08.h5ad',
  'output/diha_AIFI_L3_CMP_cell_2024-04-09.h5ad',
  'output/diha_AIFI_L3_KLRF1neg_effector_Vd1_gdT_2024-04-08.h5ad',
  'output/diha_AIFI_L3_Type_2_polarized_memory_B_cell_2024-04-08.h5ad',
  'output/diha_AIFI_L3_IL1Bpos_CD14_monocyte_2024-04-08.h5ad',
  'output/diha_AIFI_L3_Erythrocyte_2024-04-08.h5ad',
  'output/diha_AIFI_L3_CD8_MAIT_2024-04-08.h5ad',
  'output/diha_AIFI_L3_Plasma_cell_2024-04-09.h5ad',
  'output/diha_AIFI_L3_Core_naive_B_cell_2024-04-08.h5ad',
  'output/diha

In [24]:
import session_info
session_info.show()