# Generate separate L3 review data

In this notebook, we perform UMAP using data from each individual AIFI_L3 cell type. This can be used to assess if our label refinement has been effective and if our cell types are largely uniform.

## Load packages

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

from datetime import date
import hisepy
import os
import pandas as pd
import scanpy as sc
import scanpy.external as sce
import re
import tarfile

In [2]:
out_dir = 'output'
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)

## Helper functions

Helpers for reading data based on UUIDs from HISE

In [3]:
def cache_uuid_path(uuid):
    cache_path = '/home/jupyter/cache/{u}'.format(u = uuid)
    if not os.path.isdir(cache_path):
        hise_res = hisepy.reader.cache_files([uuid])
    filename = os.listdir(cache_path)[0]
    cache_file = '{p}/{f}'.format(p = cache_path, f = filename)
    return cache_file

In [4]:
def read_csv_uuid(uuid):
    cache_file = cache_uuid_path(uuid)
    res = pd.read_csv(cache_file)
    return res

In [5]:
def read_adata_uuid(uuid):
    cache_file = cache_uuid_path(uuid)
    res = sc.read_h5ad(cache_file)
    return res

This function formats cell types for use in filenames

In [6]:
def format_cell_type(cell_type):
    cell_type = re.sub('\\+', 'pos', cell_type)
    cell_type = re.sub('-', 'neg', cell_type)
    cell_type = re.sub(' ', '_', cell_type)
    return cell_type

This function corrects type labels generated when there are ties in selection of most frequent labels:

In [7]:
def fix_cat_labels(labels):
    label_cat = labels.cat.categories
    for cat in label_cat:
        cat = str(cat)
        if '[' in cat:
            new_cat = re.sub('\', .+\n.+','',cat)
            new_cat = re.sub('\[\'','',new_cat)
            if not new_cat in label_cat:
                labels = labels.cat.add_categories(new_cat)
            labels[labels == cat] = new_cat
    return labels

This function retrieves both observations and UMAP coordinates in a single table for review.

In [8]:
def obs_with_umap(adata):
    obs = adata.obs
    
    umap_mat = adata.obsm['X_umap']
    umap_df = pd.DataFrame(umap_mat, columns = ['umap_1', 'umap_2'])
    obs['umap_1'] = umap_df['umap_1']
    obs['umap_2'] = umap_df['umap_2']

    return obs

In [9]:
def process_adata(adata):
    
    # Keep a copy of the raw data
    adata = adata.raw.to_adata()
    adata.raw = adata

    print('Normalizing', end = "; ")
    # Normalize and log transform
    sc.pp.normalize_total(adata)
    sc.pp.log1p(adata)

    print('Finding HVGs', end = "; ")
    # Restrict downstream steps to variable genes
    sc.pp.highly_variable_genes(adata)
    adata = adata[:, adata.var_names[adata.var['highly_variable']]].copy()

    print('Scaling', end = "; ")
    # Scale variable genes
    sc.pp.scale(adata)

    print('PCA', end = "; ")
    # Run PCA
    sc.tl.pca(adata, svd_solver = 'arpack')

    print('Harmony', end = "; ")
    # Integrate subjects
    sce.pp.harmony_integrate(
        adata, 
        'subject.subjectGuid',
        max_iter_harmony = 30,
        verbose = False
    )
    
    print('Neighbors', end = "; ")
    # Find nearest neighbors
    sc.pp.neighbors(
        adata, 
        n_neighbors = 50,
        n_pcs = 30,
        use_rep = 'X_pca_harmony'
    )

    print('UMAP', end = "; ")
    # Run UMAP
    sc.tl.umap(adata, min_dist = 0.05)
    
    print('Renormalizing')
    adata = adata.raw.to_adata()
    adata.raw = adata

    # Normalize and log transform
    sc.pp.normalize_total(adata)
    sc.pp.log1p(adata)
    
    return adata

## Identify files in HISE

In [10]:
h5ad_uuids = {
    'asdc': 'eb71216b-8f71-4b73-9a5e-1775e1804205',
    'b_naive': '1ae3bda3-c0f9-4b70-8bb6-eb70ce214abc',
    'b_memory': '4e5e30b5-d725-4f35-9d5a-92f9d348e77a',
    'dc': 'bd744517-075d-4118-84f8-6a594f555151',
    'dnt_mait': 'fa2aea32-d174-422e-9334-436bdb6d3404',
    'eryth_platelet': '3321e686-0dc2-4370-b06b-9cbaccb7f227',
    'gdt_cd8aa': 'fe17ec4f-4b45-4c0c-9eb0-b41162e6ab2b',
    'monocyte_label': '4e523f48-d4b7-4445-b681-08e76ed489d7',
    'monocyte_marker': '85f54006-7242-4474-bfbb-ae52783d8dd6',
    'nk_cells': '91bfb846-8948-4bef-9b59-010d669a3946',
    'plasma_ilc': '0228d6e0-b5db-4b66-b0f8-b609910a298e',
    'pro': '02958072-3eca-4539-b3fc-53d8b7830ba6',
    't_cd4_memory': '',
    't_cd4_naive': '',
    't_cd8_memory': '',
    't_cd8_naive': 'f7fb9efe-7791-4be3-a137-372478db40ca',
    'treg': 'e4e31bdb-d6b4-4451-bd56-44085b70068d'
}

## Process data to generate UMAP coords.

In [None]:
out_h5ads = []
out_csvs = []
out_parquets = []
for type_group, uuid in h5ad_uuids.items():
    adata_path = cache_uuid_path(uuid)
    # get L3 types and check for previous processing
    adata = sc.read_h5ad(adata_path, backed = 'r')
    cell_types = adata.obs['AIFI_L3'].copy()
    cell_types = fix_cat_labels(cell_types)
    cell_types = cell_types.unique()
    
    for cell_type in cell_types:
        out_type = format_cell_type(cell_type)
        if type_group == 'monocyte_marker':
            out_type = 'marker-{t}'.format(t = out_type)
        
        out_h5ad = 'output/diha_AIFI_L3_{t}_{d}.h5ad'.format(t = out_type, d = date.today())
        out_csv = 'output/diha_AIFI_L3_{t}_meta_{d}.csv'.format(t = out_type, d = date.today())
        out_parquet = 'output/diha_AIFI_L3_{t}_meta_{d}.parquet'.format(t = out_type, d = date.today())
        
        if os.path.isfile(out_h5ad):
            print('previously analyzed {t}; skipping.'.format(t = cell_type))
            out_h5ads.append(out_h5ad)
            out_csvs.append(out_csv)
            out_parquets.append(out_parquet)
            continue
        
        adata = sc.read_h5ad(adata_path)
        adata.obs['AIFI_L3'] = fix_cat_labels(adata.obs['AIFI_L3'])
        adata = adata[adata.obs['AIFI_L3'] == cell_type]
        print('{t}: {n} cells'.format(t = cell_type, n = adata.shape[0]))

        if(adata.shape[0] < 100):
            print('Too few cells. Skipping')
            continue
        
        adata = process_adata(adata)
        adata.write_h5ad(out_h5ad)
        out_h5ads.append(out_h5ad)
        
        obs_df = obs_with_umap(adata)
        obs_df.to_csv(out_csv)
        out_csvs.append(out_csv)
        obs_df.to_parquet(out_parquet)
        out_parquets.append(out_parquet)



previously analyzed ASDC; skipping.




previously analyzed Core naive B cell; skipping.
previously analyzed Transitional B cell; skipping.
previously analyzed ISG+ naive B cell; skipping.




previously analyzed Core memory B cell; skipping.
previously analyzed Type 2 polarized memory B cell; skipping.
previously analyzed CD95 memory B cell; skipping.
previously analyzed CD27+ effector B cell; skipping.
previously analyzed Early memory B cell; skipping.
previously analyzed Activated memory B cell; skipping.
previously analyzed CD27- effector B cell; skipping.




previously analyzed cDC1; skipping.
previously analyzed HLA-DRhi cDC2; skipping.
previously analyzed ISG+ cDC2; skipping.




CLP cell: 29 cells
Too few cells. Skipping
previously analyzed CD14+ cDC2; skipping.
previously analyzed pDC; skipping.




previously analyzed DN T cell; skipping.
previously analyzed CD8 MAIT; skipping.
previously analyzed CD4 MAIT; skipping.
previously analyzed ISG+ MAIT; skipping.




previously analyzed Erythrocyte; skipping.
previously analyzed Core naive CD4 T cell; skipping.
previously analyzed Platelet; skipping.




previously analyzed CD8aa; skipping.
previously analyzed KLRF1- effector Vd1 gdT; skipping.
previously analyzed KLRF1+ effector Vd1 gdT; skipping.
previously analyzed SOX4+ Vd1 gdT; skipping.
previously analyzed Naive Vd1 gdT; skipping.
previously analyzed GZMB+ Vd2 gdT; skipping.
previously analyzed GZMK+ Vd2 gdT; skipping.




previously analyzed Core CD14 monocyte; skipping.




ISG+ CD14 monocyte: 268729 cells
Normalizing; Finding HVGs; Scaling; PCA; Harmony; 

2024-04-08 21:14:55,163 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...
2024-04-08 21:17:21,942 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; UMAP; Renormalizing
