# Generate separate L3 review data for CD4 and CD8 T cells

In this notebook, we perform UMAP using data from each individual AIFI_L3 cell type. This can be used to assess if our label refinement has been effective and if our cell types are largely uniform.

**Note**: Because these are *very* large cell types, you'll need to ensure that a lot of disk space is available (> 400GB)

## Load packages

In [15]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

from datetime import date
import hisepy
import os
import pandas as pd
import scanpy as sc
import scanpy.external as sce
import re
import tarfile

In [16]:
out_dir = 'output'
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)

## Helper functions

Helpers for reading data based on UUIDs from HISE

In [17]:
def cache_uuid_path(uuid):
    cache_path = '/home/jupyter/cache/{u}'.format(u = uuid)
    if not os.path.isdir(cache_path):
        hise_res = hisepy.reader.cache_files([uuid])
    filename = os.listdir(cache_path)[0]
    cache_file = '{p}/{f}'.format(p = cache_path, f = filename)
    return cache_file

In [18]:
def read_csv_uuid(uuid):
    cache_file = cache_uuid_path(uuid)
    res = pd.read_csv(cache_file)
    return res

In [19]:
def read_adata_uuid(uuid):
    cache_file = cache_uuid_path(uuid)
    res = sc.read_h5ad(cache_file)
    return res

This function formats cell types for use in filenames

In [20]:
def format_cell_type(cell_type):
    cell_type = re.sub('\\+', 'pos', cell_type)
    cell_type = re.sub('-', 'neg', cell_type)
    cell_type = re.sub(' ', '_', cell_type)
    return cell_type

This function corrects type labels generated when there are ties in selection of most frequent labels:

In [21]:
def fix_cat_labels(labels):
    label_cat = labels.cat.categories
    for cat in label_cat:
        cat = str(cat)
        if '[' in cat:
            new_cat = re.sub('\', .+\n.+','',cat)
            new_cat = re.sub('\[\'','',new_cat)
            if not new_cat in label_cat:
                labels = labels.cat.add_categories(new_cat)
            labels[labels == cat] = new_cat
    return labels

In [23]:
def process_adata(adata):
    
    # Keep a copy of the raw data
    adata = adata.raw.to_adata()
    adata.raw = adata

    print('Normalizing', end = "; ")
    # Normalize and log transform
    sc.pp.normalize_total(adata)
    sc.pp.log1p(adata)

    print('Finding HVGs', end = "; ")
    # Restrict downstream steps to variable genes
    sc.pp.highly_variable_genes(adata)
    adata = adata[:, adata.var_names[adata.var['highly_variable']]].copy()

    print('Scaling', end = "; ")
    # Scale variable genes
    sc.pp.scale(adata)

    print('PCA', end = "; ")
    # Run PCA
    sc.tl.pca(adata, svd_solver = 'arpack')

    print('Harmony', end = "; ")
    # Integrate subjects
    sce.pp.harmony_integrate(
        adata, 
        'subject.subjectGuid',
        max_iter_harmony = 30,
        verbose = False
    )
    
    print('Neighbors', end = "; ")
    # Find nearest neighbors
    sc.pp.neighbors(
        adata, 
        n_neighbors = 50,
        n_pcs = 30,
        use_rep = 'X_pca_harmony'
    )

    print('UMAP', end = "; ")
    # Run UMAP
    sc.tl.umap(adata, min_dist = 0.05)
    
    print('Renormalizing')
    adata = adata.raw.to_adata()
    adata.raw = adata

    # Normalize and log transform
    sc.pp.normalize_total(adata)
    sc.pp.log1p(adata)
    
    return adata

## Identify files in HISE

In [24]:
h5ad_uuids = {
    't_cd8_memory': 'bb05f8df-0003-4ad9-9ee1-85fe386fa279',
    't_cd8_naive': 'f7fb9efe-7791-4be3-a137-372478db40ca',
    'treg': 'e4e31bdb-d6b4-4451-bd56-44085b70068d',
    't_cd4_memory': '52ad1826-5e77-41cc-b99a-77e6f145e9a1',
    't_cd4_naive': '76b94b6b-f438-4414-a25c-82f1d473eab9',
}

## Process data to generate UMAP coords.

In [25]:
def get_file_stem(file):
    stem = re.sub('_2024.+','',file)
    stem = re.sub('.+/','',stem)
    return stem

In [26]:
def check_previous(file, path):
    path_files = os.listdir(path)
    path_bases = [get_file_stem(f) for f in path_files]
    file_base = get_file_stem(file)
    return(file_base in path_bases)

In [None]:
out_h5ads = []
for type_group, uuid in h5ad_uuids.items():
    adata_path = cache_uuid_path(uuid)
    # get L3 types and check for previous processing
    adata = sc.read_h5ad(adata_path, backed = 'r')
    cell_types = adata.obs['AIFI_L3'].copy()
    cell_types = fix_cat_labels(cell_types)
    cell_types = cell_types.unique()
    
    for cell_type in cell_types:
        out_type = format_cell_type(cell_type)
        if type_group == 'monocyte_marker':
            out_type = 'marker-{t}'.format(t = out_type)
        
        out_h5ad = 'output/diha_AIFI_L3_{t}_{d}.h5ad'.format(t = out_type, d = date.today())
        out_csv = 'output/diha_AIFI_L3_{t}_meta_{d}.csv'.format(t = out_type, d = date.today())
        out_parquet = 'output/diha_AIFI_L3_{t}_meta_{d}.parquet'.format(t = out_type, d = date.today())
        
        if check_previous(out_h5ad, 'output/'):
            print('previously analyzed {t}; skipping.'.format(t = cell_type))
            out_h5ads.append(out_h5ad)
            continue
        
        adata = sc.read_h5ad(adata_path)
        adata.obs['AIFI_L3'] = fix_cat_labels(adata.obs['AIFI_L3'])
        adata = adata[adata.obs['AIFI_L3'] == cell_type]
        print('{t}: {n} cells'.format(t = cell_type, n = adata.shape[0]))

        if(adata.shape[0] < 100):
            print('Too few cells. Skipping')
            continue
        
        adata = process_adata(adata)
        adata.write_h5ad(out_h5ad)
        out_h5ads.append(out_h5ad)

    cache_dir = re.sub('diha.+','',adata_path)
    rm_call = 'rm -r {d}'.format(d = cache_dir)
    os.system(rm_call)

downloading fileID: bb05f8df-0003-4ad9-9ee1-85fe386fa279
Files have been successfully downloaded!
GZMK+ CD27+ EM CD8 T cell: 558187 cells
Normalizing; Finding HVGs; Scaling; PCA; Harmony; 

2024-04-09 17:09:10,716 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...
2024-04-09 17:13:31,345 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; UMAP; Renormalizing
CM CD8 T cell: 224543 cells
Normalizing; Finding HVGs; Scaling; PCA; Harmony; 

2024-04-09 17:57:53,613 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...
2024-04-09 18:00:19,798 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; UMAP; Renormalizing
KLRF1- GZMB+ CD27- EM CD8 T cell: 444330 cells
Normalizing; Finding HVGs; Scaling; PCA; Harmony; 

2024-04-09 18:20:13,203 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...
2024-04-09 18:24:22,383 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; UMAP; Renormalizing
GZMK- CD27+ EM CD8 T cell: 37349 cells
Normalizing; Finding HVGs; Scaling; PCA; 

2024-04-09 19:16:53,022 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


Harmony; 

2024-04-09 19:17:20,531 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; UMAP; Renormalizing
KLRF1+ GZMB+ CD27- EM CD8 T cell: 217888 cells
Normalizing; Finding HVGs; Scaling; PCA; Harmony; 

2024-04-09 19:26:13,945 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...
2024-04-09 19:28:30,949 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; UMAP; Renormalizing
ISG+ memory CD8 T cell: 4710 cells
Normalizing; Finding HVGs; Scaling; PCA; 

2024-04-09 19:49:53,832 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


Harmony; 

2024-04-09 19:49:58,522 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; UMAP; Renormalizing
downloading fileID: f7fb9efe-7791-4be3-a137-372478db40ca
Files have been successfully downloaded!




Core naive CD8 T cell: 725788 cells
Normalizing; Finding HVGs; Scaling; PCA; Harmony; 

2024-04-09 20:07:24,168 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...
2024-04-09 20:14:17,305 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; UMAP; 

IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out


Renormalizing




SOX4+ naive CD8 T cell: 49932 cells
Normalizing; Finding HVGs; Scaling; PCA; 

2024-04-09 21:17:32,523 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


Harmony; 

2024-04-09 21:18:08,309 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; UMAP; Renormalizing




ISG+ naive CD8 T cell: 2930 cells
Normalizing; Finding HVGs; Scaling; PCA; 

2024-04-09 21:23:58,880 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


Harmony; 

2024-04-09 21:24:02,398 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; UMAP; Renormalizing
downloading fileID: e4e31bdb-d6b4-4451-bd56-44085b70068d
Files have been successfully downloaded!




Memory CD4 Treg: 147022 cells
Finding HVGs; Scaling; PCA; 

2024-04-09 21:26:23,416 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


Harmony; 

2024-04-09 21:27:51,256 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; UMAP; Renormalizing




KLRB1+ memory CD8 Treg: 4725 cells
Finding HVGs; Scaling; PCA; 

2024-04-09 21:42:55,190 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


Harmony; 

## Upload L3 h5ad's to HISE

Finally, we'll use `hisepy.upload.upload_files()` to send a copy of our output to HISE to use for downstream analysis steps.

In [28]:
study_space_uuid = 'de025812-5e73-4b3c-9c3b-6d0eac412f2a'
title = 'DIHA CD4 and CD8 T cell separate L3 objects {d}'.format(d = date.today())

In [29]:
in_files = list(h5ad_uuids.values())
in_files

['bb05f8df-0003-4ad9-9ee1-85fe386fa279',
 'f7fb9efe-7791-4be3-a137-372478db40ca',
 'e4e31bdb-d6b4-4451-bd56-44085b70068d',
 '52ad1826-5e77-41cc-b99a-77e6f145e9a1',
 '76b94b6b-f438-4414-a25c-82f1d473eab9']

In [30]:
out_files = out_h5ads
out_files

['output/diha_AIFI_L3_GZMKpos_CD27pos_EM_CD8_T_cell_2024-04-09.h5ad',
 'output/diha_AIFI_L3_CM_CD8_T_cell_2024-04-09.h5ad',
 'output/diha_AIFI_L3_KLRF1neg_GZMBpos_CD27neg_EM_CD8_T_cell_2024-04-09.h5ad',
 'output/diha_AIFI_L3_GZMKneg_CD27pos_EM_CD8_T_cell_2024-04-09.h5ad',
 'output/diha_AIFI_L3_KLRF1pos_GZMBpos_CD27neg_EM_CD8_T_cell_2024-04-09.h5ad',
 'output/diha_AIFI_L3_ISGpos_memory_CD8_T_cell_2024-04-09.h5ad',
 'output/diha_AIFI_L3_Core_naive_CD8_T_cell_2024-04-09.h5ad',
 'output/diha_AIFI_L3_SOX4pos_naive_CD8_T_cell_2024-04-09.h5ad',
 'output/diha_AIFI_L3_ISGpos_naive_CD8_T_cell_2024-04-09.h5ad',
 'output/diha_AIFI_L3_Memory_CD4_Treg_2024-04-09.h5ad',
 'output/diha_AIFI_L3_KLRB1pos_memory_CD8_Treg_2024-04-09.h5ad',
 'output/diha_AIFI_L3_Naive_CD4_Treg_2024-04-09.h5ad',
 'output/diha_AIFI_L3_KLRB1pos_memory_CD4_Treg_2024-04-09.h5ad',
 'output/diha_AIFI_L3_GZMKpos_memory_CD4_Treg_2024-04-09.h5ad',
 'output/diha_AIFI_L3_Memory_CD8_Treg_2024-04-09.h5ad',
 'output/diha_AIFI_L3_CM_CD4_T_

In [31]:
hisepy.upload.upload_files(
    files = out_files,
    study_space_id = study_space_uuid,
    title = title,
    input_file_ids = in_files
)

output/diha_AIFI_L3_GZMKpos_CD27pos_EM_CD8_T_cell_2024-04-09.h5ad
output/diha_AIFI_L3_CM_CD8_T_cell_2024-04-09.h5ad
output/diha_AIFI_L3_KLRF1neg_GZMBpos_CD27neg_EM_CD8_T_cell_2024-04-09.h5ad
output/diha_AIFI_L3_GZMKneg_CD27pos_EM_CD8_T_cell_2024-04-09.h5ad
output/diha_AIFI_L3_KLRF1pos_GZMBpos_CD27neg_EM_CD8_T_cell_2024-04-09.h5ad
output/diha_AIFI_L3_ISGpos_memory_CD8_T_cell_2024-04-09.h5ad
output/diha_AIFI_L3_Core_naive_CD8_T_cell_2024-04-09.h5ad
output/diha_AIFI_L3_SOX4pos_naive_CD8_T_cell_2024-04-09.h5ad
output/diha_AIFI_L3_ISGpos_naive_CD8_T_cell_2024-04-09.h5ad
output/diha_AIFI_L3_Memory_CD4_Treg_2024-04-09.h5ad
output/diha_AIFI_L3_KLRB1pos_memory_CD8_Treg_2024-04-09.h5ad
output/diha_AIFI_L3_Naive_CD4_Treg_2024-04-09.h5ad
output/diha_AIFI_L3_KLRB1pos_memory_CD4_Treg_2024-04-09.h5ad
output/diha_AIFI_L3_GZMKpos_memory_CD4_Treg_2024-04-09.h5ad
output/diha_AIFI_L3_Memory_CD8_Treg_2024-04-09.h5ad
output/diha_AIFI_L3_CM_CD4_T_cell_2024-04-09.h5ad
output/diha_AIFI_L3_GZMBneg_CD27pos_EM_CD

 1


you are trying to upload file_ids... ['output/diha_AIFI_L3_GZMKpos_CD27pos_EM_CD8_T_cell_2024-04-09.h5ad', 'output/diha_AIFI_L3_CM_CD8_T_cell_2024-04-09.h5ad', 'output/diha_AIFI_L3_KLRF1neg_GZMBpos_CD27neg_EM_CD8_T_cell_2024-04-09.h5ad', 'output/diha_AIFI_L3_GZMKneg_CD27pos_EM_CD8_T_cell_2024-04-09.h5ad', 'output/diha_AIFI_L3_KLRF1pos_GZMBpos_CD27neg_EM_CD8_T_cell_2024-04-09.h5ad', 'output/diha_AIFI_L3_ISGpos_memory_CD8_T_cell_2024-04-09.h5ad', 'output/diha_AIFI_L3_Core_naive_CD8_T_cell_2024-04-09.h5ad', 'output/diha_AIFI_L3_SOX4pos_naive_CD8_T_cell_2024-04-09.h5ad', 'output/diha_AIFI_L3_ISGpos_naive_CD8_T_cell_2024-04-09.h5ad', 'output/diha_AIFI_L3_Memory_CD4_Treg_2024-04-09.h5ad', 'output/diha_AIFI_L3_KLRB1pos_memory_CD8_Treg_2024-04-09.h5ad', 'output/diha_AIFI_L3_Naive_CD4_Treg_2024-04-09.h5ad', 'output/diha_AIFI_L3_KLRB1pos_memory_CD4_Treg_2024-04-09.h5ad', 'output/diha_AIFI_L3_GZMKpos_memory_CD4_Treg_2024-04-09.h5ad', 'output/diha_AIFI_L3_Memory_CD8_Treg_2024-04-09.h5ad', 'output/

(y/n) y


{'trace_id': '9de58cb4-1702-48ff-af33-1b6d826c375a',
 'files': ['output/diha_AIFI_L3_GZMKpos_CD27pos_EM_CD8_T_cell_2024-04-09.h5ad',
  'output/diha_AIFI_L3_CM_CD8_T_cell_2024-04-09.h5ad',
  'output/diha_AIFI_L3_KLRF1neg_GZMBpos_CD27neg_EM_CD8_T_cell_2024-04-09.h5ad',
  'output/diha_AIFI_L3_GZMKneg_CD27pos_EM_CD8_T_cell_2024-04-09.h5ad',
  'output/diha_AIFI_L3_KLRF1pos_GZMBpos_CD27neg_EM_CD8_T_cell_2024-04-09.h5ad',
  'output/diha_AIFI_L3_ISGpos_memory_CD8_T_cell_2024-04-09.h5ad',
  'output/diha_AIFI_L3_Core_naive_CD8_T_cell_2024-04-09.h5ad',
  'output/diha_AIFI_L3_SOX4pos_naive_CD8_T_cell_2024-04-09.h5ad',
  'output/diha_AIFI_L3_ISGpos_naive_CD8_T_cell_2024-04-09.h5ad',
  'output/diha_AIFI_L3_Memory_CD4_Treg_2024-04-09.h5ad',
  'output/diha_AIFI_L3_KLRB1pos_memory_CD8_Treg_2024-04-09.h5ad',
  'output/diha_AIFI_L3_Naive_CD4_Treg_2024-04-09.h5ad',
  'output/diha_AIFI_L3_KLRB1pos_memory_CD4_Treg_2024-04-09.h5ad',
  'output/diha_AIFI_L3_GZMKpos_memory_CD4_Treg_2024-04-09.h5ad',
  'output/d

In [None]:
import session_info
session_info.show()