# Cluster CellTypist L3 B cells

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

from datetime import date
import hisepy
import os
import pandas as pd
import re
import scanpy as sc

In [2]:
out_dir = 'output'
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)

## Helper functions

In [3]:
def cache_uuid_path(uuid):
    cache_path = '/home/jupyter/cache/{u}'.format(u = uuid)
    if not os.path.isdir(cache_path):
        hise_res = hisepy.reader.cache_files([uuid])
    filename = os.listdir(cache_path)[0]
    cache_file = '{p}/{f}'.format(p = cache_path, f = filename)
    return cache_file

In [4]:
def read_csv_uuid(uuid):
    cache_file = cache_uuid_path(uuid)
    res = pd.read_csv(cache_file)
    return res

In [5]:
def read_adata_uuid(uuid):
    cache_file = cache_uuid_path(uuid)
    res = sc.read_h5ad(cache_file)
    return res

In [6]:
def rm_cache_uuid(uuid):
    cache_path = '/home/jupyter/cache/{u}'.format(u = uuid)
    rm_call = 'rm -r {d}'.format(d = cache_path)
    os.system(rm_call)

In [7]:
def format_cell_type(cell_type):
    cell_type = re.sub('\\+', 'pos', cell_type)
    cell_type = re.sub('-', 'neg', cell_type)
    cell_type = re.sub(' ', '_', cell_type)
    return cell_type

In [8]:
def element_id(n = 3):
    import periodictable
    from random import randrange
    rand_el = []
    for i in range(n):
        el = randrange(0,118)
        rand_el.append(periodictable.elements[el].name)
    rand_str = '-'.join(rand_el)
    return rand_str

In [9]:
def process_adata(adata, resolution = 2):
    
    # Keep a copy of the raw data
    adata = adata.raw.to_adata()
    adata.raw = adata

    print('Normalizing', end = "; ")
    # Normalize and log transform
    sc.pp.normalize_total(adata)
    sc.pp.log1p(adata)

    print('Finding HVGs', end = "; ")
    # Restrict downstream steps to variable genes
    sc.pp.highly_variable_genes(adata)
    adata = adata[:, adata.var_names[adata.var['highly_variable']]].copy()

    print('Scaling', end = "; ")
    # Scale variable genes
    sc.pp.scale(adata)

    print('PCA', end = "; ")
    # Run PCA
    sc.tl.pca(adata, svd_solver = 'arpack')
    
    print('Neighbors', end = "; ")
    # Find nearest neighbors
    sc.pp.neighbors(
        adata, 
        n_neighbors = 50,
        n_pcs = 30
    )

    print('Leiden', end = "; ")
    # Find clusters
    sc.tl.leiden(
        adata, 
        resolution = resolution, 
        key_added = 'leiden_{r}'.format(r = resolution),
        n_iterations = 2
    )

    print('UMAP', end = "; ")
    # Run UMAP
    sc.tl.umap(adata, min_dist = 0.05)
    
    print('Renormalizing')
    adata = adata.raw.to_adata()
    adata.raw = adata

    # Normalize and log transform
    sc.pp.normalize_total(adata)
    sc.pp.log1p(adata)
    
    return adata

## Read cell type hierarchy

In [10]:
hierarchy_uuid = '1a44252c-8cab-4c8f-92c9-d8f3af633790'
hierarchy_df = read_csv_uuid(hierarchy_uuid)

## Identify files for use in HISE

In [11]:
search_id = 'gadolinium-mercury-cadmium'
l1_types = ['B cell']

Get L3 cell types in the format used for filenames

In [13]:
l3_types = hierarchy_df['AIFI_L3'].loc[hierarchy_df['AIFI_L1'].isin(l1_types)]
l3_types = l3_types.tolist()

In [14]:
l3_file_types = [format_cell_type(ct) for ct in l3_types]

Retrieve files stored in our HISE project store

In [15]:
ps_df = hisepy.list_files_in_project_store('cohorts')
ps_df = ps_df[['id', 'name']]

Filter for files from the previous notebook using our search_id

In [16]:
search_df = ps_df[ps_df['name'].str.contains(search_id)]

Filter for cells related to the L1 cell type based on l3_types

In [21]:
type_string = '|'.join(l3_file_types)
type_df = search_df[search_df['name'].str.contains(type_string)]

In [24]:
type_df['name'].tolist()

['gadolinium-mercury-cadmium/diha_celltypist_L3_CD27neg_effector_B_cell.h5ad',
 'gadolinium-mercury-cadmium/diha_celltypist_L3_CD27pos_effector_B_cell.h5ad',
 'gadolinium-mercury-cadmium/diha_celltypist_L3_ISGpos_naive_B_cell.h5ad',
 'gadolinium-mercury-cadmium/diha_celltypist_L3_Plasma_cell.h5ad',
 'gadolinium-mercury-cadmium/diha_celltypist_L3_Early_memory_B_cell.h5ad',
 'gadolinium-mercury-cadmium/diha_celltypist_L3_Activated_memory_B_cell.h5ad',
 'gadolinium-mercury-cadmium/diha_celltypist_L3_Type_2_polarized_memory_B_cell.h5ad',
 'gadolinium-mercury-cadmium/diha_celltypist_L3_Transitional_B_cell.h5ad',
 'gadolinium-mercury-cadmium/diha_celltypist_L3_CD95_memory_B_cell.h5ad',
 'gadolinium-mercury-cadmium/diha_celltypist_L3_Core_memory_B_cell.h5ad',
 'gadolinium-mercury-cadmium/diha_celltypist_L3_Core_naive_B_cell.h5ad']

## Process data for each L3 type

In [25]:
for uuid in type_df['id']:
    adata = read_adata_uuid(uuid)
    
    cell_type = adata.obs['AIFI_L3'][0]
    out_type = format_cell_type(cell_type)
    
    out_file = 'output/diha_clustered_celltypist_L3_{ct}_{d}.h5ad'.format(ct = out_type, d = date.today())
    if os.path.isfile(out_file):
        print('Previously processed {ct}; Skipping.'.format(ct = out_type))
        out_files.append(out_file)
    else:
        adata = process_adata(adata, resolution = 2)
        adata.write_h5ad(out_file)
    
    rm_cache_uuid(uuid)

downloading fileID: 1b86698f-b7b6-4981-9047-2876b3bec2ca
Files have been successfully downloaded!
Normalizing; Finding HVGs; Scaling; PCA; Neighbors; Leiden; UMAP; Renormalizing
downloading fileID: fbd9e245-0f42-4eb8-aaef-a91a800247ab
Files have been successfully downloaded!
Normalizing; Finding HVGs; Scaling; PCA; Neighbors; Leiden; UMAP; Renormalizing
downloading fileID: 98145ad6-78b8-4a37-9ee6-694f53bc0388
Files have been successfully downloaded!
Normalizing; Finding HVGs; Scaling; PCA; Neighbors; Leiden; UMAP; Renormalizing
downloading fileID: 9c247835-999c-441f-a0a1-f3987259fcbb
Files have been successfully downloaded!
Normalizing; Finding HVGs; Scaling; PCA; Neighbors; Leiden; UMAP; Renormalizing
downloading fileID: 64567d64-384e-4d3e-9764-7e5378c104f2
Files have been successfully downloaded!
Normalizing; Finding HVGs; Scaling; PCA; Neighbors; Leiden; UMAP; Renormalizing
downloading fileID: 78d996ec-b373-465a-9525-ab5604c303ba
Files have been successfully downloaded!
Normalizing;

## Upload assembled results to HISE

In [44]:
study_space_uuid = 'de025812-5e73-4b3c-9c3b-6d0eac412f2a'
title = 'DIHA CellTypist L3 B cells Clustered {d}'.format(d = date.today())

In [45]:
search_id = element_id()
search_id

'aluminum-gallium-gold'

In [46]:
in_files = type_df['id'].tolist()
in_files

['1b86698f-b7b6-4981-9047-2876b3bec2ca',
 'fbd9e245-0f42-4eb8-aaef-a91a800247ab',
 '98145ad6-78b8-4a37-9ee6-694f53bc0388',
 '9c247835-999c-441f-a0a1-f3987259fcbb',
 '64567d64-384e-4d3e-9764-7e5378c104f2',
 '78d996ec-b373-465a-9525-ab5604c303ba',
 'fc733449-d407-4a21-9a5b-2ee4f5557828',
 '31a76abf-0b28-43cc-b605-3f375d9d34fc',
 'c7010ad8-4de4-4659-ae41-557add2dd92e',
 'ccb68279-73da-4498-b784-96269d3aa2e4',
 '41e27829-f53a-4475-b588-b5fdb90f6b49']

In [47]:
out_files = ['output/' + f for f in os.listdir('output')]

In [48]:
out_files

['output/diha_clustered_celltypist_L3_Core_naive_B_cell_2024-04-13.h5ad',
 'output/diha_clustered_celltypist_L3_Early_memory_B_cell_2024-04-13.h5ad',
 'output/diha_clustered_celltypist_L3_ISGpos_naive_B_cell_2024-04-13.h5ad',
 'output/diha_clustered_celltypist_L3_Activated_memory_B_cell_2024-04-13.h5ad',
 'output/diha_clustered_celltypist_L3_CD95_memory_B_cell_2024-04-13.h5ad',
 'output/diha_clustered_celltypist_L3_Type_2_polarized_memory_B_cell_2024-04-13.h5ad',
 'output/diha_clustered_celltypist_L3_CD27neg_effector_B_cell_2024-04-13.h5ad',
 'output/diha_clustered_celltypist_L3_Core_memory_B_cell_2024-04-13.h5ad',
 'output/diha_clustered_celltypist_L3_Transitional_B_cell_2024-04-13.h5ad',
 'output/diha_clustered_celltypist_L3_CD27pos_effector_B_cell_2024-04-13.h5ad',
 'output/diha_clustered_celltypist_L3_Plasma_cell_2024-04-13.h5ad']

In [49]:
hisepy.upload.upload_files(
    files = out_files,
    study_space_id = study_space_uuid,
    title = title,
    input_file_ids = in_files,
    destination = search_id
)

output/diha_clustered_celltypist_L3_Core_naive_B_cell_2024-04-13.h5ad
output/diha_clustered_celltypist_L3_Early_memory_B_cell_2024-04-13.h5ad
output/diha_clustered_celltypist_L3_ISGpos_naive_B_cell_2024-04-13.h5ad
output/diha_clustered_celltypist_L3_Activated_memory_B_cell_2024-04-13.h5ad
output/diha_clustered_celltypist_L3_CD95_memory_B_cell_2024-04-13.h5ad
output/diha_clustered_celltypist_L3_Type_2_polarized_memory_B_cell_2024-04-13.h5ad
output/diha_clustered_celltypist_L3_CD27neg_effector_B_cell_2024-04-13.h5ad
output/diha_clustered_celltypist_L3_Core_memory_B_cell_2024-04-13.h5ad
output/diha_clustered_celltypist_L3_Transitional_B_cell_2024-04-13.h5ad
output/diha_clustered_celltypist_L3_CD27pos_effector_B_cell_2024-04-13.h5ad
output/diha_clustered_celltypist_L3_Plasma_cell_2024-04-13.h5ad
you are trying to upload file_ids... ['output/diha_clustered_celltypist_L3_Core_naive_B_cell_2024-04-13.h5ad', 'output/diha_clustered_celltypist_L3_Early_memory_B_cell_2024-04-13.h5ad', 'output/dih

(y/n) y


{'trace_id': 'fddd9d49-3b1f-4e9f-80c1-7b1253ae5105',
 'files': ['output/diha_clustered_celltypist_L3_Core_naive_B_cell_2024-04-13.h5ad',
  'output/diha_clustered_celltypist_L3_Early_memory_B_cell_2024-04-13.h5ad',
  'output/diha_clustered_celltypist_L3_ISGpos_naive_B_cell_2024-04-13.h5ad',
  'output/diha_clustered_celltypist_L3_Activated_memory_B_cell_2024-04-13.h5ad',
  'output/diha_clustered_celltypist_L3_CD95_memory_B_cell_2024-04-13.h5ad',
  'output/diha_clustered_celltypist_L3_Type_2_polarized_memory_B_cell_2024-04-13.h5ad',
  'output/diha_clustered_celltypist_L3_CD27neg_effector_B_cell_2024-04-13.h5ad',
  'output/diha_clustered_celltypist_L3_Core_memory_B_cell_2024-04-13.h5ad',
  'output/diha_clustered_celltypist_L3_Transitional_B_cell_2024-04-13.h5ad',
  'output/diha_clustered_celltypist_L3_CD27pos_effector_B_cell_2024-04-13.h5ad',
  'output/diha_clustered_celltypist_L3_Plasma_cell_2024-04-13.h5ad']}

In [None]:
import session_info
session_info.show()