# Cluster CellTypist L3 CD4 T cells

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

from datetime import date
import hisepy
import os
import pandas as pd
import re
import scanpy as sc

In [2]:
out_dir = 'output'
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)

## Helper functions

In [3]:
def cache_uuid_path(uuid):
    cache_path = '/home/jupyter/cache/{u}'.format(u = uuid)
    if not os.path.isdir(cache_path):
        hise_res = hisepy.reader.cache_files([uuid])
    filename = os.listdir(cache_path)[0]
    cache_file = '{p}/{f}'.format(p = cache_path, f = filename)
    return cache_file

In [4]:
def read_csv_uuid(uuid):
    cache_file = cache_uuid_path(uuid)
    res = pd.read_csv(cache_file)
    return res

In [5]:
def read_adata_uuid(uuid):
    cache_file = cache_uuid_path(uuid)
    res = sc.read_h5ad(cache_file)
    return res

In [6]:
def rm_cache_uuid(uuid):
    cache_path = '/home/jupyter/cache/{u}'.format(u = uuid)
    rm_call = 'rm -r {d}'.format(d = cache_path)
    os.system(rm_call)

In [7]:
def format_cell_type(cell_type):
    cell_type = re.sub('\\+', 'pos', cell_type)
    cell_type = re.sub('-', 'neg', cell_type)
    cell_type = re.sub(' ', '_', cell_type)
    return cell_type

In [8]:
def element_id(n = 3):
    import periodictable
    from random import randrange
    rand_el = []
    for i in range(n):
        el = randrange(0,118)
        rand_el.append(periodictable.elements[el].name)
    rand_str = '-'.join(rand_el)
    return rand_str

In [9]:
def process_adata(adata, resolution = 2):
    
    # Keep a copy of the raw data
    adata = adata.raw.to_adata()
    adata.raw = adata

    print('Normalizing', end = "; ")
    # Normalize and log transform
    sc.pp.normalize_total(adata)
    sc.pp.log1p(adata)

    print('Finding HVGs', end = "; ")
    # Restrict downstream steps to variable genes
    sc.pp.highly_variable_genes(adata)
    adata = adata[:, adata.var_names[adata.var['highly_variable']]].copy()

    print('Scaling', end = "; ")
    # Scale variable genes
    sc.pp.scale(adata)

    print('PCA', end = "; ")
    # Run PCA
    sc.tl.pca(adata, svd_solver = 'arpack')
    
    print('Neighbors', end = "; ")
    # Find nearest neighbors
    sc.pp.neighbors(
        adata, 
        n_neighbors = 50,
        n_pcs = 30
    )

    print('Leiden', end = "; ")
    # Find clusters
    sc.tl.leiden(
        adata, 
        resolution = resolution, 
        key_added = 'leiden_{r}'.format(r = resolution),
        n_iterations = 2
    )

    print('UMAP', end = "; ")
    # Run UMAP
    sc.tl.umap(adata, min_dist = 0.05)
    
    print('Renormalizing')
    adata = adata.raw.to_adata()
    adata.raw = adata

    # Normalize and log transform
    sc.pp.normalize_total(adata)
    sc.pp.log1p(adata)
    
    return adata

## Read cell type hierarchy

In [10]:
hierarchy_uuid = '1a44252c-8cab-4c8f-92c9-d8f3af633790'
hierarchy_df = read_csv_uuid(hierarchy_uuid)

In [11]:
hierarchy_df['AIFI_L2'].unique()

array(['Effector B cell', 'Memory B cell', 'Naive B cell', 'Plasma cell',
       'Transitional B cell', 'ASDC', 'cDC1', 'cDC2', 'pDC',
       'Erythrocyte', 'ILC', 'CD14 monocyte', 'CD16 monocyte',
       'Intermediate monocyte', 'CD56bright NK cell', 'CD56dim NK cell',
       'Proliferating NK cell', 'Platelet', 'Progenitor cell', 'CD8aa',
       'DN T cell', 'MAIT', 'Memory CD4 T cell', 'Memory CD8 T cell',
       'Naive CD4 T cell', 'Naive CD8 T cell', 'Proliferating T cell',
       'Treg', 'gdT'], dtype=object)

## Identify files for use in HISE

In [12]:
search_id = 'nitrogen-rhenium-hafnium'
l2_types = ['Memory CD4 T cell', 'Naive CD4 T cell']

Get L3 cell types in the format used for filenames

In [13]:
l3_types = hierarchy_df['AIFI_L3'].loc[hierarchy_df['AIFI_L2'].isin(l2_types)]
l3_types = l3_types.tolist()

In [14]:
l3_file_types = [format_cell_type(ct) for ct in l3_types]

Retrieve files stored in our HISE project store

In [15]:
ps_df = hisepy.list_files_in_project_store('cohorts')
ps_df = ps_df[['id', 'name']]

Filter for files from the previous notebook using our search_id

In [16]:
search_df = ps_df[ps_df['name'].str.contains(search_id)]

Filter for cells related to the L1 cell type based on l3_types

In [17]:
type_string = '|'.join(l3_file_types)
type_df = search_df[search_df['name'].str.contains(type_string)]

In [18]:
type_df['name'].tolist()

['nitrogen-rhenium-hafnium/diha_celltypist_L3_SOX4pos_naive_CD4_T_cell.h5ad',
 'nitrogen-rhenium-hafnium/diha_celltypist_L3_GZMBneg_CD27neg_EM_CD4_T_cell.h5ad',
 'nitrogen-rhenium-hafnium/diha_celltypist_L3_Core_naive_CD4_T_cell.h5ad',
 'nitrogen-rhenium-hafnium/diha_celltypist_L3_ISGpos_memory_CD4_T_cell.h5ad',
 'nitrogen-rhenium-hafnium/diha_celltypist_L3_CM_CD4_T_cell.h5ad',
 'nitrogen-rhenium-hafnium/diha_celltypist_L3_GZMBneg_CD27pos_EM_CD4_T_cell.h5ad',
 'nitrogen-rhenium-hafnium/diha_celltypist_L3_KLRF1neg_GZMBpos_CD27neg_memory_CD4_T_cell.h5ad',
 'nitrogen-rhenium-hafnium/diha_celltypist_L3_ISGpos_naive_CD4_T_cell.h5ad']

## Process data for each L3 type

In [19]:
out_files = []
for uuid in type_df['id']:
    adata = read_adata_uuid(uuid)
    
    cell_type = adata.obs['AIFI_L3'][0]
    out_type = format_cell_type(cell_type)
    
    out_file = 'output/diha_clustered_celltypist_L3_{ct}_{d}.h5ad'.format(ct = out_type, d = date.today())
    if os.path.isfile(out_file):
        print('Previously processed {ct}; Skipping.'.format(ct = out_type))
    else:
        adata = process_adata(adata, resolution = 2)
        adata.write_h5ad(out_file)
        
    out_files.append(out_file)
    
    rm_cache_uuid(uuid)

downloading fileID: 0259f074-282f-4e03-b6ae-80a4721d0640
Files have been successfully downloaded!
Previously processed SOX4pos_naive_CD4_T_cell; Skipping.
downloading fileID: bf42fa00-3f8d-42ad-a4cf-6cbc795047be
Files have been successfully downloaded!
Previously processed GZMBneg_CD27neg_EM_CD4_T_cell; Skipping.
Normalizing; Finding HVGs; Scaling; PCA; Neighbors; Leiden; UMAP; 

IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out


Renormalizing
downloading fileID: e3ff7bbd-917a-4676-8958-c021182ce96a
Files have been successfully downloaded!
Normalizing; Finding HVGs; Scaling; PCA; Neighbors; Leiden; UMAP; Renormalizing
downloading fileID: a26d23aa-9592-4ba1-8a9e-774256cbda1e
Files have been successfully downloaded!
Normalizing; Finding HVGs; Scaling; PCA; Neighbors; Leiden; UMAP; 

IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out


Renormalizing
downloading fileID: d3c8773b-9f8d-4c89-922a-364871505103
Files have been successfully downloaded!
Normalizing; Finding HVGs; Scaling; PCA; Neighbors; Leiden; UMAP; Renormalizing
downloading fileID: da0969b9-9fd7-495f-a242-5ae8a401dbe1
Files have been successfully downloaded!
Normalizing; Finding HVGs; Scaling; PCA; Neighbors; Leiden; UMAP; Renormalizing
downloading fileID: 88e0f251-f639-4e95-9d69-5cfa12458c71
Files have been successfully downloaded!
Normalizing; Finding HVGs; Scaling; PCA; Neighbors; Leiden; UMAP; Renormalizing


## Upload assembled results to HISE

In [20]:
study_space_uuid = 'de025812-5e73-4b3c-9c3b-6d0eac412f2a'
title = 'DIHA CellTypist L3 CD4 T cells Clustered {d}'.format(d = date.today())

In [21]:
search_id = element_id()
search_id

'niobium-cerium-barium'

In [22]:
in_files = type_df['id'].tolist()
in_files

['0259f074-282f-4e03-b6ae-80a4721d0640',
 'bf42fa00-3f8d-42ad-a4cf-6cbc795047be',
 '5027acc8-7696-4075-a165-a2b533d2ddb6',
 'e3ff7bbd-917a-4676-8958-c021182ce96a',
 'a26d23aa-9592-4ba1-8a9e-774256cbda1e',
 'd3c8773b-9f8d-4c89-922a-364871505103',
 'da0969b9-9fd7-495f-a242-5ae8a401dbe1',
 '88e0f251-f639-4e95-9d69-5cfa12458c71']

In [23]:
out_files

['output/diha_clustered_celltypist_L3_SOX4pos_naive_CD4_T_cell_2024-04-21.h5ad',
 'output/diha_clustered_celltypist_L3_GZMBneg_CD27neg_EM_CD4_T_cell_2024-04-21.h5ad',
 'output/diha_clustered_celltypist_L3_Core_naive_CD4_T_cell_2024-04-21.h5ad',
 'output/diha_clustered_celltypist_L3_ISGpos_memory_CD4_T_cell_2024-04-22.h5ad',
 'output/diha_clustered_celltypist_L3_CM_CD4_T_cell_2024-04-22.h5ad',
 'output/diha_clustered_celltypist_L3_GZMBneg_CD27pos_EM_CD4_T_cell_2024-04-22.h5ad',
 'output/diha_clustered_celltypist_L3_KLRF1neg_GZMBpos_CD27neg_memory_CD4_T_cell_2024-04-22.h5ad',
 'output/diha_clustered_celltypist_L3_ISGpos_naive_CD4_T_cell_2024-04-22.h5ad']

In [24]:
hisepy.upload.upload_files(
    files = out_files,
    study_space_id = study_space_uuid,
    title = title,
    input_file_ids = in_files,
    destination = search_id
)

you are trying to upload file_ids... ['output/diha_clustered_celltypist_L3_SOX4pos_naive_CD4_T_cell_2024-04-21.h5ad', 'output/diha_clustered_celltypist_L3_GZMBneg_CD27neg_EM_CD4_T_cell_2024-04-21.h5ad', 'output/diha_clustered_celltypist_L3_Core_naive_CD4_T_cell_2024-04-21.h5ad', 'output/diha_clustered_celltypist_L3_ISGpos_memory_CD4_T_cell_2024-04-22.h5ad', 'output/diha_clustered_celltypist_L3_CM_CD4_T_cell_2024-04-22.h5ad', 'output/diha_clustered_celltypist_L3_GZMBneg_CD27pos_EM_CD4_T_cell_2024-04-22.h5ad', 'output/diha_clustered_celltypist_L3_KLRF1neg_GZMBpos_CD27neg_memory_CD4_T_cell_2024-04-22.h5ad', 'output/diha_clustered_celltypist_L3_ISGpos_naive_CD4_T_cell_2024-04-22.h5ad']. Do you truly want to proceed?


(y/n) y


{'trace_id': 'f107c8a1-bc4c-4404-b484-54c659ed4a7a',
 'files': ['output/diha_clustered_celltypist_L3_SOX4pos_naive_CD4_T_cell_2024-04-21.h5ad',
  'output/diha_clustered_celltypist_L3_GZMBneg_CD27neg_EM_CD4_T_cell_2024-04-21.h5ad',
  'output/diha_clustered_celltypist_L3_Core_naive_CD4_T_cell_2024-04-21.h5ad',
  'output/diha_clustered_celltypist_L3_ISGpos_memory_CD4_T_cell_2024-04-22.h5ad',
  'output/diha_clustered_celltypist_L3_CM_CD4_T_cell_2024-04-22.h5ad',
  'output/diha_clustered_celltypist_L3_GZMBneg_CD27pos_EM_CD4_T_cell_2024-04-22.h5ad',
  'output/diha_clustered_celltypist_L3_KLRF1neg_GZMBpos_CD27neg_memory_CD4_T_cell_2024-04-22.h5ad',
  'output/diha_clustered_celltypist_L3_ISGpos_naive_CD4_T_cell_2024-04-22.h5ad']}

In [25]:
import session_info
session_info.show()