# Partition L3 Cell Types

For review post-doublet filtering, we'll partition cell types based on their CellTypist L3 label designations. Later, we'll perform clustering within each group for inspection.

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

from datetime import date
import hisepy
import os
import pandas as pd
import re
import scanpy as sc

In [2]:
out_dir = 'output/l3_types'
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)

## Helper functions

In [3]:
def cache_uuid_path(uuid):
    cache_path = '/home/jupyter/cache/{u}'.format(u = uuid)
    if not os.path.isdir(cache_path):
        hise_res = hisepy.reader.cache_files([uuid])
    filename = os.listdir(cache_path)[0]
    cache_file = '{p}/{f}'.format(p = cache_path, f = filename)
    return cache_file

In [4]:
def read_parquet_uuid(uuid):
    cache_file = cache_uuid_path(uuid)
    res = pd.read_parquet(cache_file)
    return res

In [5]:
def read_adata_uuid(uuid):
    cache_file = cache_uuid_path(uuid)
    res = sc.read_h5ad(cache_file)
    return res

In [6]:
def rm_cache_uuid(uuid):
    cache_path = '/home/jupyter/cache/{u}'.format(u = uuid)
    rm_call = 'rm -r {d}'.format(d = cache_path)
    os.system(rm_call)

In [7]:
def format_cell_type(cell_type):
    cell_type = re.sub('\\+', 'pos', cell_type)
    cell_type = re.sub('-', 'neg', cell_type)
    cell_type = re.sub(' ', '_', cell_type)
    return cell_type

In [8]:
def element_id(n = 3):
    import periodictable
    from random import randrange
    rand_el = []
    for i in range(n):
        el = randrange(0,118)
        rand_el.append(periodictable.elements[el].name)
    rand_str = '-'.join(rand_el)
    return rand_str

## Read L3 labels from HISE

In [9]:
l3_uuids = ['20e97e8f-2cb9-4fa6-bd84-ddcaf6a2c28b',
            'd595ab6f-d1ad-4c7c-a8f6-395642927262',
            '56e0840c-d432-45e3-ac57-4302b0e350a4',
            '8bfe1a92-35c9-433a-9640-4d7cdd7cbad6']
l3_list = []
for l3_uuid in l3_uuids:
    l3_list.append(read_parquet_uuid(l3_uuid))
l3_labels = pd.concat(l3_list)

In [10]:
l3_labels = l3_labels[['barcodes', 'AIFI_L3', 'AIFI_L3_score']]

## Identify files for use in HISE

Cells that were filtered in a previous notebook to remove most doublets and low-quality cells

In [11]:
search_id = 'lawrencium-chromium-vanadium'

Retrieve files stored in our HISE project store

In [12]:
ps_df = hisepy.list_files_in_project_store('cohorts')
ps_df = ps_df[['id', 'name']]

Filter for files from the previous notebook using our search_id

In [13]:
search_df = ps_df[ps_df['name'].str.contains(search_id)]
search_df = search_df.sort_values('name')

In [14]:
search_df.shape

(65, 2)

In [15]:
h5ad_uuids = {}
for i in range(search_df.shape[0]):
    name = search_df['name'].tolist()[i]
    if '.h5ad' in name:
        group_name = re.sub('.+diha_','',name)
        group_name = re.sub('_filtered.+','',group_name)
        h5ad_uuids[group_name] = search_df['id'].tolist()[i]

In [16]:
h5ad_uuids

{'ASDC': '559cbab1-e3cd-438b-b951-ff7874555625',
 'BR1_Female_Negative_CD14_monocyte': '40dfd96a-5795-479e-8ae7-5608f91db36d',
 'BR1_Female_Negative_CD56dim_NK_cell': 'ceca06b2-f11c-40bf-988c-62ff51063c61',
 'BR1_Female_Negative_Memory_CD4_T_cell': '5cd5c776-e44a-4b84-8869-48abce644ed7',
 'BR1_Female_Negative_Memory_CD8_T_cell': 'c5d24858-9703-42ae-97cd-6834290b18a4',
 'BR1_Female_Negative_Naive_CD4_T_cell': '50fe1d88-49a4-4eb0-bcc8-4ce983646163',
 'BR1_Female_Positive_CD14_monocyte': 'e0270c3a-b428-4dd7-8c8e-f965814776c2',
 'BR1_Female_Positive_CD56dim_NK_cell': '11b3c89c-8157-4b0e-bab3-e3198e8b1544',
 'BR1_Female_Positive_Memory_CD4_T_cell': 'ac35ceb8-1272-477a-8e6b-ed33538ea4cc',
 'BR1_Female_Positive_Memory_CD8_T_cell': '4a44b740-b280-4f5b-9aad-d0521fbb6fe2',
 'BR1_Female_Positive_Naive_CD4_T_cell': '14dd7450-8773-4256-92b6-0b1084d187cb',
 'BR1_Male_Negative_CD14_monocyte': '0fee75d4-6a98-47ec-b1f2-23ff7ad1c18e',
 'BR1_Male_Negative_CD56dim_NK_cell': 'cb6f833b-f286-478f-acd2-989ea4

## Separate files

In [19]:
for group_name, uuid in h5ad_uuids.items():
    adata = read_adata_uuid(uuid)

    # Integrate L3 labels
    obs = adata.obs
    obs = obs.reset_index(drop = True)
    obs = obs.merge(l3_labels, on = 'barcodes', how = 'left')
    obs = obs.set_index('barcodes', drop = False)
    adata.obs = obs
    
    l3_types = adata.obs['AIFI_L3'].unique().tolist()
    
    for l3_type in l3_types:
        out_type = format_cell_type(l3_type)
        type_dir = 'output/l3_types/{ct}'.format(ct = out_type)
        if not os.path.isdir(type_dir):
            os.makedirs(type_dir)
        out_file = '{td}/diha_celltypist_L3_{g}_{ct}.h5ad'.format(td = type_dir, g = group_name, ct = out_type)
        
        type_adata = adata[adata.obs['AIFI_L3'] == l3_type].copy()
        type_adata.write_h5ad(out_file)

    rm_cache_uuid(uuid)

downloading fileID: 559cbab1-e3cd-438b-b951-ff7874555625
Files have been successfully downloaded!
downloading fileID: 40dfd96a-5795-479e-8ae7-5608f91db36d
Files have been successfully downloaded!
downloading fileID: ceca06b2-f11c-40bf-988c-62ff51063c61
Files have been successfully downloaded!
downloading fileID: 5cd5c776-e44a-4b84-8869-48abce644ed7
Files have been successfully downloaded!
downloading fileID: c5d24858-9703-42ae-97cd-6834290b18a4
Files have been successfully downloaded!
downloading fileID: 50fe1d88-49a4-4eb0-bcc8-4ce983646163
Files have been successfully downloaded!
downloading fileID: e0270c3a-b428-4dd7-8c8e-f965814776c2
Files have been successfully downloaded!
downloading fileID: 11b3c89c-8157-4b0e-bab3-e3198e8b1544
Files have been successfully downloaded!
downloading fileID: ac35ceb8-1272-477a-8e6b-ed33538ea4cc
Files have been successfully downloaded!
downloading fileID: 4a44b740-b280-4f5b-9aad-d0521fbb6fe2
Files have been successfully downloaded!
downloading fileID: 

## Merge files for the same type

In [20]:
type_dirs = os.listdir('output/l3_types')
type_h5ads = []
for type_dir in type_dirs:
    type_path = 'output/l3_types/{td}'.format(td = type_dir)
    type_files = os.listdir(type_path)

    adata_list = []
    for type_file in type_files:
        adata = sc.read_h5ad('{tp}/{tf}'.format(tp = type_path, tf = type_file))
        adata_list.append(adata)

    type_adata = sc.concat(adata_list)

    cell_type = type_adata.obs['AIFI_L3'][0]
    out_type = format_cell_type(cell_type)
    
    out_file = 'output/diha_celltypist_L3_{ct}.h5ad'.format(ct = out_type)
    type_adata.write_h5ad(out_file)
    type_h5ads.append(out_file)

## Upload assembled results to HISE

In [26]:
study_space_uuid = 'de025812-5e73-4b3c-9c3b-6d0eac412f2a'
title = 'DIHA CellTypist L3 .h5ads {d}'.format(d = date.today())

In [27]:
search_id = element_id()
search_id

'nitrogen-rhenium-hafnium'

In [28]:
in_files = []
for group_name, uuid in h5ad_uuids.items():
    in_files.append(uuid)
in_files

['559cbab1-e3cd-438b-b951-ff7874555625',
 '40dfd96a-5795-479e-8ae7-5608f91db36d',
 'ceca06b2-f11c-40bf-988c-62ff51063c61',
 '5cd5c776-e44a-4b84-8869-48abce644ed7',
 'c5d24858-9703-42ae-97cd-6834290b18a4',
 '50fe1d88-49a4-4eb0-bcc8-4ce983646163',
 'e0270c3a-b428-4dd7-8c8e-f965814776c2',
 '11b3c89c-8157-4b0e-bab3-e3198e8b1544',
 'ac35ceb8-1272-477a-8e6b-ed33538ea4cc',
 '4a44b740-b280-4f5b-9aad-d0521fbb6fe2',
 '14dd7450-8773-4256-92b6-0b1084d187cb',
 '0fee75d4-6a98-47ec-b1f2-23ff7ad1c18e',
 'cb6f833b-f286-478f-acd2-989ea4a4c7b8',
 '8d566aba-47de-452c-b9f1-bad3d5713135',
 'b2ef0d39-8861-406b-936a-bb242e6809b7',
 '943f09d1-3190-4422-b050-b355daa29360',
 'af58fd39-0156-450f-a38c-3213c3cffc66',
 'aeb088bf-bcef-4170-b07c-0e3282615066',
 'f1c45e38-12d6-4b77-89eb-51bb46de485b',
 '982f64fc-bfb0-476a-808a-77df2828091d',
 '10e95fd0-85e2-40b5-a0f1-3bb1f885ab15',
 '6621dac9-f9b9-422b-90f9-3ea31c6f9545',
 'f01ebaa1-aaab-44b0-83b9-4ce40b89acb4',
 '18661b9d-7973-46ba-a0c7-22d21cd1a70a',
 '461db2c7-2f5a-

In [29]:
out_files = type_h5ads

In [30]:
out_files

['output/diha_celltypist_L3_GZMKpos_Vd2_gdT.h5ad',
 'output/diha_celltypist_L3_GZMKneg_CD27pos_EM_CD8_T_cell.h5ad',
 'output/diha_celltypist_L3_Intermediate_monocyte.h5ad',
 'output/diha_celltypist_L3_ILC.h5ad',
 'output/diha_celltypist_L3_HLAnegDRhi_cDC2.h5ad',
 'output/diha_celltypist_L3_Naive_CD4_Treg.h5ad',
 'output/diha_celltypist_L3_KLRF1pos_effector_Vd1_gdT.h5ad',
 'output/diha_celltypist_L3_GZMKpos_CD27pos_EM_CD8_T_cell.h5ad',
 'output/diha_celltypist_L3_Platelet.h5ad',
 'output/diha_celltypist_L3_ISGpos_CD14_monocyte.h5ad',
 'output/diha_celltypist_L3_Early_memory_B_cell.h5ad',
 'output/diha_celltypist_L3_Memory_CD8_Treg.h5ad',
 'output/diha_celltypist_L3_Core_CD14_monocyte.h5ad',
 'output/diha_celltypist_L3_ISGpos_CD16_monocyte.h5ad',
 'output/diha_celltypist_L3_cDC1.h5ad',
 'output/diha_celltypist_L3_CD8_MAIT.h5ad',
 'output/diha_celltypist_L3_Transitional_B_cell.h5ad',
 'output/diha_celltypist_L3_CLP_cell.h5ad',
 'output/diha_celltypist_L3_SOX4pos_naive_CD4_T_cell.h5ad',
 '

In [31]:
hisepy.upload.upload_files(
    files = out_files,
    study_space_id = study_space_uuid,
    title = title,
    input_file_ids = in_files,
    destination = search_id
)

Cannot determine the current notebook.
1) /home/jupyter/IH-A-Aging-Analysis-Notebooks/scrna-seq_analysis/02-reference_labeling/09-Python_partition_L3_data.ipynb
2) /home/jupyter/IH-A-Aging-Analysis-Notebooks/scrna-seq_analysis/02-reference_labeling/07a-R_filter_review_plots.ipynb
3) /home/jupyter/IH-A-Aging-Analysis-Notebooks/scrna-seq_analysis/02-reference_labeling/07-Python_filter_cell_classes.ipynb
Please select (1-3) 


 1


you are trying to upload file_ids... ['output/diha_celltypist_L3_GZMKpos_Vd2_gdT.h5ad', 'output/diha_celltypist_L3_GZMKneg_CD27pos_EM_CD8_T_cell.h5ad', 'output/diha_celltypist_L3_Intermediate_monocyte.h5ad', 'output/diha_celltypist_L3_ILC.h5ad', 'output/diha_celltypist_L3_HLAnegDRhi_cDC2.h5ad', 'output/diha_celltypist_L3_Naive_CD4_Treg.h5ad', 'output/diha_celltypist_L3_KLRF1pos_effector_Vd1_gdT.h5ad', 'output/diha_celltypist_L3_GZMKpos_CD27pos_EM_CD8_T_cell.h5ad', 'output/diha_celltypist_L3_Platelet.h5ad', 'output/diha_celltypist_L3_ISGpos_CD14_monocyte.h5ad', 'output/diha_celltypist_L3_Early_memory_B_cell.h5ad', 'output/diha_celltypist_L3_Memory_CD8_Treg.h5ad', 'output/diha_celltypist_L3_Core_CD14_monocyte.h5ad', 'output/diha_celltypist_L3_ISGpos_CD16_monocyte.h5ad', 'output/diha_celltypist_L3_cDC1.h5ad', 'output/diha_celltypist_L3_CD8_MAIT.h5ad', 'output/diha_celltypist_L3_Transitional_B_cell.h5ad', 'output/diha_celltypist_L3_CLP_cell.h5ad', 'output/diha_celltypist_L3_SOX4pos_naive_C

(y/n) y


{'trace_id': '50f40bcc-79de-4910-846a-beac19f11799',
 'files': ['output/diha_celltypist_L3_GZMKpos_Vd2_gdT.h5ad',
  'output/diha_celltypist_L3_GZMKneg_CD27pos_EM_CD8_T_cell.h5ad',
  'output/diha_celltypist_L3_Intermediate_monocyte.h5ad',
  'output/diha_celltypist_L3_ILC.h5ad',
  'output/diha_celltypist_L3_HLAnegDRhi_cDC2.h5ad',
  'output/diha_celltypist_L3_Naive_CD4_Treg.h5ad',
  'output/diha_celltypist_L3_KLRF1pos_effector_Vd1_gdT.h5ad',
  'output/diha_celltypist_L3_GZMKpos_CD27pos_EM_CD8_T_cell.h5ad',
  'output/diha_celltypist_L3_Platelet.h5ad',
  'output/diha_celltypist_L3_ISGpos_CD14_monocyte.h5ad',
  'output/diha_celltypist_L3_Early_memory_B_cell.h5ad',
  'output/diha_celltypist_L3_Memory_CD8_Treg.h5ad',
  'output/diha_celltypist_L3_Core_CD14_monocyte.h5ad',
  'output/diha_celltypist_L3_ISGpos_CD16_monocyte.h5ad',
  'output/diha_celltypist_L3_cDC1.h5ad',
  'output/diha_celltypist_L3_CD8_MAIT.h5ad',
  'output/diha_celltypist_L3_Transitional_B_cell.h5ad',
  'output/diha_celltypist_L

In [32]:
import session_info
session_info.show()