# Retrieve coordinates and metadata for UMAP projections

To review our separate AIFI_L3 data, we'll retrieve the .h5ad files generated for each cell type and extract the UMAP coordinates and metadata within each file.

In [18]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

from datetime import date
import hisepy
import os
import pandas as pd
import scanpy as sc
import re
import tarfile

In [2]:
if not os.path.isdir('output'):
    os.mkdir('output')

## Helper functions

Cache based on uuid and return location

In [3]:
def cache_uuid_path(uuid):
    cache_path = '/home/jupyter/cache/{u}'.format(u = uuid)
    if not os.path.isdir(cache_path):
        hise_res = hisepy.reader.cache_files([uuid])
    filename = os.listdir(cache_path)[0]
    cache_file = '{p}/{f}'.format(p = cache_path, f = filename)
    return cache_file

Connect to a cached .h5ad file in file-backed mode

In [4]:
def connect_h5ad_uuid(uuid):
    cache_file = cache_uuid_path(uuid)
    res = sc.read_h5ad(cache_file, backed = 'r')
    return res

Read observations and UMAP coordinates and join these results

In [5]:
def obs_with_umap(adata):
    obs = adata.obs
    obs = obs.reset_index()
    
    umap_mat = adata.obsm['X_umap']
    umap_df = pd.DataFrame(umap_mat, columns = ['umap_1', 'umap_2'])
    obs['umap_1'] = umap_df['umap_1']
    obs['umap_2'] = umap_df['umap_2']
    
    return obs

Delete cached files to free up disk space

In [6]:
def delete_cache_uuid(uuid):
    cache_path = '/home/jupyter/cache/{u}'.format(u = uuid)
    os.system('rm -r {p}'.format(p = cache_path))

Put all of these steps together for parallel processing

In [7]:
def process_uuid(cell_type, uuid):
    out_csv = 'output/diha_AIFI_L3_{c}_meta_umap.csv'.format(c = cell_type)
    out_parquet = 'output/diha_AIFI_L3_{c}_meta_umap_parquet.parquet'.format(c = cell_type)
    print(cell_type)
    if not os.path.isfile(out_parquet):
        adata = connect_h5ad_uuid(uuid)
        out_df = obs_with_umap(adata)
    
        out_df.to_csv(out_csv)
        out_df.to_parquet(out_parquet)
        
        delete_cache_uuid(uuid)
    
    return [out_csv, out_parquet]

This function generates a quasi-random ID to make searching for outputs easier

In [8]:
def element_id(n = 3):
    import periodictable
    from random import randrange
    rand_el = []
    for i in range(n):
        el = randrange(0,118)
        rand_el.append(periodictable.elements[el].name)
    rand_str = '-'.join(rand_el)
    return rand_str

## Identify UUIDs to use from HISE

In [9]:
h5ad_uuids = {
	'Activated_memory_B_cell': '6a9c5a03-8204-4ec3-9540-0ca6cc6c2e64',
	'Adaptive_NK_cell': '6dbd1697-983e-4f63-a03e-bb3f12abbb76',
	'ASDC': '42df79a9-3b14-49ac-ad0a-0102b0900945',
	'ASDC': 'ce61e453-80c8-438f-9977-e3d5137186ab',
	'BaEoMaP_cell': 'da2ed67e-6dda-4a94-8718-1ddc79a2571e',
	'C1Qpos_CD16_monocyte': '68987028-f587-4cf6-b055-c624097f7145',
	'CD14pos_cDC2': '20e94e77-aaa4-48f9-bae7-669a10826377',
	'CD27neg_effector_B_cell': 'd73d6756-f497-4fa5-9ced-36be50633961',
	'CD27pos_effector_B_cell': '5fb43708-450a-45a8-8ad2-f945d22f646a',
	'CD4_MAIT': '96fa2384-4241-4f61-a37f-1dfbfe29dba6',
	'CD56bright_NK_cell': '54be0702-f9b0-47c2-99d6-d7723e622696',
	'CD8_MAIT': 'b0e24c46-b4dc-4621-bf80-3ac20eee7c12',
	'CD8aa': '5451f439-2ee7-49ac-a817-461f9e700319',
	'CD95_memory_B_cell': 'a385dabb-0f46-4aa2-aee4-ea3c1a6366f0',
	'cDC1': '47bff835-2572-4edc-b421-63a0adadc2a0',
	'CLP_cell': 'dbe08d85-1697-446a-bf16-6cf99618b870',
	'CM_CD4_T_cell': 'a6196f00-da2e-4261-ae92-9ca30e9057b3',
	'CM_CD8_T_cell': '8d5a6137-0643-40a0-93bb-8e877d67ae83',
	'CMP_cell': '90e382b5-f032-4145-82f4-4f1ece05cc7f',
	'Core_CD14_monocyte': '0f37be28-2311-4e2d-bf26-328d2a8c55a3',
	'Core_CD16_monocyte': '5619713d-47ad-485c-bb8f-c3adbe7d1f18',
	'Core_memory_B_cell': 'a5070a3f-19b3-4b15-84ba-2297791935b7',
	'Core_naive_B_cell': 'c02465d1-1db5-46f1-b645-ec3927bcf302',
	'Core_naive_CD4_T_cell': '2e355ab5-a605-408a-aab5-8532d571929d',
	'Core_naive_CD4_T_cell': '59fe6ac1-2a2b-4926-8a46-356b99f0623c',
	'Core_naive_CD8_T_cell': '289c7887-579c-4ec4-9e84-671dabefa9dd',
	'DN_T_cell': '5b9dcd89-b718-47c2-8849-18f9d3daf18d',
	'Early_memory_B_cell': '53a0fa60-ae88-4583-8296-ae7075397f1d',
	'Erythrocyte': '27292a34-3540-46c1-bc16-71ee2f336d02',
	'GZMBneg_CD27neg_EM_CD4_T_cell': '8f724d05-9397-4110-a8d3-46dba8ef8004',
	'GZMBneg_CD27pos_EM_CD4_T_cell': '9aa686af-4675-4e47-977a-2569e259588f',
	'GZMBpos_Vd2_gdT': '4b3e28a3-7e02-4cb9-9dfd-d77b6f2f1e89',
	'GZMKneg_CD27pos_EM_CD8_T_cell': 'c5a5d117-8cb4-4bdf-9e09-e9d1cd00e985',
	'GZMKneg_CD56dim_NK_cell': 'ef27ec4a-4523-46a5-b203-91ad2c440c78',
	'GZMKpos_CD27pos_EM_CD8_T_cell': '44b790d4-ee6d-436c-b055-f2a21bf120fa',
	'GZMKpos_CD56dim_NK_cell': 'd02ed96b-ebe0-4728-99ba-d93fbba418a8',
	'GZMKpos_memory_CD4_Treg': 'dc45ab2d-a722-482d-bbc1-ff4659e1850c',
	'GZMKpos_Vd2_gdT': '3f4c5821-117a-4383-9425-7a5695beca39',
	'HLAnegDRhi_cDC2': 'd0fe7d73-edeb-4bd5-960c-e675102b523b',
	'IL1Bpos_CD14_monocyte': '51de3fb2-498c-4e22-b64e-e24c0f6c4ec5',
	'ILC': '44829f73-c319-40ed-bf29-3dabeade4035',
	'Intermediate_monocyte': 'cef25006-e06b-492a-9001-189c4f6174c0',
	'ISGpos_CD14_monocyte': 'cccbcc9c-252d-4ec1-a261-70a3d41b948b',
	'ISGpos_CD16_monocyte': 'a2c6765f-870a-4e43-a597-1cc26a941e13',
	'ISGpos_CD56dim_NK_cell': 'ecb371d8-9154-4305-8b18-faf5346eeed3',
	'ISGpos_cDC2': '87bda2e3-fdad-4745-9825-c5965226d811',
	'ISGpos_MAIT': '243fc806-fbbf-446b-a68e-5a40e0cfd3e4',
	'ISGpos_memory_CD4_T_cell': '6d0c8ae2-3b44-4784-ba7b-d54cbd888222',
	'ISGpos_memory_CD8_T_cell': '4d905695-a183-42af-8060-82714cbd517d',
	'ISGpos_naive_B_cell': 'ad6f083a-7e46-44b7-aab0-b12f3b28905b',
	'ISGpos_naive_CD4_T_cell': 'e4184f05-f65e-436a-b2af-6f93cfca724c',
	'ISGpos_naive_CD8_T_cell': 'ccd3ff87-c139-4408-8ae2-d1ed02096b5f',
	'KLRB1pos_memory_CD4_Treg': 'b50f6471-1f5e-47fc-bcc8-59817f3a6baa',
	'KLRB1pos_memory_CD8_Treg': 'ce93f6a2-807c-431d-b37d-ef6a753c0bcf',
	'KLRF1neg_effector_Vd1_gdT': '5cf49f0c-4228-4731-85e4-35632e8a43df',
	'KLRF1neg_GZMBpos_CD27neg_EM_CD8_T_cell': 'cedee7cc-c8a8-4488-9f23-b7ea7ce2b4de',
	'KLRF1neg_GZMBpos_CD27neg_memory_CD4_T_cell': 'c415a983-3ad0-433a-a85e-17528f7d0c2b',
	'KLRF1pos_effector_Vd1_gdT': '107db90d-0a9f-47c6-bfb5-3103fcac6ce1',
	'KLRF1pos_GZMBpos_CD27neg_EM_CD8_T_cell': 'eefec292-eac0-4fcc-9667-2af5c524cb9d',
	'marker-C1Qpos_CD16_monocyte': '86834c52-6ba2-4c80-9fd1-47e16be69259',
	'marker-Core_CD14_monocyte': 'c14f1cb3-c022-48ca-95bb-3b9d5d5ff9b5',
	'marker-Core_CD16_monocyte': 'a48036ff-cb4d-4a91-80bd-9dac514e3996',
	'marker-IL1Bpos_CD14_monocyte': 'bff78464-8c70-485d-a315-89810606261f',
	'marker-Intermediate_monocyte': 'e68a4ef4-f880-4eb7-b9bb-9d2cc46a75bd',
	'marker-ISGpos_CD14_monocyte': '62c6898c-cac7-4268-9317-2824bed358b3',
	'marker-ISGpos_CD16_monocyte': '641992b0-97a1-4f14-b9f3-f4cf59c83873',
	'Memory_CD4_Treg': 'f6409d32-ad76-4aae-9ebc-24b6724e4494',
	'Memory_CD8_Treg': '413922d9-c9d8-4f3f-8cf7-42bc38c8487c',
	'Naive_CD4_Treg': '707ba49d-25ef-44ce-a864-6c356b0afbbd',
	'Naive_Vd1_gdT': '6a8bb896-6e2b-4d11-85b8-6beb1460474c',
	'pDC': '94153b33-f362-49aa-a418-deefbceadbdd',
	'Plasma_cell': 'aa26b6d0-81f9-47e5-b255-8ea6eb9b463d',
	'Platelet': 'a4423569-4c27-43ca-b913-84ff0e5385e8',
	'Proliferating_NK_cell': '45488616-be38-435f-be7d-6494ac19a281',
	'Proliferating_T_cell': '4fc827c9-0e47-47d7-ac05-a59a9b4fb728',
	'SOX4pos_naive_CD4_T_cell': '6919a01a-11a3-41c6-a9ff-8a2ee8b5522e',
	'SOX4pos_naive_CD8_T_cell': '7998f705-6123-4436-8cba-ff031b8748a9',
	'SOX4pos_Vd1_gdT': 'fd1c7490-ae9a-4c4f-92cf-54d8e91e3944',
	'Transitional_B_cell': '2324fa54-0d49-4f0b-913c-7cabfa084b59',
	'Type_2_polarized_memory_B_cell': '04cf8a76-34cd-42f8-9bef-daaf24fdb56c',
}

## Pull and process files

In [10]:
out_file_list = []
for cell_type, uuid in h5ad_uuids.items():
    out_file_list.append(process_uuid(cell_type, uuid))

Activated_memory_B_cell
Adaptive_NK_cell
ASDC
BaEoMaP_cell
C1Qpos_CD16_monocyte
CD14pos_cDC2
CD27neg_effector_B_cell
CD27pos_effector_B_cell
CD4_MAIT
CD56bright_NK_cell
CD8_MAIT
CD8aa
CD95_memory_B_cell
cDC1
CLP_cell
CM_CD4_T_cell
CM_CD8_T_cell
CMP_cell
Core_CD14_monocyte
Core_CD16_monocyte
Core_memory_B_cell
Core_naive_B_cell
Core_naive_CD4_T_cell
Core_naive_CD8_T_cell
DN_T_cell
Early_memory_B_cell
Erythrocyte
GZMBneg_CD27neg_EM_CD4_T_cell
GZMBneg_CD27pos_EM_CD4_T_cell
GZMBpos_Vd2_gdT
GZMKneg_CD27pos_EM_CD8_T_cell
GZMKneg_CD56dim_NK_cell
GZMKpos_CD27pos_EM_CD8_T_cell
GZMKpos_CD56dim_NK_cell
GZMKpos_memory_CD4_Treg
GZMKpos_Vd2_gdT
HLAnegDRhi_cDC2
IL1Bpos_CD14_monocyte
ILC
Intermediate_monocyte
ISGpos_CD14_monocyte
ISGpos_CD16_monocyte
ISGpos_CD56dim_NK_cell
ISGpos_cDC2
ISGpos_MAIT
ISGpos_memory_CD4_T_cell
ISGpos_memory_CD8_T_cell
ISGpos_naive_B_cell
ISGpos_naive_CD4_T_cell
ISGpos_naive_CD8_T_cell
KLRB1pos_memory_CD4_Treg
KLRB1pos_memory_CD8_Treg
KLRF1neg_effector_Vd1_gdT
KLRF1neg_GZMBp

In [19]:
out_csvs = []
out_parquets = []
for sublist in out_file_list:
    out_csvs.append(sublist[0])
    out_parquets.append(sublist[1])

In [None]:
csv_tar = 'output/diha_L3_meta_umap_csv_{d}.tar.gz'.format(d = date.today())
tar = tarfile.open(csv_tar, 'w:gz')
for csv_file in out_csvs:
    tar.add(csv_file)
tar.close()

In [26]:
parquet_tar = 'output/diha_L3_meta_umap_parquet_{d}.tar.gz'.format(d = date.today())
tar = tarfile.open(parquet_tar, 'w:gz')
for parquet_file in out_parquets:
    tar.add(parquet_file)
tar.close()

## Upload Cell Type data to HISE

Finally, we'll use `hisepy.upload.upload_files()` to send a copy of our output to HISE to use for downstream analysis steps.

In [27]:
study_space_uuid = 'de025812-5e73-4b3c-9c3b-6d0eac412f2a'
title = 'DIHA Separate L3 Plot Data {d}'.format(d = date.today())

In [28]:
search_id = element_id()
search_id

'xenon-cesium-technetium'

In [29]:
in_files = list(h5ad_uuids.values())
in_files

['6a9c5a03-8204-4ec3-9540-0ca6cc6c2e64',
 '6dbd1697-983e-4f63-a03e-bb3f12abbb76',
 'ce61e453-80c8-438f-9977-e3d5137186ab',
 'da2ed67e-6dda-4a94-8718-1ddc79a2571e',
 '68987028-f587-4cf6-b055-c624097f7145',
 '20e94e77-aaa4-48f9-bae7-669a10826377',
 'd73d6756-f497-4fa5-9ced-36be50633961',
 '5fb43708-450a-45a8-8ad2-f945d22f646a',
 '96fa2384-4241-4f61-a37f-1dfbfe29dba6',
 '54be0702-f9b0-47c2-99d6-d7723e622696',
 'b0e24c46-b4dc-4621-bf80-3ac20eee7c12',
 '5451f439-2ee7-49ac-a817-461f9e700319',
 'a385dabb-0f46-4aa2-aee4-ea3c1a6366f0',
 '47bff835-2572-4edc-b421-63a0adadc2a0',
 'dbe08d85-1697-446a-bf16-6cf99618b870',
 'a6196f00-da2e-4261-ae92-9ca30e9057b3',
 '8d5a6137-0643-40a0-93bb-8e877d67ae83',
 '90e382b5-f032-4145-82f4-4f1ece05cc7f',
 '0f37be28-2311-4e2d-bf26-328d2a8c55a3',
 '5619713d-47ad-485c-bb8f-c3adbe7d1f18',
 'a5070a3f-19b3-4b15-84ba-2297791935b7',
 'c02465d1-1db5-46f1-b645-ec3927bcf302',
 '59fe6ac1-2a2b-4926-8a46-356b99f0623c',
 '289c7887-579c-4ec4-9e84-671dabefa9dd',
 '5b9dcd89-b718-

In [30]:
out_files = [csv_tar, parquet_tar]
out_files

['output/diha_L3_meta_umap_csv_2024-04-10.tar.gz',
 'output/diha_L3_meta_umap_parquet_2024-04-11.tar.gz']

In [32]:
hisepy.upload.upload_files(
    files = out_files,
    study_space_id = study_space_uuid,
    title = title,
    input_file_ids = in_files,
    destination = search_id
)

you are trying to upload file_ids... ['output/diha_L3_meta_umap_csv_2024-04-10.tar.gz', 'output/diha_L3_meta_umap_parquet_2024-04-11.tar.gz']. Do you truly want to proceed?


(y/n) y


{'trace_id': 'ac396924-f3a0-45f6-9377-034c420a3e0d',
 'files': ['output/diha_L3_meta_umap_csv_2024-04-10.tar.gz',
  'output/diha_L3_meta_umap_parquet_2024-04-11.tar.gz']}

In [33]:
import session_info
session_info.show()