# Assemble Cell Class .h5ad files

To release our scRNA-seq data, we'll assemble labeled data from samples in groups based on major cell class. Due to the size of some cell classes, we'll divide them slightly differently from AIFI_L1 levels:

- CD4 Naive T cells
- CD4 Memory T cells
- CD8 T cells
- Other T cells (MAIT, Treg, dnT, gdT, Proliferating)
- NK cells and ILC
- B cells and Plasmablasts
- Monocytes and DCs
- Other cell types

Users who want the whole set will be able to download each of these files and assemble them.

In [1]:
from datetime import date
import hisepy
import os
import pandas as pd
import scanpy as sc
import re
import tarfile

In [2]:
if not os.path.isdir('output'):
    os.mkdir('output')

In [3]:
if not os.path.isdir('split_h5ad'):
    os.mkdir('split_h5ad')

## Helper functions

In [4]:
def find_stored_files(search_id, store = 'Service_Core'):
    ps_df = hisepy.list_files_in_project_store(store)
    ps_df = ps_df[['id', 'name']]
    search_df = ps_df[ps_df['name'].str.contains(search_id)]
    
    return search_df

In [5]:
def cache_uuid_path(uuid):
    cache_path = '/home/jupyter/cache/{u}'.format(u = uuid)
    if not os.path.isdir(cache_path):
        hise_res = hisepy.reader.cache_files([uuid])
    filename = os.listdir(cache_path)[0]
    cache_file = '{p}/{f}'.format(p = cache_path, f = filename)
    return cache_file

In [32]:
def element_id(n = 3):
    import periodictable
    from random import randrange
    rand_el = []
    for i in range(n):
        el = randrange(0,118)
        rand_el.append(periodictable.elements[el].name)
    rand_str = '-'.join(rand_el)
    return rand_str

In [6]:
cohort_file_dict = {
    'BR1': 'SoundLife_YoungAdult',
    'BR2': 'SoundLife_OlderAdult'
}
cohort_name_dict = {
    'BR1': 'Sound Life Young Adult',
    'BR2': 'Sound Life Older Adult'
}

## Find labeled data in HISE

In [7]:
search_id = 'polonium-tin-curium'

In [8]:
search_df = find_stored_files(search_id, 'cohorts')

## Assign cell type groups

In [9]:
pq_df = search_df[search_df['name'].str.contains('.parquet')].copy()
pq_df = pq_df.reset_index(drop = True)

In [10]:
all_df = pq_df[pq_df['name'].str.contains('all')]

In [11]:
meta_uuid = all_df['id'].iloc[0]
meta_file = cache_uuid_path(meta_uuid)
meta = pd.read_parquet(meta_file)

In [12]:
l2_groups = {
	'Naive CD4 T cell': 't_cd4_naive',
	'Memory CD4 T cell': 't_cd4_memory',
	'CD14 monocyte':  'dc_monocyte',
	'Memory CD8 T cell': 't_cd8',
	'CD56dim NK cell': 'nk',
	'Naive CD8 T cell': 't_cd8',
	'Naive B cell': 'b_plasma',
	'Memory B cell': 'b_plasma',
	'MAIT': 't_other',
	'CD16 monocyte': 'dc_monocyte',
	'gdT': 't_other',
	'Treg': 't_other',
	'cDC2': 'dc_monocyte',
	'Intermediate monocyte': 'dc_monocyte',
	'CD56bright NK cell': 'nk',
	'Transitional B cell': 'b_plasma',
	'Effector B cell': 'b_plasma',
	'Platelet': 'other',
	'pDC': 'dc_monocyte',
	'CD8aa': 't_cd8',
	'Proliferating NK cell': 'nk',
	'Proliferating T cell': 't_other',
	'Erythrocyte': 'other',
	'DN T cell': 't_other',
	'Plasma cell': 'b_plasma',
	'Progenitor cell': 'other',
	'cDC1': 'dc_monocyte',
	'ILC': 'other',
	'ASDC': 'other'
}

In [13]:
cell_groups = []

l2_list = meta['AIFI_L2'].tolist()
cohort_list = meta['cohort.cohortGuid'].tolist()

for i in range(meta.shape[0]):
    l2_type = l2_list[i]
    cohort = cohort_list[i]

    type_group = l2_groups[l2_type]
    #age_group = cohort_file_dict[cohort]
    #cell_group = f'{age_group}_{type_group}'
    cell_groups.append(type_group)

meta['cell_group'] = cell_groups

In [14]:
meta['cell_group'].value_counts()

cell_group
t_cd4_naive     2839099
dc_monocyte     2643674
t_cd4_memory    2640499
t_cd8           2264102
b_plasma        1210764
nk              1114975
t_other          987870
other             94244
Name: count, dtype: int64

In [15]:
sum(meta['cell_group'].value_counts())

13795227

In [16]:
sample_meta = meta.groupby('specimen.specimenGuid')

In [17]:
sample_meta_dict = {key1: group for key1, group in sample_meta}

In [18]:
cell_group_dict = {}
for sample_id, sample_meta in sample_meta_dict.items():
    cell_group_df = sample_meta[['barcodes', 'cell_group']]
    cell_group_dict[sample_id] = cell_group_df

## Iterate over tar files to split .h5ads

In [19]:
cell_groups = meta['cell_group'].unique().tolist()

In [20]:
for cell_group in cell_groups:
    if not os.path.isdir(f'split_h5ad/{cell_group}'):
        os.mkdir(f'split_h5ad/{cell_group}')

In [21]:
tar_df = search_df[search_df['name'].str.contains('.h5ad')].copy()

In [22]:
 for i in range(tar_df.shape[0]):
    tar_uuid = tar_df['id'].iloc[i]
    tar_path = cache_uuid_path(tar_uuid)

    tar = tarfile.TarFile(tar_path)
    tar.extractall()

    sample_h5ad_files = ['sample_h5ad/' + f for f in os.listdir('sample_h5ad')]
    
    for h5ad_file in sample_h5ad_files:
        sample_adata = sc.read_h5ad(h5ad_file)
        sample_id = sample_adata.obs['specimen.specimenGuid'].tolist()[0]
        cell_group_df = cell_group_dict[sample_id]

        obs = sample_adata.obs.copy()
        obs = obs.reset_index(drop = True)
        obs['subject.ageGroup'] = [cohort_name_dict[x] for x in obs['cohort.cohortGuid']]
        obs = obs.merge(cell_group_df, how = 'left', on = 'barcodes')
        obs = obs.set_index('barcodes', drop = False)
        
        sample_adata.obs = obs
        
        for cell_group in cell_groups:
            out_file = f'split_h5ad/{cell_group}/{sample_id}.h5ad'
            group_adata = sample_adata[sample_adata.obs['cell_group'] == cell_group].copy()
            group_adata.write_h5ad(out_file)

    os.system('rm -r sample_h5ad')



## Assemble data for each cell group

In [23]:
cell_groups

['b_plasma',
 't_cd8',
 'nk',
 'other',
 't_other',
 't_cd4_memory',
 't_cd4_naive',
 'dc_monocyte']

In [36]:
out_files = []
for cell_group in cell_groups:
    group_path = f'split_h5ad/{cell_group}/'
    group_h5ad_files = [group_path + f for f in os.listdir(group_path)]

    group_adata_list = []
    for sample_h5ad in group_h5ad_files:
        sample_adata = sc.read_h5ad(sample_h5ad)
        group_adata_list.append(sample_adata)
    group_adata = sc.concat(group_adata_list)
    group_adata.obs = group_adata.obs.drop('cell_group', axis = 1)
    
    out_file = f'output/SoundLife_{cell_group}.h5ad'
    group_adata.write_h5ad(out_file)
    out_files.append(out_file)

  concat_annot = pd.concat(


## Upload .h5ad data to HISE

Finally, we'll use `hisepy.upload.upload_files()` to send a copy of our output to HISE to use for downstream analysis steps.

In [37]:
study_space_uuid = 'de025812-5e73-4b3c-9c3b-6d0eac412f2a'
title = 'DIHA .h5ad file sets: Cell Class Groups {d}'.format(d = date.today())

In [38]:
search_id = element_id()
search_id

'uranium-dubnium-actinium'

In [39]:
in_files = tar_df['id'].tolist()
in_files

['04666e28-8443-4a51-8670-f409a7b5afe5',
 'ae2996c3-eab5-4d61-a997-084351727413',
 'b8f48340-ec96-4ed9-bad1-23fcb1a64e70',
 '11d754d9-0323-400b-8c47-8b9193d254d6',
 '6c6c9bbb-ac49-42f1-9e4f-f6a00766f331',
 '712082ed-2fe2-4121-9f89-7f732b4a58a7',
 'e1fe73c4-44d1-4092-ba72-72c5efe657d1',
 'dd3c4973-439f-4987-ac52-12cd86b31021']

In [40]:
out_files

['output/SoundLife_b_plasma.h5ad',
 'output/SoundLife_t_cd8.h5ad',
 'output/SoundLife_nk.h5ad',
 'output/SoundLife_other.h5ad',
 'output/SoundLife_t_other.h5ad',
 'output/SoundLife_t_cd4_memory.h5ad',
 'output/SoundLife_t_cd4_naive.h5ad',
 'output/SoundLife_dc_monocyte.h5ad']

In [41]:
len(out_files)

8

In [42]:
hisepy.upload.upload_files(
    files = out_files,
    study_space_id = study_space_uuid,
    title = title,
    input_file_ids = in_files,
    destination = search_id
)

you are trying to upload file_ids... ['output/SoundLife_b_plasma.h5ad', 'output/SoundLife_t_cd8.h5ad', 'output/SoundLife_nk.h5ad', 'output/SoundLife_other.h5ad', 'output/SoundLife_t_other.h5ad', 'output/SoundLife_t_cd4_memory.h5ad', 'output/SoundLife_t_cd4_naive.h5ad', 'output/SoundLife_dc_monocyte.h5ad']. Do you truly want to proceed?


(y/n) y


{'trace_id': 'b636c8e4-cc32-45fb-ba5c-30eda7303442',
 'files': ['output/SoundLife_b_plasma.h5ad',
  'output/SoundLife_t_cd8.h5ad',
  'output/SoundLife_nk.h5ad',
  'output/SoundLife_other.h5ad',
  'output/SoundLife_t_other.h5ad',
  'output/SoundLife_t_cd4_memory.h5ad',
  'output/SoundLife_t_cd4_naive.h5ad',
  'output/SoundLife_dc_monocyte.h5ad']}

In [43]:
import session_info
session_info.show()