# Assemble Subject Group .h5ad files

To release our scRNA-seq data, we'll assemble labeled data from samples in groups based on cohort, biological sex, and CMV status to generate usable chunks of data for download.

Users who want the whole set will be able to download each of these files and assemble them.

In [1]:
from datetime import date
import hisepy
import os
import scanpy as sc
import re
import tarfile

In [2]:
if not os.path.isdir('output'):
    os.mkdir('output')

## Helper functions

In [3]:
def find_stored_files(search_id, store = 'Service_Core'):
    ps_df = hisepy.list_files_in_project_store(store)
    ps_df = ps_df[['id', 'name']]
    search_df = ps_df[ps_df['name'].str.contains(search_id)]
    
    return search_df

In [4]:
def cache_uuid_path(uuid):
    cache_path = '/home/jupyter/cache/{u}'.format(u = uuid)
    if not os.path.isdir(cache_path):
        hise_res = hisepy.reader.cache_files([uuid])
    filename = os.listdir(cache_path)[0]
    cache_file = '{p}/{f}'.format(p = cache_path, f = filename)
    return cache_file

In [14]:
def element_id(n = 3):
    import periodictable
    from random import randrange
    rand_el = []
    for i in range(n):
        el = randrange(0,118)
        rand_el.append(periodictable.elements[el].name)
    rand_str = '-'.join(rand_el)
    return rand_str

## Find labeled data in HISE

In [5]:
search_id = 'polonium-tin-curium'

In [6]:
search_df = find_stored_files(search_id, 'cohorts')

In [7]:
tar_df = search_df[search_df['name'].str.contains('.h5ad')].copy()

In [8]:
cohort_file_dict = {
    'BR1': 'SoundLife_YoungAdult',
    'BR2': 'SoundLife_OlderAdult'
}
cohort_name_dict = {
    'BR1': 'Sound Life Young Adult',
    'BR2': 'Sound Life Older Adult'
}

In [9]:
file_groups = []
for name in tar_df['name']:
    group = re.sub('.+diha_','',name)
    group = re.sub('_h5ad.+','',group)
    
    if 'BR1' in group:
        group = re.sub('BR1', cohort_file_dict['BR1'], group)
    if 'BR2' in group:
        group = re.sub('BR2', cohort_file_dict['BR2'], group)
    if 'Negative' in group:
        group = re.sub('Negative', 'CMVneg', group)
    if 'Positive' in group:
        group = re.sub('Positive', 'CMVpos', group)
        
    file_groups.append(group)

tar_df['group'] = file_groups

In [10]:
tar_df = tar_df.reset_index(drop = True)
tar_df

Unnamed: 0,id,name,group
0,04666e28-8443-4a51-8670-f409a7b5afe5,polonium-tin-curium/diha_BR1_Female_Negative_h...,SoundLife_YoungAdult_Female_CMVneg
1,ae2996c3-eab5-4d61-a997-084351727413,polonium-tin-curium/diha_BR1_Female_Positive_h...,SoundLife_YoungAdult_Female_CMVpos
2,b8f48340-ec96-4ed9-bad1-23fcb1a64e70,polonium-tin-curium/diha_BR1_Male_Negative_h5a...,SoundLife_YoungAdult_Male_CMVneg
3,11d754d9-0323-400b-8c47-8b9193d254d6,polonium-tin-curium/diha_BR1_Male_Positive_h5a...,SoundLife_YoungAdult_Male_CMVpos
4,6c6c9bbb-ac49-42f1-9e4f-f6a00766f331,polonium-tin-curium/diha_BR2_Female_Negative_h...,SoundLife_OlderAdult_Female_CMVneg
5,712082ed-2fe2-4121-9f89-7f732b4a58a7,polonium-tin-curium/diha_BR2_Female_Positive_h...,SoundLife_OlderAdult_Female_CMVpos
6,e1fe73c4-44d1-4092-ba72-72c5efe657d1,polonium-tin-curium/diha_BR2_Male_Negative_h5a...,SoundLife_OlderAdult_Male_CMVneg
7,dd3c4973-439f-4987-ac52-12cd86b31021,polonium-tin-curium/diha_BR2_Male_Positive_h5a...,SoundLife_OlderAdult_Male_CMVpos


## Assemble .h5ads from each .tar file

In [11]:
out_files = []
for i in range(tar_df.shape[0]):
    group = tar_df['group'].iloc[i]
    out_file = f'output/{group}.h5ad'
    
    if os.path.isfile(out_file):
        print(f'{group} previously processed. Skipping.')
        out_files.append(out_file)
    else:
        print(f'Processing {group}.')
        
        tar_uuid = tar_df['id'].iloc[i]
        tar_path = cache_uuid_path(tar_uuid)
        
        tar = tarfile.TarFile(tar_path)
        tar.extractall()
        
        sample_h5ad_files = ['sample_h5ad/' + f for f in os.listdir('sample_h5ad')]
        
        adata_list = []
        for h5ad_file in sample_h5ad_files:
            sample_adata = sc.read_h5ad(h5ad_file)
            adata_list.append(sample_adata)
        adata = sc.concat(adata_list)
        
        adata.obs['subject.ageGroup'] = [cohort_name_dict[x] for x in adata.obs['cohort.cohortGuid']]
        
        n_samples = len(adata.obs['sample.sampleKitGuid'].unique().tolist())
        n_cells = adata.shape[0]
        
        adata.write_h5ad(out_file)
        out_files.append(out_file)

        os.system('rm -r sample_h5ad')

SoundLife_YoungAdult_Female_CMVneg previously processed. Skipping.
Processing SoundLife_YoungAdult_Female_CMVpos.
Processing SoundLife_YoungAdult_Male_CMVneg.
Processing SoundLife_YoungAdult_Male_CMVpos.
downloading fileID: 11d754d9-0323-400b-8c47-8b9193d254d6
Files have been successfully downloaded!
Processing SoundLife_OlderAdult_Female_CMVneg.
downloading fileID: 6c6c9bbb-ac49-42f1-9e4f-f6a00766f331
Files have been successfully downloaded!
Processing SoundLife_OlderAdult_Female_CMVpos.
downloading fileID: 712082ed-2fe2-4121-9f89-7f732b4a58a7
Files have been successfully downloaded!
Processing SoundLife_OlderAdult_Male_CMVneg.
downloading fileID: e1fe73c4-44d1-4092-ba72-72c5efe657d1
Files have been successfully downloaded!
Processing SoundLife_OlderAdult_Male_CMVpos.
downloading fileID: dd3c4973-439f-4987-ac52-12cd86b31021
Files have been successfully downloaded!


## Upload .h5ad data to HISE

Finally, we'll use `hisepy.upload.upload_files()` to send a copy of our output to HISE to use for downstream analysis steps.

In [12]:
study_space_uuid = 'de025812-5e73-4b3c-9c3b-6d0eac412f2a'
title = 'DIHA .h5ad file sets: Subject Groups {d}'.format(d = date.today())

In [15]:
search_id = element_id()
search_id

'nihonium-seaborgium-technetium'

In [16]:
in_files = tar_df['id'].tolist()
in_files

['04666e28-8443-4a51-8670-f409a7b5afe5',
 'ae2996c3-eab5-4d61-a997-084351727413',
 'b8f48340-ec96-4ed9-bad1-23fcb1a64e70',
 '11d754d9-0323-400b-8c47-8b9193d254d6',
 '6c6c9bbb-ac49-42f1-9e4f-f6a00766f331',
 '712082ed-2fe2-4121-9f89-7f732b4a58a7',
 'e1fe73c4-44d1-4092-ba72-72c5efe657d1',
 'dd3c4973-439f-4987-ac52-12cd86b31021']

In [17]:
out_files

['output/SoundLife_YoungAdult_Female_CMVneg.h5ad',
 'output/SoundLife_YoungAdult_Female_CMVpos.h5ad',
 'output/SoundLife_YoungAdult_Male_CMVneg.h5ad',
 'output/SoundLife_YoungAdult_Male_CMVpos.h5ad',
 'output/SoundLife_OlderAdult_Female_CMVneg.h5ad',
 'output/SoundLife_OlderAdult_Female_CMVpos.h5ad',
 'output/SoundLife_OlderAdult_Male_CMVneg.h5ad',
 'output/SoundLife_OlderAdult_Male_CMVpos.h5ad']

In [18]:
len(out_files)

8

In [19]:
hisepy.upload.upload_files(
    files = out_files,
    study_space_id = study_space_uuid,
    title = title,
    input_file_ids = in_files,
    destination = search_id
)

you are trying to upload file_ids... ['output/SoundLife_YoungAdult_Female_CMVneg.h5ad', 'output/SoundLife_YoungAdult_Female_CMVpos.h5ad', 'output/SoundLife_YoungAdult_Male_CMVneg.h5ad', 'output/SoundLife_YoungAdult_Male_CMVpos.h5ad', 'output/SoundLife_OlderAdult_Female_CMVneg.h5ad', 'output/SoundLife_OlderAdult_Female_CMVpos.h5ad', 'output/SoundLife_OlderAdult_Male_CMVneg.h5ad', 'output/SoundLife_OlderAdult_Male_CMVpos.h5ad']. Do you truly want to proceed?


(y/n) y


{'trace_id': '7ddfb232-59f5-4879-a4df-13c298db984a',
 'files': ['output/SoundLife_YoungAdult_Female_CMVneg.h5ad',
  'output/SoundLife_YoungAdult_Female_CMVpos.h5ad',
  'output/SoundLife_YoungAdult_Male_CMVneg.h5ad',
  'output/SoundLife_YoungAdult_Male_CMVpos.h5ad',
  'output/SoundLife_OlderAdult_Female_CMVneg.h5ad',
  'output/SoundLife_OlderAdult_Female_CMVpos.h5ad',
  'output/SoundLife_OlderAdult_Male_CMVneg.h5ad',
  'output/SoundLife_OlderAdult_Male_CMVpos.h5ad']}

In [20]:
import session_info
session_info.show()