# Assemble data per sample

In this notebook, we'll read the filtered L3 data from HISE, split each type by sample, then re-assemble sample-specific files for downstream analyses.

To help with portability, samples will be assembled into .tar files based on cohort, sex, and CMV status so that we have a final set of 8 groups of samples.

In [39]:
import anndata
from datetime import date
import hisepy
import itertools
import multiprocessing
import os
import pandas as pd
import re
import scanpy as sc
import tarfile

In [2]:
out_dir = 'output'
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)

In [3]:
sample_dir = 'sample_h5ad'
if not os.path.isdir(sample_dir):
    os.makedirs(sample_dir)

## Helper functions

In [4]:
def cache_uuid_path(uuid):
    cache_path = '/home/jupyter/cache/{u}'.format(u = uuid)
    if not os.path.isdir(cache_path):
        hise_res = hisepy.reader.cache_files([uuid])
    filename = os.listdir(cache_path)[0]
    cache_file = '{p}/{f}'.format(p = cache_path, f = filename)
    return cache_file

In [5]:
def read_adata_uuid(uuid):
    cache_file = cache_uuid_path(uuid)
    res = sc.read_h5ad(cache_file)
    return res

In [6]:
def read_csv_uuid(uuid):
    cache_file = cache_uuid_path(uuid)
    res = pd.read_csv(cache_file)
    return res

In [7]:
def filename_cell_type(filename):
    cell_type = re.sub('.+L3_','',filename)
    cell_type = re.sub('_2024.+','',cell_type)
    cell_type = re.sub('_', ' ', cell_type)
    cell_type = re.sub('pos','+', cell_type)
    cell_type = re.sub('neg','-', cell_type)
    return cell_type

In [8]:
def prep_adata_uuid(uuid, keep_obs, meta):
    adata = read_adata_uuid(uuid)
    adata = adata.raw.to_adata()
    
    obs = adata.obs
    obs = obs.reset_index(drop = True)
    obs = obs[keep_obs]
    obs = obs.merge(meta, on = 'sample.sampleKitGuid', how = 'left')
    obs = obs.set_index('barcodes', drop = False)

    adata = anndata.AnnData(
        X = adata.X,
        obs = obs,
        var = adata.var
    )
    return adata

In [9]:
def split_type_adata_per_sample(adata, out_path):
    
    sample_ids = adata.obs['specimen.specimenGuid'].unique()

    for sample_id in sample_ids:
        sample_adata = adata[adata.obs['specimen.specimenGuid'] == sample_id].copy()
        
        if sample_adata.shape[0] > 0:
            subject = sample_adata.obs['subject.subjectGuid'].tolist()[0]
            visit = sample_adata.obs['sample.visitName'].tolist()[0]
            visit = re.sub(' ', '-', visit)
            
            cell_type = sample_adata.obs['AIFI_L3'].tolist()[0]
            type_dir = '{op}/{ct}'.format(op = out_path, ct = cell_type)
            if not os.path.isdir(type_dir):
                os.makedirs(type_dir)
            
            out_file = '{td}/{si}_{su}_{vi}_{ct}.h5ad'.format(
                td = type_dir,
                si = sample_id,
                su = subject,
                vi = visit,
                ct = cell_type
            )
            
            sample_adata.write_h5ad(out_file)

In [10]:
def assemble_h5ad_per_sample(sample_id, in_files, out_path):
    adata_list = []
    for file in in_files:
        if sample_id in file:
            adata = sc.read_h5ad(file)
            adata_list.append(adata)
    
    sample_adata = sc.concat(adata_list)
    
    subject = sample_adata.obs['subject.subjectGuid'].tolist()[0]
    visit = sample_adata.obs['sample.visitName'].tolist()[0]
    visit = re.sub(' ', '-', visit)
    
    out_file = '{op}/{si}_{su}_{vi}.h5ad'.format(
        op = out_path,
        si = sample_id,
        su = subject,
        vi = visit
    )
    
    sample_adata.write_h5ad(out_file)

In [11]:
def element_id(n = 3):
    import periodictable
    from random import randrange
    rand_el = []
    for i in range(n):
        el = randrange(0,118)
        rand_el.append(periodictable.elements[el].name)
    rand_str = '-'.join(rand_el)
    return rand_str

## Prepare sample metadata

Used for grouping samples for output and to ensure that we have all of the metadata we require in the final files for analysis

In [12]:
sample_meta_uuid = 'd82c5c42-ae5f-4e67-956e-cd3b7bf88105'
sample_meta = read_csv_uuid(sample_meta_uuid)

downloading fileID: d82c5c42-ae5f-4e67-956e-cd3b7bf88105
Files have been successfully downloaded!


### Rename specimen and file-specific columns

In [13]:
sample_meta = sample_meta.rename({'pbmc_sample_id': 'specimen.specimenGuid'}, axis = 1)
sample_meta = sample_meta.rename({'file.id': 'pipeline.fileGuid'}, axis = 1)

### Add age at sample draw and age at enrollment

In [14]:
def drawDate_to_drawYear(drawDate):
    drawYear = re.sub('-.+', '', drawDate)
    drawYear = int(drawYear)
    return(drawYear)

In [15]:
sample_meta['sample.drawYear'] = [drawDate_to_drawYear(d) for d in sample_meta['sample.drawDate']]
sample_meta['sample.subjectAgeAtDraw'] = sample_meta['sample.drawYear'] - sample_meta['subject.birthYear']

In [16]:
first_draw_age = (
    sample_meta
        .groupby('subject.subjectGuid', as_index = False)['sample.subjectAgeAtDraw']
        .min()
        .rename({'sample.subjectAgeAtDraw': 'subject.ageAtFirstDraw'}, axis = 1)
)

In [17]:
sample_meta = sample_meta.merge(first_draw_age, on = 'subject.subjectGuid', how = 'left')

### Simplify drawDate

In [18]:
sample_meta['sample.drawDate'] = [re.sub('([0-9]{4}-[0-9]{2})-.+', '\\1', d) for d in sample_meta['sample.drawDate']]

### Add CMV and BMI from clinical labs

In [19]:
cmv_meta_uuid = '9469f67c-b09a-454d-9fb9-f50ff3494d69'
cmv_meta = read_csv_uuid(cmv_meta_uuid)

downloading fileID: 9469f67c-b09a-454d-9fb9-f50ff3494d69
Files have been successfully downloaded!


In [20]:
bmi_meta_uuid = 'e507258c-d175-4d8e-a455-5229870dc991'
bmi_meta = read_csv_uuid(bmi_meta_uuid)

downloading fileID: e507258c-d175-4d8e-a455-5229870dc991
Files have been successfully downloaded!


### Combine sample-level metadata

In [21]:
cmv_meta = cmv_meta[['subject.subjectGuid', 'subject.cmv']].drop_duplicates()

In [22]:
combined_sample_meta = sample_meta.merge(cmv_meta, on = 'subject.subjectGuid', how = 'left')

In [23]:
bmi_meta = bmi_meta[['sample.sampleKitGuid', 'subject.bmi']]
bmi_meta['subject.bmi'] = bmi_meta['subject.bmi'].round(0)

In [24]:
combined_sample_meta = combined_sample_meta.merge(bmi_meta, on = 'sample.sampleKitGuid', how = 'left')

We only need to keep some of the metadata columns that pertain to cohort, subject, and sample. We'll also keep the originating File GUID to help us keep track of provenance. Let's select just these columns:

In [25]:
keep_meta = [
    'cohort.cohortGuid',
    'subject.subjectGuid', 'subject.biologicalSex', 'subject.cmv', 'subject.bmi',
    'subject.race', 'subject.ethnicity', 'subject.birthYear', 'subject.ageAtFirstDraw',
    'sample.sampleKitGuid', 'sample.visitName', 'sample.drawDate', 'sample.subjectAgeAtDraw',
    'specimen.specimenGuid', 'pipeline.fileGuid'
]

In [26]:
combined_sample_meta = combined_sample_meta[keep_meta]

In [27]:
combined_sample_meta.shape

(868, 15)

In [28]:
combined_sample_meta.head()

Unnamed: 0,cohort.cohortGuid,subject.subjectGuid,subject.biologicalSex,subject.cmv,subject.bmi,subject.race,subject.ethnicity,subject.birthYear,subject.ageAtFirstDraw,sample.sampleKitGuid,sample.visitName,sample.drawDate,sample.subjectAgeAtDraw,specimen.specimenGuid,pipeline.fileGuid
0,BR1,BR1001,Female,Negative,23.0,Caucasian,Non-Hispanic origin,1987,32,KT00001,Flu Year 1 Day 0,2019-10,32,PB00001-01,fec489f9-9a74-4635-aa91-d2bf09d1faec
1,BR1,BR1002,Male,Negative,22.0,Caucasian,Non-Hispanic origin,1991,28,KT00002,Flu Year 1 Day 0,2019-10,28,PB00002-01,7c0c7979-eebd-4aba-b5b2-6e76b4643623
2,BR1,BR1003,Female,Negative,21.0,Caucasian,Non-Hispanic origin,1989,30,KT00003,Flu Year 1 Day 0,2019-10,30,PB00003-01,40efd03a-cb2f-4677-af42-a056cbfe5a17
3,BR1,BR1004,Male,Negative,22.0,Caucasian,Non-Hispanic origin,1989,30,KT00004,Flu Year 1 Day 0,2019-10,30,PB00004-01,68fbcd34-1d63-461d-8195-df5b8dc61b31
4,BR1,BR1005,Female,Negative,20.0,Caucasian,Non-Hispanic origin,1992,27,KT00006,Flu Year 1 Day 0,2019-10,27,PB00006-01,ea8d98e9-e99e-4dc6-9e78-9866e0deac68


## Identify files for use in HISE

In [29]:
search_ids = {
    'b':       'aluminum-thorium-neon',
    't_cd4':   'mercury-polonium-zinc',
    't_cd8':   'fermium-chromium-gallium',
    't_other': 'magnesium-potassium-zinc',
    'myeloid': 'neptunium-cadmium-erbium',
    'nk':      'livermorium-copper-curium',
    'other':   'thorium-zirconium-nobelium'
}

Retrieve files stored in our HISE project store

In [30]:
ps_df = hisepy.list_files_in_project_store('cohorts')
ps_df = ps_df[['id', 'name']]

Filter for files from the previous notebook using our search_id

In [31]:
search_string = '|'.join(search_ids.values())

search_df = ps_df[ps_df['name'].str.contains(search_string)]
search_df = search_df[search_df['name'].str.contains('.h5ad')]

search_df = search_df.sort_values('name')

search_df['AIFI_L3'] = [filename_cell_type(f) for f in search_df['name']]

In [32]:
print(len(search_df['AIFI_L3'].tolist()))
search_df['AIFI_L3'].head().tolist()

71


['Activated memory B cell',
 'CD27- effector B cell',
 'CD27+ effector B cell',
 'CD95 memory B cell',
 'Core memory B cell']

## Split cell type results

In [33]:
keep_adata_obs = [
    'barcodes', 'original_barcodes', 'cell_name',
    'batch_id', 'pool_id', 'chip_id', 'well_id', 
    'n_genes', 'n_reads', 'n_umis', 
    'total_counts_mito', 'pct_counts_mito', 'doublet_score',
    'predicted_AIFI_L1', 'AIFI_L1_score', 'AIFI_L1',
    'predicted_AIFI_L2', 'AIFI_L2_score', 'AIFI_L2',
    'predicted_AIFI_L3', 'AIFI_L3_score', 'AIFI_L3',
    'sample.sampleKitGuid'
]

In [34]:
obs_list = []

n_types = search_df.shape[0]
for i in range(n_types):
    cell_type = search_df['AIFI_L3'].tolist()[i]
    print('{k}/{n}: Splitting {ct}'.format(k = i+1, n = n_types, ct = cell_type))
    
    uuid = search_df['id'].tolist()[i]
    type_adata = prep_adata_uuid(
        uuid, keep_adata_obs, combined_sample_meta
    )
    
    type_obs = type_adata.obs
    obs_list.append(type_obs)
    
    split_type_adata_per_sample(
        type_adata, out_dir
    )

1/71: Splitting Activated memory B cell
downloading fileID: 1e8c6893-2294-41f8-97a7-f0855e2c7a24
Files have been successfully downloaded!
2/71: Splitting CD27- effector B cell
downloading fileID: 196ed0aa-dd8f-4d20-b890-43df425dbbb2
Files have been successfully downloaded!
3/71: Splitting CD27+ effector B cell
downloading fileID: 1c1dc7a5-ce1c-43b7-9532-f857720f5125
Files have been successfully downloaded!
4/71: Splitting CD95 memory B cell
downloading fileID: abee0c76-8900-4648-9fa9-e044d601e669
Files have been successfully downloaded!
5/71: Splitting Core memory B cell
downloading fileID: 8af856b5-1758-4623-9fe2-a9f27cf92b5c
Files have been successfully downloaded!
6/71: Splitting Core naive B cell
downloading fileID: 5db86bfd-b109-4d54-972b-4d02e9f6bdc2
Files have been successfully downloaded!
7/71: Splitting Early memory B cell
downloading fileID: c1785810-e96f-43ab-9e0c-2328519c33f9
Files have been successfully downloaded!
8/71: Splitting ISG+ naive B cell
downloading fileID: b858

In [35]:
type_dirs = [out_dir + '/' + x for x in os.listdir(out_dir)]
all_type_files = []
for type_dir in type_dirs:
    type_files = [type_dir + '/' + x for x in os.listdir(type_dir)]
    all_type_files = all_type_files + type_files 

## Assemble sample results

In [44]:
samples = combined_sample_meta['specimen.specimenGuid'].tolist()

param_list = zip(
    samples,
    itertools.repeat(all_type_files),
    itertools.repeat(sample_dir)
)

pool = multiprocessing.Pool(40)

pool.starmap(assemble_h5ad_per_sample, param_list)

pool.close()

## Assemble metadata from all types

In [45]:
out_files = []

In [46]:
all_obs = pd.concat(obs_list)
all_obs = all_obs.reset_index(drop = True)

all_obs_csv = '{od}/diha_all_cell_meta_{d}.csv'.format(od = out_dir, d = date.today())
all_obs.to_csv(all_obs_csv)
out_files.append(all_obs_csv)

all_obs_parquet = '{od}/diha_all_cell_meta_{d}.parquet'.format(od = out_dir, d = date.today())
all_obs.to_parquet(all_obs_parquet)
out_files.append(all_obs_parquet)

  values = values.astype(str)


### Assemble metadata per group

In [47]:
split_samples = combined_sample_meta.groupby(['cohort.cohortGuid', 'subject.biologicalSex', 'subject.cmv'])
split_obs = all_obs.groupby(['cohort.cohortGuid', 'subject.biologicalSex', 'subject.cmv'])

In [48]:
for (group, group_obs) in split_obs:
    out_group = '_'.join(group)
    
    group_csv = '{od}/diha_{g}_meta_{d}.csv'.format(od = out_dir, g = out_group, d = date.today())
    group_obs.to_csv(group_csv)
    out_files.append(group_csv)
    
    group_parquet = '{od}/diha_{g}_meta_{d}.parquet'.format(od = out_dir, g = out_group, d = date.today())
    group_obs.to_parquet(group_parquet)
    out_files.append(group_parquet)

  values = values.astype(str)


## Assemble .tar files per group

In [49]:
for (group, df) in split_samples:
    out_group = '_'.join(group)
    print('Generating .tar for {g}'.format(g = out_group))

    group_ids = df['specimen.specimenGuid'].tolist()

    all_sample_files = os.listdir(sample_dir)
    sample_files = []
    for file in all_sample_files:
        file_id = re.sub('_.+', '',file)
        if file_id in group_ids:
            sample_files.append(sample_dir + '/' + file)
    
    out_tar = '{od}/diha_{g}_h5ads_{d}.tar'.format(od = out_dir, g = out_group, d = date.today())
    with tarfile.open(out_tar,"w") as tar:
        for sample_file in sample_files:
            tar.add(sample_file)
    
    out_files.append(out_tar)

Generating .tar for BR1_Female_Negative
Generating .tar for BR1_Female_Positive
Generating .tar for BR1_Male_Negative
Generating .tar for BR1_Male_Positive
Generating .tar for BR2_Female_Negative
Generating .tar for BR2_Female_Positive
Generating .tar for BR2_Male_Negative
Generating .tar for BR2_Male_Positive


## Upload Sample data to HISE

Finally, we'll use `hisepy.upload.upload_files()` to send a copy of our output to HISE to use for downstream analysis steps.

In [50]:
study_space_uuid = 'de025812-5e73-4b3c-9c3b-6d0eac412f2a'
title = 'DIHA Assembled scRNA-seq Data per Sample {d}'.format(d = date.today())

In [51]:
search_id = element_id()
search_id

'polonium-tin-curium'

In [52]:
in_files = search_df['id'].tolist()
in_files = in_files + [sample_meta_uuid, cmv_meta_uuid, bmi_meta_uuid]            
in_files = in_files

In [53]:
out_files = list(set(out_files))

In [54]:
len(out_files)

26

In [55]:
out_files.sort()
out_files

['output/diha_BR1_Female_Negative_h5ads_2024-05-05.tar',
 'output/diha_BR1_Female_Negative_meta_2024-05-05.csv',
 'output/diha_BR1_Female_Negative_meta_2024-05-05.parquet',
 'output/diha_BR1_Female_Positive_h5ads_2024-05-05.tar',
 'output/diha_BR1_Female_Positive_meta_2024-05-05.csv',
 'output/diha_BR1_Female_Positive_meta_2024-05-05.parquet',
 'output/diha_BR1_Male_Negative_h5ads_2024-05-05.tar',
 'output/diha_BR1_Male_Negative_meta_2024-05-05.csv',
 'output/diha_BR1_Male_Negative_meta_2024-05-05.parquet',
 'output/diha_BR1_Male_Positive_h5ads_2024-05-05.tar',
 'output/diha_BR1_Male_Positive_meta_2024-05-05.csv',
 'output/diha_BR1_Male_Positive_meta_2024-05-05.parquet',
 'output/diha_BR2_Female_Negative_h5ads_2024-05-05.tar',
 'output/diha_BR2_Female_Negative_meta_2024-05-05.csv',
 'output/diha_BR2_Female_Negative_meta_2024-05-05.parquet',
 'output/diha_BR2_Female_Positive_h5ads_2024-05-05.tar',
 'output/diha_BR2_Female_Positive_meta_2024-05-05.csv',
 'output/diha_BR2_Female_Positive_

In [56]:
hisepy.upload.upload_files(
    files = out_files,
    study_space_id = study_space_uuid,
    title = title,
    input_file_ids = in_files,
    destination = search_id
)

you are trying to upload file_ids... ['output/diha_BR1_Female_Negative_h5ads_2024-05-05.tar', 'output/diha_BR1_Female_Negative_meta_2024-05-05.csv', 'output/diha_BR1_Female_Negative_meta_2024-05-05.parquet', 'output/diha_BR1_Female_Positive_h5ads_2024-05-05.tar', 'output/diha_BR1_Female_Positive_meta_2024-05-05.csv', 'output/diha_BR1_Female_Positive_meta_2024-05-05.parquet', 'output/diha_BR1_Male_Negative_h5ads_2024-05-05.tar', 'output/diha_BR1_Male_Negative_meta_2024-05-05.csv', 'output/diha_BR1_Male_Negative_meta_2024-05-05.parquet', 'output/diha_BR1_Male_Positive_h5ads_2024-05-05.tar', 'output/diha_BR1_Male_Positive_meta_2024-05-05.csv', 'output/diha_BR1_Male_Positive_meta_2024-05-05.parquet', 'output/diha_BR2_Female_Negative_h5ads_2024-05-05.tar', 'output/diha_BR2_Female_Negative_meta_2024-05-05.csv', 'output/diha_BR2_Female_Negative_meta_2024-05-05.parquet', 'output/diha_BR2_Female_Positive_h5ads_2024-05-05.tar', 'output/diha_BR2_Female_Positive_meta_2024-05-05.csv', 'output/diha_

(y/n) y


{'trace_id': 'bbab4457-c85a-4577-8c2a-5312bebdf852',
 'files': ['output/diha_BR1_Female_Negative_h5ads_2024-05-05.tar',
  'output/diha_BR1_Female_Negative_meta_2024-05-05.csv',
  'output/diha_BR1_Female_Negative_meta_2024-05-05.parquet',
  'output/diha_BR1_Female_Positive_h5ads_2024-05-05.tar',
  'output/diha_BR1_Female_Positive_meta_2024-05-05.csv',
  'output/diha_BR1_Female_Positive_meta_2024-05-05.parquet',
  'output/diha_BR1_Male_Negative_h5ads_2024-05-05.tar',
  'output/diha_BR1_Male_Negative_meta_2024-05-05.csv',
  'output/diha_BR1_Male_Negative_meta_2024-05-05.parquet',
  'output/diha_BR1_Male_Positive_h5ads_2024-05-05.tar',
  'output/diha_BR1_Male_Positive_meta_2024-05-05.csv',
  'output/diha_BR1_Male_Positive_meta_2024-05-05.parquet',
  'output/diha_BR2_Female_Negative_h5ads_2024-05-05.tar',
  'output/diha_BR2_Female_Negative_meta_2024-05-05.csv',
  'output/diha_BR2_Female_Negative_meta_2024-05-05.parquet',
  'output/diha_BR2_Female_Positive_h5ads_2024-05-05.tar',
  'output/di

In [57]:
import session_info
session_info.show()