In [1]:
import pandas as pd
import requests
import scanpy as sc

In [2]:
#specify the Collection to extract metadata from
collection_id = 'bcb61471-2a44-4d00-a0af-ff085512674c'

In [3]:
#schema fields that will not change Dataset-to-Dataset
sample_schema = [
    'organism','organism_ontology_term_id',
    'donor_id','sex','sex_ontology_term_id',
    'development_stage','development_stage_ontology_term_id',
    'self_reported_ethnicity','self_reported_ethnicity_ontology_term_id',
    'disease','disease_ontology_term_id',
    'tissue','tissue_ontology_term_id','tissue_type',
    'assay','assay_ontology_term_id','suspension_type'
]
cell_schema = [
    'cell_type','cell_type_ontology_term_id'
]
collection_schema = [
    'name','collection_url','visibility','doi',
    'consortia','contact_name','contact_email','protocols'
]

In [4]:
#optionally, define additional fields that are specific to this Dataset
sample_nonschema = [
    'BMI','condition.l1','condition.l2','condition.long',
    'diabetes_history','eGFR','experiment','hypertension',
    'id','library','region.l1','region.l2','specimen',
    'percent.cortex','percent.medulla'
]
cell_nonschema = [
    'class','state','state.l2','structure','subclass.full',
    'subclass.l1','subclass.l2','subclass.l3'
]
sample_schema.extend(sample_nonschema)
cell_schema.extend(cell_nonschema)

In [5]:
#query the CELLxGENE API for Collection metadata
cxg_api = 'https://api.cellxgene.cziscience.com/curation/v1'
headers = {'Content-Type': 'application/json'}
url = f'{cxg_api}/collections/{collection_id}'
collection = requests.get(url, headers=headers).json()

collection['protocols'] = [l['link_url'] for l in collection['links'] if l['link_type'] == 'PROTOCOL']

coll_report = {}
for f in collection_schema:
    v = collection.get(f)
    if isinstance(v, list):
        v = ','.join(v)
    coll_report[f] = v

In [6]:
pd.DataFrame(collection['datasets'])[['dataset_id','cell_count','title']]

Unnamed: 0,dataset_id,cell_count,title
0,32b9bdce-2481-4c85-ba1b-6ad5fcea844c,107344,Single-cell RNA-seq of the Adult Human Kidney ...
1,0b75c598-0893-4216-afe8-5414cab7739d,304652,Integrated Single-nucleus and Single-cell RNA-...
2,07854d9c-5375-4a9b-ac34-fa919d3c3686,172847,Single-nucleus RNA-seq of the Adult Human Kidn...


In [7]:
#specify the Dataset to extract metadata from
dataset_id = '0b75c598-0893-4216-afe8-5414cab7739d'

In [8]:
pd.DataFrame(coll_report, index=[0]).transpose().to_csv(f'metadata/{collection_id}_{dataset_id}_study_metadata.csv', header=None)

In [9]:
#download the H5AD file
mx_file = f'h5ads/{collection_id}_{dataset_id}.h5ad'

for d in collection['datasets']:
    if d['dataset_id'] == dataset_id:
        h5ad_url = [a['url'] for a in d['assets'] if a['filetype'] == 'H5AD'][0]

with requests.get(h5ad_url, stream=True) as res:
    res.raise_for_status()
    filesize = int(res.headers['Content-Length'])
    with open(mx_file, 'wb') as df:
        total_bytes_received = 0
        for chunk in res.iter_content(chunk_size=1024 * 1024):
            df.write(chunk)
            total_bytes_received += len(chunk)
            percent_of_total_upload = float('{:.1f}'.format(total_bytes_received / filesize * 100))
            print(f'\033[1m\033[38;5;10m{percent_of_total_upload}% downloaded {mx_file}\033[0m\r', end='')

[1m[38;5;10m100.0% downloaded h5ads/bcb61471-2a44-4d00-a0af-ff085512674c_0b75c598-0893-4216-afe8-5414cab7739d.h5ad[0m

In [10]:
#extract metadata
adata = sc.read_h5ad(mx_file, backed='r')

pd.DataFrame(adata.obs[sample_schema].value_counts()).rename(columns={0: 'obs count'}).to_csv(f'metadata/{collection_id}_{dataset_id}_sample_metadata.csv')
pd.DataFrame(adata.obs[cell_schema].value_counts()).rename(columns={0: 'obs count'}).to_csv(f'metadata/{collection_id}_{dataset_id}_cell_metadata.csv')

In [11]:
obs_table = adata.obs[sample_schema].drop_duplicates().set_index('library')
obs_table['obs count'] = adata.obs['library'].value_counts()
obs_table.to_csv(f'metadata/{collection_id}_{dataset_id}_metadata.csv')