Allen Institute resources for API access:
- http://help.brain-map.org/display/api/Allen+Brain+Atlas+API
- http://help.brain-map.org/display/api/Downloading+3-D+Expression+Grid+Data
- http://help.brain-map.org/display/api/Example+Queries+for+Experiment+Metadata
- http://help.brain-map.org/pages/viewpage.action?pageId=5308449

XML parsing:
- https://www.datacamp.com/community/tutorials/python-xml-elementtree
- https://www.kite.com/python/answers/how-to-download-a-csv-file-from-a-url-in-python

Python parallel processing:
- https://www.machinelearningplus.com/python/parallel-processing-python


In [2]:
import os
import argparse
import numpy as np
import pandas as pd
import requests
from pyminc.volumes.factory import *
from zipfile import ZipFile

In [3]:
def fetch_metadata(dataset = 'coronal', outdir='./', outfile = 'AMBA_metadata.csv'):

    """ """        

    abi_query_metadata = "http://api.brain-map.org/api/v2/data/SectionDataSet/query.csv?"+\
"criteria=[failed$eqfalse],plane_of_section[name$eq{}],products[abbreviation$eqMouse],treatments[name$eqISH],genes&".format(dataset)+\
"tabular=data_sets.id+as+experiment_id,data_sets.section_thickness,data_sets.specimen_id,"+\
"plane_of_sections.name+as+plane,"+\
"genes.acronym+as+gene,genes.name+as+gene_name,genes.chromosome_id,genes.entrez_id,genes.genomic_reference_update_id,genes.homologene_id,genes.organism_id&"+\
"start_row=0&num_rows=all"

    pd.read_csv(abi_query_metadata).to_csv(outdir+outfile, index=False)

    print('Metadata downloaded at: {}'.format(outdir+outfile))
    
    return 

In [4]:
def fetch_expression(experiment_id, outdir = './tmp/'):

    """ """

    if not os.path.exists(outdir):
        os.mkdir(outdir)

    abi_query_expr = 'http://api.brain-map.org/grid_data/download/{}'.format(experiment_id)

    amba_request = requests.get(abi_query_expr)

    tmpfile = outdir+str(experiment_id)+'.zip'
    with open(tmpfile, 'wb') as file:
        file.write(amba_request.content)

    with ZipFile(tmpfile, 'r') as file:
        try:
            file.extract('energy.raw', path = outdir)
            os.rename(outdir+'energy.raw', outdir+str(experiment_id)+'.raw')
            success = 1
        except KeyError as err:
            print('Error for experiment {}: {}'.format(experiment_id, err))
            success = 0
            
    os.remove(tmpfile)
 
    return success

In [5]:
def transform_space(infile, outfile, voxel_orientation = 'RAS', world_space = 'MICe', expansion_factor = 1.0, volume_type = None, data_type = None, labels = False):

    def reorient_to_standard(dat):
        dat = np.rot90(dat, k=1, axes=(0, 2))
        dat = np.rot90(dat, k=1, axes=(0, 1))

        shape = dat.shape
        dat = np.ravel(dat)
        dat = np.reshape(dat, shape)

        return(dat)

    def do_nothing(dat):
        return(dat)

    # %% Coordinate definitions

    # Centers are listed as x,y,z; reverse these when writing out
    # Centers listed in um in CCFv3 coordinates
    centers_RAS = {"MICe"   :   [5700, 7900, 2700],
                   "CCFv3"  :   [0, 13200, 8000]}
    centers_PIR = {"MICe"   :   [5300, 5300, 5700],
                   "CCFv3"  :   [0, 0, 0]}

    # Direction cosines
    direction_cosines_RAS = {"MICe"     :   [[1, 0, 0], [0, 1, 0], [0, 0, 1]],
                             "CCFv3"    :   [[0, 0, 1], [-1, 0, 0], [0, -1, 0]]}

    direction_cosines_PIR = {"MICe"     :   [[0, -1, 0], [0, 0, -1], [1, 0, 0]],
                             "CCFv3"    :   [[1, 0, 0], [0, 1, 0], [0, 0, 1]]}

    # Map arguments to functions/dicts/values
    map_voxel_orientations = {"RAS" :   reorient_to_standard,
                              "PIR":    do_nothing}

    map_centers = {"RAS"    :   centers_RAS,
                   "PIR"    :   centers_PIR}

    map_dir_cosines = {"RAS"    :   direction_cosines_RAS,
                       "PIR"    :   direction_cosines_PIR}

    size_10 = 1320*800*1140
    size_25 = 528*320*456
    size_50 = 264*160*228
    size_100 = 132*80*114
    size_200 = 58*41*67

    map_resolutions = {size_10: 10,
                       size_25: 25,
                       size_50: 50,
                       size_100: 100,
                       size_200: 200}

    vol = volumeFromFile(infile)

    res = map_resolutions[vol.data.size]

    # Voxel orientation
    if voxel_orientation in map_voxel_orientations:
        new_data = map_voxel_orientations[voxel_orientation](vol.data)
    else:
        print("Invalid voxel orientation")
        sys.exit(1)

    # World coordinate system
    centers = [expansion_factor*c/(1000) 
               for c in map_centers[voxel_orientation][world_space]]
    steps = [expansion_factor*res/1000] * 3
    xdc = map_dir_cosines[voxel_orientation][world_space][0]
    ydc = map_dir_cosines[voxel_orientation][world_space][1]
    zdc = map_dir_cosines[voxel_orientation][world_space][2]

    # Types
    vtype = vol.volumeType if volume_type is None else volume_type
    dtype = vol.dtype if data_type is None else data_type
    labels = vol.labels if labels is None else labels

    outvol = volumeFromDescription(outputFilename=outfile,
                                   dimnames=["zspace", "yspace", "xspace"],
                                   sizes=new_data.shape,
                                   starts=[-c for c in reversed(centers)],
                                   steps=[s for s in reversed(steps)],
                                   x_dir_cosines=xdc,
                                   y_dir_cosines=ydc,
                                   z_dir_cosines=zdc,
                                   volumeType=vtype,
                                   dtype=dtype,
                                   labels=labels)

    outvol.data = new_data
    outvol.writeFile()
    outvol.closeVolume()

In [6]:
def download_data(metadata, outdir):
    
    """ """
    
    metadata['success'] = 0
    
    for index, row in metadata.iterrows():
        
        experiment_id = row['experiment_id']
        
        success = fetch_expression(experiment_id = experiment_id, 
                                   outdir = outdir)
        
        if bool(success):
            
            metadata.loc[index,'success'] = 1
        
            infile = outdir+'{}.raw'.format(experiment_id)
            outfile = outdir+'{}_tmp.mnc'.format(experiment_id)

            cmd = 'cat {} | rawtominc {} -signed -float -ounsigned -oshort -xstep 0.2 -ystep 0.2 -zstep 0.2 -clobber 58 41 67'.format(infile, outfile)

            os.system(cmd)
            os.remove(infile)

            infile = outfile
            outfile = outdir+'{}_tmp2.mnc'.format(experiment_id)
            transform_space(infile = infile,
                            outfile = outfile,
                            voxel_orientation = 'RAS',
                            world_space = 'MICe',
                            expansion_factor = 1.0)
            os.remove(infile)
    
    
            gene_id = row['gene']
    
            infile = outfile
            outfile = outdir+'{}_{}.mnc'.format(gene_id, experiment_id)
            os.rename(infile, outfile)
            
    return metadata

In [7]:

dataset = 'coronal'
datadir = 'data/expression/'
metadata = 'AMBA_metadata.csv'

if os.path.isfile(datadir+metadata) == False:
    print('AMBA metadata file {} not found in {}. Fetching from API.'.format(metadata,datadir))
    fetch_metadata(dataset = 'coronal',
                   outdir = datadir,
                   outfile = metadata)

dfMetadata = pd.read_csv(datadir+metadata, index_col=None)

    


dfTemp = dfMetadata.loc[:50].copy()

outdir = datadir+dataset+'/'

nproc = 4

nrows = dfTemp.shape[0]

chunksize = int(nrows/nproc)



metadata_chunks = [dfTemp.iloc[dfTemp.index[i:i+chunksize]] for i in range(0, nrows, chunksize)]

In [8]:
len(metadata_chunks)

5

In [22]:
len(metadata_chunks[:4])

4

In [32]:
from multiprocessing import Pool
from functools import partial

In [33]:
pool = Pool(processes = nproc)
download_data_partial = partial(download_data, outdir = outdir)

Process ForkPoolWorker-5:
Process ForkPoolWorker-6:
Process ForkPoolWorker-7:
Process ForkPoolWorker-8:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.8/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/usr/lib/python3

In [31]:
# download_data_partial(dfTemp)

Unnamed: 0,experiment_id,section_thickness,specimen_id,plane,gene,gene_name,chromosome_id,entrez_id,genomic_reference_update_id,homologene_id,organism_id,success
0,112646890,25,710945,coronal,Nr2f2,"nuclear receptor subfamily 2, group F, member 2",52.0,11819.0,491928275.0,7628.0,2,1
1,71836747,25,71035669,coronal,Tbc1d8,"TBC1 domain family, member 8",34.0,54610.0,491928275.0,31421.0,2,1
2,71717451,25,71019378,coronal,Kcng3,"potassium voltage-gated channel, subfamily G, ...",43.0,225030.0,491928275.0,15168.0,2,1
3,2480,25,702794,coronal,Phox2b,paired-like homeobox 2b,50.0,18935.0,491928275.0,68371.0,2,1
4,73929578,25,71903222,coronal,Thbs2,thrombospondin 2,43.0,21826.0,491928275.0,2438.0,2,1
5,2107,25,702831,coronal,Cldn12,claudin 12,50.0,64945.0,491928275.0,40809.0,2,1
6,71836810,25,71035725,coronal,Acsl6,acyl-CoA synthetase long-chain family member 6,36.0,216739.0,491928275.0,100939.0,2,1
7,2493,25,702795,coronal,Pomc,pro-opiomelanocortin-alpha,37.0,18976.0,491928275.0,723.0,2,1
8,1510,25,702587,coronal,Mtf2,metal response element binding transcription f...,50.0,17765.0,491928275.0,7207.0,2,1
9,112646675,25,710947,coronal,Oprd1,"opioid receptor, delta 1",49.0,18386.0,491928275.0,20252.0,2,1


In [None]:
result = pool.map(download_data_partial, metadata_chunks)