Allen Institute resources for API access:
- http://help.brain-map.org/display/api/Allen+Brain+Atlas+API
- http://help.brain-map.org/display/api/Downloading+3-D+Expression+Grid+Data
- http://help.brain-map.org/display/api/Example+Queries+for+Experiment+Metadata
- http://help.brain-map.org/pages/viewpage.action?pageId=5308449

XML parsing:
- https://www.datacamp.com/community/tutorials/python-xml-elementtree
- https://www.kite.com/python/answers/how-to-download-a-csv-file-from-a-url-in-python

Python parallel processing:
- https://www.machinelearningplus.com/python/parallel-processing-python


In [1]:
import os
import argparse
import numpy as np
import pandas as pd
import requests
import multiprocessing as mp
from pyminc.volumes.factory import *
from zipfile import ZipFile
from functools import partial
from itertools import starmap
from tqdm import tqdm

In [2]:
def fetch_metadata(dataset = 'coronal', outdir='./', outfile = 'AMBA_metadata.csv'):

    """ """        

    abi_query_metadata = "http://api.brain-map.org/api/v2/data/SectionDataSet/query.csv?"+\
"criteria=[failed$eqfalse],plane_of_section[name$eq{}],products[abbreviation$eqMouse],treatments[name$eqISH],genes&".format(dataset)+\
"tabular=data_sets.id+as+experiment_id,data_sets.section_thickness,data_sets.specimen_id,"+\
"plane_of_sections.name+as+plane,"+\
"genes.acronym+as+gene,genes.name+as+gene_name,genes.chromosome_id,genes.entrez_id,genes.genomic_reference_update_id,genes.homologene_id,genes.organism_id&"+\
"start_row=0&num_rows=all"

    pd.read_csv(abi_query_metadata).to_csv(outdir+outfile, index=False)

    print('Metadata downloaded at: {}'.format(outdir+outfile))
    
    return

In [24]:
def fetch_expression(experiment_id, outdir = './tmp/'):

    """ """
    
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    tmpdir = outdir+str(experiment_id)+'/' 
    os.mkdir(tmpdir)
        
    abi_query_expr = 'http://api.brain-map.org/grid_data/download/{}'.format(experiment_id)
    amba_request = requests.get(abi_query_expr)

    tmpfile = tmpdir+str(experiment_id)+'.zip'
    with open(tmpfile, 'wb') as file:
        file.write(amba_request.content)

    outfile = outdir+str(experiment_id)+'.raw'
    with ZipFile(tmpfile, 'r') as file:
        try:
            file.extract('energy.raw', path = tmpdir)
            os.rename(tmpdir+'energy.raw', outfile)
            success = 1
        except KeyError as err:
            print('Error for experiment {}: {}. Ignoring.'.format(experiment_id, err))
            success = 0
            
    os.remove(tmpfile)
    os.rmdir(tmpdir)
 
    return outfile, success

In [4]:
def rawtominc_wrapper(infile, outfile = None, keep_raw = False):
    
    """ """
    
    if outfile is None:
        outfile = infile.replace('.raw', '.mnc')
    
    try: 
        rawtominc = 'cat {} | rawtominc {} -signed -float -ounsigned -oshort -xstep 0.2 -ystep 0.2 -zstep 0.2 -clobber 58 41 67'.format(infile, outfile)
        success = 1
    except: 
        success = 0
    
    os.system(rawtominc)
    
    if keep_raw is not True:
        os.remove(infile)
        
    return outfile, success

In [5]:
def transform_space(infile, outfile = None, voxel_orientation = 'RAS', world_space = 'MICe', expansion_factor = 1.0, volume_type = None, data_type = None, labels = False):

    """ """
    
    def reorient_to_standard(dat):
        dat = np.rot90(dat, k=1, axes=(0, 2))
        dat = np.rot90(dat, k=1, axes=(0, 1))

        shape = dat.shape
        dat = np.ravel(dat)
        dat = np.reshape(dat, shape)

        return(dat)

    def do_nothing(dat):
        return(dat)

    # %% Coordinate definitions

    # Centers are listed as x,y,z; reverse these when writing out
    # Centers listed in um in CCFv3 coordinates
    centers_RAS = {"MICe"   :   [5700, 7900, 2700],
                   "CCFv3"  :   [0, 13200, 8000]}
    centers_PIR = {"MICe"   :   [5300, 5300, 5700],
                   "CCFv3"  :   [0, 0, 0]}

    # Direction cosines
    direction_cosines_RAS = {"MICe"     :   [[1, 0, 0], [0, 1, 0], [0, 0, 1]],
                             "CCFv3"    :   [[0, 0, 1], [-1, 0, 0], [0, -1, 0]]}

    direction_cosines_PIR = {"MICe"     :   [[0, -1, 0], [0, 0, -1], [1, 0, 0]],
                             "CCFv3"    :   [[1, 0, 0], [0, 1, 0], [0, 0, 1]]}

    # Map arguments to functions/dicts/values
    map_voxel_orientations = {"RAS" :   reorient_to_standard,
                              "PIR":    do_nothing}

    map_centers = {"RAS"    :   centers_RAS,
                   "PIR"    :   centers_PIR}

    map_dir_cosines = {"RAS"    :   direction_cosines_RAS,
                       "PIR"    :   direction_cosines_PIR}

    size_10 = 1320*800*1140
    size_25 = 528*320*456
    size_50 = 264*160*228
    size_100 = 132*80*114
    size_200 = 58*41*67

    map_resolutions = {size_10: 10,
                       size_25: 25,
                       size_50: 50,
                       size_100: 100,
                       size_200: 200}
    
    vol = volumeFromFile(infile)

    res = map_resolutions[vol.data.size]

    # Voxel orientation
    if voxel_orientation in map_voxel_orientations:
        new_data = map_voxel_orientations[voxel_orientation](vol.data)
    else:
        print("Invalid voxel orientation")
        sys.exit(1)

    # World coordinate system
    centers = [expansion_factor*c/(1000) 
               for c in map_centers[voxel_orientation][world_space]]
    steps = [expansion_factor*res/1000] * 3
    xdc = map_dir_cosines[voxel_orientation][world_space][0]
    ydc = map_dir_cosines[voxel_orientation][world_space][1]
    zdc = map_dir_cosines[voxel_orientation][world_space][2]

    # Types
    vtype = vol.volumeType if volume_type is None else volume_type
    dtype = vol.dtype if data_type is None else data_type
    labels = vol.labels if labels is None else labels
    
    if outfile is None:
        outfile = infile
        tmpfile = infile.replace('.mnc', '')+'_tmp.mnc'
    else:
        tmpfile = outfile

    outvol = volumeFromDescription(outputFilename=tmpfile,
                                   dimnames=["zspace", "yspace", "xspace"],
                                   sizes=new_data.shape,
                                   starts=[-c for c in reversed(centers)],
                                   steps=[s for s in reversed(steps)],
                                   x_dir_cosines=xdc,
                                   y_dir_cosines=ydc,
                                   z_dir_cosines=zdc,
                                   volumeType=vtype,
                                   dtype=dtype,
                                   labels=labels)

    outvol.data = new_data
    outvol.writeFile()
    outvol.closeVolume()
    
    if outfile == infile:
        os.rename(tmpfile, infile)
        
    return outfile

In [26]:
def download_data(experiment, outdir):
    
    """ """
    
    experiment_id = experiment[0]
    gene = experiment[1]
    
    rawfile, success = fetch_expression(experiment_id, outdir = outdir)

    if success == 1:
        mincfile, success = rawtominc_wrapper(infile = rawfile)
    
        outfile = transform_space(infile = mincfile, voxel_orientation = 'RAS', world_space = 'MICe', expansion_factor = 1.0)
    
        os.rename(outfile, outdir+'{}_{}.mnc'.format(gene, experiment_id))
    
    return

In [None]:
#Get command line arguments
dataset = 'sagittal'
outdir = 'data/expression/'
metadata = 'AMBA_metadata_sagittal.csv'
parallel = False

#If outdir does not exist, create it
if os.path.exists(outdir) == False:
    print('Output directory {} not found. Creating it...'.format(outdir))
    os.mkdir(outdir)

#If AMBA metadata file not found, download it from the web
if os.path.isfile(outdir+metadata) == False:
    print('AMBA metadata file {} not found in {}. Fetching from API...'.format(metadata, outdir))
    fetch_metadata(dataset = dataset,
                   outdir = outdir,
                   outfile = metadata)

#Import AMBA metadata
dfMetadata = pd.read_csv(outdir+metadata, index_col=None)

#Extract experiment IDs and gene names from metadata
experiments = [(dfMetadata.loc[i, 'experiment_id'], dfMetadata.loc[i, 'gene']) for i in range(0, dfMetadata.shape[0])]

#Create output sub-directory based on data set specified
outdir = outdir+dataset+'/'
if os.path.exists(outdir) == False:
    os.mkdir(outdir)

In [29]:
experiments

[(375, 'Ebf1'),
 (386, 'Efnb1'),
 (81600550, 'Tmem30a'),
 (68269598, 'Csrp1'),
 (68844361, 'Slc4a1ap'),
 (564, 'Hist1h2bc'),
 (100144858, '1700101E01Rik'),
 (68203460, 'Zfp46'),
 (67979033, 'Arf6'),
 (75990519, 'Zfp647'),
 (71763900, 'Med22'),
 (100145378, 'Oprk1'),
 (695, 'Negr1'),
 (69782640, 'Pabpc5'),
 (71809286, 'Dock6'),
 (71808729, 'Map3k14'),
 (69783316, 'Prl7d1'),
 (81600160, 'Kirrel3os'),
 (71809385, 'Megf11'),
 (71809346, 'Samd12'),
 (69783089, 'Phip'),
 (68203496, 'Brox'),
 (100145312, 'Dgkk'),
 (71809259, 'Ppm1h'),
 (100145313, 'Alpl'),
 (68077015, 'Lcmt1'),
 (68443220, 'Ube2h'),
 (68193054, 'Tceal1'),
 (81600580, 'Wnt11'),
 (81600582, 'Dr1'),
 (69783220, 'Pira1'),
 (632488, 'Bspry'),
 (633293, 'Mrpl12'),
 (81790680, 'Crxos'),
 (81600604, 'Ppp1r12a'),
 (81790686, 'Figla'),
 (81790690, 'Gabrb2'),
 (81790692, 'Hcrtr2'),
 (81790670, 'Foxd1'),
 (81790662, 'Foxj1'),
 (81599628, 'Nmd3'),
 (81599624, 'Akirin1'),
 (81599610, 'Cdk9'),
 (81599612, 'Eif3g'),
 (678682, 'Mrps5'),
 (815

In [27]:
experiment_id = experiments[0][0]
gene = experiments[0][1]

# rawfile, success = fetch_expression(experiment_id, outdir = outdir)

download_data(experiment = experiments[0], outdir = outdir)



Error for experiment 375: "There is no item named 'energy.raw' in the archive". Ignoring.


In [30]:
#Partial version of function for iteration
download_data_partial = partial(download_data, outdir = outdir)

print('Downloading {} AMBA dataset to: {}'.format(dataset, outdir))
if parallel:

    nproc = args['nproc']
    pool = mp.Pool(nproc)

    print('Running in parallel on {} CPUs...'.format(nproc))

    #Download data in parallel. Show progress bar.
    results_tqdm = []
    for result in tqdm(pool.imap(download_data_partial, experiments), total = len(experiments)):
        results_tqdm.append(result)

    pool.close()
    pool.join()

else:

    experiments_tqdm = tqdm(experiments)
    results = list(map(download_data_partial, experiments_tqdm))

Downloading sagittal AMBA dataset to: data/expression/sagittal/


  0%|                                         | 1/21734 [00:00<56:36,  6.40it/s]

Error for experiment 375: "There is no item named 'energy.raw' in the archive". Ignoring.


  0%|                                         | 2/21734 [00:00<55:10,  6.56it/s]

Error for experiment 386: "There is no item named 'energy.raw' in the archive". Ignoring.


  0%|                                       | 6/21734 [00:02<2:05:35,  2.88it/s]

Error for experiment 564: "There is no item named 'energy.raw' in the archive". Ignoring.


  0%|                                      | 13/21734 [00:05<2:12:38,  2.73it/s]

Error for experiment 695: "There is no item named 'energy.raw' in the archive". Ignoring.


  0%|                                      | 41/21734 [00:21<3:09:53,  1.90it/s]


KeyboardInterrupt: 

In [34]:
outdir
outdir = 'data/expression'
os.path.join(outdir, dataset, '')

'data/expression/sagittal/'

This download will take GB of space. Proceed? (y/n)n
This download will take GB of space. Proceed? (y/n)y
