# Set up the Data Frame

In [2]:
from allensdk.core.cell_types_cache import CellTypesCache

# !! update this path so that it points to your external hard drive !!
ctc = CellTypesCache(manifest_file='e:/cell_types/manifest.json')

# get metadata for all the cells
cells_sdk = ctc.get_cells()
print "There are %d cells in the cache" % len(cells_sdk)

There are 248 cells in the cache


In [3]:
import re

# helper function to break up structure into regions and layer
def split_region_layer(cell) :
    a = cell['structure']['acronym']
    si = 0
    for idx,x in enumerate(a) :
        if x.isdigit() :
            si = idx
            break
    return {'region': a[:si], 'layer': a[si:]}
    
print split_region_layer(cells_sdk[0])

{'region': u'VISp', 'layer': u'4'}


In [4]:
# helper function to get the mouse line
def transgenic_drivers( cell ) :
    tlist = cell['donor']['transgenic_lines']
    dlist = [str(x['name']) for x in tlist if x['transgenic_line_type_name'] == 'driver' ]
    return {'mouse_line':";".join( dlist )}

print transgenic_drivers(cells_sdk[0])

{'mouse_line': 'Sst-IRES-Cre'}


In [5]:
# create temporary record dictionary with the metadata we want to keep

index_key = ['id']
sample_keys = ['mouse_line','region','layer','hemisphere']
annotation_keys = ['dendrite_type','apical']
modality_keys = ['has_morphology', 'has_reconstruction']

def filter_list(keys,exclude_keys) : 
    return [ x for x in keys if x not in exclude_keys ]

def clean_keys(keys) :
    return [ str(x) for x in keys ]

# cell soma location keys
csl_keys = cells_sdk[0]['cell_soma_locations'][0].keys()
exclude_csl_keys = ['id','specimen_id']
csl_keys = clean_keys(filter_list( csl_keys, exclude_csl_keys))

# ephys feature keys
ef_keys = cells_sdk[0]['ephys_features'][0].keys()
exclude_ef_keys = ['id','specimen_id']
ef_keys = clean_keys(filter_list( ef_keys, exclude_ef_keys ))

# neuron reconstruction keys
c = [ x for x in cells_sdk if x['has_reconstruction']][0]
nr_keys = c['neuron_reconstructions'][0].keys()
exclude_nr_keys = ['id','specimen_id','tags']
nr_keys = clean_keys(filter_list( nr_keys, exclude_nr_keys ))

columns = index_key + sample_keys + annotation_keys + modality_keys + csl_keys + ef_keys + nr_keys

In [6]:
def filter_dictonary_by_keys( d, k ) :
     return { x: d[x] for x in k }
    
def fetch_data( c ) :
    d = filter_dictonary_by_keys( c, (index_key + annotation_keys + modality_keys + ['hemisphere']) )
    d.update( split_region_layer(c) )
    d.update( transgenic_drivers(c) )
    d.update( filter_dictonary_by_keys( c['cell_soma_locations'][0], csl_keys) )
    d.update( filter_dictonary_by_keys( c['ephys_features'][0], ef_keys) )
    if c['has_reconstruction'] :
        d.update( filter_dictonary_by_keys( c['neuron_reconstructions'][0], nr_keys ) )
    return d

cells_records = [fetch_data(c) for c in cells_sdk]
    
print cells_records[0]['mouse_line']

Sst-IRES-Cre


In [7]:
# create pandas dataframe
import pandas as pd

cells_df = pd.DataFrame.from_records( cells_records, columns=columns )
cells_df.head()

Unnamed: 0,id,mouse_line,region,layer,hemisphere,dendrite_type,apical,has_morphology,has_reconstruction,normalized_depth,...,number_stems,number_branches,average_fragmentation,average_contraction,average_bifurcation_angle_remote,hausdorff_dimension,total_surface,max_branch_order,soma_surface,overall_height
0,324257146,Sst-IRES-Cre,VISp,4,left,aspiny,,True,True,0.410398,...,6.0,30.0,34.2333,0.776329,77.4101,1.12107,1416.64,6.0,111.175,215.758
1,469622566,Scnn1a-Tg3-Cre,VISp,5,right,spiny,truncated,True,False,0.423992,...,,,,,,,,,,
2,328876201,Sst-IRES-Cre,VISp,5,left,aspiny,,False,False,0.510874,...,,,,,,,,,,
3,466431949,Scnn1a-Tg3-Cre,VISl,4,left,spiny,truncated,True,False,0.46447,...,,,,,,,,,,
4,396903227,Scnn1a-Tg3-Cre,VISp,5,right,spiny,truncated,False,False,0.497223,...,,,,,,,,,,


In [8]:
#remove these 2 since they shouldn't be included in the dataset
cells_df = cells_df[cells_df.id !=319070795]
cells_df = cells_df[cells_df.id !=341016267]

In [9]:
cells_df.save('cells_dataframe.h5')

