In [116]:
import os
import numpy as np
import pandas as pd
from pyminc.volumes.factory import volumeFromFile
from glob import glob
from tqdm import tqdm
from functools import partial

In [117]:
def import_cluster_mask(cluster, mask):
    
    mask_vol = volumeFromFile(mask)
    mask_array = np.array(mask_vol.data.flatten())
    mask_vol.closeVolume()
    
    cluster_vol = volumeFromFile(cluster)
    cluster_array = np.array(cluster_vol.data.flatten())
    cluster_vol.closeVolume()
    
    cluster_masked = cluster_array[mask_array == 1]
    
    return cluster_masked

In [118]:
def aggregate_expression(cluster, expression):
    return np.array(expression
                    .loc[cluster == 1]
                    .mean())

In [119]:
def wrapper(cluster, mask, expression):
    
    cluster = import_cluster_mask(cluster = cluster,
                                  mask = mask)
    avg_expr = aggregate_expression(cluster = cluster,
                                    expression = expression)
    
    return avg_expr

In [2]:
infile = 'data/MouseExpressionMatrix_voxel_coronal_log2_grouped_imputed_homologs_scaled.csv'
dfExpression = pd.read_csv(infile)

In [8]:
dfExpression.shape

(63545, 2580)

In [94]:
wrapper_partial = partial(wrapper, 
                          mask = maskfile,
                          expression = dfExpression)

In [109]:
cluster_files = glob('data/mouse/cluster_masks/*.mnc')

In [110]:
arrays = list(map(wrapper_partial, tqdm(cluster_files)))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 432/432 [01:51<00:00,  3.89it/s]


In [114]:
df_avg = pd.DataFrame(np.asarray(arrays),
                     index = [os.path.basename(file) for file in cluster_files],
                     columns = dfExpression.columns)

In [115]:
df_avg

Unnamed: 0,A4GALT,AACS,AARS,ABAT,ABCB10,ABCB6,ABCD2,ABHD11,ABHD3,ABHD6,...,ZNF365,ZNF423,ZNF462,ZNF483,ZNF521,ZNF536,ZNF618,ZNF790,ZSWIM6,ZYX
Group_4_Clusternum_9_ES_abs_200_median_mask_threshold0.1.mnc,-0.004898,0.010841,-0.003427,-0.008275,0.005848,-0.001731,-0.020299,-0.008153,-0.012463,0.009670,...,0.007031,-0.021081,-0.016408,0.010716,-0.081891,-0.006610,-0.009774,0.000138,0.004549,-0.007603
Group_3_Clusternum_7_ES_abs_200_median_mask_threshold0.5.mnc,-0.087248,0.052223,-0.039435,-0.006251,-0.111652,0.062149,0.100238,0.060348,-0.144860,0.043906,...,0.149598,0.009961,-0.113680,-0.173589,-0.231182,-0.064090,-0.145468,-0.052464,0.142591,-0.130690
Group_5_Clusternum_10_ES_rel_200_mean_mask_threshold0.5.mnc,-0.015044,-0.020461,-0.069439,0.018821,0.078730,-0.168973,-0.124918,-0.003569,-0.053468,-0.027640,...,0.096626,-0.059513,0.081634,0.239696,-0.238893,0.029474,-0.044584,-0.008776,0.090201,-0.061500
Group_7_Clusternum_9_ES_abs_200_median_mask_threshold0.5.mnc,0.068855,-0.064739,-0.005274,0.010032,0.046309,0.052823,0.097043,0.024344,0.117323,0.051243,...,-0.104754,0.028590,0.043266,-0.163177,0.216465,-0.000889,-0.007157,0.007164,-0.073492,0.044331
Group_7_Clusternum_8_ES_rel_200_median_mask_threshold0.5.mnc,-0.272468,0.004310,-0.000134,0.031303,-0.068875,0.094875,0.113668,-0.012624,-0.045689,-0.098940,...,0.030215,-0.013735,-0.232062,0.144248,-0.019165,0.002752,-0.158415,0.086897,-0.035043,0.022719
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Group_3_Clusternum_3_ES_abs_200_median_mask_threshold0.1.mnc,-0.028060,0.047853,-0.022992,-0.012707,-0.029811,0.004386,-0.050884,-0.022960,-0.085204,-0.018565,...,0.084075,-0.012098,-0.068323,0.056336,-0.167774,-0.013969,-0.033822,-0.025009,0.039772,-0.054316
Group_4_Clusternum_10_ES_rel_200_median_mask_threshold0.5.mnc,-0.029762,-0.016065,-0.024707,0.071561,-0.039510,0.023816,-0.018524,-0.033869,-0.032548,-0.131298,...,0.064136,0.104572,-0.003176,0.041647,0.021239,-0.018045,-0.013835,-0.008235,0.095382,-0.104096
Group_1_Clusternum_5_ES_rel_200_median_mask_threshold0.1.mnc,-0.004793,0.005075,0.005650,0.011425,-0.008992,-0.002996,-0.011121,0.002076,0.020099,-0.007067,...,-0.013174,0.005337,0.005766,0.016416,0.012773,-0.006085,0.022187,-0.005770,-0.006106,0.013806
Group_9_Clusternum_9_ES_abs_200_median_mask_threshold0.1.mnc,-0.004959,0.009815,-0.001357,0.003278,-0.009179,-0.000258,0.006958,-0.007763,-0.008990,-0.000603,...,0.014497,-0.001159,-0.018294,0.020642,-0.004024,-0.000444,0.012964,-0.003621,0.009740,-0.005962


In [2]:
from datatable import fread

In [13]:
mousefile = 'data/MouseExpressionMatrix_voxel_coronal_log2_grouped_imputed_homologs_GM_labelled_67_scaled.csv'
mouse = fread(mousefile, header = True).to_pandas()

In [19]:
mouse.shape

(52463, 2581)

In [16]:
mouse.Region

0             Piriform-amygdalar area
1             Cortical amygdalar area
2             Cortical amygdalar area
3             Cortical amygdalar area
4             Cortical amygdalar area
                     ...             
52458    Postpiriform transition area
52459    Postpiriform transition area
52460         Cortical subplate-other
52461         Cortical subplate-other
52462         Cortical amygdalar area
Name: Region, Length: 52463, dtype: object

In [17]:
humanfile = 'data/HumanExpressionMatrix_samples_pipeline_v1_homologs_GM_labelled_88_scaled.csv'
human = fread(humanfile, header = True).to_pandas()

In [18]:
human.shape

(3682, 2581)

In [20]:
human.Region

0       paracentral lobule, anterior part
1                               claustrum
2                               claustrum
3                                thalamus
4                               CA4 field
                      ...                
3677                             amygdala
3678                    cerebellar nuclei
3679                    cerebellar nuclei
3680                    cerebellar nuclei
3681                    cerebellar nuclei
Name: Region, Length: 3682, dtype: object