In [1]:
import sys
import os
import numpy as np
import pandas as pd

import h5py

import gc


In [None]:
#Load scores and auxiliary data, compute mean over replicates, and save new scores

fold_index = [3]
cross_index = [0, 1, 2, 3]

out_dirs = [
    'saved_models/linnar_brain_astrocyte',
    'saved_models/linnar_brain_medium_spiny_neuron',
    'saved_models/linnar_brain_microglia',
    'saved_models/linnar_brain_tabula',
    'saved_models/linnar_brain_oligodendrocyte',
    'saved_models/linnar_brain_astrocyte_smooth_095',
    'saved_models/linnar_brain_medium_spiny_neuron_smooth_095',
    'saved_models/linnar_brain_microglia_smooth_095',
    'saved_models/linnar_brain_tabula_smooth_095',
    'saved_models/linnar_brain_oligodendrocyte_smooth_095',
    'saved_models/linnar_brain_astrocyte_smooth_098',
    'saved_models/linnar_brain_medium_spiny_neuron_smooth_098',
    'saved_models/linnar_brain_microglia_smooth_098',
    'saved_models/linnar_brain_tabula_smooth_098',
    'saved_models/linnar_brain_oligodendrocyte_smooth_098',
]

#Loop over experiments
for out_dir in out_dirs :

    print("-- " + str(out_dir) + " --")

    #Initialize HDF5
    scores_h5 = h5py.File('%s/scores_mean.h5' % out_dir, 'w')

    seqs = None
    grads = None
    genes = None
    chrs = None
    starts = None
    ends = None
    utr_starts = None
    utr_ends = None
    strands = None
    masks = None
    for fold_i, fold_ix in enumerate(fold_index) :
        for cross_i, cross_ix in enumerate(cross_index) :

            score_file = h5py.File(('%s/scores_f' % out_dir) + str(fold_ix) + 'c' + str(cross_ix) + '.h5', 'r')

            if fold_i == 0 :
                seqs = score_file['seqs'][()]
                grads = score_file['grads'][()]
                genes = score_file['gene'][()]
                chrs = score_file['chr'][()]
                starts = score_file['start'][()]
                ends = score_file['end'][()]
                strands = score_file['strand'][()]
                if 'masks' in score_file.keys() :
                    masks = score_file['masks']

                if 'utr_start' in score_file.keys() :
                    utr_starts = score_file['utr_start']
                    utr_ends = score_file['utr_end']
            else :
                grads += score_file['grads'][()]

            #Close h5
            score_file.close()
            
            #Collect garbage
            gc.collect()

    grads /= float((len(fold_index) * len(cross_index)))

    #Re-save datasets in h5
    scores_h5.create_dataset('seqs', data=np.array(seqs, dtype='bool'))
    scores_h5.create_dataset('grads', data=np.array(grads, dtype='float16'))

    scores_h5.create_dataset('gene', data=np.array(genes, dtype='S'))
    scores_h5.create_dataset('chr', data=np.array(chrs, dtype='S'))
    scores_h5.create_dataset('start', data=np.array(starts))
    scores_h5.create_dataset('end', data=np.array(ends))
    scores_h5.create_dataset('strand', data=np.array(strands, dtype='S'))
    
    if masks is not None :
        scores_h5.create_dataset('masks', data=np.array(masks, dtype='bool'))
    
    if utr_starts is not None :
        scores_h5.create_dataset('utr_start', data=np.array(utr_starts))
        scores_h5.create_dataset('utr_end', data=np.array(utr_ends))

    #Close h5
    scores_h5.close()
