In [1]:
import sys
import os
import numpy as np
import pandas as pd

import h5py

import gc


In [2]:
#Load scores and auxiliary data, compute mean, and save new scores

out_suffix = '_smooth'

out_dirs = [
    'saved_models/tabula_erytrocyte',
    'saved_models/tabula_hematopoietic',
    'saved_models/tabula_hepatocyte',
    'saved_models/tabula_muscle',
    'saved_models/tabula_tcell',
]

grad_suffixes = [
    '_smooth_095',
    '_smooth_098',
    '',
]

#Loop over experiments
for out_dir in out_dirs :

    print("-- " + str(out_dir) + " --")

    !mkdir {out_dir + out_suffix}
    
    #Initialize HDF5
    scores_h5 = h5py.File('%s%s/scores_mean.h5' % (out_dir, out_suffix), 'w')

    seqs = None
    grads = None
    genes = None
    chrs = None
    starts = None
    ends = None
    strands = None
    masks = None
    for grad_suffix in grad_suffixes :
        
        score_file = h5py.File(('%s%s/scores_mean.h5' % (out_dir, grad_suffix)), 'r')

        if seqs is None :
            seqs = score_file['seqs'][()]
            grads = score_file['grads'][()]
            genes = score_file['gene'][()]
            chrs = score_file['chr'][()]
            starts = score_file['start'][()]
            ends = score_file['end'][()]
            strands = score_file['strand'][()]
        else :
            grads += score_file['grads'][()]
        if masks is None and 'masks' in score_file.keys() :
            masks = score_file['masks']

        #Close h5
        score_file.close()
        
        #Collect garbage
        gc.collect()

    grads /= float(len(grad_suffixes))

    #Re-save datasets in h5
    scores_h5.create_dataset('seqs', data=np.array(seqs, dtype='bool'))
    scores_h5.create_dataset('grads', data=np.array(grads, dtype='float16'))

    scores_h5.create_dataset('gene', data=np.array(genes, dtype='S'))
    scores_h5.create_dataset('chr', data=np.array(chrs, dtype='S'))
    scores_h5.create_dataset('start', data=np.array(starts))
    scores_h5.create_dataset('end', data=np.array(ends))
    scores_h5.create_dataset('strand', data=np.array(strands, dtype='S'))
    
    if masks is not None :
        scores_h5.create_dataset('masks', data=np.array(masks, dtype='bool'))

    #Close h5
    scores_h5.close()


-- tabula_erytrocyte --
-- tabula_hematopoietic --
-- tabula_hepatocyte --
-- tabula_muscle --
-- tabula_tcell --
