In [1]:
import sys
import os
import numpy as np
import pandas as pd
from scipy.stats import pearsonr


In [2]:
#Read targets

targets_strand_df = pd.read_csv('test_apa/f3c0/testg-0/acc.txt', index_col=0, sep='\t')[['identifier', 'description']]


In [3]:
#Store original target indices

targets_strand_df['row_index'] = np.arange(len(targets_strand_df), dtype='int32')

#Get dataframe columns (identifiers)
cols = targets_strand_df['identifier'].values.tolist()


In [4]:
#Load gene dataframe

gene_df = pd.read_csv('test_apa/f3c0/testg-0/gene_preds.tsv', sep='\t').rename(columns={'Unnamed: 0' : 'gene_id'})[['gene_id']].copy().reset_index(drop=True)


In [5]:
#Compute ensemble coverage predictions

gene_ids = gene_df['gene_id'].values.tolist()
gene_within = []
gene_wvar = []

num_targets_strand = len(targets_strand_df)

fold_ix = 3
cross_index = [0, 1, 2, 3]

#Loop over gene ids
for gene_i, gene_id in enumerate(gene_ids) :
    
    if gene_i % 100 == 0 :
        print('Processing gene ' + str(gene_i))
    
    #Load gene preds/targets
    hash_code = str(gene_id.split(".")[0][-1]) #Last digit of gene id
    
    gene_preds_gi_agg = None
    gene_targets_gi_agg = None
    
    #Loop over replicates
    for cross_ix in cross_index :
        out_dir = 'test_apa/f' + str(fold_ix) + 'c' + str(cross_ix) + '/testgs-0'
    
        #Load cached prediction and targets across gene span
        gene_preds_gi = np.load('%s/gene_within/%s/preds/%s_preds.npy' % (out_dir, hash_code, gene_id)).astype('float32')
        gene_targets_gi = np.load('%s/gene_within/%s/targets/%s_targets.npy' % (out_dir, hash_code, gene_id)).astype('float32')
    
        #Aggregate across replicates
        if gene_preds_gi_agg is None :
            gene_preds_gi_agg = gene_preds_gi
            gene_targets_gi_agg = gene_targets_gi
        else :
            gene_preds_gi_agg += gene_preds_gi
            gene_targets_gi_agg += gene_targets_gi
    
    #Compute average
    gene_preds_gi = gene_preds_gi_agg / float(len(cross_index))
    gene_targets_gi = gene_targets_gi_agg / float(len(cross_index))
    
    #Compute within gene correlation
    gene_corr_gi = np.zeros(num_targets_strand, dtype='float32')
    
    #Loop over targets
    for ti in range(num_targets_strand) :
        
        #Compute metrics if variance is non-zero
        if gene_preds_gi[:, ti].var() > 1e-6 and gene_targets_gi[:, ti].var() > 1e-6 :
            preds_log = np.log2(gene_preds_gi[:, ti] + 1.)
            targets_log = np.log2(gene_targets_gi[:, ti] + 1.)
            gene_corr_gi[ti] = pearsonr(preds_log, targets_log)[0]
        else :
            gene_corr_gi[ti] = np.nan
    
    gene_within.append(gene_corr_gi)
    gene_wvar.append(gene_targets_gi.var(axis=0))

gene_within = np.array(gene_within)
gene_wvar = np.array(gene_wvar)

out_dir = 'test_apa/f' + str(fold_ix) + 'c0/testgs-0'

#Store newly computed metrics

genes_within_df = pd.DataFrame(gene_within, index=gene_ids, columns=targets_strand_df.identifier)
genes_within_df.to_csv('%s/gene_within_ens.tsv' % out_dir, sep='\t')

genes_var_df = pd.DataFrame(gene_wvar, index=gene_ids, columns=targets_strand_df.identifier)
genes_var_df.to_csv('%s/gene_var_ens.tsv' % out_dir, sep='\t')


Processing gene 0
Processing gene 100
Processing gene 200
Processing gene 300
Processing gene 400
Processing gene 500
Processing gene 600
Processing gene 700
Processing gene 800
Processing gene 900
Processing gene 1000
Processing gene 1100
Processing gene 1200
Processing gene 1300
Processing gene 1400
Processing gene 1500
Processing gene 1600
Processing gene 1700
Processing gene 1800
Processing gene 1900
