In [1]:
import sys
import os
import numpy as np
import pandas as pd
from scipy.stats import pearsonr


In [2]:
#Read targets

targets_strand_df = pd.read_csv('test_apa/f3c0/testg-0/acc.txt', index_col=0, sep='\t')[['identifier', 'description']]


In [3]:
#Store original target indices

targets_strand_df['row_index'] = np.arange(len(targets_strand_df), dtype='int32')

#Get dataframe columns (identifiers)
cols = targets_strand_df['identifier'].values.tolist()


In [16]:
#Load gene prediction dataframe(s)

num_targets_strand = len(targets_strand_df)

fold_ix = 3
cross_index = [0, 1, 2, 3]

gene_dfs = []

#Loop over replicates
for cross_ix in cross_index :
    gene_df = pd.read_csv('test_apa/f' + str(fold_ix) + 'c' + str(cross_ix) + '/testg-0/gene_preds.tsv', sep='\t').rename(columns={'Unnamed: 0' : 'gene_id'}).copy().reset_index(drop=True)
    gene_dfs.append(gene_df)


In [17]:
#Compute ensemble gene expression predictions

gene_df_ens = gene_dfs[0].copy()

#Loop over replicates and targets
for cross_ix in cross_index[1:] :
    for target_col in cols :
        gene_df_ens[target_col] += gene_dfs[cross_ix][target_col]

#Loop over targets
for target_col in cols :
    gene_df_ens[target_col] /= float(len(cross_index))

gene_df_ens.set_index('gene_id').to_csv('test_apa/f3c0/testg-0/gene_preds_ens.tsv', sep='\t')


In [6]:
#Re-load targets and ensemble predictions

out_dir = 'test_apa/f3c0/testg-0'

genes_targets_df = pd.read_csv('%s/gene_targets.tsv' % out_dir, sep='\t').rename(columns={'Unnamed: 0' : 'gene_id'})
genes_preds_df = pd.read_csv('%s/gene_preds_ens.tsv' % out_dir, sep='\t').rename(columns={'Unnamed: 0' : 'gene_id'})

genes_within_df = pd.read_csv('%s/gene_within_ens.tsv' % out_dir, sep='\t').rename(columns={'Unnamed: 0' : 'gene_id'})
genes_var_df = pd.read_csv('%s/gene_var_ens.tsv' % out_dir, sep='\t').rename(columns={'Unnamed: 0' : 'gene_id'})


In [14]:
# Re-calculate accuracy stats for ensemble

from qnorm import quantile_normalize
from sklearn.metrics import explained_variance_score

#Get number of targets (after collapsing strands)
num_targets_strand = len(targets_strand_df)

gene_targets = np.array(genes_targets_df[cols].values)
gene_preds = np.array(genes_preds_df[cols].values)
gene_within = np.array(genes_within_df[cols].values)
gene_wvar = np.array(genes_var_df[cols].values)

#Quantile-normalize and subtract mean (targets)
gene_targets_norm = quantile_normalize(gene_targets, ncpus=2)
gene_targets_norm = gene_targets_norm - gene_targets_norm.mean(axis=-1, keepdims=True)

#Quantile-normalize and sutract mean (predictions)
gene_preds_norm = quantile_normalize(gene_preds, ncpus=2)
gene_preds_norm = gene_preds_norm - gene_preds_norm.mean(axis=-1, keepdims=True)

wvar_t = np.percentile(gene_wvar, 80, axis=0)

acc_pearsonr = []
acc_r2 = []
acc_npearsonr = []
acc_nr2 = []
acc_wpearsonr = []

#Loop over targets
for ti in range(num_targets_strand) :
    
    #Calculate metrics
    
    r_ti = pearsonr(gene_targets[:,ti], gene_preds[:,ti])[0]
    acc_pearsonr.append(r_ti)
    
    r2_ti = explained_variance_score(gene_targets[:,ti], gene_preds[:,ti])
    acc_r2.append(r2_ti)
    
    nr_ti = pearsonr(gene_targets_norm[:,ti], gene_preds_norm[:,ti])[0]
    acc_npearsonr.append(nr_ti)
    
    nr2_ti = explained_variance_score(gene_targets_norm[:,ti], gene_preds_norm[:,ti])
    acc_nr2.append(nr2_ti)
    
    var_mask = (gene_wvar[:,ti] > wvar_t[ti])
    
    wr_ti = gene_within[:,ti][var_mask].mean()
    acc_wpearsonr.append(wr_ti)

#Create new dataframe
acc_df = pd.DataFrame({
    'identifier': targets_strand_df.identifier,
    'pearsonr': acc_pearsonr,
    'r2': acc_r2,
    'pearsonr_norm': acc_npearsonr,
    'r2_norm': acc_nr2,
    'pearsonr_gene': acc_wpearsonr,
    'description': targets_strand_df.description
})

#Store dataframe
acc_df.to_csv('%s/acc_ens.txt' % out_dir, sep='\t')

print('%d genes' % gene_targets.shape[0])
print('Overall PearsonR:     %.4f' % np.mean(acc_df.pearsonr))
print('Overall R2:           %.4f' % np.mean(acc_df.r2))
print('Normalized PearsonR:  %.4f' % np.mean(acc_df.pearsonr_norm))
print('Normalized R2:        %.4f' % np.mean(acc_df.r2_norm))
print('Within-gene PearsonR: %.4f' % np.mean(acc_df.pearsonr_gene))


1940 genes
Overall PearsonR:     0.8691
Overall R2:           0.7506
Normalized PearsonR:  0.5754
Normalized R2:        0.3308
Within-gene PearsonR: 0.8123
