-
Notifications
You must be signed in to change notification settings - Fork 1
/
evaluate_with_deseq2.py
113 lines (101 loc) · 5.06 KB
/
evaluate_with_deseq2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import os
import argparse
import pickle
import numpy as np
import pandas as pd
from collections import defaultdict
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats
import data_prep
import evaluation_utils
def main(args):
save_path = args.save_path
if save_path is not None and os.path.exists(save_path):
raise IOError('Saved file already exists at {}. Choose another save_path.'.format(save_path))
data_path = args.data_path
print('\nLoading data from {}'.format(data_path))
df = pd.read_csv(data_path)
if args.enrichment_column is not None:
print('Using enrichment scores in {}.'.format(args.enrichment_column))
enrich_scores = df[args.enrichment_column].values
else:
print('Computing enrichment scores from \'count_pre\' and \'count_post\'.')
pre_counts = df['count_pre'].values + 1
post_counts = df['count_post'].values + 1
enrich_scores = data_prep.calculate_enrichment_scores(pre_counts, post_counts, pre_counts.sum(), post_counts.sum())
enrich_scores = enrich_scores[:, 0]
n_replicates = args.n_replicates
count_cols = ['pre_count_{}'.format(n) for n in range(n_replicates)] + ['post_count_{}'.format(n) for n in range(n_replicates)]
df = df[count_cols]
count_sum = df.sum(axis=1)
df = df.set_axis(['seq{}'.format(i) for i in range(len(df))])
clinical_df = pd.DataFrame(data={'condition': ['pre'] * n_replicates + ['post'] * n_replicates}, index=count_cols)
results = {}
idx_paths = args.idx_paths
n_folds = 1 if idx_paths is None else len(idx_paths)
results['meta'] = {
'data_path': data_path,
'model_paths': ['DEseq2'] * n_folds,
}
if idx_paths is not None:
idx_paths = list(idx_paths)
results['meta']['idx_paths'] = idx_paths
results['metrics'] = defaultdict(list)
for i in range(n_folds):
print('\nFold {}'.format(i))
if idx_paths is not None:
idx_path = idx_paths[i]
print('\n\tUsing indices {}'.format(idx_path))
test_idx = np.load(idx_path)
# Invert test_idx to correspond to the observed sequences in the train set
test_idx = np.delete(np.arange(len(df)), test_idx)
mask = np.full(len(test_idx), False)
# Remove unobserved sequencecs with 0 counts in both conditions
mask = mask | (count_sum[test_idx] > 0)
test_idx = test_idx[mask]
else:
test_idx = np.arange(len(df))[count_sum > 0]
truth = enrich_scores[test_idx]
fold_df = df.iloc[test_idx] #+ 1 # pseudo-count to avoid divide by zero.
fold_df = fold_df.T
dds = DeseqDataSet(counts=fold_df, clinical=clinical_df, design_factors='condition')
dds.deseq2()
preds = -1 * dds.varm['LFC'].condition_pre_vs_post.values
metrics = evaluation_utils.get_eval_metrics(truth, preds)
print('\n\tCurrent metrics:')
evaluation_utils.print_eval_metrics(metrics)
for k in metrics.keys():
results['metrics'][k].append(metrics[k])
print('\n\tComputing top-K metrics...')
fracs = np.linspace(0, 1, args.num_fracs, endpoint=False)
results['meta']['culled_fracs'] = fracs
results['metrics']['culled_pearson'].append(
evaluation_utils.calculate_culled_correlation(preds, truth, fracs))
results['metrics']['culled_spearman'].append(
evaluation_utils.calculate_culled_correlation(preds, truth, fracs, correlation_type='spearman'))
results['metrics']['culled_mse'].append(
evaluation_utils.calculate_culled_correlation(preds, truth, fracs, correlation_type='mse'))
try:
# NDCG expects positive groundtruth, so pass enrichment instead of log-enrichment.
culled_ndcg = evaluation_utils.calculate_culled_ndcg(np.exp(preds), np.exp(truth), fracs)
results['metrics']['culled_ndcg'].append(culled_ndcg)
except:
# NDCG fails when predictions are very large and np.exp(.) overflows.
print('\n\tUnable to compute culled NDCG.')
pass
print('\nFinal evaluation metrics:')
evaluation_utils.print_eval_metrics(results['metrics'])
if save_path is not None:
print('\nSaving results to {}'.format(save_path))
np.save(save_path, results)
return
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('data_path', help='path to counts dataset', type=str)
parser.add_argument('--enrichment_column', help='column name in counts dataset', type=str)
parser.add_argument('--idx_paths', help='path to file containing subset of test indices', nargs='+', type=str)
parser.add_argument('--num_fracs', default=100, help='number of K for computing top K performance metrics', type=int)
parser.add_argument('--save_path', help='path to which to save output', type=str)
parser.add_argument('--n_replicates', default=3, help='number of replicates in counts dataset', type=int)
args = parser.parse_args()
main(args)