-
Notifications
You must be signed in to change notification settings - Fork 68
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
33be6b6
commit 52d14d2
Showing
4 changed files
with
357 additions
and
43 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# R scripts | ||
|
||
Here you can find R scripts and markdown file for analyzing and visualizing | ||
data. | ||
|
||
* [eval_perf_single.Rmd](./eval_perf_single.Rmd): For visualizing performance data from `dcpg_eval_perf.py` of a single model. | ||
* [eval_perf_mult.Rmd](./eval_perf_mult.Rmd): For visualizing performance data from `dcpg_eval_perf.py` of multiple models. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,264 @@ | ||
--- | ||
title: "Prediction performance evaluation" | ||
date: "`r format(Sys.time(), '%Y-%m-%d')`" | ||
output: | ||
html_document: | ||
toc: yes | ||
--- | ||
|
||
```{r echo=F} | ||
# You can use this Rmarkdown file to visualize the performance data from | ||
# `dcpg_eval_perf.py` of multiple models. Use `eval_perf_single.Rmd` to | ||
# visualize performances of a single model. | ||
# Copy the file to the output directory of `dcpg_eval_perf.py` and adapt the | ||
# options in the section below. | ||
``` | ||
|
||
```{r, include=F} | ||
library(knitr) | ||
opts_chunk$set(echo=F, fig.width=12, warning=F, message=F) | ||
``` | ||
|
||
```{r, include=F} | ||
library(ggplot2) | ||
library(dplyr) | ||
library(tidyr) | ||
library(xtable) | ||
library(grid) | ||
``` | ||
|
||
<style> | ||
img { | ||
max-width: none; | ||
} | ||
</style> | ||
|
||
```{r} | ||
options(xtable.type='html') | ||
``` | ||
|
||
```{r} | ||
# Options | ||
opts <- list() | ||
# ============================================================================== | ||
# Define here the name and output directory of the `dcpg_eval_perf.py` for the | ||
# models you want to compare. | ||
opts$data_dirs <- c('DNA model'='../eval2', | ||
'CpG model'='../eval2', | ||
'Joint model'='../eval2') | ||
# ============================================================================== | ||
# You probably do not have to change the following options. | ||
# Name of the file with performance metrics. | ||
opts$metrics_name <- 'metrics.tsv*' | ||
# Name of the file with performance curves. | ||
opts$curves_names <- 'curves.tsv*' | ||
# Name of the annotation that corresponds to the genome-wide performance. | ||
opts$anno_global <- 'global' | ||
# Performance metrics that are shown. | ||
opts$metrics <- c('AUC', 'ACC', 'F1', 'MCC', 'TPR', 'TNR') | ||
expand_filenames <- function(dirnames, filename) { | ||
# Append `filename` to each named directory in `dirname`. | ||
filenames <- c() | ||
for (name in names(dirnames)) { | ||
filenames[name] <- Sys.glob(file.path(dirnames[name], filename)) | ||
} | ||
return (filenames) | ||
} | ||
# Expand path of metrics and curves files. | ||
opts$metrics_files <- expand_filenames(opts$data_dirs, opts$metrics_name) | ||
opts$curves_files <- expand_filenames(opts$data_dirs, opts$curves_name) | ||
``` | ||
|
||
```{r} | ||
# ggplot theme | ||
my_theme <- function() { | ||
p <- theme( | ||
axis.text=element_text(size=rel(1.2), color='black'), | ||
axis.title.y=element_text(size=rel(1.8), margin=margin(0, 10, 0, 0)), | ||
axis.title.x=element_text(size=rel(1.8), margin=margin(10, 0, 0, 0)), | ||
axis.line = element_line(colour="black", size=1), | ||
axis.ticks.length = unit(.3, 'cm'), | ||
axis.ticks.margin = unit(.3, 'cm'), | ||
legend.position='right', | ||
legend.text=element_text(size=rel(1.3)), | ||
legend.title=element_text(size=rel(1.3), face='bold'), | ||
legend.key=element_rect(fill='transparent'), | ||
strip.text=element_text(size=rel(1.3)), | ||
panel.border=element_blank(), | ||
panel.grid.major=element_line(colour="grey60", size=0.1, linetype='solid'), | ||
panel.grid.minor=element_line(colour="grey60", size=0.1, linetype='dotted'), | ||
panel.background=element_rect(fill="transparent", colour = NA), | ||
plot.background=element_rect(fill="transparent", colour = NA) | ||
) | ||
return (p) | ||
} | ||
``` | ||
|
||
```{r} | ||
format_output <- function(d) { | ||
d <- factor(sub('cpg/', '', d)) | ||
return (d) | ||
} | ||
read_metrics <- function(filename) { | ||
d <- read.table(gzfile(filename), sep='\t', head=T) %>% tbl_df %>% | ||
select(anno, metric, output, value) %>% | ||
mutate(metric=toupper(metric), output=format_output(output)) | ||
return (d) | ||
} | ||
read_curves <- function(filename) { | ||
d <- read.table(filename, sep='\t', head=T) %>% tbl_df %>% | ||
select(anno, curve, output, x, y, thr) %>% | ||
mutate(curve=toupper(curve), output=format_output(output)) | ||
return (d) | ||
} | ||
read_files <- function(filenames, fun, ...) { | ||
d <- list() | ||
for (name in names(filenames)) { | ||
d[[name]] <- fun(filenames[name], ...) %>% mutate(model=name) | ||
} | ||
d <- do.call(rbind.data.frame, d) | ||
return (d) | ||
} | ||
# Read the data. | ||
dat <- list() | ||
dat$metrics <- read_files(opts$metrics_files, read_metrics) | ||
dat$curves <- read_files(opts$curves_files, read_curves) | ||
``` | ||
|
||
```{r} | ||
# Define order of models. | ||
model_order <- dat$metrics %>% | ||
filter(anno == opts$anno_global, metric=='AUC') %>% | ||
select(-c(anno, metric)) %>% | ||
group_by(model) %>% summarise(value=mean(value)) %>% ungroup %>% | ||
arrange(desc(value)) %>% | ||
select(model) %>% unlist | ||
dat$metrics <- dat$metrics %>% mutate(model=factor(model, levels=model_order)) | ||
dat$curves <- dat$curves %>% mutate(model=factor(model, levels=model_order)) | ||
``` | ||
|
||
|
||
## Genome-wide performances | ||
|
||
```{r fig.width=10, fig.height=8} | ||
plot_metrics <- function(d, metrics=opts$metrics, legend='top') { | ||
# Plot genome-wide performance metrics of multiple models as boxplots over | ||
# cells. | ||
if (is.null(metrics)) { | ||
metrics <- unique(d$metric) | ||
} | ||
d <- d %>% filter(metric %in% metrics) %>% | ||
mutate(metric=factor(metric, levels=metrics, labels=toupper(metrics))) %>% | ||
droplevels | ||
p <- ggplot(d, aes(x=model, y=value)) + | ||
geom_boxplot(aes(fill=model), alpha=1.0, outlier.shape=NA) + | ||
scale_fill_brewer(palette='Set1') + | ||
geom_jitter(position=position_jitter(width=0.1, height=0), size=1.2) + | ||
xlab('') + ylab('') + | ||
my_theme() + | ||
guides(fill=guide_legend(title='Model')) + | ||
theme(legend.position=legend) + | ||
theme(axis.text.x=element_text(angle=30, hjust=1)) | ||
if (length(unique(d$metric)) > 1) { | ||
p <- p + facet_wrap(~metric, ncol=3, scales='free') | ||
} else { | ||
p <- p + ylab(toupper(metrics[1])) | ||
} | ||
return (p) | ||
} | ||
d <- dat$metrics %>% filter(anno == opts$anno_global) | ||
p <- plot_metrics(d) | ||
print(p) | ||
``` | ||
|
||
```{r results='asis'} | ||
# Performance table. | ||
d <- dat$metrics %>% | ||
filter(anno == opts$anno_global) %>% select(-anno) %>% | ||
group_by(model, metric) %>% summarise(value=mean(value)) %>% ungroup %>% | ||
spread(metric, value) %>% | ||
arrange(desc(AUC)) | ||
xtable(d, digits=4) | ||
``` | ||
|
||
```{r fig.width=10, fig.height=5} | ||
plot_curves <- function(d) { | ||
# Plot performance curves (ROC, PR) of multiple models smoothed over cells. | ||
d <- d %>% filter(anno == opts$anno_global, (curve == 'ROC') | (x > 0.5)) | ||
p <- ggplot(d, aes(x=x, y=y, color=model)) + | ||
geom_smooth() + | ||
scale_color_brewer(palette='Set1') + | ||
my_theme() + | ||
theme(legend.position='top') + | ||
guides(color=guide_legend(title='Model')) + | ||
facet_wrap(~curve, ncol=2, scale='free') + | ||
xlab('') + ylab('') | ||
return (p) | ||
} | ||
d <- dat$curves %>% filter(anno == opts$anno_global) | ||
p <- plot_curves(d) | ||
print(p) | ||
``` | ||
|
||
## Context-specific performances | ||
|
||
```{r fig.width=10, fig.height=25} | ||
plot_annos <- function(d, metrics=opts$metrics, points=T) { | ||
# Plot performances of multiple models in different annotations as boxplots | ||
# over cells. | ||
annos <- d %>% filter(metric == 'AUC') %>% group_by(anno) %>% | ||
summarise(value=mean(value)) %>% | ||
arrange(desc(value)) %>% select(anno) %>% unlist | ||
d <- d %>% mutate(anno=factor(anno, levels=annos)) | ||
if (is.null(metrics)) { | ||
metrics <- unique(d$metric) | ||
} | ||
d <- d %>% filter(metric %in% metrics) %>% | ||
mutate(metric=factor(metric, levels=metrics, labels=toupper(metrics))) %>% | ||
droplevels | ||
p <- ggplot(d, aes(x=anno, y=value)) + | ||
geom_boxplot(aes(fill=model), outlier.shape=NA) + | ||
scale_fill_brewer(palette='Set1') + | ||
my_theme() + | ||
theme( | ||
panel.grid.major=element_line(colour="grey60", size=0.1, linetype='solid'), | ||
panel.grid.minor=element_line(colour="grey60", size=0.1, linetype='dotted'), | ||
axis.text.x=element_text(angle=30, hjust=1), | ||
axis.title.x=element_blank(), | ||
axis.title.y=element_blank(), | ||
legend.position='top') + | ||
facet_wrap(~metric, ncol=1, scale='free') | ||
if (points) { | ||
p <- p + geom_point(aes(fill=model), size=0.2, | ||
position=position_jitterdodge(jitter.width=0.1, jitter.height=0, | ||
dodge.width=0.8)) | ||
} | ||
return (p) | ||
} | ||
p <- plot_annos(dat$metrics) | ||
print(p) | ||
``` | ||
|
||
```{r results='asis'} | ||
# Performance table. | ||
d <- dat$metrics %>% | ||
group_by(anno, model, metric) %>% | ||
summarise(value=mean(value)) %>% | ||
ungroup %>% | ||
spread(metric, value) %>% | ||
arrange(desc(AUC)) | ||
xtable(d, digits=4) | ||
``` |
Oops, something went wrong.