In [2]:
source("utils/plot.R")

In [3]:
get_folded_sfs <- function(filename, n) {
    sfs <- read_tsv(filename, col_types=cols()) %>%
        filter(num_alternate != 0, num_alternate != N) %>%
        mutate(num_minor=pmin(num_alternate, N - num_alternate)) %>%
        group_by(num_minor) %>%
        summarize(num_sites=sum(num_sites)) %>%
        mutate(freq=num_minor/N)
    reference <- sfs$num_sites[1]
    phi_1 <- 1 + 1/(N - 1)
    theta <- reference/phi_1
    sfs <- sfs %>% mutate(
        # Expected folded SFS from https://doi.org/10.1006/tpbi.1995.1025
        expected=theta*((1/(1 + as.integer(num_minor == N - num_minor)))*(1/num_minor + 1/(N - num_minor))),
        scaled=num_sites/expected
    )
    return(sfs)
}

In [4]:
N <- 220
empirical <- get_folded_sfs(snakemake@input$empirical, n=N)
simulated <- get_folded_sfs(snakemake@input$simulated, n=N)
sfs <- bind_rows(Empirical=empirical, Simulated=simulated, .id='Dataset')

In [5]:
weirdness <- tibble(location=c(46, 108)) %>% mutate(freq=location/N, label=paste('n =', location))

In [6]:
fig <- ggplot(sfs) +
    geom_vline(data=weirdness, aes(xintercept=freq), linetype='dashed') +
    geom_text(data=weirdness, aes(x=freq, y=0.75, label=label), hjust=1, nudge_x=-0.01) +
    geom_line(aes(x=freq, y=scaled, colour=Dataset), size=1) +
    turkana_colour +
    labs(
        x = "Frequency",
        y = "Observed/Expected",
        title = "Genome-wide SFS"
    ) +
    turkana_theme +
    theme(
        legend.position='top',
        legend.title=element_blank(),
        panel.grid.major.y = element_blank(),
    )

turkana_save(snakemake@output[[1]], fig, width=4)