# Characterize predicted mutations

In [None]:
library(data.table)
library(tidyverse)

In [None]:
options(repr.plot.width=8, repr.plot.height=8)

In [None]:
res_mutations <- fread('../notebooks/paper/mutations.tsv')

In [None]:
# Helper functions
strpart <- function(x, split, n, fixed=FALSE) {
    sapply(strsplit(as.character(x),split,fixed=fixed),'[',n)
}

matrix_to_long_df <- function(matrix) {
    rn <- rownames(matrix)
    cn <- colnames(matrix)
    df <- as.data.frame(matrix)
    colnames(df) <- cn
    df$rownames <- rn
    pivot_longer(df, !rownames, names_to = "colnames")
}

### Get summary of mutations a the top of the list

In [None]:
get_mutation_summary <- function(mutations, n_mutations = NULL) {
    # TODO: Order mutations
    aa_chars = sort(c('G','A','L','M','F','W','K','Q','E','S','P','V','I','C','Y','H','R','N','D','T'))
    
    if (!is.null(n_mutations)) {
        mutations[1:n_mutations,] %>% pull('mutation') -> mutation_strings
    } else {
        mutations %>% pull('mutation') -> mutation_strings
    }
    
    # Extract mutation information
    gene <- strpart(mutation_strings, ':', 1)
    aa_change <- strpart(mutation_strings, ':', 2)
    aa_from <- substr(aa_change,1,1)
    position <- substr(aa_change,2,nchar(aa_change)-1)
    aa_to <- substr(aa_change,nchar(aa_change),nchar(aa_change))
    mutation_data_parse <- data.frame(
        gene,
        aa_from,
        position,
        aa_to
    )
    mutation_data_parse %>% 
        filter(aa_from %in% aa_chars) %>%
        filter(aa_to %in% aa_chars) -> mutation_data_parse
    
    # Count mutations
    mutation_data_parse[,c('aa_from','aa_to')] %>% 
        group_by(aa_from, aa_to) %>% summarise(n=n(), .groups="drop") -> summarized_counts
    
    # pivot wider
    summarized_counts %>% pivot_wider(
        id_cols = c(aa_from, aa_to),
        names_from = aa_to,
        values_from = n,
        values_fill=0
    ) -> mutation_counts_actual
    
    # Turn to matrix
    rn <- mutation_counts_actual$aa_from
    cn <- colnames(mutation_counts_actual)
    mutation_counts_actual <- (as.matrix(mutation_counts_actual[,-1]))
    rownames(mutation_counts_actual) <- rn
    colnames(mutation_counts_actual) <- cn[-1]
    
    # Find missing cols and rows
    missing_cols <- setdiff(aa_chars, colnames(mutation_counts_actual))
    missing_rows <- setdiff(aa_chars, rownames(mutation_counts_actual))
    
    # Add missing cols
    original_cn <- colnames(mutation_counts_actual)
    mutation_counts_actual <- cbind(mutation_counts_actual, 
                                    matrix(0, nrow(mutation_counts_actual), 
                                           length(missing_cols)))
    colnames(mutation_counts_actual) <- c(original_cn, missing_cols)
    # Add missing rows
    original_rn <- rownames(mutation_counts_actual)
    mutation_counts_actual <- rbind(mutation_counts_actual,
                                    matrix(0, length(missing_rows),
                                          ncol(mutation_counts_actual)))
    rownames(mutation_counts_actual) <- c(original_rn, missing_rows)
    
    # Put in order
    mutation_counts_actual <- mutation_counts_actual[aa_chars,aa_chars]
    
    mutation_counts_actual
}

In [None]:
n_mutations <- 1000

matrix_to_long_df(get_mutation_summary(res_mutations, n_mutations = n_mutations)) %>% 
    ggplot(aes(x=rownames, y=colnames, fill=value)) + 
    geom_tile() +
    theme_bw() +
    scale_fill_gradient(low = "white", high = "red") +
    geom_text(aes(label=value)) +
    ggtitle(paste0('Mutation Frequencies in Top ', n_mutations, ' mutations'))

### Test for enrichment of particular mutations

In [None]:
test_mutation_enrichment <- function(res_mutations, n_mutations=1000, symmetric = TRUE) {
    aa_chars = sort(c('G','A','L','M','F','W','K','Q','E','S','P','V','I','C','Y','H','R','N','D','T'))
    
    top_mut <- get_mutation_summary(res_mutations, n_mutations)
    universe_mut <- get_mutation_summary(res_mutations, NULL)
    
    pvals <- matrix(0, length(aa_chars), length(aa_chars))
    rownames(pvals) <- colnames(pvals) <- aa_chars
    
    top_total <- sum(top_mut)
    universe_total <- sum(universe_mut)
    
    universe_rate <- universe_mut / universe_total

    for (from_aa in aa_chars) {
        for (to_aa in aa_chars) {
            if (symmetric) {
                this_mutation_count = top_mut[from_aa, to_aa] + top_mut[to_aa, from_aa] # observed count
                this_mutation_global_prob = universe_rate[from_aa, to_aa] + universe_rate[to_aa, from_aa] #expected_rate
            } else {
                this_mutation_count = top_mut[from_aa, to_aa] # observed count
                this_mutation_global_prob = universe_rate[from_aa, to_aa] # excpected rate
            }

            pvals[from_aa, to_aa] <- dbinom(this_mutation_count, top_total, this_mutation_global_prob)
        }
    }
    
    pvals    
}

In [None]:
test_mutation_enrichment_serial <- function(res_mutations, n_start=1, n_end=1000, step=1, symmetric=TRUE, summary_fn=min) {
    xs <- seq(n_start, n_end, step)
    ys <- unlist(lapply(xs, function(x) {
        summary_fn(test_mutation_enrichment(res_mutations,n=x,symmetric=symmetric))
    }))
    most_sign_pval <- data.frame(n=xs, most_sign_pval= ys)
}

In [None]:
most_sign_pval_sym <- test_mutation_enrichment_serial(res_mutations, n_end=5000, step=10, symmetric = TRUE, summary_fn=min)
most_sign_pval_sym$type <- 'most'
most_sign_pval_sym$sym <- 'sym'

most_sign_pval_nonsym <- test_mutation_enrichment_serial(res_mutations, n_end=5000, step=10,symmetric = FALSE, summary_fn=min)
most_sign_pval_nonsym$type <- 'most'
most_sign_pval_nonsym$sym <- 'nonsym'

mean_sign_pval_sym <- test_mutation_enrichment_serial(res_mutations, n_end=5000, step=10,symmetric = TRUE, summary_fn=mean)
mean_sign_pval_sym$type <- 'mean'
mean_sign_pval_sym$sym <- 'sym'

mean_sign_pval_nonsym <- test_mutation_enrichment_serial(res_mutations, n_end=5000, step=10,symmetric = FALSE, summary_fn=mean)
mean_sign_pval_nonsym$type <- 'mean'
mean_sign_pval_nonsym$sym <- 'nonsym'

plot_data <- rbind(most_sign_pval_sym, most_sign_pval_nonsym, mean_sign_pval_sym, mean_sign_pval_nonsym)

In [None]:
ggplot(plot_data, aes(x=n, y=-log10(most_sign_pval), color=type, linetype=sym)) + 
    geom_line() + 
    geom_hline(yintercept = -log10(0.05/(20*20))) + facet_wrap(~sym)