# Examine mutations

In [None]:
# load libraries
library(tidyverse)
library(gridExtra)

In [None]:
# Load the mutation data
mutations_raw <- data.table::fread('paper/mutations.tsv')

In [None]:
# inspect data
head(mutations_raw, n=2)

In [None]:
# extract substition information
mutations_raw %>% 
    select(rank, mutation) %>% 
    mutate(protein = sapply(strsplit(x = mutation, split=':', fixed=T), `[`, 1)) %>%
    mutate(aachange = sapply(strsplit(x = mutation, split=':', fixed=T), `[`, 2)) %>%
    mutate(aa_from = substr(aachange,1,1)) %>%
    mutate(aa_to = substr(aachange,nchar(aachange),nchar(aachange))) -> mutation_info

In [None]:
# inspect
head(mutation_info)

In [None]:
# Get counts of each substitution
mutation_info %>% 
    group_by(aa_from, aa_to) %>%
    summarise( count = n() ) %>%
    arrange(desc(count))-> substitution_count_data

In [None]:
# Look at top substitutions
substitution_count_data  %>% head(10)

In [None]:
write.table(substitution_count_data, 'paper/top_substitions/top_substitions.csv')

In [None]:
generate_substitions_plots <- function(substitution_count_data) {
    substitution_count_data %>%
        ggplot(aes(x=aa_from, y=count)) + 
        geom_bar(stat='identity') +
        ggtitle('Substitions From') -> p_from
    substitution_count_data %>%
        ggplot(aes(x=aa_to, y=count)) + 
        geom_bar(stat='identity') +
        ggtitle('Substitions From') -> p_to
    substitution_count_data %>% 
        ggplot(aes(x=aa_from,y=aa_to,fill=count)) +
        geom_tile() + 
        geom_text(aes(label=count), size=1) +
        ggtitle("Heatmap of aa substitutions") -> p_heat
    grid.arrange(p_heat, p_from, p_to, ncol=3) -> p
    invisible(p)
}

In [None]:
options(repr.plot.width = 12, repr.plot.height = 5)
generate_substitions_plots(substitution_count_data) -> p
p

In [None]:
ggsave("paper/top_substitions/top_substitions.png", w=12,h=5,plot=p)

# Stratify by gene

In [None]:
# get list of genes
mutation_info %>% 
    pull(protein) %>% 
    unique() -> gene_list

## ORF1b

In [None]:
for (g in gene_list) {
    mutation_info %>% 
        filter(protein == g) %>%
        group_by(aa_from, aa_to) %>%
        summarise( count = n() ) %>%
        arrange(desc(count)) -> gene_mutation_info
    
    write.table(gene_mutation_info, file = paste0("paper/top_substitions/top_substitutions__",g,".csv"))
    
    generate_substitions_plots(gene_mutation_info) -> p_gene
    
    ggsave(filename = paste0("paper/top_substitions/top_substitutions__",g,".png"), plot = p_gene, w= 12,h=5)
}