# descriptive stats (R)

### Requires: metadata.tsv (via Nextstrain) 

Goal: generate descriptive barplots (sequence counts by region, sample counts through time) for hmpxv1 subsampled genomes. 

### load libraries

In [3]:
library(tidyverse)
library(lubridate)
library(ggplot2)
library(RColorBrewer)
library(viridis)
library(scales)
library(ggbreak) 
library(patchwork)

── [1mAttaching packages[22m ─────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.2 ──
[32m✔[39m [34mggplot2[39m 3.4.0      [32m✔[39m [34mpurrr  [39m 0.3.5 
[32m✔[39m [34mtibble [39m 3.1.8      [32m✔[39m [34mdplyr  [39m 1.0.10
[32m✔[39m [34mtidyr  [39m 1.2.1      [32m✔[39m [34mstringr[39m 1.4.1 
[32m✔[39m [34mreadr  [39m 2.1.3      [32m✔[39m [34mforcats[39m 0.5.2 
── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
Loading required package: timechange


Attaching package: ‘lubridate’


The following objects are masked from ‘package:base’:

    date, intersect, setdiff, union


Loading required package: viridisLite


Attaching package: ‘scales’


The 

In [4]:
metadata_path <- '../out/beauti/meta.tsv'
og_path <- '../monkeypox-build/results/hmpxv1/good_metadata.tsv'

meta <- read_tsv(metadata_path)
og <- read_tsv(og_path)

[1mRows: [22m[34m237[39m [1mColumns: [22m[34m30[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (24): strain, accession, genbank_accession_rev, strain_original, date, r...
[32mdbl[39m  (4): coverage, missing_data, divergence, nonACGTN
[33mlgl[39m  (2): reverse, is_reverse_complement

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m2790[39m [1mColumns: [22m[34m30[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m  (23): strain, accession, genbank_accession_rev, strain_original, date, ...
[32mdbl[39m   (4): cover

In [5]:
my_theme <- function() {
    theme_classic() +
    theme( 
        plot.title = element_text(color="black", size=12, face="bold"),
        axis.title.x = element_text(color="black", size=20, face="bold", vjust=2),
        axis.title.y = element_text(color="black", size=20, face="bold", vjust=2),
      #  axis.text.x = element_text(size= 10, vjust=0.5),
      #  axis.text.y = element_text(size = 10),
        axis.text=element_text(size=24),
        )
}

options(repr.plot.width=14, repr.plot.height=8)

In [10]:
lin_b <- c('B.1',
 'B.1.1',
 'B.1.10',
 'B.1.11',
 'B.1.12',
 'B.1.13',
 'B.1.14',
 'B.1.2',
 'B.1.3',
 'B.1.4',
 'B.1.5',
 'B.1.6',
 'B.1.7',
 'B.1.8',
 'B.1.9')

og <- og %>% filter(lineage %in% lin_b) %>%
    select('strain','date','region','country','lineage') %>%
    mutate(tmp = str_replace(country, 'UnitedKingdom', 'United Kingdom')) %>%
    filter(!grepl('-XX-XX', date)) %>% 
    mutate(tmp = str_replace(date, '-XX', '-01')) %>%
    mutate(date = ymd(tmp)) %>% 
    mutate(decimal = decimal_date(date)) %>% 
    mutate(month=month(date)) %>% 
    select(-tmp) 

meta <- meta %>% filter(lineage %in% lin_b) %>%
    select('strain','date','region','country','lineage') %>%
    mutate(tmp = str_replace(country, 'UnitedKingdom', 'United Kingdom')) %>%
    filter(!grepl('-XX-XX', date)) %>% 
    mutate(tmp = str_replace(date, '-XX', '-01')) %>%
    mutate(date = ymd(tmp)) %>% 
    mutate(decimal = decimal_date(date)) %>% 
    mutate(month=month(date)) %>% 
    select(-tmp) 


## plot counts over time 

In [12]:
meta_counts <- meta %>% group_by(date, month) %>% count()
og_counts <- og %>% group_by(date,month) %>% count()

In [13]:
og_month <- og %>% group_by(month) %>% summarise(counts=n())
meta_month <- meta %>% group_by(month) %>% summarise(counts=n())

### plot counts from original dataset

In [14]:
plot_og <- og_month %>% ggplot(aes(y=log(counts), x=month)) + 
         geom_histogram(position="stack", stat="identity", bins=8, width=0.8, fill='white', color='black') + 
         my_theme() + 
         scale_fill_manual(values=color_dic) + 
         ylab('log(Sample counts)') + xlab('Date') +
         scale_x_continuous(labels=function(x) month(x, label=TRUE), breaks = scales::pretty_breaks(n = 8))
                            
plot_og

“[1m[22mIgnoring unknown parameters: `binwidth`, `bins`, and `pad`”


ERROR: Error in is_missing(values): object 'color_dic' not found


### overlay counts from subsampled data

In [None]:
og_month %>% ggplot(aes(y=log(counts), x=month)) + 
         geom_histogram(position="stack", stat="identity", bins=8, width=0.8, fill='white', color='black') + 
         my_theme() + 
         scale_fill_manual(values=color_dic) + 
         ylab('log(Sample counts)') + xlab('2022') +
         scale_x_continuous(labels=function(x) month(x, label=TRUE), breaks = scales::pretty_breaks(n = 8)) + 
         geom_histogram(data=meta_month, stat = 'identity', fill='black',width=0.5)        

#ggsave('relative_counts.png', width = 14, height = 8)

### Sample counts by region 

In [None]:
meta$country <- factor(meta$country,
                       levels= c('Netherlands',
                                 'France',
                                 'United Kingdom',
                                 'Germany',
                                 'Switzerland',
                                 'Belgium',  
                                 'Austria',
                                
                                 'Slovenia',
                                 'Slovakia',
                                 
                                 'Finland',
                                 
                                 'Spain',
                                 'Portugal',
                                 'Italy',
                                 'Colombia',
                                 'Peru',
                                 
                                 'Canada',
                                 'USA'))

og$country <- factor(og$country)

In [None]:
country_month_og <- og %>% group_by(country, month) %>% summarise(counts=n()) %>% mutate(month=month(month, label=T)) 

### assign colors to regions

In [None]:
color_dic <- c('North America'= '#66c2a5',
               'South America'= '#fc8d62', 
               'Southern Europe'= '#8da0cb',
               'Northern Europe'= '#e78ac3',
               'Central/Eastern Europe'= '#a6d854',
               'Western Europe'= '#ffd92f')

loc <- tibble(country=levels(meta$country),
                   place= c(rep('Western Europe',7),
                              rep('Central/Eastern Europe',2),
                              'Northern Europe',
                              rep('Southern Europe', 3), 
                              rep('South America',2),
                              rep('North America',2)))

meta <- meta %>% left_join(loc, by = 'country')

### plot sample counts by region through time

In [None]:
meta_grouped <- meta %>% group_by(month, place) %>% count()

meta_month <- meta %>% group_by(month) %>% count()

plot_meta <- meta %>% group_by(month, place) %>%
         count() %>%
         ggplot(aes(fill=place, y=log(n), x=month)) + 
         geom_histogram(position="stack", stat="identity", bins=8) + 
         my_theme() + 
         scale_fill_manual(values=color_dic) + 
         ylab('Sample counts') + xlab('2020') + guides(fill=guide_legend("Region"))+
         scale_x_continuous(labels=function(x) month(x, label=TRUE), breaks = scales::pretty_breaks(n = 8))
  
plot_meta 

ggsave('sample_counts.png', width = 14, height = 8)