Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Weigh loci by read coverage #16

Merged
merged 3 commits into from
Feb 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@

## New features

- Within sample minor allele frequencies are now weighed by read coverage. If no
coverage is supplied, the coverage is assumed to be uniform across all loci
(#16.)
- New `theme_coiaf()` creates a custom theme for this package.

## Bug Fixes
Expand Down
7 changes: 7 additions & 0 deletions R/compute_coi.R
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,16 @@ compute_coi <- function(data,
bin_size <- processed$bin_size
cuts <- processed$cuts
} else if (data_type == "real") {
# If no coverage is provided, we assume that the coverage is uniform across
# all loci
if (!"coverage" %in% colnames(data)) {
data$coverage <- rep(100, length(data$wsmaf))
}

processed <- process_real(
data$wsmaf,
data$plmaf,
data$coverage,
seq_error,
bin_size,
coi_method
Expand Down
15 changes: 14 additions & 1 deletion R/optimize.R
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,20 @@ optimize_coi <- function(data,
bin_size <- processed$bin_size
cuts <- processed$cuts
} else if (data_type == "real") {
processed <- process_real(data$wsmaf, data$plmaf, seq_error, bin_size, coi_method)
# If no coverage is provided, we assume that the coverage is uniform across
# all loci
if (!"coverage" %in% colnames(data)) {
data$coverage <- rep(100, length(data$wsmaf))
}

processed <- process_real(
data$wsmaf,
data$plmaf,
data$coverage,
seq_error,
bin_size,
coi_method
)
processed_data <- processed$data
seq_error <- processed$seq_error
bin_size <- processed$bin_size
Expand Down
21 changes: 15 additions & 6 deletions R/process.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

process <- function(wsmaf,
plmaf,
coverage,
seq_error = NULL,
bin_size = 20,
coi_method = "variant") {
Expand Down Expand Up @@ -64,14 +65,16 @@ process <- function(wsmaf,
# accounting for sequence error
df <- data.frame(
plmaf_cut = suppressWarnings(Hmisc::cut2(plmaf, m = bin_size)),
variant = ifelse(wsmaf <= seq_error | wsmaf >= (1 - seq_error), 0, 1)
variant = ifelse(wsmaf <= seq_error | wsmaf >= (1 - seq_error), 0, 1),
coverage = coverage
)
} else if (coi_method == "frequency") {
# Subset to heterozygous sites
data <- data.frame(wsmaf = wsmaf, plmaf = plmaf) %>%
data <- data.frame(wsmaf = wsmaf, plmaf = plmaf, coverage = coverage) %>%
dplyr::filter(wsmaf > seq_error & wsmaf < (1 - seq_error))
wsmaf <- data$wsmaf
plmaf <- data$plmaf
coverage <- data$coverage

# If remove all data, need to return a pseudo result to not induce errors.
# Additionally, in order to define a cut, need at least 2 data points
Expand All @@ -93,7 +96,8 @@ process <- function(wsmaf,
# Isolate PLMAF, and keep WSMAF as is
df <- data.frame(
plmaf_cut = suppressWarnings(Hmisc::cut2(plmaf, m = bin_size)),
variant = wsmaf
variant = wsmaf,
coverage = coverage
)
}

Expand Down Expand Up @@ -126,7 +130,7 @@ process <- function(wsmaf,
df_grouped <- df %>%
dplyr::group_by(.data$plmaf_cut, .drop = FALSE) %>%
dplyr::summarise(
m_variant = mean(.data$variant),
m_variant = stats::weighted.mean(.data$variant, .data$coverage),
bucket_size = dplyr::n()
) %>%
stats::na.omit()
Expand Down Expand Up @@ -194,6 +198,7 @@ process_sim <- function(sim,
process(
wsmaf = sim$data$wsmaf,
plmaf = sim$data$plmaf,
coverage = sim$data$coverage,
seq_error = seq_error,
bin_size = bin_size,
coi_method = coi_method
Expand All @@ -211,6 +216,7 @@ process_sim <- function(sim,
#'
#' @param wsmaf The within-sample allele frequency.
#' @param plmaf The population-level allele frequency.
#' @param coverage The read coverage at each locus.
#' @inheritParams process_sim
#'
#' @return A list of the following:
Expand All @@ -228,7 +234,9 @@ process_sim <- function(sim,
#' @seealso [process_sim()] to process simulated data.
#' @export

process_real <- function(wsmaf, plmaf,
process_real <- function(wsmaf,
plmaf,
coverage,
seq_error = NULL,
bin_size = 20,
coi_method = "variant") {
Expand All @@ -240,7 +248,7 @@ process_real <- function(wsmaf, plmaf,
if (!is.null(seq_error)) assert_single_bounded(seq_error)
assert_single_pos_int(bin_size)

input <- tibble::tibble(wsmaf = wsmaf, plmaf = plmaf) %>%
input <- tibble::tibble(wsmaf = wsmaf, plmaf = plmaf, coverage = coverage) %>%
tidyr::drop_na()

# In some cases we are fed in the major allele so we ensure we only examine
Expand All @@ -258,6 +266,7 @@ process_real <- function(wsmaf, plmaf,
process(
wsmaf = minor$wsmaf,
plmaf = minor$plmaf,
coverage = minor$coverage,
seq_error = seq_error,
bin_size = bin_size,
coi_method = coi_method
Expand Down
11 changes: 10 additions & 1 deletion man/process.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions man/process_real.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.