From 7578ad73238c318e2f938f9d2dfab9bdd5c4a7e8 Mon Sep 17 00:00:00 2001 From: "Brian S. Yandell" Date: Wed, 12 Oct 2011 10:33:27 -0500 Subject: [PATCH] fixed big.phase0 --- DESCRIPTION | 4 +- R/big.R | 144 ++++++++++++++++++++++++++++++++++++++------------- R/highlod.R | 11 +++- R/parallel.R | 39 ++++++++------ 4 files changed, 140 insertions(+), 58 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 9833095..7847ff7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: qtlhot -Version: 0.2.0 -Date: 2011-07-26 +Version: 0.3.1 +Date: 2011-10-12 Author: Elias Chaibub Neto and Brian S Yandell Title: Inference for QTL Hotspots Description: Functions to Determine significance of co-mapping trait hotspots diff --git a/R/big.R b/R/big.R index ec8392a..e5b1681 100644 --- a/R/big.R +++ b/R/big.R @@ -1,8 +1,69 @@ +big.phase0 <- function(dirpath, cross, trait.file, trait.matrix, + droptrait.names = NULL, + keeptrait.names = NULL, + lod.thrs, + sex = "Sex", trait.index, batch.effect = NULL, size.set = 250, + verbose = TRUE) +{ + ## Set up cross object and trait.data objects. + + ## Cross object with sex, trait.index and batch.effect as only phenotypes. + traits <- match(c(sex,trait.index, batch.effect), names(cross$pheno), nomatch = 0) + cross$pheno <- cross$pheno[, traits, drop = FALSE] + + ## Subset to individuals with batch if used. + if(!is.null(batch.effect)) + cross <- subset(cross, ind = !is.na(cross$pheno[[batch.effect]])) + + ## Save cross object and other pertinent information. + if(verbose) + cat("Saving cross object in cross.RData\n") + save(cross, lod.thrs, file = "cross.RData", compress = TRUE) + + ## Get trait values for selected traits. + attach(file.path(dirpath, trait.file)) + trait.names <- dimnames(get(trait.matrix))[[2]] + all.traits <- seq(trait.names) + if(!is.null(droptrait.names)) + all.traits[match(droptrait.names, trait.names, nomatch = 0)] <- 0 + if(!is.null(keeptrait.names)) + all.traits[-match(keeptrait.names, trait.names, nomatch = 0)] <- 0 + all.traits <- all.traits[all.traits > 0] + n.traits <- length(all.traits) + + num.sets <- ceiling(n.traits / size.set) + trait.nums <- seq(size.set) + + ## Save trait names, including those dropped, and size of sets. + tmp <- paste("Trait", 0, "RData", sep = ".") + if(verbose) + cat("Saving trait metadata in", tmp, "\n") + save(trait.file, trait.matrix, droptrait.names, size.set, + trait.names, all.traits, num.sets, + file = tmp, compress = TRUE) + + ## Cycle through all the phenotypes in sets of size size.set. Keeps R object smaller. + for(pheno.set in seq(num.sets)) { + traits <- all.traits[trait.nums[trait.nums <= n.traits]] + + trait.data <- get(trait.matrix)[, trait.names[traits], drop = FALSE] + + tmp <- paste("Trait", pheno.set, "RData", sep = ".") + if(verbose) + cat("Saving", length(traits), "trait data as set", pheno.set, "in", tmp, "\n") + save(trait.data, pheno.set, file = tmp, compress = TRUE) + + ## Get next size.set traits. + trait.nums <- trait.nums + size.set + } + detach() +} +############################################################################################# big.phase1 <- function(dirpath = ".", cross.index = 0, params.file, - nruns, cross, lod.thrs, Nmax = 2000, n.perm = 1, - trait.file, trait.matrix, trait.index, droptrait.names, batch.effect = NULL, + cross, lod.thrs, Nmax = 2000, n.perm = 1, n.split = 1, + batch.effect = NULL, drop.lod = 1.5, lod.min = min(lod.thrs), - size.set = 250, window = 5, seed = 0, ..., + window = 5, seed = 0, big = TRUE, ..., verbose = FALSE) { ## Want this to depend on parallel.phase1 as much as possible. @@ -13,7 +74,7 @@ big.phase1 <- function(dirpath = ".", cross.index = 0, params.file, ## Get any parameters in file. These will overwrite passed arguments. eval(parse(file.path(dirpath, params.file))) - + ## Calculate genotype probabilities. cross <- calc.genoprob(cross, step=0.5, err = 0.002, map.function = "c-f", stepwidth = "max") @@ -21,15 +82,9 @@ big.phase1 <- function(dirpath = ".", cross.index = 0, params.file, ## Subset to individuals with batch if used. if(!is.null(batch.effect)) cross <- subset(cross, ind = !is.na(cross$pheno[[batch.effect]])) - - ## Get trait values for selected traits. - attach(file.path(dirpath, trait.file)) - trait.names <- dimnames(get(trait.matrix))[[2]] - detach() - all.traits <- seq(trait.names)[-match(droptrait.names, trait.names)] - - cross.index <- as.numeric(cross.index) + ## Load trait. + load(file.path(dirpath, "Trait.0.RData")) ## Random numbers. if(n.perm > 1) { @@ -40,51 +95,62 @@ big.phase1 <- function(dirpath = ".", cross.index = 0, params.file, seeds <- sample(c(98765:987654321), n.perm, replace = FALSE) } else - seeds <- rep(0, n.perm) + seeds <- 0 + ## Big assumes n.split is n.perm and does one perm each run. + n.split <- n.perm + save(cross, n.perm, seeds, Nmax, drop.lod, lod.thrs, lod.min, batch.effect, - control.probes, cross, trait.file, trait.matrix, trait.index, - trait.names, all.traits, size.set, window, cross.index, + trait.index, trait.names, all.traits, size.set, ## From Trait.0.RData + window, cross.index, n.split, big, file = "Phase1.RData", compress = TRUE) invisible() } ############################################################################################# -big.phase2 <- function(dirpath = ".", index, ..., verbose = FALSE) +big.phase2 <- function(dirpath = ".", index, ..., remove.files = TRUE, verbose = FALSE) { ## Phase 2. ## Loop on sets of phenotypes, adding only what is needed. ## Loop on permutations internal to scanone.permutations. - load("Phase1.RData") - - covars <- sexbatch.covar(cross, batch.effect) + load(file.path(dirpath, "Phase1.RData")) + + if(verbose) + cat("compute covariates\n") + covars <- sexbatch.covar(cross, batch.effect, verbose = TRUE) n.ind <- nind(cross) n.phe <- nphe(cross) n.traits <- length(all.traits) - num.sets <- ceiling(n.traits / size.set) - trait.nums <- seq(size.set) if(n.perm > 1) { ## Random permutation. Use preset seed if provided. seed <- seeds[index] if(seed > 0) set.seed(seed[1]) + if(verbose) + cat("sample permutation", seed[1], "\n") perms <- sample(seq(n.ind), n.ind, replace = FALSE) cross$pheno <- cross$pheno[perms,] } ## Cycle through all the phenotypes in sets of size size.set. Keeps R object smaller. - for(pheno.set in seq(num.sets)) { - cat(pheno.set, "\n") - traits <- all.traits[trait.nums[trait.nums <= n.traits]] - - attach(file.path(dirpath, trait.file)) - perm.cross <- add.phenos(cross, get(trait.matrix)[, trait.names[traits], drop = FALSE], - index = trait.index) + ## Assume large trait matrix has been broken into Trait.i.RData, each with trait.data. + + filenames <- list.files(dirpath, "Trait.[1-9][0-9]*.RData") + if(!length(filenames)) + parallel.error(5, 5, index) + + for(pheno.set in seq(length(filenames))) { + if(verbose) + cat(pheno.set, "\n") + + ## This is not working correctly. Either Trait.n.RData are all the same or ??. + attach(file.path(dirpath, filenames[pheno.set])) + perm.cross <- add.phenos(cross, trait.data, index = trait.index) + pheno.col <- find.pheno(perm.cross, dimnames(trait.data)[[2]]) detach() - pheno.col <- find.pheno(perm.cross, trait.names[traits]) per.scan <- scanone(perm.cross, pheno.col=pheno.col, method="hk", addcovar=covars$addcovar, intcovar=covars$intcovar) @@ -92,8 +158,6 @@ big.phase2 <- function(dirpath = ".", index, ..., verbose = FALSE) save(per.scan.hl, perms, file=file.path(dirpath, paste("per.scan",pheno.set, index,"RData",sep="."))) - - trait.nums <- trait.nums + size.set } chr.pos <- per.scan[,1:2] @@ -101,21 +165,24 @@ big.phase2 <- function(dirpath = ".", index, ..., verbose = FALSE) filenames <- list.files(dirpath, paste("per.scan", "[0-9][0-9]*", index, "RData", sep = ".")) - scan.hl <- cat.scanone(".", filenames, chr.pos) + scan.hl <- cat.scanone(dirpath, filenames, chr.pos) + + if(remove.files) + file.remove(dirpath, filenames) ## Has some problem with missing values for (any(diff(pos) < 0)) in runningmean. ## problem is in calc of maxlod.hl[[k]]: NA should be 0 lod.sums <- lod.quantile.permutation(scan.hl,Nmax,lod.thrs,window,chr.pos,n.traits) - save(scan.hl, lod.sums, perm.set, + save(scan.hl, lod.sums, cross.index, index, file = paste("perm", ".", cross.index, "_", index, ".RData", sep = "")) } ############################################################################################# -big.phase3 <- function(dirpath = ".", index, ..., verbose = FALSE) +big.phase3 <- function(dirpath = ".", index, cross.index, ..., verbose = FALSE) { ## Phase 3. Merge back together. - load("Phase1.RData") + load(file.path(dirpath, "Phase1.RData")) n.thrs <- length(lod.thrs) @@ -123,11 +190,14 @@ big.phase3 <- function(dirpath = ".", index, ..., verbose = FALSE) max.N <- max.N.window <- matrix(NA, n.perm, n.thrs) for(i.perm in seq(n.perm)) { - attach(file.path(dirpath, "perm", ".", cross.index, "_", i.perm, ".RData", sep = "")) + attach(file.path(dirpath, paste("perm", ".", cross.index, "_", i.perm, ".RData", sep = ""))) n.quant <- length(lod.sums$max.lod.quant) max.lod.quant[i.perm, seq(n.quant)] <- lod.sums$max.lod.quant max.N[i.perm,] <- lod.sums$max.N$max.N max.N.window[i.perm,] <- lod.sums$max.N$max.N.win + detach() } - save(max.lod.quant, max.N, max.N.window, file = "Phase3.RData", compress = TRUE) + outfile <- paste("Phase3", ifelse(cross.index > 0, cross.index, ""), ".RData", sep = "") + + save(max.lod.quant, max.N, max.N.window, file = outfile, compress = TRUE) } diff --git a/R/highlod.R b/R/highlod.R index 043f62b..ea3565a 100644 --- a/R/highlod.R +++ b/R/highlod.R @@ -28,14 +28,20 @@ pull.highlods <- function(scans, pheno.col, lod=4.5, drop.lod = 1.5) cbind.data.frame(row = rr, phenos = pheno.col[cc], lod = lod) } -sexbatch.covar <- function(cross, batch.effect) +sexbatch.covar <- function(cross, batch.effect, verbose = FALSE) { ic <- getsex(cross)$sex if(!is.null(batch.effect)){ batch <- cross$pheno[,batch.effect, drop = FALSE] tmp <- formula(paste("~ factor(", batch.effect, ")")) + if(verbose) + cat("sexbatch.covar", names(tmp), levels(factor(batch[[1]])), "\n") + if(verbose) + cat("sexbatch.covar", dim(batch), "\n") batch <- model.matrix(tmp,batch)[,-1, drop = FALSE] + if(verbose) + cat("sexbatch.covar", dim(batch), "\n") ac <- cbind(batch,ic) } else @@ -131,7 +137,8 @@ lod.quantile.permutation <- function(cat.scan.hl,N,lod.thr,window,chr.pos,n.phe) for(j in 1:l.lod.thr){ XX <- cat.scan.hl$lod >= lod.thr[j] max.N$max.N[j] <- max(tapply(XX, cat.scan.hl$row,sum,na.rm=T),na.rm=T) - + + ## Is this working properly? neqtl.pos <- smooth.neqtl(cat.scan.hl, chr.pos, max.hl, lod.thr[j], window) max.N$max.N.win[j] <- max(neqtl.pos[,3]) diff --git a/R/parallel.R b/R/parallel.R index 3f50efd..63ae7e6 100644 --- a/R/parallel.R +++ b/R/parallel.R @@ -91,7 +91,6 @@ qtlhot.phase1 <- function(dirpath, index = 0, n.phe = nphe(cross), ## Number of traits. nruns = 1, big = FALSE, - droptrait.names = character(0), ...) { ## PHASE 1: Set up cross object. Needed in phases 2. @@ -107,27 +106,30 @@ qtlhot.phase1 <- function(dirpath, index = 0, ## Get any parameters in file. These will overwrite passed arguments. eval(parse(file.path(dirpath, params.file))) - n.perm <- ceiling(n.perm / n.split) - - ## cross.index is used when multiple phase1 jobs are spawned. - cross.index <- as.numeric(index) - ## Cross object. Load if not done already. if(!exists(cross.name)) load(file.path(dirpath, cross.file)) - + ## Change name of cross object to "cross" for internal use. if(cross.name != "cross") cross <- get(cross.name) + ## cross.index is used when multiple phase1 jobs are spawned. + cross.index <- as.numeric(index) + if(big) { ## Used for big data run. - big.phase1(dirpath, cross.index, params.file, nruns, cross, - lod.thrs, Nmax, n.perm, droptrait.names = droptrait.names, ...) + ## Each run is separate permutation. + ## Make sure n.split is equal to n.perm. n.perm set to 1 in big.phase1. n.split <- n.perm + big.phase1(dirpath, cross.index, params.file, cross, lod.thrs, Nmax, n.perm, + n.split, ...) } else { ## Used for studying properties of qtlhot. + + n.perm <- ceiling(n.perm / n.split) + if(cross.index > 0 & nruns > 1) { ## Keep markers but re-simulate genotypes each time. mymap <- pull.map(cross) @@ -156,7 +158,7 @@ qtlhot.phase1 <- function(dirpath, index = 0, row.names = FALSE, col.names = FALSE, quote = FALSE) } #################################################################################### -qtlhot.phase2 <- function(dirpath, index = NULL, ..., verbose = FALSE) +qtlhot.phase2 <- function(dirpath, index = NULL, ..., big = FALSE, verbose = FALSE) { ## PHASE 2: NL,N and WW permutations. ## Slow. Run on condor nodes. Sized by n.perm. @@ -169,6 +171,12 @@ qtlhot.phase2 <- function(dirpath, index = NULL, ..., verbose = FALSE) ## permi.RData ## + cross.index <- scan("groups.txt", 0)[1] + + ## Load Phase 1 computations. + infile <- "Phase1.RData" + load(file.path(dirpath, infile)) + ## Quality check of index. if(missing(index)) parallel.error(2, 2, index) @@ -181,12 +189,6 @@ qtlhot.phase2 <- function(dirpath, index = NULL, ..., verbose = FALSE) if(big) return(big.phase2(dirpath, index)) - cross.index <- scan("groups.txt", 0)[1] - - ## Load Phase 1 computations. - infile <- "Phase1.RData" - load(file.path(dirpath, infile)) - outfile <- paste("perm", ".", cross.index, "_", index, ".RData", sep = "") ## Following is in NL.N.perm stuff. @@ -214,7 +216,7 @@ qtlhot.phase2 <- function(dirpath, index = NULL, ..., verbose = FALSE) } #################################################################################### qtlhot.phase3 <- function(dirpath, index = NULL, ..., - dirpath.save = dirpath, verbose = FALSE) + dirpath.save = dirpath, big = FALSE, verbose = FALSE) { ## PHASE 3: Sample Markov chain (MCMC). Parallelize. ## Fast: Run on scheduler. @@ -234,6 +236,9 @@ qtlhot.phase3 <- function(dirpath, index = NULL, ..., ## Load Phase 1 computations. load(file.path(dirpath, "Phase1.RData")) + if(big) + return(big.phase3(dirpath, index, cross.index)) + ## This could be done once, but it would require splitting this phase in two. ## Besides, it is quite fast. ## Read in saved BIC scores and combine into one object.