Permalink
Browse files

fixed big.phase0

  • Loading branch information...
1 parent ceac8fe commit 7578ad73238c318e2f938f9d2dfab9bdd5c4a7e8 @byandell committed Oct 12, 2011
Showing with 140 additions and 58 deletions.
  1. +2 −2 DESCRIPTION
  2. +107 −37 R/big.R
  3. +9 −2 R/highlod.R
  4. +22 −17 R/parallel.R
View
@@ -1,6 +1,6 @@
Package: qtlhot
-Version: 0.2.0
-Date: 2011-07-26
+Version: 0.3.1
+Date: 2011-10-12
Author: Elias Chaibub Neto <echaibub@hotmail.com> and Brian S Yandell <byandell@wisc.edu>
Title: Inference for QTL Hotspots
Description: Functions to Determine significance of co-mapping trait hotspots
View
144 R/big.R
@@ -1,8 +1,69 @@
+big.phase0 <- function(dirpath, cross, trait.file, trait.matrix,
+ droptrait.names = NULL,
+ keeptrait.names = NULL,
+ lod.thrs,
+ sex = "Sex", trait.index, batch.effect = NULL, size.set = 250,
+ verbose = TRUE)
+{
+ ## Set up cross object and trait.data objects.
+
+ ## Cross object with sex, trait.index and batch.effect as only phenotypes.
+ traits <- match(c(sex,trait.index, batch.effect), names(cross$pheno), nomatch = 0)
+ cross$pheno <- cross$pheno[, traits, drop = FALSE]
+
+ ## Subset to individuals with batch if used.
+ if(!is.null(batch.effect))
+ cross <- subset(cross, ind = !is.na(cross$pheno[[batch.effect]]))
+
+ ## Save cross object and other pertinent information.
+ if(verbose)
+ cat("Saving cross object in cross.RData\n")
+ save(cross, lod.thrs, file = "cross.RData", compress = TRUE)
+
+ ## Get trait values for selected traits.
+ attach(file.path(dirpath, trait.file))
+ trait.names <- dimnames(get(trait.matrix))[[2]]
+ all.traits <- seq(trait.names)
+ if(!is.null(droptrait.names))
+ all.traits[match(droptrait.names, trait.names, nomatch = 0)] <- 0
+ if(!is.null(keeptrait.names))
+ all.traits[-match(keeptrait.names, trait.names, nomatch = 0)] <- 0
+ all.traits <- all.traits[all.traits > 0]
+ n.traits <- length(all.traits)
+
+ num.sets <- ceiling(n.traits / size.set)
+ trait.nums <- seq(size.set)
+
+ ## Save trait names, including those dropped, and size of sets.
+ tmp <- paste("Trait", 0, "RData", sep = ".")
+ if(verbose)
+ cat("Saving trait metadata in", tmp, "\n")
+ save(trait.file, trait.matrix, droptrait.names, size.set,
+ trait.names, all.traits, num.sets,
+ file = tmp, compress = TRUE)
+
+ ## Cycle through all the phenotypes in sets of size size.set. Keeps R object smaller.
+ for(pheno.set in seq(num.sets)) {
+ traits <- all.traits[trait.nums[trait.nums <= n.traits]]
+
+ trait.data <- get(trait.matrix)[, trait.names[traits], drop = FALSE]
+
+ tmp <- paste("Trait", pheno.set, "RData", sep = ".")
+ if(verbose)
+ cat("Saving", length(traits), "trait data as set", pheno.set, "in", tmp, "\n")
+ save(trait.data, pheno.set, file = tmp, compress = TRUE)
+
+ ## Get next size.set traits.
+ trait.nums <- trait.nums + size.set
+ }
+ detach()
+}
+#############################################################################################
big.phase1 <- function(dirpath = ".", cross.index = 0, params.file,
- nruns, cross, lod.thrs, Nmax = 2000, n.perm = 1,
- trait.file, trait.matrix, trait.index, droptrait.names, batch.effect = NULL,
+ cross, lod.thrs, Nmax = 2000, n.perm = 1, n.split = 1,
+ batch.effect = NULL,
drop.lod = 1.5, lod.min = min(lod.thrs),
- size.set = 250, window = 5, seed = 0, ...,
+ window = 5, seed = 0, big = TRUE, ...,
verbose = FALSE)
{
## Want this to depend on parallel.phase1 as much as possible.
@@ -13,23 +74,17 @@ big.phase1 <- function(dirpath = ".", cross.index = 0, params.file,
## Get any parameters in file. These will overwrite passed arguments.
eval(parse(file.path(dirpath, params.file)))
-
+
## Calculate genotype probabilities.
cross <- calc.genoprob(cross, step=0.5, err = 0.002,
map.function = "c-f", stepwidth = "max")
## Subset to individuals with batch if used.
if(!is.null(batch.effect))
cross <- subset(cross, ind = !is.na(cross$pheno[[batch.effect]]))
-
- ## Get trait values for selected traits.
- attach(file.path(dirpath, trait.file))
- trait.names <- dimnames(get(trait.matrix))[[2]]
- detach()
- all.traits <- seq(trait.names)[-match(droptrait.names, trait.names)]
-
- cross.index <- as.numeric(cross.index)
+ ## Load trait.
+ load(file.path(dirpath, "Trait.0.RData"))
## Random numbers.
if(n.perm > 1) {
@@ -40,94 +95,109 @@ big.phase1 <- function(dirpath = ".", cross.index = 0, params.file,
seeds <- sample(c(98765:987654321), n.perm, replace = FALSE)
}
else
- seeds <- rep(0, n.perm)
+ seeds <- 0
+ ## Big assumes n.split is n.perm and does one perm each run.
+ n.split <- n.perm
+
save(cross, n.perm, seeds, Nmax, drop.lod, lod.thrs, lod.min, batch.effect,
- control.probes, cross, trait.file, trait.matrix, trait.index,
- trait.names, all.traits, size.set, window, cross.index,
+ trait.index, trait.names, all.traits, size.set, ## From Trait.0.RData
+ window, cross.index, n.split, big,
file = "Phase1.RData", compress = TRUE)
invisible()
}
#############################################################################################
-big.phase2 <- function(dirpath = ".", index, ..., verbose = FALSE)
+big.phase2 <- function(dirpath = ".", index, ..., remove.files = TRUE, verbose = FALSE)
{
## Phase 2.
## Loop on sets of phenotypes, adding only what is needed.
## Loop on permutations internal to scanone.permutations.
- load("Phase1.RData")
-
- covars <- sexbatch.covar(cross, batch.effect)
+ load(file.path(dirpath, "Phase1.RData"))
+
+ if(verbose)
+ cat("compute covariates\n")
+ covars <- sexbatch.covar(cross, batch.effect, verbose = TRUE)
n.ind <- nind(cross)
n.phe <- nphe(cross)
n.traits <- length(all.traits)
- num.sets <- ceiling(n.traits / size.set)
- trait.nums <- seq(size.set)
if(n.perm > 1) {
## Random permutation. Use preset seed if provided.
seed <- seeds[index]
if(seed > 0)
set.seed(seed[1])
+ if(verbose)
+ cat("sample permutation", seed[1], "\n")
perms <- sample(seq(n.ind), n.ind, replace = FALSE)
cross$pheno <- cross$pheno[perms,]
}
## Cycle through all the phenotypes in sets of size size.set. Keeps R object smaller.
- for(pheno.set in seq(num.sets)) {
- cat(pheno.set, "\n")
- traits <- all.traits[trait.nums[trait.nums <= n.traits]]
-
- attach(file.path(dirpath, trait.file))
- perm.cross <- add.phenos(cross, get(trait.matrix)[, trait.names[traits], drop = FALSE],
- index = trait.index)
+ ## Assume large trait matrix has been broken into Trait.i.RData, each with trait.data.
+
+ filenames <- list.files(dirpath, "Trait.[1-9][0-9]*.RData")
+ if(!length(filenames))
+ parallel.error(5, 5, index)
+
+ for(pheno.set in seq(length(filenames))) {
+ if(verbose)
+ cat(pheno.set, "\n")
+
+ ## This is not working correctly. Either Trait.n.RData are all the same or ??.
+ attach(file.path(dirpath, filenames[pheno.set]))
+ perm.cross <- add.phenos(cross, trait.data, index = trait.index)
+ pheno.col <- find.pheno(perm.cross, dimnames(trait.data)[[2]])
detach()
- pheno.col <- find.pheno(perm.cross, trait.names[traits])
per.scan <- scanone(perm.cross, pheno.col=pheno.col, method="hk",
addcovar=covars$addcovar, intcovar=covars$intcovar)
per.scan.hl <- pull.highlods(per.scan, lod = lod.min, drop.lod = drop.lod)
save(per.scan.hl, perms,
file=file.path(dirpath, paste("per.scan",pheno.set, index,"RData",sep=".")))
-
- trait.nums <- trait.nums + size.set
}
chr.pos <- per.scan[,1:2]
filenames <- list.files(dirpath, paste("per.scan", "[0-9][0-9]*", index, "RData",
sep = "."))
- scan.hl <- cat.scanone(".", filenames, chr.pos)
+ scan.hl <- cat.scanone(dirpath, filenames, chr.pos)
+
+ if(remove.files)
+ file.remove(dirpath, filenames)
## Has some problem with missing values for (any(diff(pos) < 0)) in runningmean.
## problem is in calc of maxlod.hl[[k]]: NA should be 0
lod.sums <- lod.quantile.permutation(scan.hl,Nmax,lod.thrs,window,chr.pos,n.traits)
- save(scan.hl, lod.sums, perm.set,
+ save(scan.hl, lod.sums, cross.index, index,
file = paste("perm", ".", cross.index, "_", index, ".RData", sep = ""))
}
#############################################################################################
-big.phase3 <- function(dirpath = ".", index, ..., verbose = FALSE)
+big.phase3 <- function(dirpath = ".", index, cross.index, ..., verbose = FALSE)
{
## Phase 3. Merge back together.
- load("Phase1.RData")
+ load(file.path(dirpath, "Phase1.RData"))
n.thrs <- length(lod.thrs)
max.lod.quant <- matrix(NA, n.perm, Nmax)
max.N <- max.N.window <- matrix(NA, n.perm, n.thrs)
for(i.perm in seq(n.perm)) {
- attach(file.path(dirpath, "perm", ".", cross.index, "_", i.perm, ".RData", sep = ""))
+ attach(file.path(dirpath, paste("perm", ".", cross.index, "_", i.perm, ".RData", sep = "")))
n.quant <- length(lod.sums$max.lod.quant)
max.lod.quant[i.perm, seq(n.quant)] <- lod.sums$max.lod.quant
max.N[i.perm,] <- lod.sums$max.N$max.N
max.N.window[i.perm,] <- lod.sums$max.N$max.N.win
+ detach()
}
- save(max.lod.quant, max.N, max.N.window, file = "Phase3.RData", compress = TRUE)
+ outfile <- paste("Phase3", ifelse(cross.index > 0, cross.index, ""), ".RData", sep = "")
+
+ save(max.lod.quant, max.N, max.N.window, file = outfile, compress = TRUE)
}
View
@@ -28,14 +28,20 @@ pull.highlods <- function(scans, pheno.col, lod=4.5, drop.lod = 1.5)
cbind.data.frame(row = rr, phenos = pheno.col[cc], lod = lod)
}
-sexbatch.covar <- function(cross, batch.effect)
+sexbatch.covar <- function(cross, batch.effect, verbose = FALSE)
{
ic <- getsex(cross)$sex
if(!is.null(batch.effect)){
batch <- cross$pheno[,batch.effect, drop = FALSE]
tmp <- formula(paste("~ factor(", batch.effect, ")"))
+ if(verbose)
+ cat("sexbatch.covar", names(tmp), levels(factor(batch[[1]])), "\n")
+ if(verbose)
+ cat("sexbatch.covar", dim(batch), "\n")
batch <- model.matrix(tmp,batch)[,-1, drop = FALSE]
+ if(verbose)
+ cat("sexbatch.covar", dim(batch), "\n")
ac <- cbind(batch,ic)
}
else
@@ -131,7 +137,8 @@ lod.quantile.permutation <- function(cat.scan.hl,N,lod.thr,window,chr.pos,n.phe)
for(j in 1:l.lod.thr){
XX <- cat.scan.hl$lod >= lod.thr[j]
max.N$max.N[j] <- max(tapply(XX, cat.scan.hl$row,sum,na.rm=T),na.rm=T)
-
+
+ ## Is this working properly?
neqtl.pos <- smooth.neqtl(cat.scan.hl, chr.pos, max.hl, lod.thr[j], window)
max.N$max.N.win[j] <- max(neqtl.pos[,3])
View
@@ -91,7 +91,6 @@ qtlhot.phase1 <- function(dirpath, index = 0,
n.phe = nphe(cross), ## Number of traits.
nruns = 1,
big = FALSE,
- droptrait.names = character(0),
...)
{
## PHASE 1: Set up cross object. Needed in phases 2.
@@ -107,27 +106,30 @@ qtlhot.phase1 <- function(dirpath, index = 0,
## Get any parameters in file. These will overwrite passed arguments.
eval(parse(file.path(dirpath, params.file)))
- n.perm <- ceiling(n.perm / n.split)
-
- ## cross.index is used when multiple phase1 jobs are spawned.
- cross.index <- as.numeric(index)
-
## Cross object. Load if not done already.
if(!exists(cross.name))
load(file.path(dirpath, cross.file))
-
+
## Change name of cross object to "cross" for internal use.
if(cross.name != "cross")
cross <- get(cross.name)
+ ## cross.index is used when multiple phase1 jobs are spawned.
+ cross.index <- as.numeric(index)
+
if(big) {
## Used for big data run.
- big.phase1(dirpath, cross.index, params.file, nruns, cross,
- lod.thrs, Nmax, n.perm, droptrait.names = droptrait.names, ...)
+ ## Each run is separate permutation.
+ ## Make sure n.split is equal to n.perm. n.perm set to 1 in big.phase1.
n.split <- n.perm
+ big.phase1(dirpath, cross.index, params.file, cross, lod.thrs, Nmax, n.perm,
+ n.split, ...)
}
else {
## Used for studying properties of qtlhot.
+
+ n.perm <- ceiling(n.perm / n.split)
+
if(cross.index > 0 & nruns > 1) {
## Keep markers but re-simulate genotypes each time.
mymap <- pull.map(cross)
@@ -156,7 +158,7 @@ qtlhot.phase1 <- function(dirpath, index = 0,
row.names = FALSE, col.names = FALSE, quote = FALSE)
}
####################################################################################
-qtlhot.phase2 <- function(dirpath, index = NULL, ..., verbose = FALSE)
+qtlhot.phase2 <- function(dirpath, index = NULL, ..., big = FALSE, verbose = FALSE)
{
## PHASE 2: NL,N and WW permutations.
## Slow. Run on condor nodes. Sized by n.perm.
@@ -169,6 +171,12 @@ qtlhot.phase2 <- function(dirpath, index = NULL, ..., verbose = FALSE)
## permi.RData
##
+ cross.index <- scan("groups.txt", 0)[1]
+
+ ## Load Phase 1 computations.
+ infile <- "Phase1.RData"
+ load(file.path(dirpath, infile))
+
## Quality check of index.
if(missing(index))
parallel.error(2, 2, index)
@@ -181,12 +189,6 @@ qtlhot.phase2 <- function(dirpath, index = NULL, ..., verbose = FALSE)
if(big)
return(big.phase2(dirpath, index))
- cross.index <- scan("groups.txt", 0)[1]
-
- ## Load Phase 1 computations.
- infile <- "Phase1.RData"
- load(file.path(dirpath, infile))
-
outfile <- paste("perm", ".", cross.index, "_", index, ".RData", sep = "")
## Following is in NL.N.perm stuff.
@@ -214,7 +216,7 @@ qtlhot.phase2 <- function(dirpath, index = NULL, ..., verbose = FALSE)
}
####################################################################################
qtlhot.phase3 <- function(dirpath, index = NULL, ...,
- dirpath.save = dirpath, verbose = FALSE)
+ dirpath.save = dirpath, big = FALSE, verbose = FALSE)
{
## PHASE 3: Sample Markov chain (MCMC). Parallelize.
## Fast: Run on scheduler.
@@ -234,6 +236,9 @@ qtlhot.phase3 <- function(dirpath, index = NULL, ...,
## Load Phase 1 computations.
load(file.path(dirpath, "Phase1.RData"))
+ if(big)
+ return(big.phase3(dirpath, index, cross.index))
+
## This could be done once, but it would require splitting this phase in two.
## Besides, it is quite fast.
## Read in saved BIC scores and combine into one object.

0 comments on commit 7578ad7

Please sign in to comment.