From 7578ad73238c318e2f938f9d2dfab9bdd5c4a7e8 Mon Sep 17 00:00:00 2001
From: "Brian S. Yandell" <byandell@wisc.edu>
Date: Wed, 12 Oct 2011 10:33:27 -0500
Subject: [PATCH] fixed big.phase0

---
 DESCRIPTION  |   4 +-
 R/big.R      | 144 ++++++++++++++++++++++++++++++++++++++-------------
 R/highlod.R  |  11 +++-
 R/parallel.R |  39 ++++++++------
 4 files changed, 140 insertions(+), 58 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 9833095..7847ff7 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: qtlhot
-Version: 0.2.0
-Date: 2011-07-26
+Version: 0.3.1
+Date: 2011-10-12
 Author: Elias Chaibub Neto <echaibub@hotmail.com> and Brian S Yandell <byandell@wisc.edu>
 Title: Inference for QTL Hotspots
 Description: Functions to Determine significance of co-mapping trait hotspots
diff --git a/R/big.R b/R/big.R
index ec8392a..e5b1681 100644
--- a/R/big.R
+++ b/R/big.R
@@ -1,8 +1,69 @@
+big.phase0 <- function(dirpath, cross, trait.file, trait.matrix,
+                       droptrait.names = NULL,
+                       keeptrait.names = NULL,
+                       lod.thrs,
+                       sex = "Sex", trait.index, batch.effect = NULL, size.set = 250,
+                       verbose = TRUE)
+{
+  ## Set up cross object and trait.data objects.
+
+  ## Cross object with sex, trait.index and batch.effect as only phenotypes.
+  traits <- match(c(sex,trait.index, batch.effect), names(cross$pheno), nomatch = 0)
+  cross$pheno <- cross$pheno[, traits, drop = FALSE]
+  
+  ## Subset to individuals with batch if used.
+  if(!is.null(batch.effect))
+    cross <- subset(cross, ind = !is.na(cross$pheno[[batch.effect]]))
+
+  ## Save cross object and other pertinent information.
+  if(verbose)
+    cat("Saving cross object in cross.RData\n")
+  save(cross, lod.thrs, file = "cross.RData", compress = TRUE)
+
+  ## Get trait values for selected traits.
+  attach(file.path(dirpath, trait.file))
+  trait.names <- dimnames(get(trait.matrix))[[2]]
+  all.traits <- seq(trait.names)
+  if(!is.null(droptrait.names))
+    all.traits[match(droptrait.names, trait.names, nomatch = 0)] <- 0
+  if(!is.null(keeptrait.names))
+    all.traits[-match(keeptrait.names, trait.names, nomatch = 0)] <- 0
+  all.traits <- all.traits[all.traits > 0]
+  n.traits <- length(all.traits)
+
+  num.sets <- ceiling(n.traits / size.set)
+  trait.nums <- seq(size.set)
+
+  ## Save trait names, including those dropped, and size of sets.
+  tmp <- paste("Trait", 0, "RData", sep = ".")
+  if(verbose)
+    cat("Saving trait metadata in", tmp, "\n")
+  save(trait.file, trait.matrix, droptrait.names, size.set,
+       trait.names, all.traits, num.sets,
+       file = tmp, compress = TRUE)
+
+  ## Cycle through all the phenotypes in sets of size size.set. Keeps R object smaller.
+  for(pheno.set in seq(num.sets)) {
+    traits <- all.traits[trait.nums[trait.nums <= n.traits]]
+    
+    trait.data <- get(trait.matrix)[, trait.names[traits], drop = FALSE]
+
+    tmp <- paste("Trait", pheno.set, "RData", sep = ".")
+    if(verbose)
+      cat("Saving", length(traits), "trait data as set", pheno.set, "in", tmp, "\n")
+    save(trait.data, pheno.set, file = tmp, compress = TRUE)
+
+    ## Get next size.set traits.
+    trait.nums <- trait.nums + size.set
+  }
+  detach()
+}
+#############################################################################################
 big.phase1 <- function(dirpath = ".", cross.index = 0, params.file,
-                       nruns, cross, lod.thrs, Nmax = 2000, n.perm = 1,
-                       trait.file, trait.matrix, trait.index, droptrait.names, batch.effect = NULL,
+                       cross, lod.thrs, Nmax = 2000, n.perm = 1, n.split = 1,
+                       batch.effect = NULL,
                        drop.lod = 1.5, lod.min = min(lod.thrs),
-                       size.set = 250, window = 5, seed = 0, ...,
+                       window = 5, seed = 0, big = TRUE, ...,
                        verbose = FALSE)
 {
   ## Want this to depend on parallel.phase1 as much as possible.
@@ -13,7 +74,7 @@ big.phase1 <- function(dirpath = ".", cross.index = 0, params.file,
   
   ## Get any parameters in file. These will overwrite passed arguments.
   eval(parse(file.path(dirpath, params.file)))
-
+  
   ## Calculate genotype probabilities.
   cross <- calc.genoprob(cross, step=0.5, err = 0.002,
                          map.function = "c-f", stepwidth = "max")
@@ -21,15 +82,9 @@ big.phase1 <- function(dirpath = ".", cross.index = 0, params.file,
   ## Subset to individuals with batch if used.
   if(!is.null(batch.effect))
     cross <- subset(cross, ind = !is.na(cross$pheno[[batch.effect]]))
-  
-  ## Get trait values for selected traits.
 
-  attach(file.path(dirpath, trait.file))
-  trait.names <- dimnames(get(trait.matrix))[[2]]
-  detach()
-  all.traits <- seq(trait.names)[-match(droptrait.names, trait.names)]
-
-  cross.index <- as.numeric(cross.index)
+  ## Load trait.
+  load(file.path(dirpath, "Trait.0.RData"))
 
   ## Random numbers.
   if(n.perm > 1) {
@@ -40,51 +95,62 @@ big.phase1 <- function(dirpath = ".", cross.index = 0, params.file,
     seeds <- sample(c(98765:987654321), n.perm, replace = FALSE)
   }
   else
-    seeds <- rep(0, n.perm)
+    seeds <- 0
 
+  ## Big assumes n.split is n.perm and does one perm each run.
+  n.split <- n.perm
+  
   save(cross, n.perm, seeds, Nmax, drop.lod, lod.thrs, lod.min, batch.effect,
-       control.probes, cross, trait.file, trait.matrix, trait.index,
-       trait.names, all.traits, size.set, window, cross.index,
+       trait.index, trait.names, all.traits, size.set, ## From Trait.0.RData
+       window, cross.index, n.split, big,
        file = "Phase1.RData", compress = TRUE)
   invisible()
 }
 #############################################################################################
-big.phase2 <- function(dirpath = ".", index, ..., verbose = FALSE)
+big.phase2 <- function(dirpath = ".", index, ..., remove.files = TRUE, verbose = FALSE)
 {
   ## Phase 2.
   ## Loop on sets of phenotypes, adding only what is needed.
   ## Loop on permutations internal to scanone.permutations.
   
-  load("Phase1.RData")
-  
-  covars <- sexbatch.covar(cross, batch.effect)
+  load(file.path(dirpath, "Phase1.RData"))
+
+  if(verbose)
+    cat("compute covariates\n")
+  covars <- sexbatch.covar(cross, batch.effect, verbose = TRUE)
 
   n.ind <- nind(cross)
   n.phe <- nphe(cross)
   n.traits <- length(all.traits)
-  num.sets <- ceiling(n.traits / size.set)
-  trait.nums <- seq(size.set)
 
   if(n.perm > 1) {
     ## Random permutation. Use preset seed if provided.
     seed <- seeds[index]
     if(seed > 0)
       set.seed(seed[1])
+    if(verbose)
+      cat("sample permutation", seed[1], "\n")
     perms <- sample(seq(n.ind), n.ind, replace = FALSE)
     cross$pheno <- cross$pheno[perms,]
   }
 
   ## Cycle through all the phenotypes in sets of size size.set. Keeps R object smaller.
-  for(pheno.set in seq(num.sets)) {
-    cat(pheno.set, "\n")
-    traits <- all.traits[trait.nums[trait.nums <= n.traits]]
-    
-    attach(file.path(dirpath, trait.file))
-    perm.cross <- add.phenos(cross, get(trait.matrix)[, trait.names[traits], drop = FALSE],
-                         index = trait.index)
+  ## Assume large trait matrix has been broken into Trait.i.RData, each with trait.data.
+
+  filenames <- list.files(dirpath, "Trait.[1-9][0-9]*.RData")
+  if(!length(filenames))
+    parallel.error(5, 5, index)
+  
+  for(pheno.set in seq(length(filenames))) {
+    if(verbose)
+      cat(pheno.set, "\n")
+
+    ## This is not working correctly. Either Trait.n.RData are all the same or ??.
+    attach(file.path(dirpath, filenames[pheno.set]))
+    perm.cross <- add.phenos(cross, trait.data, index = trait.index)
+    pheno.col <- find.pheno(perm.cross, dimnames(trait.data)[[2]])
     detach()
 
-    pheno.col <- find.pheno(perm.cross, trait.names[traits])
     per.scan <- scanone(perm.cross, pheno.col=pheno.col, method="hk", 
                         addcovar=covars$addcovar, intcovar=covars$intcovar)
 
@@ -92,8 +158,6 @@ big.phase2 <- function(dirpath = ".", index, ..., verbose = FALSE)
 
     save(per.scan.hl, perms,
          file=file.path(dirpath, paste("per.scan",pheno.set, index,"RData",sep=".")))
-
-    trait.nums <- trait.nums + size.set
   }
 
   chr.pos <- per.scan[,1:2]
@@ -101,21 +165,24 @@ big.phase2 <- function(dirpath = ".", index, ..., verbose = FALSE)
   filenames <- list.files(dirpath, paste("per.scan", "[0-9][0-9]*", index, "RData",
                                          sep = "."))
 
-  scan.hl <- cat.scanone(".", filenames, chr.pos)
+  scan.hl <- cat.scanone(dirpath, filenames, chr.pos)
+
+  if(remove.files)
+    file.remove(dirpath, filenames)
   
   ## Has some problem with missing values for (any(diff(pos) < 0)) in runningmean.
   ## problem is in calc of maxlod.hl[[k]]: NA should be 0
   lod.sums <- lod.quantile.permutation(scan.hl,Nmax,lod.thrs,window,chr.pos,n.traits)
   
-  save(scan.hl, lod.sums, perm.set,
+  save(scan.hl, lod.sums, cross.index, index,
        file = paste("perm", ".", cross.index, "_", index, ".RData", sep = ""))
 }
 #############################################################################################
-big.phase3 <- function(dirpath = ".", index, ..., verbose = FALSE)
+big.phase3 <- function(dirpath = ".", index, cross.index, ..., verbose = FALSE)
 {
   ## Phase 3. Merge back together.
 
-  load("Phase1.RData")
+  load(file.path(dirpath, "Phase1.RData"))
 
   n.thrs <- length(lod.thrs)
   
@@ -123,11 +190,14 @@ big.phase3 <- function(dirpath = ".", index, ..., verbose = FALSE)
   max.N <- max.N.window <- matrix(NA, n.perm, n.thrs)
   
   for(i.perm in seq(n.perm)) {
-    attach(file.path(dirpath, "perm", ".", cross.index, "_", i.perm, ".RData", sep = ""))
+    attach(file.path(dirpath, paste("perm", ".", cross.index, "_", i.perm, ".RData", sep = "")))
     n.quant <- length(lod.sums$max.lod.quant)
     max.lod.quant[i.perm, seq(n.quant)] <- lod.sums$max.lod.quant
     max.N[i.perm,] <- lod.sums$max.N$max.N
     max.N.window[i.perm,] <- lod.sums$max.N$max.N.win
+    detach()
   }
-  save(max.lod.quant, max.N, max.N.window, file = "Phase3.RData", compress = TRUE)
+  outfile <- paste("Phase3", ifelse(cross.index > 0, cross.index, ""), ".RData", sep = "")
+
+  save(max.lod.quant, max.N, max.N.window, file = outfile, compress = TRUE)
 }
diff --git a/R/highlod.R b/R/highlod.R
index 043f62b..ea3565a 100644
--- a/R/highlod.R
+++ b/R/highlod.R
@@ -28,14 +28,20 @@ pull.highlods <- function(scans, pheno.col, lod=4.5, drop.lod = 1.5)
   cbind.data.frame(row = rr, phenos = pheno.col[cc], lod = lod)
 }
 
-sexbatch.covar <- function(cross, batch.effect)
+sexbatch.covar <- function(cross, batch.effect, verbose = FALSE)
 {
   ic <- getsex(cross)$sex
   
   if(!is.null(batch.effect)){
     batch <- cross$pheno[,batch.effect, drop = FALSE]
     tmp <- formula(paste("~ factor(", batch.effect, ")"))
+    if(verbose)
+      cat("sexbatch.covar", names(tmp), levels(factor(batch[[1]])), "\n")
+    if(verbose)
+      cat("sexbatch.covar", dim(batch), "\n")
     batch <- model.matrix(tmp,batch)[,-1, drop = FALSE]
+    if(verbose)
+      cat("sexbatch.covar", dim(batch), "\n")
     ac <- cbind(batch,ic)
   }
   else
@@ -131,7 +137,8 @@ lod.quantile.permutation <- function(cat.scan.hl,N,lod.thr,window,chr.pos,n.phe)
   for(j in 1:l.lod.thr){
     XX <- cat.scan.hl$lod >= lod.thr[j]
     max.N$max.N[j] <- max(tapply(XX, cat.scan.hl$row,sum,na.rm=T),na.rm=T)
-    
+
+    ## Is this working properly?
     neqtl.pos <- smooth.neqtl(cat.scan.hl, chr.pos, max.hl, lod.thr[j], window)
     
     max.N$max.N.win[j] <- max(neqtl.pos[,3])
diff --git a/R/parallel.R b/R/parallel.R
index 3f50efd..63ae7e6 100644
--- a/R/parallel.R
+++ b/R/parallel.R
@@ -91,7 +91,6 @@ qtlhot.phase1 <- function(dirpath, index = 0,
                           n.phe = nphe(cross), ## Number of traits.
                           nruns = 1,
                           big = FALSE,
-                          droptrait.names = character(0),
                           ...)
 {
   ## PHASE 1: Set up cross object. Needed in phases 2.
@@ -107,27 +106,30 @@ qtlhot.phase1 <- function(dirpath, index = 0,
   ## Get any parameters in file. These will overwrite passed arguments.
   eval(parse(file.path(dirpath, params.file)))
 
-  n.perm <- ceiling(n.perm / n.split)
-
-  ## cross.index is used when multiple phase1 jobs are spawned.
-  cross.index <- as.numeric(index)
-  
   ## Cross object. Load if not done already.
   if(!exists(cross.name))
     load(file.path(dirpath, cross.file))
-
+  
   ## Change name of cross object to "cross" for internal use.
   if(cross.name != "cross")
     cross <- get(cross.name)
 
+  ## cross.index is used when multiple phase1 jobs are spawned.
+  cross.index <- as.numeric(index)
+
   if(big) {
     ## Used for big data run.
-    big.phase1(dirpath, cross.index, params.file, nruns, cross,
-               lod.thrs, Nmax, n.perm, droptrait.names = droptrait.names, ...)
+    ## Each run is separate permutation.
+    ## Make sure n.split is equal to n.perm. n.perm set to 1 in big.phase1.
     n.split <- n.perm
+    big.phase1(dirpath, cross.index, params.file, cross, lod.thrs, Nmax, n.perm,
+               n.split, ...)
   }
   else {
     ## Used for studying properties of qtlhot.
+
+    n.perm <- ceiling(n.perm / n.split)
+      
     if(cross.index > 0 & nruns > 1) {
       ## Keep markers but re-simulate genotypes each time.
       mymap <- pull.map(cross)
@@ -156,7 +158,7 @@ qtlhot.phase1 <- function(dirpath, index = 0,
               row.names = FALSE, col.names = FALSE, quote = FALSE)
 }
 ####################################################################################
-qtlhot.phase2 <- function(dirpath, index = NULL, ..., verbose = FALSE)
+qtlhot.phase2 <- function(dirpath, index = NULL, ..., big = FALSE, verbose = FALSE)
 {
   ## PHASE 2: NL,N and WW permutations. 
   ##          Slow. Run on condor nodes. Sized by n.perm.
@@ -169,6 +171,12 @@ qtlhot.phase2 <- function(dirpath, index = NULL, ..., verbose = FALSE)
   ##       permi.RData
   ##
 
+  cross.index <- scan("groups.txt", 0)[1]
+
+  ## Load Phase 1 computations.
+  infile <- "Phase1.RData"
+  load(file.path(dirpath, infile))
+
   ## Quality check of index.
   if(missing(index))
     parallel.error(2, 2, index)
@@ -181,12 +189,6 @@ qtlhot.phase2 <- function(dirpath, index = NULL, ..., verbose = FALSE)
   if(big)
     return(big.phase2(dirpath, index))
 
-  cross.index <- scan("groups.txt", 0)[1]
-
-  ## Load Phase 1 computations.
-  infile <- "Phase1.RData"
-  load(file.path(dirpath, infile))
-
   outfile <- paste("perm", ".", cross.index, "_", index, ".RData", sep = "")
 
   ## Following is in NL.N.perm stuff.
@@ -214,7 +216,7 @@ qtlhot.phase2 <- function(dirpath, index = NULL, ..., verbose = FALSE)
 }
 ####################################################################################
 qtlhot.phase3 <- function(dirpath, index = NULL, ...,
-                          dirpath.save = dirpath, verbose = FALSE)
+                          dirpath.save = dirpath, big = FALSE, verbose = FALSE)
 {
   ## PHASE 3: Sample Markov chain (MCMC). Parallelize.
   ##          Fast: Run on scheduler.
@@ -234,6 +236,9 @@ qtlhot.phase3 <- function(dirpath, index = NULL, ...,
   ## Load Phase 1 computations.
   load(file.path(dirpath, "Phase1.RData"))
 
+  if(big)
+    return(big.phase3(dirpath, index, cross.index))
+
   ## This could be done once, but it would require splitting this phase in two.
   ## Besides, it is quite fast.
   ## Read in saved BIC scores and combine into one object.