Permalink
Browse files

really add the big files

  • Loading branch information...
byandell committed Jul 27, 2011
1 parent f61806b commit ceac8fed93c3f6e14f7b81b8581e44d3ffa54dfc
Showing with 642 additions and 0 deletions.
  1. +133 −0 R/big.R
  2. +259 −0 R/highlod.R
  3. +130 −0 R/ravel.R
  4. +120 −0 R/util.R
View
133 R/big.R
@@ -0,0 +1,133 @@
+big.phase1 <- function(dirpath = ".", cross.index = 0, params.file,
+ nruns, cross, lod.thrs, Nmax = 2000, n.perm = 1,
+ trait.file, trait.matrix, trait.index, droptrait.names, batch.effect = NULL,
+ drop.lod = 1.5, lod.min = min(lod.thrs),
+ size.set = 250, window = 5, seed = 0, ...,
+ verbose = FALSE)
+{
+ ## Want this to depend on parallel.phase1 as much as possible.
+ ## Just include new stuff here.
+
+ ## Phase 1 for big dataset.
+ ## Sets up initial permutations.
+
+ ## Get any parameters in file. These will overwrite passed arguments.
+ eval(parse(file.path(dirpath, params.file)))
+
+ ## Calculate genotype probabilities.
+ cross <- calc.genoprob(cross, step=0.5, err = 0.002,
+ map.function = "c-f", stepwidth = "max")
+
+ ## Subset to individuals with batch if used.
+ if(!is.null(batch.effect))
+ cross <- subset(cross, ind = !is.na(cross$pheno[[batch.effect]]))
+
+ ## Get trait values for selected traits.
+
+ attach(file.path(dirpath, trait.file))
+ trait.names <- dimnames(get(trait.matrix))[[2]]
+ detach()
+ all.traits <- seq(trait.names)[-match(droptrait.names, trait.names)]
+
+ cross.index <- as.numeric(cross.index)
+
+ ## Random numbers.
+ if(n.perm > 1) {
+ if(seed[1] > 0)
+ set.seed(seed[1])
+
+ n.ind <- nind(cross)
+ seeds <- sample(c(98765:987654321), n.perm, replace = FALSE)
+ }
+ else
+ seeds <- rep(0, n.perm)
+
+ save(cross, n.perm, seeds, Nmax, drop.lod, lod.thrs, lod.min, batch.effect,
+ control.probes, cross, trait.file, trait.matrix, trait.index,
+ trait.names, all.traits, size.set, window, cross.index,
+ file = "Phase1.RData", compress = TRUE)
+ invisible()
+}
+#############################################################################################
+big.phase2 <- function(dirpath = ".", index, ..., verbose = FALSE)
+{
+ ## Phase 2.
+ ## Loop on sets of phenotypes, adding only what is needed.
+ ## Loop on permutations internal to scanone.permutations.
+
+ load("Phase1.RData")
+
+ covars <- sexbatch.covar(cross, batch.effect)
+
+ n.ind <- nind(cross)
+ n.phe <- nphe(cross)
+ n.traits <- length(all.traits)
+ num.sets <- ceiling(n.traits / size.set)
+ trait.nums <- seq(size.set)
+
+ if(n.perm > 1) {
+ ## Random permutation. Use preset seed if provided.
+ seed <- seeds[index]
+ if(seed > 0)
+ set.seed(seed[1])
+ perms <- sample(seq(n.ind), n.ind, replace = FALSE)
+ cross$pheno <- cross$pheno[perms,]
+ }
+
+ ## Cycle through all the phenotypes in sets of size size.set. Keeps R object smaller.
+ for(pheno.set in seq(num.sets)) {
+ cat(pheno.set, "\n")
+ traits <- all.traits[trait.nums[trait.nums <= n.traits]]
+
+ attach(file.path(dirpath, trait.file))
+ perm.cross <- add.phenos(cross, get(trait.matrix)[, trait.names[traits], drop = FALSE],
+ index = trait.index)
+ detach()
+
+ pheno.col <- find.pheno(perm.cross, trait.names[traits])
+ per.scan <- scanone(perm.cross, pheno.col=pheno.col, method="hk",
+ addcovar=covars$addcovar, intcovar=covars$intcovar)
+
+ per.scan.hl <- pull.highlods(per.scan, lod = lod.min, drop.lod = drop.lod)
+
+ save(per.scan.hl, perms,
+ file=file.path(dirpath, paste("per.scan",pheno.set, index,"RData",sep=".")))
+
+ trait.nums <- trait.nums + size.set
+ }
+
+ chr.pos <- per.scan[,1:2]
+
+ filenames <- list.files(dirpath, paste("per.scan", "[0-9][0-9]*", index, "RData",
+ sep = "."))
+
+ scan.hl <- cat.scanone(".", filenames, chr.pos)
+
+ ## Has some problem with missing values for (any(diff(pos) < 0)) in runningmean.
+ ## problem is in calc of maxlod.hl[[k]]: NA should be 0
+ lod.sums <- lod.quantile.permutation(scan.hl,Nmax,lod.thrs,window,chr.pos,n.traits)
+
+ save(scan.hl, lod.sums, perm.set,
+ file = paste("perm", ".", cross.index, "_", index, ".RData", sep = ""))
+}
+#############################################################################################
+big.phase3 <- function(dirpath = ".", index, ..., verbose = FALSE)
+{
+ ## Phase 3. Merge back together.
+
+ load("Phase1.RData")
+
+ n.thrs <- length(lod.thrs)
+
+ max.lod.quant <- matrix(NA, n.perm, Nmax)
+ max.N <- max.N.window <- matrix(NA, n.perm, n.thrs)
+
+ for(i.perm in seq(n.perm)) {
+ attach(file.path(dirpath, "perm", ".", cross.index, "_", i.perm, ".RData", sep = ""))
+ n.quant <- length(lod.sums$max.lod.quant)
+ max.lod.quant[i.perm, seq(n.quant)] <- lod.sums$max.lod.quant
+ max.N[i.perm,] <- lod.sums$max.N$max.N
+ max.N.window[i.perm,] <- lod.sums$max.N$max.N.win
+ }
+ save(max.lod.quant, max.N, max.N.window, file = "Phase3.RData", compress = TRUE)
+}
View
@@ -0,0 +1,259 @@
+## see ~/p/private/diabetes1/diabetes10/scan.perm/Rcode files
+pull.highlods <- function(scans, pheno.col, lod=4.5, drop.lod = 1.5)
+{
+ if(missing(pheno.col))
+ pheno.col <- names(scans)[-(1:2)]
+
+ ## Extract matrix of lod scores.
+ x <- as.matrix(scans[,-(1:2), drop = FALSE])
+ ## Find which values are at or above l!od threshold.
+ wh <- which(x >= lod)
+
+ ## Get row and column indices.
+ rr <- row(x)[wh]
+ cc <- col(x)[wh]
+
+ ## Find which are within drop.lod of max lod per chr.
+ lod <- x[wh]
+ chr <- scans$chr[rr]
+ maxlod <- tapply(lod, chr, max)
+ wh <- which(maxlod[chr] <= lod + drop.lod)
+
+ ## Reget values.
+ rr <- rr[wh]
+ cc <- cc[wh]
+ lod <- lod[wh]
+
+ ## return data frame with genome row, trait column and lod value.
+ cbind.data.frame(row = rr, phenos = pheno.col[cc], lod = lod)
+}
+
+sexbatch.covar <- function(cross, batch.effect)
+{
+ ic <- getsex(cross)$sex
+
+ if(!is.null(batch.effect)){
+ batch <- cross$pheno[,batch.effect, drop = FALSE]
+ tmp <- formula(paste("~ factor(", batch.effect, ")"))
+ batch <- model.matrix(tmp,batch)[,-1, drop = FALSE]
+ ac <- cbind(batch,ic)
+ }
+ else
+ ac <- ic
+
+ list(addcovar = ac, intcovar = ic)
+}
+## Performs and saves scanone on permuted dataset
+scanone.permutations <- function(cross, pheno.col = seq(3, nphe(cross)),
+ n.perm, seed=123456789, batch.effect = NULL,
+ pheno.set = 1,
+ lod.min, drop.lod = 1.5)
+{
+ set.seed(seed[[1]])
+
+ if(!is.null(batch.effect))
+ cross <- subset(cross, ind = !is.na(cross$pheno[[batch.effect]]))
+
+ covars <- sexbatch.covar(cross, batch.effect)
+
+ n.ind <- nind(cross)
+
+ perms <- matrix(NA, n.ind, n.perm)
+
+ for(i in 1:n.perm){
+ perm.cross <- cross
+ perms[,i] <- tmp <- sample(c(1:n.ind), n.ind, replace=FALSE)
+ perm.cross$pheno <- cross$pheno[tmp,]
+
+ per.scan <- scanone(perm.cross, pheno.col=pheno.col, method="hk",
+ addcovar=covars$addcovar, intcovar=covars$intcovar)
+
+ per.scan.hl <- pull.highlods(per.scan, lod = lod.min, drop.lod = drop.lod)
+
+ save(per.scan.hl, perms,
+ file=paste("per.scan",pheno.set, i,"RData",sep="."))
+ }
+}
+
+## Folder should contain scanone highlods data across all traits for ONE permutation
+cat.scanone <- function(dirpath = ".", filenames = permfiles, chr.pos)
+{
+ permfiles <- list.files(dirpath, paste("per.scan", "*", "RData", sep = "."))
+
+ if(exists("per.scan.hl"))
+ rm(per.scan.hl)
+ for(i in 1:length(filenames)){
+ attach(filenames[i])
+ if(i==1) cat.scan.hl <- per.scan.hl else
+ cat.scan.hl <- rbind.data.frame(cat.scan.hl,per.scan.hl)
+ detach()
+ }
+ cbind.data.frame(chr.pos[cat.scan.hl$row,],cat.scan.hl)
+}
+
+## Get the 2000 highest values above lod=4 (see scan.perm.R)
+get.tails <- function(highs, n.quant = 2000)
+{
+ n.quant <- min(n.quant, max(table(highs[,"row"])))
+ tmpfn <- function(x, n) {
+ l <- length(x)
+ if(l < n)
+ x <- c(x, rep(NA, n - l))
+ rev(sort(x, na.last = FALSE))[seq(n)]
+ }
+ out <- tapply(highs[,"lod"], highs[,"row"], tmpfn, n.quant)
+ ## Turn list of items of length n.quant into matrix.
+ rows <- names(out)
+ out <- matrix(unlist(out), n.quant)
+ dimnames(out) <- list(seq(n.quant), rows)
+ t(out)
+}
+
+## cat.scan.hl = highlods() data.frame across all traits in a tissue
+## N = number of highest lod scores to save 2000 ~= 0.05 percentile
+lod.quantile.permutation <- function(cat.scan.hl,N,lod.thr,window,chr.pos,n.phe)
+{
+ n.chr <- levels(chr.pos$chr)
+ l.lod.thr <- length(lod.thr)
+
+ ## Elias quantiles.
+ quant <- get.tails(cat.scan.hl, n.quant = N)
+ N <- ncol(quant)
+ max.lod.quant <- apply(quant,2,max,na.rm=T)
+ names(max.lod.quant) <- paste(paste(round(1-as.numeric(dimnames(quant)[[2]])/n.phe,4)*100,
+ "%", sep=""),1:N, sep="_")
+
+ max.hl <- make.maxlod(cat.scan.hl, chr.pos)
+
+ max.N <- data.frame(max.N=vector(length=length(lod.thr)),
+ max.N.win=vector(length=length(lod.thr)),row.names=lod.thr,check.names=T)
+
+ for(j in 1:l.lod.thr){
+ XX <- cat.scan.hl$lod >= lod.thr[j]
+ max.N$max.N[j] <- max(tapply(XX, cat.scan.hl$row,sum,na.rm=T),na.rm=T)
+
+ neqtl.pos <- smooth.neqtl(cat.scan.hl, chr.pos, max.hl, lod.thr[j], window)
+
+ max.N$max.N.win[j] <- max(neqtl.pos[,3])
+ }
+
+ list(max.lod.quant=max.lod.quant,
+ max.N=max.N)
+}
+make.maxlod <- function(cat.scan.hl, chr.pos)
+{
+ n.chr <- levels(chr.pos$chr)
+
+ tmpfn <- function(x) {
+ if(is.null(x))
+ 0
+ else
+ max(x, na.rm = TRUE)
+ }
+ tmpfn2 <- function(a,b) {
+ if(is.null(a))
+ NA
+ else
+ mean(b[!is.nan(a) & a==max(a, na.rm=TRUE)])
+ }
+
+ maxlod.hl <- maxlod.pos.hl <- vector("list", length(n.chr))
+ names(maxlod.pos.hl) <- n.chr
+ for(k in seq(along=n.chr)) {
+ scan.out.bychr <- cat.scan.hl[cat.scan.hl$chr==n.chr[k],]
+ scan.out.bychr$phenos <- factor(scan.out.bychr$phenos)
+ maxlod.hl[[k]] <- tapply(scan.out.bychr$lod,scan.out.bychr$phenos, tmpfn)
+ maxlod.pos.hl[[k]] <- tapply(scan.out.bychr$lod, scan.out.bychr$phenos, tmpfn2,
+ scan.out.bychr$pos)
+ }
+ list(lod = maxlod.hl, pos = maxlod.pos.hl)
+}
+
+smooth.neqtl <- function(cat.scan.hl, chr.pos, max.hl = make.maxlod(cat.scan.hl, chr.pos),
+ lod.thr, window = 5)
+{
+ chr <- chr.pos$chr
+ pos <- chr.pos$pos
+ n.chr <- levels(chr.pos$chr)
+
+ maxlod.thr.pos <- max.hl$pos
+ for(k in seq(along=n.chr))
+ maxlod.thr.pos[[k]] <- max.hl$pos[[k]][max.hl$lod[[k]] >= lod.thr]
+
+ out <- smoothall(maxlod.thr.pos,thechr = chr.pos$chr, thepos = chr.pos$pos, window = window)
+ ## Recover marker information.
+ rownames(out) <- rownames(chr.pos)
+ out
+}
+
+## Run permuations
+lod.quantile.permutations.2 <- function(cross, pheno.col, N, n.perm, lod.thr,
+ batch.effect, window, seed = 123456789, verbose = FALSE)
+{
+
+ ## Set up pseudo-random number generation seeds.
+ set.seed(seed[[1]])
+ all.seeds <- sample(c(98765:987654), n.perm, replace=FALSE)
+
+ ## Set up matrices to record values.
+ T <- nphe(cross)
+ N <- N[N <= T]
+ quants <- 1 - (N - 1)/T
+ l.N <- length(N)
+ max.lod.quant <- matrix(NA, n.perm, l.N)
+ dimnames(max.lod.quant)[[2]] <- paste(paste(round(quants,4)*100, "%", sep=""),
+ N, sep="_")
+ l.lod.thr <- length(lod.thr)
+ max.N <- matrix(NA, n.perm, l.lod.thr)
+ max.lod.quant <- matrix(NA, n.perm, l.N)
+ max.N.window <- matrix(NA, n.perm, l.lod.thr)
+
+ if(!is.null(batch.effect))
+ cross <- subset(cross, ind = !is.na(cross$pheno[[batch.effect]]))
+
+ covars <- sexbatch.covar(cross, batch.effect)
+
+ n.ind <- nind(cross)
+
+ for(i in 1:n.perm){
+ perm.cross <- cross
+ set.seed(all.seeds[i])
+ tmp <- sample(c(1:n.ind), n.ind, replace=FALSE)
+ perm.cross$pheno <- cross$pheno[tmp,]
+
+ per.scan <- scanone(perm.cross, pheno.col=pheno.col, method="hk",
+ addcovar=covars$addcovar, intcovar=covars$intcovar)
+
+ ## Elias' quantiles.
+ quant <- apply(per.scan[,-(1:2)], 1, quantile, quants)
+ max.lod.quant[i,] <- apply(quant,1,max)
+
+ ## Jansen's count.
+ max.N[i,] <- apply(count.thr(per.scan, lod.thr, droptwo = TRUE), 1, sum)
+
+ ## Smoothed count.
+ maxlod <- apply(per.scan[,-(1:2)], 2, tapply, per.scan[,1], max)
+ chrs <- dimnames(maxlod)[[1]]
+ chr <- factor(per.scan[,1], levels = chrs)
+ pos <- as.numeric(per.scan[,2])
+
+ maxlod.pos <- maxlod.thr.pos <- vector("list", length(chrs))
+ names(maxlod.pos) <- names(maxlod.thr.pos) <- chrs
+ for(k in seq(length(chrs))) {
+ scan.out.bychr <- per.scan[per.scan[,1] == chrs[k], ]
+ maxlod.pos[[k]] <- apply(scan.out.bychr[,-(1:2)], 2, function(a,b)
+ mean(b[!is.nan(a) & a==max(a, na.rm=TRUE)]), scan.out.bychr[,2])
+ }
+
+ for(j in seq(along = lod.thr)){
+ for(k in chrs)
+ maxlod.thr.pos[[k]] <- maxlod.pos[[k]][maxlod[k,] >= lod.thr[j]]
+ neqtl.pos <- smoothall(maxlod.thr.pos, thechr = chr, thepos = pos, window = window)
+ max.N.window[i,j] <- max(neqtl.pos[,3])
+ }
+ print(i)
+ }
+ list(max.lod.quant=max.lod.quant,
+ max.N=max.N,
+ max.N.window=max.N.window)
+}
Oops, something went wrong.

0 comments on commit ceac8fe

Please sign in to comment.