From 2713632a45b24d65481a8f9cbe3959a3e23ef062 Mon Sep 17 00:00:00 2001 From: Aaron McKenna Date: Sat, 3 Aug 2013 15:44:00 -0700 Subject: [PATCH] cleaning up some of the parameters for tangent that plug through to CapSeg.scala --- R/tangent_normalize.R | 12 ++++++------ utils/CapSeg.scala | 40 ++++++++++++++++------------------------ 2 files changed, 22 insertions(+), 30 deletions(-) diff --git a/R/tangent_normalize.R b/R/tangent_normalize.R index 99a0280..fd53623 100755 --- a/R/tangent_normalize.R +++ b/R/tangent_normalize.R @@ -15,20 +15,17 @@ option.list <- list( make_option(c("--tumor.lane.data"),help="the tumor exome coverage: lanes as rows, exome targets (or baits) as rows",default="blank"), make_option(c("--target.list"),help="the list of targets we captured in sequencing",default="blank"), make_option(c("--script.dir"),help="where we can find the wesseg scripts - where you placed the checked out tool into",default="blank"), - make_option(c("--normal.sample.to.bam.file"),help="the file containing the mapping of the sample names (for normals) to bam files",default="blank"), - make_option(c("--tumor.sample.to.bam.file"),help="the file containing the mapping of the sample names (for tumors) to bam files",default="blank"), + make_option(c("--normal.sample.bams"),help="the file containing the mapping of the sample names (for normals) to bam files",default="blank"), + make_option(c("--tumor.sample.bams"),help="the file containing the mapping of the sample names (for tumors) to bam files",default="blank"), make_option(c("--output.location"),help="where to write output files to - the segmentation results plus any graphs",default="blank"), make_option(c("--tangent.database.location"),help="the directory of tangent planes to normalize against; this directory should contain only tangent planes",default="blank"), make_option(c("--output.tangent.database"),help="the directory where we put the output tangent data",default="blank"), make_option(c("--build"),help="are we running with hg18 and hg19",default="blank"), make_option(c("--analysis.set.name"),help="what was the name of the analysis set",default="blank"), - make_option(c("--bylane"),help="is the data coming in by lane? (if not it should be by sample)",default="blank"), - make_option(c("--parallel"),help="should we merge lanes to samples in parallel",default="blank"), make_option(c("--bait.factor"),help="the bait factor data file",default="blank"), make_option(c("--bam.file.listing"),help="the listing of bam files, by tumor and by normal",default="blank"), make_option(c("--signal.files"),help="the sample name to signal file",default="blank"), make_option(c("--use.histo.data"),help="should we use historical data",default="blank"), - make_option(c("--sex.calls"),help="a file mapping the id to the sex of each patient",default="blank"), make_option(c("--debug"),help="dump lots of debugging data to the /debug directory",default="blank") ) opt <- parse_args(OptionParser(option_list=option.list)) @@ -154,8 +151,11 @@ female.normals <- normal.data[,is.element(colnames(normal.data),sex.calls$sample male.calibrated <- calibrate.and.pi.tumors(male.tumors,male.normals) female.calibrated <- calibrate.and.pi.tumors(female.tumors,female.normals) +# put together the calibrated data +calibrated.tumors <- cbind(male.calibrated,female.calibrated) + # save off the data to a Rdata object for later -save.off.processed.data(log.normals,log.tumors,calibrated.tumors,baits,paste(tangent.database.output,build.version,sep="/"),analysis.set.name,build.version) +save.off.processed.data(log2(normal.data),log2(tumor.data),calibrated.tumors,baits,paste(tangent.database.output,build.version,sep="/"),analysis.set.name,build.version) # output the raw data and plots for each sample output.and.plot.data(calibrated.tumors,tumor.data,baits,output.location,signal.files) diff --git a/utils/CapSeg.scala b/utils/CapSeg.scala index c50363a..16a9e09 100755 --- a/utils/CapSeg.scala +++ b/utils/CapSeg.scala @@ -236,25 +236,22 @@ class CapSeg extends QScript { firehoseInport.close() // run the final R script which puts it all together - add(new PostProcessData(libraryDir, + add(new PostProcessData(samples, + libraryDir, normalSampleFile, tumorSampleFile, baitCSV, - true, - new File(outputDir.getAbsolutePath() + "/.cache"), - normalSampleToPGFile, - tumorSampleToPGFile, + normalBamToSampleFile, + normalBamToSampleFile, outputDir, tangentLocation, tangentOutputLocation, build, analysisSet, - perLane, finalNormalMatrixBF, signal, signalFile, - useHistData, - alleleOutput)) + useHistData)) } @@ -387,36 +384,31 @@ class SegmentSample(libraryDir: File, bamName: String, bamToSample: File, output // post process the data using the R script -class PostProcessData(libraryDir: File, normal_bait_coverage: File, tumor_bait_coverage: File, target_file: File, - useCachedData: Boolean, cachedLocation: File, sToRGFileNormals: File, sToRGFileTumors: File, - outputDir: File, tangentLocation: File, tangentOutputLocation: File, buildType: String, analysisSet: String, - byLane: Boolean, normalBaitFile: File, signalFLs: List[File], signalTSV: File, histoData: Boolean, acovFiles: List[File]) extends CommandLineFunction { - @Input(doc = "the normal bait output file") var normalFile = normal_bait_coverage - @Input(doc = "the tumor bait output file") var tumorFile = tumor_bait_coverage +class PostProcessData(indTable: File, libraryDir: File, normal_bams: File, tumor_bams: File, target_file: File, + normalBamListing: File, tumorBamListing: File, outputDir: File, tangentLocation: File, tangentOutputLocation: File, + buildType: String, analysisSet: String, + normalBaitFile: File, signalFLs: List[File], signalTSV: File, histoData: Boolean) extends CommandLineFunction { + @Input(doc = "the sample table containing individuals and their t/n bam files") var inds = indTable + @Input(doc = "the normal bait output file") var normalFile = normal_bams + @Input(doc = "the tumor bait output file") var tumorFile = tumor_bams @Input(doc = "the target list") var targetFile = target_file @Input(doc = "where can we find the WES segmentation algorithm") var libDir = libraryDir - @Input(doc = "the file containing the mapping of sample to read group (normals)") var normalSampleToReadGroupFile = sToRGFileNormals - @Input(doc = "the file containing the mapping of sample to read group (tumors)") var tumorSampleToReadGroupFile = sToRGFileTumors + @Input(doc = "the file containing the mapping of sample to bams (normals)") var normalBams = tumorBamListing + @Input(doc = "the file containing the mapping of sample to bams (tumors)") var tumorBams = normalBamListing @Input(doc = "the output location") var outputDirectory = outputDir @Input(doc = "tangent normalization database location") var tangentDatabase = tangentLocation @Input(doc = "tangent normalization database location") var tangentOutputDatabase = tangentOutputLocation @Input(doc = "signal tsv file") var signalTSVFile = signalTSV - @Input(doc = "coverage files from the allele balance pulldown") var coverageFiles = acovFiles @Output(doc = "the signal file outputs") var signalFiles = signalFLs - @Output(doc = "the cache location") var cacheLocation = cachedLocation @Argument(doc = "the build type, hg18 or hg19") var build = buildType @Argument(doc = "the analysis name") var analysisSetName = analysisSet - @Argument(doc = "are we running by lane?") var byLaneData = byLane @Argument(doc = "the normal bait factor file") var nbf = normalBaitFile @Argument(doc = "should we use historical data") var uhd = histoData - //@Output(doc="the sample information file") var sampleInfoFile = new File(outputDir.getAbsolutePath() + "/sampleInformation.txt") - - @Argument(doc = "should we use the cache file") var useCache = useCachedData - memoryLimit = Some(16) // change me - def commandLine = "Rscript %s/R/tangent_normalize.R --normal.lane.data %s --tumor.lane.data %s --target.list %s --use.cache %s --cache.location %s --script.dir %s --normal.sample.to.lanes.file %s --tumor.sample.to.lanes.file %s --output.location %s --tangent.database.location %s --output.tangent.database %s --build %s --analysis.set.name %s --bylane %s --bait.factor %s --signal.files %s --histo.data %s".format(libDir.getAbsolutePath(), normalFile.getAbsolutePath(), tumorFile.getAbsolutePath(), targetFile.getAbsolutePath(), useCache, cacheLocation.getAbsolutePath(), libDir.getAbsolutePath(), normalSampleToReadGroupFile.getAbsolutePath(), tumorSampleToReadGroupFile.getAbsolutePath(), outputDirectory.getAbsolutePath(), tangentDatabase.getAbsolutePath(), tangentOutputDatabase.getAbsolutePath(), build, analysisSetName,byLaneData, nbf.getAbsolutePath(), signalTSVFile.getAbsolutePath(), uhd) + memoryLimit = Some(8) + def commandLine = "Rscript %s/R/tangent_normalize.R --sample.table %s --normal.lane.data %s --tumor.lane.data %s --target.list %s --script.dir %s --normal.sample.bams %s --tumor.sample.bams %s --output.location %s --tangent.database.location %s --output.tangent.database %s --build %s --analysis.set.name %s --bait.factor %s --signal.files %s --histo.data %s".format(libDir.getAbsolutePath(), indTable.getAbsolutePath(), normalFile.getAbsolutePath(), tumorFile.getAbsolutePath(), targetFile.getAbsolutePath(), libDir.getAbsolutePath(), normalBams.getAbsolutePath(), tumorBams.getAbsolutePath(), outputDirectory.getAbsolutePath(), tangentDatabase.getAbsolutePath(), tangentOutputDatabase.getAbsolutePath(), build, analysisSetName, nbf.getAbsolutePath(), signalTSVFile.getAbsolutePath(), uhd) } // correct the output files for any dups lines and extra headers