From e7b0d87c7f795ac20ded697b929d6a0506a4a121 Mon Sep 17 00:00:00 2001 From: Zachary Heins Date: Fri, 16 Aug 2019 09:58:48 -0400 Subject: [PATCH] Tidying up work over summer (#34) Signed-off-by: Zachary Heins --- .../cbio/gdcpipeline/reader/CnaReader.java | 24 ++++++++++++- .../gdcpipeline/reader/ExpressionReader.java | 36 ++++++++++--------- .../gdcpipeline/writer/ClinicalWriter.java | 8 ++--- src/main/resources/application.properties | 6 ++-- 4 files changed, 49 insertions(+), 25 deletions(-) diff --git a/src/main/java/org/cbio/gdcpipeline/reader/CnaReader.java b/src/main/java/org/cbio/gdcpipeline/reader/CnaReader.java index a92c82d..8102a41 100644 --- a/src/main/java/org/cbio/gdcpipeline/reader/CnaReader.java +++ b/src/main/java/org/cbio/gdcpipeline/reader/CnaReader.java @@ -31,6 +31,8 @@ public class CnaReader implements ItemStreamReader { private Map gdcAliquotIdToSampleId; private List cnaRecords = new ArrayList<>(); + private Set samplesSeen = new HashSet<>(); + private static Log LOG = LogFactory.getLog(CnaReader.class); @Override @@ -80,22 +82,39 @@ private void readFile(File cnaFile, ExecutionContext executionContext) throws Ex List headerList = new ArrayList<>(Arrays.asList(header.split("\t"))); headerList.replaceAll(String::toUpperCase); + // Keep these in order int geneIdIndex = headerList.indexOf("GENE ID"); if (geneIdIndex != -1) { headerList.remove(geneIdIndex); } + int geneSymbolIndex = headerList.indexOf("GENE SYMBOL"); + if (geneSymbolIndex != -1) { + headerList.remove(geneSymbolIndex); + } + headerList.add(geneSymbolIndex, "Hugo_Symbol"); int cytobandIndex = headerList.indexOf("CYTOBAND"); if (cytobandIndex != -1) { headerList.remove(cytobandIndex); } + List indicesToSkip = new ArrayList<>(); + // Replace case ids with sample ids for (int i = 0; i < headerList.size(); i++) { if (gdcAliquotIdToSampleId.containsKey(headerList.get(i))) { - headerList.set(i, gdcAliquotIdToSampleId.get(headerList.get(i))); + String sid = gdcAliquotIdToSampleId.get(headerList.get(i)); + if (samplesSeen.contains(sid)) { + indicesToSkip.add(0,i); + } + headerList.set(i, sid); + samplesSeen.add(sid); } } + for (Integer indexToSkip : indicesToSkip) { + headerList.remove(indexToSkip); + } + executionContext.put("cnaHeader", headerList); while ((line = br.readLine()) != null) { List fields = new ArrayList<>(Arrays.asList(line.split("\t"))); @@ -105,6 +124,9 @@ private void readFile(File cnaFile, ExecutionContext executionContext) throws Ex if (cytobandIndex != -1) { fields.remove(cytobandIndex); } + for (Integer indexToSkip : indicesToSkip) { + fields.remove(indexToSkip); + } // Assuming that only ensembl id is present for the gene symbol cnaRecords.add(new CnaRecord(fields.get(0), fields.subList(1, fields.size()))); } diff --git a/src/main/java/org/cbio/gdcpipeline/reader/ExpressionReader.java b/src/main/java/org/cbio/gdcpipeline/reader/ExpressionReader.java index a0e3c24..478b615 100644 --- a/src/main/java/org/cbio/gdcpipeline/reader/ExpressionReader.java +++ b/src/main/java/org/cbio/gdcpipeline/reader/ExpressionReader.java @@ -39,28 +39,29 @@ public class ExpressionReader implements ItemStreamReader { @Value("#{jobParameters[sourceDirectory]}") private String sourceDir; - + @Value("#{jobExecutionContext[gdcManifestData]}") private List gdcManifestData; - + @Value("#{jobExecutionContext[gdcAliquotIdToSampleId]}") private Map gdcAliquotIdToSampleId; - + @Value("#{jobExecutionContext[gdcIdToSampleId]}") private Map gdcIdToSampleId; @Value("#{jobExecutionContext[gdcUUIDToSampleId]}") - private Map gdcUUIDToSampleId; - + private Map gdcUUIDToSampleId; + @Autowired GenomeNexusCache genomeNexusCache; private List expressionRecords = new ArrayList<>(); private MultiKeyMap expressionMap = new MultiKeyMap(); private Set genes = new HashSet<>(); + private Set samplesSeen = new HashSet(); private List sampleIds = new ArrayList<>(); - private static Log LOG = LogFactory.getLog(ExpressionReader.class); - + private static Log LOG = LogFactory.getLog(ExpressionReader.class); + private static final Pattern FILENAME_PATTERN = Pattern.compile("(\\S+)\\.htseq\\.counts.*"); @Override @@ -77,21 +78,22 @@ public void open(ExecutionContext executionContext) throws ItemStreamException { LOG.error("Failed to extract file"); throw new ItemStreamException("Failed to process file"); } - + try { String sampleId = fileData.getSampleIds().get(0); - if (gdcAliquotIdToSampleId.containsKey(sampleId.toUpperCase())) { + if (gdcAliquotIdToSampleId.containsKey(sampleId.toUpperCase()) && !samplesSeen.contains(sampleId)) { readFile(expressionFile, executionContext, gdcAliquotIdToSampleId.get(sampleId)); } - else if (gdcIdToSampleId.containsKey(sampleId.toUpperCase())) { + else if (gdcIdToSampleId.containsKey(sampleId.toUpperCase()) && !samplesSeen.contains(sampleId)) { readFile(expressionFile, executionContext, gdcIdToSampleId.get(sampleId.toUpperCase())); } - else if (gdcUUIDToSampleId.containsKey(sampleId.toUpperCase())) { + else if (gdcUUIDToSampleId.containsKey(sampleId.toUpperCase()) && !samplesSeen.contains(sampleId)) { readFile(expressionFile, executionContext, gdcUUIDToSampleId.get(sampleId.toUpperCase())); - } + } else { LOG.error("Could not find id from filename " + fileData.getFilename()); } + samplesSeen.add(sampleId); } catch (Exception e) { LOG.error("Failed to read file " + fileData.getFilename()); @@ -104,7 +106,7 @@ else if (gdcUUIDToSampleId.containsKey(sampleId.toUpperCase())) { for (String sampleId : sampleIds) { headerList.add(sampleId); } - executionContext.put("expressionHeader", headerList); + executionContext.put("expressionHeader", headerList); } @Override @@ -118,9 +120,9 @@ public String read() throws Exception, UnexpectedInputException, ParseException, if (!expressionRecords.isEmpty()) { return expressionRecords.remove(0); } - return null; + return null; } - + private void readFile(File expressionFile, ExecutionContext e, String sampleId) throws Exception { //TODO: There should be a cache for gene id mappings BufferedReader br = new BufferedReader(new FileReader(expressionFile)); @@ -136,11 +138,11 @@ private void readFile(File expressionFile, ExecutionContext e, String sampleId) //genes.add(geneId); if (!sampleIds.contains(sampleId)) { sampleIds.add(sampleId); - } + } expressionMap.put(geneId, sampleId, value); } } - + private void generateExpressionRecords() { for (String gene : genes) { String expressionRecord = gene; diff --git a/src/main/java/org/cbio/gdcpipeline/writer/ClinicalWriter.java b/src/main/java/org/cbio/gdcpipeline/writer/ClinicalWriter.java index fddb046..316da81 100644 --- a/src/main/java/org/cbio/gdcpipeline/writer/ClinicalWriter.java +++ b/src/main/java/org/cbio/gdcpipeline/writer/ClinicalWriter.java @@ -102,10 +102,10 @@ private FlatFileHeaderCallback clinicalDataHeader(ClinicalDataModel data) { public void writeHeader(Writer writer) throws IOException { Map> headers = data.getHeaders(); StringBuilder sb = new StringBuilder(); - sb.append(StringUtils.join(headers.get("displayNames"), '\t')).append("\n"); - sb.append(StringUtils.join(headers.get("description"), '\t')).append("\n"); - sb.append(StringUtils.join(headers.get("datatype"), '\t')).append("\n"); - sb.append(StringUtils.join(headers.get("priority"), '\t')).append("\n"); + sb.append("#").append(StringUtils.join(headers.get("displayNames"), '\t')).append("\n"); + sb.append("#").append(StringUtils.join(headers.get("description"), '\t')).append("\n"); + sb.append("#").append(StringUtils.join(headers.get("datatype"), '\t')).append("\n"); + sb.append("#").append(StringUtils.join(headers.get("priority"), '\t')).append("\n"); List list = headers.get("headers"); for (int i = 0; i < list.size(); i++) { list.set(i, list.get(i).toUpperCase()); diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties index a8c7b26..8691ff5 100644 --- a/src/main/resources/application.properties +++ b/src/main/resources/application.properties @@ -15,8 +15,8 @@ clinical.metadata.sample.file=meta_clinical_sample.txt #################################### Mutation ################################################### mutation.data.file.prefix=data_ -mutation.metadata.file=meta_mutation_extended.txt -mutation.default.merged.maf.file=data_mutation_extended.maf +mutation.metadata.file=meta_mutations_extended.txt +mutation.default.merged.maf.file=data_mutations_extended.txt #################################### Genome Nexus ################################################### genomenexus.base=http://annotation.genomenexus.org/ @@ -31,7 +31,7 @@ cna.geneId.field=geneId #################################### CNA ################################################### expression.data.file=data_expression.txt -expression.metadata.file=expression_cna.txt +expression.metadata.file=meta_expression.txt #################################### Test ####################################################### test.cancer.study.id=TCGA_BRCA