Skip to content

Commit

Permalink
Tidying up work over summer (#34)
Browse files Browse the repository at this point in the history
Signed-off-by: Zachary Heins <zackheins@gmail.com>
  • Loading branch information
zheins authored and ao508 committed Aug 16, 2019
1 parent 0eaf558 commit e7b0d87
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 25 deletions.
24 changes: 23 additions & 1 deletion src/main/java/org/cbio/gdcpipeline/reader/CnaReader.java
Expand Up @@ -31,6 +31,8 @@ public class CnaReader implements ItemStreamReader<CnaRecord> {
private Map<String, String> gdcAliquotIdToSampleId;

private List<CnaRecord> cnaRecords = new ArrayList<>();
private Set<String> samplesSeen = new HashSet<>();

private static Log LOG = LogFactory.getLog(CnaReader.class);

@Override
Expand Down Expand Up @@ -80,22 +82,39 @@ private void readFile(File cnaFile, ExecutionContext executionContext) throws Ex
List<String> headerList = new ArrayList<>(Arrays.asList(header.split("\t")));
headerList.replaceAll(String::toUpperCase);

// Keep these in order
int geneIdIndex = headerList.indexOf("GENE ID");
if (geneIdIndex != -1) {
headerList.remove(geneIdIndex);
}
int geneSymbolIndex = headerList.indexOf("GENE SYMBOL");
if (geneSymbolIndex != -1) {
headerList.remove(geneSymbolIndex);
}
headerList.add(geneSymbolIndex, "Hugo_Symbol");
int cytobandIndex = headerList.indexOf("CYTOBAND");
if (cytobandIndex != -1) {
headerList.remove(cytobandIndex);
}

List<Integer> indicesToSkip = new ArrayList<>();

// Replace case ids with sample ids
for (int i = 0; i < headerList.size(); i++) {
if (gdcAliquotIdToSampleId.containsKey(headerList.get(i))) {
headerList.set(i, gdcAliquotIdToSampleId.get(headerList.get(i)));
String sid = gdcAliquotIdToSampleId.get(headerList.get(i));
if (samplesSeen.contains(sid)) {
indicesToSkip.add(0,i);
}
headerList.set(i, sid);
samplesSeen.add(sid);
}
}

for (Integer indexToSkip : indicesToSkip) {
headerList.remove(indexToSkip);
}

executionContext.put("cnaHeader", headerList);
while ((line = br.readLine()) != null) {
List<String> fields = new ArrayList<>(Arrays.asList(line.split("\t")));
Expand All @@ -105,6 +124,9 @@ private void readFile(File cnaFile, ExecutionContext executionContext) throws Ex
if (cytobandIndex != -1) {
fields.remove(cytobandIndex);
}
for (Integer indexToSkip : indicesToSkip) {
fields.remove(indexToSkip);
}
// Assuming that only ensembl id is present for the gene symbol
cnaRecords.add(new CnaRecord(fields.get(0), fields.subList(1, fields.size())));
}
Expand Down
36 changes: 19 additions & 17 deletions src/main/java/org/cbio/gdcpipeline/reader/ExpressionReader.java
Expand Up @@ -39,28 +39,29 @@ public class ExpressionReader implements ItemStreamReader<String> {

@Value("#{jobParameters[sourceDirectory]}")
private String sourceDir;

@Value("#{jobExecutionContext[gdcManifestData]}")
private List<ManifestFileData> gdcManifestData;

@Value("#{jobExecutionContext[gdcAliquotIdToSampleId]}")
private Map<String, String> gdcAliquotIdToSampleId;

@Value("#{jobExecutionContext[gdcIdToSampleId]}")
private Map<String, String> gdcIdToSampleId;

@Value("#{jobExecutionContext[gdcUUIDToSampleId]}")
private Map<String, String> gdcUUIDToSampleId;
private Map<String, String> gdcUUIDToSampleId;

@Autowired
GenomeNexusCache genomeNexusCache;

private List<String> expressionRecords = new ArrayList<>();
private MultiKeyMap expressionMap = new MultiKeyMap();
private Set<String> genes = new HashSet<>();
private Set<String> samplesSeen = new HashSet();
private List<String> sampleIds = new ArrayList<>();
private static Log LOG = LogFactory.getLog(ExpressionReader.class);
private static Log LOG = LogFactory.getLog(ExpressionReader.class);

private static final Pattern FILENAME_PATTERN = Pattern.compile("(\\S+)\\.htseq\\.counts.*");

@Override
Expand All @@ -77,21 +78,22 @@ public void open(ExecutionContext executionContext) throws ItemStreamException {
LOG.error("Failed to extract file");
throw new ItemStreamException("Failed to process file");
}

try {
String sampleId = fileData.getSampleIds().get(0);
if (gdcAliquotIdToSampleId.containsKey(sampleId.toUpperCase())) {
if (gdcAliquotIdToSampleId.containsKey(sampleId.toUpperCase()) && !samplesSeen.contains(sampleId)) {
readFile(expressionFile, executionContext, gdcAliquotIdToSampleId.get(sampleId));
}
else if (gdcIdToSampleId.containsKey(sampleId.toUpperCase())) {
else if (gdcIdToSampleId.containsKey(sampleId.toUpperCase()) && !samplesSeen.contains(sampleId)) {
readFile(expressionFile, executionContext, gdcIdToSampleId.get(sampleId.toUpperCase()));
}
else if (gdcUUIDToSampleId.containsKey(sampleId.toUpperCase())) {
else if (gdcUUIDToSampleId.containsKey(sampleId.toUpperCase()) && !samplesSeen.contains(sampleId)) {
readFile(expressionFile, executionContext, gdcUUIDToSampleId.get(sampleId.toUpperCase()));
}
}
else {
LOG.error("Could not find id from filename " + fileData.getFilename());
}
samplesSeen.add(sampleId);
}
catch (Exception e) {
LOG.error("Failed to read file " + fileData.getFilename());
Expand All @@ -104,7 +106,7 @@ else if (gdcUUIDToSampleId.containsKey(sampleId.toUpperCase())) {
for (String sampleId : sampleIds) {
headerList.add(sampleId);
}
executionContext.put("expressionHeader", headerList);
executionContext.put("expressionHeader", headerList);
}

@Override
Expand All @@ -118,9 +120,9 @@ public String read() throws Exception, UnexpectedInputException, ParseException,
if (!expressionRecords.isEmpty()) {
return expressionRecords.remove(0);
}
return null;
return null;
}

private void readFile(File expressionFile, ExecutionContext e, String sampleId) throws Exception {
//TODO: There should be a cache for gene id mappings
BufferedReader br = new BufferedReader(new FileReader(expressionFile));
Expand All @@ -136,11 +138,11 @@ private void readFile(File expressionFile, ExecutionContext e, String sampleId)
//genes.add(geneId);
if (!sampleIds.contains(sampleId)) {
sampleIds.add(sampleId);
}
}
expressionMap.put(geneId, sampleId, value);
}
}

private void generateExpressionRecords() {
for (String gene : genes) {
String expressionRecord = gene;
Expand Down
8 changes: 4 additions & 4 deletions src/main/java/org/cbio/gdcpipeline/writer/ClinicalWriter.java
Expand Up @@ -102,10 +102,10 @@ private FlatFileHeaderCallback clinicalDataHeader(ClinicalDataModel data) {
public void writeHeader(Writer writer) throws IOException {
Map<String, List<String>> headers = data.getHeaders();
StringBuilder sb = new StringBuilder();
sb.append(StringUtils.join(headers.get("displayNames"), '\t')).append("\n");
sb.append(StringUtils.join(headers.get("description"), '\t')).append("\n");
sb.append(StringUtils.join(headers.get("datatype"), '\t')).append("\n");
sb.append(StringUtils.join(headers.get("priority"), '\t')).append("\n");
sb.append("#").append(StringUtils.join(headers.get("displayNames"), '\t')).append("\n");
sb.append("#").append(StringUtils.join(headers.get("description"), '\t')).append("\n");
sb.append("#").append(StringUtils.join(headers.get("datatype"), '\t')).append("\n");
sb.append("#").append(StringUtils.join(headers.get("priority"), '\t')).append("\n");
List<String> list = headers.get("headers");
for (int i = 0; i < list.size(); i++) {
list.set(i, list.get(i).toUpperCase());
Expand Down
6 changes: 3 additions & 3 deletions src/main/resources/application.properties
Expand Up @@ -15,8 +15,8 @@ clinical.metadata.sample.file=meta_clinical_sample.txt

#################################### Mutation ###################################################
mutation.data.file.prefix=data_
mutation.metadata.file=meta_mutation_extended.txt
mutation.default.merged.maf.file=data_mutation_extended.maf
mutation.metadata.file=meta_mutations_extended.txt
mutation.default.merged.maf.file=data_mutations_extended.txt

#################################### Genome Nexus ###################################################
genomenexus.base=http://annotation.genomenexus.org/
Expand All @@ -31,7 +31,7 @@ cna.geneId.field=geneId

#################################### CNA ###################################################
expression.data.file=data_expression.txt
expression.metadata.file=expression_cna.txt
expression.metadata.file=meta_expression.txt

#################################### Test #######################################################
test.cancer.study.id=TCGA_BRCA

0 comments on commit e7b0d87

Please sign in to comment.