Merge pull request #341 from bento-platform/features/cbio_export_work…

…flow Features/cbio export workflow
bento-platform · Nov 5, 2022 · 31754ae · 31754ae
2 parents f2bae33 + ccb5d09
commit 31754ae
Show file tree

Hide file tree

Showing 16 changed files with 992 additions and 55 deletions.
diff --git a/chord_metadata_service/chord/data_types.py b/chord_metadata_service/chord/data_types.py
@@ -14,6 +14,7 @@
 DATA_TYPE_PHENOPACKET = "phenopacket"
 DATA_TYPE_MCODEPACKET = "mcodepacket"
 DATA_TYPE_READSET = "readset"
+DATA_TYPE_EXPERIMENT_RESULT = "experiment_result"
 
 DATA_TYPES = {
     DATA_TYPE_EXPERIMENT: {
@@ -41,5 +42,11 @@
         "metadata_schema": {
             "type": "object"
         }
+    },
+    DATA_TYPE_EXPERIMENT_RESULT: {
+        "schema": EXPERIMENT_RESULT_SCHEMA,
+        "metadata_schema": {
+            "type": "object"
+        }
     }
 }
diff --git a/chord_metadata_service/chord/export_cbio.py b/chord_metadata_service/chord/export_cbio.py
@@ -1,13 +1,16 @@
 import logging
 import csv
 from typing import TextIO, Callable
+import re
+
 from django.db.models import F
 
 from .export_utils import ExportError
 
 from chord_metadata_service.chord.models import Dataset
 from chord_metadata_service.patients.models import Individual
 from chord_metadata_service.phenopackets import models as pm
+from chord_metadata_service.experiments.models import ExperimentResult
 
 __all__ = [
     "study_export",
@@ -21,18 +24,37 @@
 SAMPLE_META_FILENAME = "meta_clinical_sample.txt"
 PATIENT_DATA_FILENAME = "data_clinical_patient.txt"
 PATIENT_META_FILENAME = "meta_clinical_patient.txt"
+MUTATION_META_FILENAME = "meta_mutation.txt"
+MUTATION_DATA_FILENAME = "data_mutations_extended.txt"   # is generated during workflow from files coming from DRS
+MAF_LIST_FILENAME = 'maf_list.txt'  # Accessory file
+CASE_LIST_SEQUENCED = "case_lists/case_list_sequenced.txt"  # in a subfolder
+
 
 CBIO_FILES_SET = frozenset({
     STUDY_FILENAME,
     SAMPLE_DATA_FILENAME,
     SAMPLE_META_FILENAME,
     PATIENT_DATA_FILENAME,
-    PATIENT_META_FILENAME
+    PATIENT_META_FILENAME,
+    MUTATION_META_FILENAME,
+    # MUTATION_DATA_FILENAME is not part of the files generated here
+    CASE_LIST_SEQUENCED
 })
 
 PATIENT_DATATYPE = 'PATIENT'
 SAMPLE_DATATYPE = 'SAMPLE'
 
+# [     List
+#   ^   not in...
+#   a-z lowercase letter
+#   A-Z uppercase
+#   0-9 digit
+#   _   underscore
+#   \.  dot
+#   \-  hyphen
+# ]     Closing list
+REGEXP_INVALID_FOR_ID = re.compile(r"[^a-zA-Z0-9_\.\-]")
+
 
 def study_export(getPath: Callable[[str], str], dataset_id: str):
     """Export a given Project as a cBioPortal study"""
@@ -43,27 +65,40 @@ def study_export(getPath: Callable[[str], str], dataset_id: str):
     cbio_study_id = str(dataset.identifier)
 
     # Export study file
-    with open(getPath(STUDY_FILENAME), 'w') as file_study:
+    with open(getPath(STUDY_FILENAME), 'w', newline='\n') as file_study:
         study_export_meta(dataset, file_study)
 
     # Export patients.
-    with open(getPath(PATIENT_DATA_FILENAME), 'w') as file_patient:
+    with open(getPath(PATIENT_DATA_FILENAME), 'w', newline='\n') as file_patient:
         # Note: plural in `phenopackets` is intentional (related_name property in model)
         indiv = Individual.objects.filter(phenopackets__table__ownership_record__dataset_id=dataset.identifier)
         individual_export(indiv, file_patient)
 
-    with open(getPath(PATIENT_META_FILENAME), 'w') as file_patient_meta:
+    with open(getPath(PATIENT_META_FILENAME), 'w', newline='\n') as file_patient_meta:
         clinical_meta_export(cbio_study_id, PATIENT_DATATYPE, file_patient_meta)
 
     # Export samples
-    with open(getPath(SAMPLE_DATA_FILENAME), 'w') as file_sample:
+    with open(getPath(SAMPLE_DATA_FILENAME), 'w', newline='\n') as file_sample:
         sampl = pm.Biosample.objects.filter(phenopacket__table__ownership_record__dataset_id=dataset.identifier)\
             .annotate(phenopacket_subject_id=F("phenopacket__subject"))
         sample_export(sampl, file_sample)
 
-    with open(getPath(SAMPLE_META_FILENAME), 'w') as file_sample_meta:
+    with open(getPath(SAMPLE_META_FILENAME), 'w', newline='\n') as file_sample_meta:
         clinical_meta_export(cbio_study_id, SAMPLE_DATATYPE, file_sample_meta)
 
+    # .maf files stored
+    with open(getPath(MAF_LIST_FILENAME), 'w', newline='\n') as file_maf_list,\
+         open(getPath(CASE_LIST_SEQUENCED), 'w', newline='\n') as file_case_list:
+        exp_res = ExperimentResult.objects.filter(experiment__table__ownership_record__dataset_id=dataset.identifier) \
+            .filter(file_format='MAF') \
+            .annotate(biosample_id=F("experiment__biosample"))
+
+        maf_list(exp_res, file_maf_list)
+        case_list_export(cbio_study_id, exp_res, file_case_list)
+
+    with open(getPath(MUTATION_META_FILENAME), 'w', newline='\n') as file_mutation_meta:
+        mutation_meta_export(cbio_study_id, file_mutation_meta)
+
 
 def study_export_meta(dataset: Dataset, file_handle: TextIO):
     """
@@ -78,9 +113,9 @@ def study_export_meta(dataset: Dataset, file_handle: TextIO):
     # optional fields
     if len(dataset.primary_publications):
         lines['citation'] = dataset.primary_publications[0]
-    # pmid: unvailable
+    # pmid: unavailable
     # groups: unused for authentication
-    # add_global_case_list: ?
+    lines['add_global_case_list'] = 'true'  # otherwise causes an error at validation
     # tags_file: ?
     # reference_genome: ?
 
@@ -127,7 +162,7 @@ def individual_export(results, file_handle: TextIO):
     individuals = []
     for individual in results:
         ind_obj = {
-            'id': individual.id,
+            'id': sanitize_id(individual.id),
             'sex': individual.sex,
         }
         individuals.append(ind_obj)
@@ -136,7 +171,7 @@ def individual_export(results, file_handle: TextIO):
     headers = individual_to_patient_header(columns)
 
     file_handle.writelines([line + '\n' for line in headers])
-    dict_writer = csv.DictWriter(file_handle, fieldnames=columns, delimiter='\t')
+    dict_writer = csv.DictWriter(file_handle, fieldnames=columns, delimiter='\t', lineterminator='\n')
     dict_writer.writerows(individuals)
 
 
@@ -189,8 +224,8 @@ def sample_export(results, file_handle: TextIO):
             continue
 
         sample_obj = {
-            'individual_id': subject_id,
-            'id': sample.id
+            'individual_id': sanitize_id(subject_id),
+            'id': sanitize_id(sample.id)
         }
         if sample.sampled_tissue:
             sample_obj['tissue_label'] = sample.sampled_tissue.get('label', '')
@@ -201,10 +236,82 @@ def sample_export(results, file_handle: TextIO):
     headers = biosample_to_sample_header(columns)
 
     file_handle.writelines([line + '\n' for line in headers])
-    dict_writer = csv.DictWriter(file_handle, fieldnames=columns, delimiter='\t')
+    dict_writer = csv.DictWriter(file_handle, fieldnames=columns, delimiter='\t', lineterminator='\n')
     dict_writer.writerows(samples)
 
 
+def maf_list(results, file_handle: TextIO):
+    """
+    List of maf files associated with this dataset.
+    """
+    maf_uri = [experiment.extra_properties['uri'] + '\n' for experiment in results]
+    file_handle.writelines(maf_uri)
+
+
+def mutation_meta_export(study_id: str, file_handle: TextIO):
+    """
+    Mutation data, metadata file generation
+
+    specifications:
+    cancer_study_identifier: same value as specified in study meta file
+    genetic_alteration_type: MUTATION_EXTENDED
+    datatype: MAF
+    stable_id: mutations
+    show_profile_in_analysis_tab: true
+    profile_name: A name for the mutation data, e.g., "Mutations".
+    profile_description: A description of the mutation data, e.g., "Mutation data from whole exome sequencing.".
+    data_filename: your data file
+    gene_panel (optional): gene panel stable id. See Gene panels for mutation data.
+    swissprot_identifier (optional): accession or name, indicating the type of identifier in the SWISSPROT column
+    variant_classification_filter (optional): List of Variant_Classifications values to be filtered out.
+    namespaces (optional): Comma-delimited list of namespaces to import.
+    """
+    lines = dict()
+    lines['cancer_study_identifier'] = study_id
+    lines['genetic_alteration_type'] = 'MUTATION_EXTENDED'
+    lines['datatype'] = 'MAF'
+    lines['stable_id'] = 'mutations'
+    lines['show_profile_in_analysis_tab'] = 'true'
+    lines['profile_name'] = 'Mutations'
+    lines['profile_description'] = 'Mutation data from whole exome sequencing'  # TODO: extract from experiments
+    lines['data_filename'] = MUTATION_DATA_FILENAME
+    lines['swissprot_identifier'] = 'name'
+
+    for field, value in lines.items():
+        file_handle.write(f"{field}: {value}\n")
+
+
+def case_list_export(study_id: str, results, file_handle: TextIO):
+    """
+    Case list. For now, sequenced data only.
+
+    Specifications:
+    cancer_study_identifier: same value as specified in study meta file
+    stable_id: it must contain the cancer_study_identifier followed by an underscore.
+        Typically, after this a relevant suffix, e.g., _custom, is added.
+        There are some naming rules to follow if you want the case list to be
+        selected automatically in the query UI base on the selected sample
+        profiles. See subsection below.
+    case_list_name: A name for the patient list, e.g., "All Tumors".
+    case_list_description: A description of the patient list, e.g.,
+        "All tumor samples (825 samples).".
+    case_list_ids: A tab-delimited list of sample ids from the dataset.
+    case_list_category: Optional alternative way of linking your case list to a
+        specific molecular profile. E.g. setting this to all_cases_with_cna_data
+        will signal to the portal that this is the list of samples to be associated
+        with CNA data in some of the analysis.
+    """
+    lines = dict()
+    lines['cancer_study_identifier'] = study_id
+    lines['stable_id'] = f'{study_id}_sequenced'
+    lines['case_list_name'] = 'All samples'
+    lines['case_list_description'] = 'All samples'
+    lines['case_list_ids'] = '\t'.join([sanitize_id(exp_res.biosample_id) for exp_res in results])
+
+    for field, value in lines.items():
+        file_handle.write(f"{field}: {value}\n")
+
+
 class CbioportalClinicalHeaderGenerator():
     """
     Generates cBioPortal data files headers based on field names from katsu models.
@@ -284,3 +391,15 @@ def biosample_to_sample_header(fields: list):
 
     cbio_header = CbioportalClinicalHeaderGenerator(fields_mapping)
     return cbio_header.make_header(fields)
+
+
+def sanitize_id(id: str):
+    """Ensure IDs are compatible with cBioPortal specifications
+
+    IDs must contain alphanumeric characters, dot, hyphen or underscore
+    """
+    id = str(id)    # force casting to string
+    if REGEXP_INVALID_FOR_ID.search(id) is None:
+        return id
+
+    return REGEXP_INVALID_FOR_ID.sub('_', id)
diff --git a/chord_metadata_service/chord/export_utils.py b/chord_metadata_service/chord/export_utils.py
@@ -82,7 +82,14 @@ def get_path(self, filename: str = ''):
         Attributes:
             filename: optional filename to use
         """
-        return os.path.join(self.path, filename)
+        path = os.path.join(self.path, filename)
+
+        # if filename contains a subdirectory, ensure it is created
+        dirpath = os.path.dirname(path)
+        if not os.path.exists(dirpath):
+            os.makedirs(dirpath, 0o777)
+
+        return path
 
     def write_tar(self):
         """Creates a tar gzipped archive from the export directory content