Merge fb7ef21 into 456f3e9

biocore · Apr 17, 2024 · f1fb9de · f1fb9de
2 parents 456f3e9 + fb7ef21
commit f1fb9de
Show file tree

Hide file tree

Showing 6 changed files with 730 additions and 72 deletions.
diff --git a/metapool/__init__.py b/metapool/__init__.py
@@ -4,7 +4,8 @@
                    demux_pre_prep, pre_prep_needs_demuxing)
 from .sample_sheet import (sample_sheet_to_dataframe, make_sample_sheet,
                            AmpliconSampleSheet, MetagenomicSampleSheetv90,
-                           MetagenomicSampleSheetv100, AbsQuantSampleSheetv10,
+                           MetagenomicSampleSheetv100,
+                           MetagenomicSampleSheetv101, AbsQuantSampleSheetv10,
                            MetatranscriptomicSampleSheetv0, demux_sample_sheet,
                            sheet_needs_demuxing, KLSampleSheet,
                            load_sample_sheet, MetatranscriptomicSampleSheetv10)
@@ -29,7 +30,8 @@
            'requires_dilution', 'run_counts', 'sample_sheet_to_dataframe',
            'sheet_needs_demuxing', 'sum_lanes', 'validate_plate_metadata',
            'MetagenomicSampleSheetv90', 'MetagenomicSampleSheetv100',
-           'AmpliconSampleSheet', 'MetatranscriptomicSampleSheetv0',
+           'MetagenomicSampleSheetv101', 'AmpliconSampleSheet',
+           'MetatranscriptomicSampleSheetv0',
            'MetatranscriptomicSampleSheetv10', 'AbsQuantSampleSheetv10',
            # KLSampleSheet is needed for instance() calls.
            'KLSampleSheet', 'load_sample_sheet']

diff --git a/metapool/sample_sheet.py b/metapool/sample_sheet.py
@@ -308,7 +308,7 @@ def write_blank_lines(writer, n=blank_lines, width=csv_width):
                 if section is not None:
                     # these sections are represented as DataFrame objects
                     writer.writerow(pad_iterable(section.columns.tolist(),
-                                    csv_width))
+                                                 csv_width))
 
                     for _, row in section.iterrows():
                         writer.writerow(pad_iterable(row.values.tolist(),
@@ -377,73 +377,61 @@ def merge(self, sheets):
                 else:
                     pass
 
-    def _remap_table(self, table, strict=True):
-        if self.remapper is None:
-            raise ValueError("sample-sheet does not contain a valid Assay"
-                             " type.")
-
-        # Well_description column is now defined here as the concatenation
-        # of the following columns. If the column existed previously it will
-        # be overwritten, otherwise it will be created here. Alternate versions
-        # of the column name have already been resolved at this point.
-
-        # Note that the amplicon notebook currently generates the same values
-        # for this column. If the functionality in the notebook changes, the
-        # output will continue to be redfined with the current values here.
-        well_description = table['Project Plate'].astype(str) + "." + table[
-            'Sample'].astype(str) + "." + table['Well'].astype(str)
+    def _remap_table(self, table, strict):
+        result = table.copy(deep=True)
 
         if strict:
-            # legacy operation. All columns not defined in remapper will be
-            # filtered out.
-            out = table[self.remapper.keys()].copy()
-            out.rename(self.remapper, axis=1, inplace=True)
+            # All columns not defined in remapper will be filtered result.
+            result = table[self.remapper.keys()].copy()
+            result.rename(self.remapper, axis=1, inplace=True)
         else:
-            out = table.copy(deep=True)
-
             # if a column named 'index' is present in table, assume it is a
             # numeric index and not a sequence of bases, which is required in
             # the output. Assume the column that will become 'index' is
             # defined in remapper.
-            if 'index' in set(out.columns):
-                out.drop(columns=['index'], inplace=True)
-
-            # if an alternate form of a column name defined in
-            # _KL_SAMPLE_SHEET_COLUMN_ALTS is found in table, assume it should
-            # be renamed to its proper form and be included in the output e.g.:
-            # 'sample_plate' -> 'Sample_Plate'.
+            if 'index' in set(result.columns):
+                result.drop(columns=['index'], inplace=True)
 
-            # assume keys in _KL_SAMPLE_SHEET_COLUMN_ALTS do not overlap w/
-            # remapper (they currently do not). Define the full set of
-            # potential columns to rename in table.
-
-            # new syntax in 3.9 allows us to merge two dicts together w/OR.
             remapper = KLSampleSheet.column_alts | self.remapper
-            out.rename(remapper, axis=1, inplace=True)
+            result.rename(remapper, axis=1, inplace=True)
 
-            # out may contain additional columns that aren't allowed in the
+            # result may contain additional columns that aren't allowed in the
             # [Data] section of a sample-sheet e.g.: 'Extraction Kit Lot'.
-            # There may also be required columns that aren't defined in out.
-            subset = list(
-                set(self.data_columns) & set(
-                    out.columns))
+            # There may also be required columns that aren't defined in result.
 
-            out = out[subset]
+            # once all columns have been renamed to their preferred names, we
+            # must determine the proper set of column names for this sample-
+            # sheet. For legacy classes this is simply the list of columns
+            # defined in each sample-sheet version. For newer classes, this is
+            # defined at run-time and requires examining the metadata that
+            # will define the [Data] section.
+            required_columns = self._get_expected_columns(table=result)
+            subset = list(set(required_columns) & set(result.columns))
+            result = result[subset]
 
-        # append the new 'Well_description' column, now that alternates have
-        # been removed and non-essential columns have been dropped.
-        out['Well_description'] = well_description
+        return result
 
-        for column in self.data_columns:
-            if column not in out.columns:
-                warnings.warn('The column %s in the sample sheet is empty' %
-                              column)
-                out[column] = ''
+    def _add_data_to_sheet(self, table, sequencer, lanes, assay, strict=True):
+        if self.remapper is None:
+            raise ValueError("sample-sheet does not contain a valid Assay"
+                             " type.")
 
-        return out
+        # Well_description column is now defined here as the concatenation
+        # of the following columns. If the column existed previously it will
+        # be overwritten, otherwise it will be created here.
+        well_description = table['Project Plate'].astype(str) + "." + \
+            table['Sample'].astype(str) + "." + table['Well'].astype(str)
 
-    def _add_data_to_sheet(self, table, sequencer, lanes, assay, strict=True):
         table = self._remap_table(table, strict)
+
+        table['Well_description'] = well_description
+
+        for column in self._get_expected_columns():
+            if column not in table.columns:
+                warnings.warn('The column %s in the sample sheet is empty' %
+                              column)
+                table[column] = ''
+
         if assay != _AMPLICON:
             table['index2'] = sequencer_i5_index(sequencer, table['index2'])
 
@@ -455,6 +443,8 @@ def _add_data_to_sheet(self, table, sequencer, lanes, assay, strict=True):
                 sample['Lane'] = lane
                 self.add_sample(sample_sheet.Sample(sample))
 
+        return table
+
     def _add_metadata_to_sheet(self, metadata, sequencer):
         # set the default to avoid index errors if only one of the two is
         # provided.
@@ -537,6 +527,12 @@ def get_lane_number(self):
 
         return int(lanes[0])
 
+    def _get_expected_columns(self, table=None):
+        # this base (general) implementation of this method does nothing w/
+        # the table parameter. It is present only for compatibility with child
+        # methods.
+        return self.data_columns
+
     def validate_and_scrub_sample_sheet(self, echo_msgs=True):
         """Validate the sample sheet and scrub invalid characters
 
@@ -582,7 +578,7 @@ def quiet_validate_and_scrub_sample_sheet(self):
 
         # we print an error return None and exit when this happens otherwise
         # we won't be able to run other checks
-        for column in self.data_columns:
+        for column in self._get_expected_columns():
             if column not in self.all_sample_keys:
                 msgs.append(ErrorMessage(f'The {column} column in the Data '
                                          'section is missing'))
@@ -625,6 +621,22 @@ def quiet_validate_and_scrub_sample_sheet(self):
             msgs.append(ErrorMessage("'SheetType' value is not "
                                      f"'{expected_sheet_type}'"))
 
+        expected_sheet_version = int(type(self)._HEADER['SheetVersion'])
+
+        # sanitize sample-sheet SheetVersion before attempting to convert to
+        # int() type. Remove any additional enclosing quotes.
+        sheet_version = list(self.Header['SheetVersion'])
+        sheet_version = [c for c in sheet_version if c not in ['"', "'"]]
+        try:
+            sheet_version = int(''.join(sheet_version))
+        except ValueError:
+            msgs.append(ErrorMessage(f"'{self.Header['SheetVersion']}' does"
+                                     "not look like a valid value"))
+
+        if sheet_version != expected_sheet_version:
+            msgs.append(ErrorMessage("'SheetVersion' value is not "
+                                     f"'{expected_sheet_version}'"))
+
         # if any errors are found up to this point then we can't continue with
         # the validation process.
         if msgs:
@@ -803,6 +815,106 @@ def __init__(self, path=None):
         }
 
 
+class MetagenomicSampleSheetv101(KLSampleSheet):
+    # Adds support for optional KATHAROSEQ columns in [Data] section.
+
+    _HEADER = {
+        'IEMFileVersion': '4',
+        'SheetType': _STANDARD_METAG_SHEET_TYPE,
+        'SheetVersion': '101',
+        'Investigator Name': 'Knight',
+        'Experiment Name': 'RKL_experiment',
+        'Date': None,
+        'Workflow': 'GenerateFASTQ',
+        'Application': 'FASTQ Only',
+        'Assay': _METAGENOMIC,
+        'Description': '',
+        'Chemistry': 'Default',
+    }
+
+    data_columns = ['Sample_ID', 'Sample_Name', 'Sample_Plate', 'well_id_384',
+                    'I7_Index_ID', 'index', 'I5_Index_ID', 'index2',
+                    'Sample_Project', 'Well_description']
+
+    # columns present in an pre-prep file (amplicon) that included katharoseq
+    # controls. Presumably we will need these same columns in a sample-sheet.
+    optional_katharoseq_columns = ['Kathseq_RackID', 'TubeCode',
+                                   'katharo_description',
+                                   'number_of_cells',
+                                   'platemap_generation_date',
+                                   'project_abbreviation',
+                                   'vol_extracted_elution_ul', 'well_id_96']
+
+    # For now, assume only MetagenomicSampleSheetv101 (v100, v95, v99) contains
+    # 'contains_replicates' column. Assume AbsQuantSampleSheetv10 doesn't.
+    _BIOINFORMATICS_COLUMNS = {'Sample_Project', 'QiitaID', 'BarcodesAreRC',
+                               'ForwardAdapter', 'ReverseAdapter',
+                               'HumanFiltering', 'contains_replicates',
+                               'library_construction_protocol',
+                               'experiment_design_description'}
+
+    CARRIED_PREP_COLUMNS = ['experiment_design_description', 'i5_index_id',
+                            'i7_index_id', 'index', 'index2',
+                            'library_construction_protocol', 'sample_name',
+                            'sample_plate', 'sample_project',
+                            'well_description', 'well_id_384']
+
+    def __init__(self, path=None):
+        super().__init__(path=path)
+        self.remapper = {
+            'sample sheet Sample_ID': 'Sample_ID',
+            'Sample': 'Sample_Name',
+            'Project Plate': 'Sample_Plate',
+            'Well': 'well_id_384',
+            'i7 name': 'I7_Index_ID',
+            'i7 sequence': 'index',
+            'i5 name': 'I5_Index_ID',
+            'i5 sequence': 'index2',
+            'Project Name': 'Sample_Project',
+            'Kathseq_RackID': 'Kathseq_RackID'
+        }
+
+    def contains_katharoseq_samples(self):
+        # when creating samples manually, as opposed to loading a sample-sheet
+        # from file, whether or not a sample-sheet contains katharoseq
+        # controls can change from add_sample() to add_sample() and won't be
+        # determined when MetagenomicSampleSheetv101() is created w/out a
+        # file. Hence, perform this check on demand() as opposed to once at
+        # init().
+        for sample in self.samples:
+            # assume any sample-name beginning with 'katharo' in any form of
+            # case is a katharoseq sample.
+            if sample.Sample_Name.lower().startswith('katharo'):
+                return True
+
+        return False
+
+    def _table_contains_katharoseq_samples(self, table):
+        # for instances when a MetagenomicSampleSheetv101() object contains
+        # no samples, and the samples will be added in a single method call.
+        # this helper method will return True only if a katharo-control
+        # sample is found. Note criteria for this method should be kept
+        # consistent w/the above method (contains_katharoseq_samples).
+        return table['Sample_Name'].str.startswith('katharo').any()
+
+    def _get_expected_columns(self, table=None):
+        if table is None:
+            # if [Data] section contains katharoseq samples, add the expected
+            # additional katharoseq columns to the official list of expected
+            # columns before validation or other processing begins.
+            if self.contains_katharoseq_samples():
+                return self.data_columns + self.optional_katharoseq_columns
+        else:
+            # assume that there are no samples added to this object yet. This
+            # means that self.contains_katharoseq_samples() will always return
+            # False. Assume table contains a list of samples that may or may
+            # not contain katharoseq controls.
+            if self._table_contains_katharoseq_samples(table):
+                return self.data_columns + self.optional_katharoseq_columns
+
+        return self.data_columns
+
+
 class MetagenomicSampleSheetv100(KLSampleSheet):
     _HEADER = {
         'IEMFileVersion': '4',
@@ -821,19 +933,12 @@ class MetagenomicSampleSheetv100(KLSampleSheet):
     # Note that there doesn't appear to be a difference between 95, 99, and 100
     # beyond the value observed in 'Well_description' column. The real
     # difference is between standard_metag and abs_quant_metag.
-
-    # Marks change from 'Metagenomics' to 'Metagenomic' - encapsulate this
-    # change: TODO.
-
-    # Note: Remove syndna_pool_number as that was part of the purpose of
-    # making this change. Also, it's always going to be empty or worse have
-    # a value that won't be checked.
     data_columns = ['Sample_ID', 'Sample_Name', 'Sample_Plate', 'well_id_384',
                     'I7_Index_ID', 'index', 'I5_Index_ID', 'index2',
                     'Sample_Project', 'Well_description']
 
-    # For now, assume only MetagenomicSampleSheetv100 (and v95, v99) contains
-    # 'contains_replicates' column. Assume AbsQuantSampleSheetv10 doesn't.
+    # For now, assume only AbsQuantSampleSheetv10 doesn't contain
+    # 'contains_replicates' column, while the others do.
 
     _BIOINFORMATICS_COLUMNS = {'Sample_Project', 'QiitaID', 'BarcodesAreRC',
                                'ForwardAdapter', 'ReverseAdapter',
@@ -1081,21 +1186,19 @@ def load_sample_sheet(sample_sheet_path):
     if sheet.validate_and_scrub_sample_sheet(echo_msgs=False):
         return sheet
 
-    sheet = MetagenomicSampleSheetv100(sample_sheet_path)
+    sheet = MetagenomicSampleSheetv101(sample_sheet_path)
     if sheet.validate_and_scrub_sample_sheet(echo_msgs=False):
         return sheet
 
-    sheet = MetagenomicSampleSheetv90(sample_sheet_path)
+    sheet = MetagenomicSampleSheetv100(sample_sheet_path)
     if sheet.validate_and_scrub_sample_sheet(echo_msgs=False):
         return sheet
 
-    sheet = MetatranscriptomicSampleSheetv10(sample_sheet_path)
-
+    sheet = MetagenomicSampleSheetv90(sample_sheet_path)
     if sheet.validate_and_scrub_sample_sheet(echo_msgs=False):
         return sheet
 
-    sheet = MetatranscriptomicSampleSheetv0(sample_sheet_path)
-
+    sheet = MetatranscriptomicSampleSheetv10(sample_sheet_path)
     if sheet.validate_and_scrub_sample_sheet(echo_msgs=False):
         return sheet
 
@@ -1110,7 +1213,9 @@ def load_sample_sheet(sample_sheet_path):
 def _create_sample_sheet(sheet_type, sheet_version, assay_type):
     if sheet_type == _STANDARD_METAG_SHEET_TYPE:
         if assay_type == _METAGENOMIC:
-            if sheet_version == '90':
+            if sheet_version == '101':
+                sheet = MetagenomicSampleSheetv101()
+            elif sheet_version == '90':
                 sheet = MetagenomicSampleSheetv90()
             elif sheet_version in ['95', '99', '100']:
                 # 95, 99, and v100 are functionally the same type.

diff --git a/metapool/tests/data/test_katharoseq_sheet1.csv b/metapool/tests/data/test_katharoseq_sheet1.csv
@@ -0,0 +1,38 @@
+[Header],,,,,,,,,,
+IEMFileVersion,4,,,,,,,,,
+SheetType,standard_metag,,,,,,,,,
+SheetVersion,101,,,,,,,,,
+Investigator Name,Knight,,,,,,,,,
+Experiment Name,RKL0042,,,,,,,,,
+Date,2/26/24,,,,,,,,,
+Workflow,GenerateFASTQ,,,,,,,,,
+Application,FASTQ Only,,,,,,,,,
+Assay,Metagenomic,,,,,,,,,
+Description,,,,,,,,,,
+Chemistry,Default,,,,,,,,,
+,,,,,,,,,,
+[Reads],,,,,,,,,,
+150,,,,,,,,,,
+150,,,,,,,,,,
+,,,,,,,,,,
+[Settings],,,,,,,,,,
+ReverseComplement,0,,,,,,,,,
+MaskShortReads,1,,,,,,,,,
+OverrideCycles,Y151;I8N2;I8N2;Y151,,,,,,,,,
+,,,,,,,,,,
+[Data],,,,,,,,,,
+Lane,Sample_ID,Sample_Name,Sample_Plate,well_id_384,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Well_description
+1,SAMPLE_1,SAMPLE-1,MyProject_99999_P40,A1,iTru7_107_07,CCGACTAT,iTru5_01_A,ACCGACAA,MyProject_99999,this is a description
+1,SAMPLE_2,SAMPLE-2,MyProject_99999_P40,C1,iTru7_107_08,CCGACTAT,iTru5_02_A,CTTCGCAA,MyProject_99999,this is a description
+1,SAMPLE_3,SAMPLE-3,MyProject_99999_P40,E1,iTru7_107_09,GCCTTGTT,iTru5_03_A,AACACCAC,MyProject_99999,this is a description
+1,SAMPLE_4,SAMPLE-4,MyProject_99999_P40,G1,iTru7_107_10,AACTTGCC,iTru5_04_A,CGTATCTC,MyProject_99999,this is a description
+1,SAMPLE_5,SAMPLE-5,MyProject_99999_P40,I1,iTru7_107_11,CAATGTGG,iTru5_05_A,GGTACGAA,MyProject_99999,this is a description
+,,,,,,,,,,
+[Bioinformatics],,,,,,,,,,
+Sample_Project,QiitaID,BarcodesAreRC,ForwardAdapter,ReverseAdapter,HumanFiltering,library_construction_protocol,experiment_design_description,,,
+MyProject_99999,11661,FALSE,AACC,GGTT,FALSE,Knight Lab Kapa HP,description1,,,
+,,,,,,,,,,
+[Contact],,,,,,,,,,
+Email,Sample_Project,,,,,,,,,
+foo@bar.com,MyProject_99999,,,,,,,,,
+,,,,,,,,,,