Skip to content

Commit

Permalink
Merge fb7ef21 into 456f3e9
Browse files Browse the repository at this point in the history
  • Loading branch information
charles-cowart committed Apr 17, 2024
2 parents 456f3e9 + fb7ef21 commit f1fb9de
Show file tree
Hide file tree
Showing 6 changed files with 730 additions and 72 deletions.
6 changes: 4 additions & 2 deletions metapool/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
demux_pre_prep, pre_prep_needs_demuxing)
from .sample_sheet import (sample_sheet_to_dataframe, make_sample_sheet,
AmpliconSampleSheet, MetagenomicSampleSheetv90,
MetagenomicSampleSheetv100, AbsQuantSampleSheetv10,
MetagenomicSampleSheetv100,
MetagenomicSampleSheetv101, AbsQuantSampleSheetv10,
MetatranscriptomicSampleSheetv0, demux_sample_sheet,
sheet_needs_demuxing, KLSampleSheet,
load_sample_sheet, MetatranscriptomicSampleSheetv10)
Expand All @@ -29,7 +30,8 @@
'requires_dilution', 'run_counts', 'sample_sheet_to_dataframe',
'sheet_needs_demuxing', 'sum_lanes', 'validate_plate_metadata',
'MetagenomicSampleSheetv90', 'MetagenomicSampleSheetv100',
'AmpliconSampleSheet', 'MetatranscriptomicSampleSheetv0',
'MetagenomicSampleSheetv101', 'AmpliconSampleSheet',
'MetatranscriptomicSampleSheetv0',
'MetatranscriptomicSampleSheetv10', 'AbsQuantSampleSheetv10',
# KLSampleSheet is needed for instance() calls.
'KLSampleSheet', 'load_sample_sheet']
Expand Down
241 changes: 173 additions & 68 deletions metapool/sample_sheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ def write_blank_lines(writer, n=blank_lines, width=csv_width):
if section is not None:
# these sections are represented as DataFrame objects
writer.writerow(pad_iterable(section.columns.tolist(),
csv_width))
csv_width))

for _, row in section.iterrows():
writer.writerow(pad_iterable(row.values.tolist(),
Expand Down Expand Up @@ -377,73 +377,61 @@ def merge(self, sheets):
else:
pass

def _remap_table(self, table, strict=True):
if self.remapper is None:
raise ValueError("sample-sheet does not contain a valid Assay"
" type.")

# Well_description column is now defined here as the concatenation
# of the following columns. If the column existed previously it will
# be overwritten, otherwise it will be created here. Alternate versions
# of the column name have already been resolved at this point.

# Note that the amplicon notebook currently generates the same values
# for this column. If the functionality in the notebook changes, the
# output will continue to be redfined with the current values here.
well_description = table['Project Plate'].astype(str) + "." + table[
'Sample'].astype(str) + "." + table['Well'].astype(str)
def _remap_table(self, table, strict):
result = table.copy(deep=True)

if strict:
# legacy operation. All columns not defined in remapper will be
# filtered out.
out = table[self.remapper.keys()].copy()
out.rename(self.remapper, axis=1, inplace=True)
# All columns not defined in remapper will be filtered result.
result = table[self.remapper.keys()].copy()
result.rename(self.remapper, axis=1, inplace=True)
else:
out = table.copy(deep=True)

# if a column named 'index' is present in table, assume it is a
# numeric index and not a sequence of bases, which is required in
# the output. Assume the column that will become 'index' is
# defined in remapper.
if 'index' in set(out.columns):
out.drop(columns=['index'], inplace=True)

# if an alternate form of a column name defined in
# _KL_SAMPLE_SHEET_COLUMN_ALTS is found in table, assume it should
# be renamed to its proper form and be included in the output e.g.:
# 'sample_plate' -> 'Sample_Plate'.
if 'index' in set(result.columns):
result.drop(columns=['index'], inplace=True)

# assume keys in _KL_SAMPLE_SHEET_COLUMN_ALTS do not overlap w/
# remapper (they currently do not). Define the full set of
# potential columns to rename in table.

# new syntax in 3.9 allows us to merge two dicts together w/OR.
remapper = KLSampleSheet.column_alts | self.remapper
out.rename(remapper, axis=1, inplace=True)
result.rename(remapper, axis=1, inplace=True)

# out may contain additional columns that aren't allowed in the
# result may contain additional columns that aren't allowed in the
# [Data] section of a sample-sheet e.g.: 'Extraction Kit Lot'.
# There may also be required columns that aren't defined in out.
subset = list(
set(self.data_columns) & set(
out.columns))
# There may also be required columns that aren't defined in result.

out = out[subset]
# once all columns have been renamed to their preferred names, we
# must determine the proper set of column names for this sample-
# sheet. For legacy classes this is simply the list of columns
# defined in each sample-sheet version. For newer classes, this is
# defined at run-time and requires examining the metadata that
# will define the [Data] section.
required_columns = self._get_expected_columns(table=result)
subset = list(set(required_columns) & set(result.columns))
result = result[subset]

# append the new 'Well_description' column, now that alternates have
# been removed and non-essential columns have been dropped.
out['Well_description'] = well_description
return result

for column in self.data_columns:
if column not in out.columns:
warnings.warn('The column %s in the sample sheet is empty' %
column)
out[column] = ''
def _add_data_to_sheet(self, table, sequencer, lanes, assay, strict=True):
if self.remapper is None:
raise ValueError("sample-sheet does not contain a valid Assay"
" type.")

return out
# Well_description column is now defined here as the concatenation
# of the following columns. If the column existed previously it will
# be overwritten, otherwise it will be created here.
well_description = table['Project Plate'].astype(str) + "." + \
table['Sample'].astype(str) + "." + table['Well'].astype(str)

def _add_data_to_sheet(self, table, sequencer, lanes, assay, strict=True):
table = self._remap_table(table, strict)

table['Well_description'] = well_description

for column in self._get_expected_columns():
if column not in table.columns:
warnings.warn('The column %s in the sample sheet is empty' %
column)
table[column] = ''

if assay != _AMPLICON:
table['index2'] = sequencer_i5_index(sequencer, table['index2'])

Expand All @@ -455,6 +443,8 @@ def _add_data_to_sheet(self, table, sequencer, lanes, assay, strict=True):
sample['Lane'] = lane
self.add_sample(sample_sheet.Sample(sample))

return table

def _add_metadata_to_sheet(self, metadata, sequencer):
# set the default to avoid index errors if only one of the two is
# provided.
Expand Down Expand Up @@ -537,6 +527,12 @@ def get_lane_number(self):

return int(lanes[0])

def _get_expected_columns(self, table=None):
# this base (general) implementation of this method does nothing w/
# the table parameter. It is present only for compatibility with child
# methods.
return self.data_columns

def validate_and_scrub_sample_sheet(self, echo_msgs=True):
"""Validate the sample sheet and scrub invalid characters
Expand Down Expand Up @@ -582,7 +578,7 @@ def quiet_validate_and_scrub_sample_sheet(self):

# we print an error return None and exit when this happens otherwise
# we won't be able to run other checks
for column in self.data_columns:
for column in self._get_expected_columns():
if column not in self.all_sample_keys:
msgs.append(ErrorMessage(f'The {column} column in the Data '
'section is missing'))
Expand Down Expand Up @@ -625,6 +621,22 @@ def quiet_validate_and_scrub_sample_sheet(self):
msgs.append(ErrorMessage("'SheetType' value is not "
f"'{expected_sheet_type}'"))

expected_sheet_version = int(type(self)._HEADER['SheetVersion'])

# sanitize sample-sheet SheetVersion before attempting to convert to
# int() type. Remove any additional enclosing quotes.
sheet_version = list(self.Header['SheetVersion'])
sheet_version = [c for c in sheet_version if c not in ['"', "'"]]
try:
sheet_version = int(''.join(sheet_version))
except ValueError:
msgs.append(ErrorMessage(f"'{self.Header['SheetVersion']}' does"
"not look like a valid value"))

if sheet_version != expected_sheet_version:
msgs.append(ErrorMessage("'SheetVersion' value is not "
f"'{expected_sheet_version}'"))

# if any errors are found up to this point then we can't continue with
# the validation process.
if msgs:
Expand Down Expand Up @@ -803,6 +815,106 @@ def __init__(self, path=None):
}


class MetagenomicSampleSheetv101(KLSampleSheet):
# Adds support for optional KATHAROSEQ columns in [Data] section.

_HEADER = {
'IEMFileVersion': '4',
'SheetType': _STANDARD_METAG_SHEET_TYPE,
'SheetVersion': '101',
'Investigator Name': 'Knight',
'Experiment Name': 'RKL_experiment',
'Date': None,
'Workflow': 'GenerateFASTQ',
'Application': 'FASTQ Only',
'Assay': _METAGENOMIC,
'Description': '',
'Chemistry': 'Default',
}

data_columns = ['Sample_ID', 'Sample_Name', 'Sample_Plate', 'well_id_384',
'I7_Index_ID', 'index', 'I5_Index_ID', 'index2',
'Sample_Project', 'Well_description']

# columns present in an pre-prep file (amplicon) that included katharoseq
# controls. Presumably we will need these same columns in a sample-sheet.
optional_katharoseq_columns = ['Kathseq_RackID', 'TubeCode',
'katharo_description',
'number_of_cells',
'platemap_generation_date',
'project_abbreviation',
'vol_extracted_elution_ul', 'well_id_96']

# For now, assume only MetagenomicSampleSheetv101 (v100, v95, v99) contains
# 'contains_replicates' column. Assume AbsQuantSampleSheetv10 doesn't.
_BIOINFORMATICS_COLUMNS = {'Sample_Project', 'QiitaID', 'BarcodesAreRC',
'ForwardAdapter', 'ReverseAdapter',
'HumanFiltering', 'contains_replicates',
'library_construction_protocol',
'experiment_design_description'}

CARRIED_PREP_COLUMNS = ['experiment_design_description', 'i5_index_id',
'i7_index_id', 'index', 'index2',
'library_construction_protocol', 'sample_name',
'sample_plate', 'sample_project',
'well_description', 'well_id_384']

def __init__(self, path=None):
super().__init__(path=path)
self.remapper = {
'sample sheet Sample_ID': 'Sample_ID',
'Sample': 'Sample_Name',
'Project Plate': 'Sample_Plate',
'Well': 'well_id_384',
'i7 name': 'I7_Index_ID',
'i7 sequence': 'index',
'i5 name': 'I5_Index_ID',
'i5 sequence': 'index2',
'Project Name': 'Sample_Project',
'Kathseq_RackID': 'Kathseq_RackID'
}

def contains_katharoseq_samples(self):
# when creating samples manually, as opposed to loading a sample-sheet
# from file, whether or not a sample-sheet contains katharoseq
# controls can change from add_sample() to add_sample() and won't be
# determined when MetagenomicSampleSheetv101() is created w/out a
# file. Hence, perform this check on demand() as opposed to once at
# init().
for sample in self.samples:
# assume any sample-name beginning with 'katharo' in any form of
# case is a katharoseq sample.
if sample.Sample_Name.lower().startswith('katharo'):
return True

return False

def _table_contains_katharoseq_samples(self, table):
# for instances when a MetagenomicSampleSheetv101() object contains
# no samples, and the samples will be added in a single method call.
# this helper method will return True only if a katharo-control
# sample is found. Note criteria for this method should be kept
# consistent w/the above method (contains_katharoseq_samples).
return table['Sample_Name'].str.startswith('katharo').any()

def _get_expected_columns(self, table=None):
if table is None:
# if [Data] section contains katharoseq samples, add the expected
# additional katharoseq columns to the official list of expected
# columns before validation or other processing begins.
if self.contains_katharoseq_samples():
return self.data_columns + self.optional_katharoseq_columns
else:
# assume that there are no samples added to this object yet. This
# means that self.contains_katharoseq_samples() will always return
# False. Assume table contains a list of samples that may or may
# not contain katharoseq controls.
if self._table_contains_katharoseq_samples(table):
return self.data_columns + self.optional_katharoseq_columns

return self.data_columns


class MetagenomicSampleSheetv100(KLSampleSheet):
_HEADER = {
'IEMFileVersion': '4',
Expand All @@ -821,19 +933,12 @@ class MetagenomicSampleSheetv100(KLSampleSheet):
# Note that there doesn't appear to be a difference between 95, 99, and 100
# beyond the value observed in 'Well_description' column. The real
# difference is between standard_metag and abs_quant_metag.

# Marks change from 'Metagenomics' to 'Metagenomic' - encapsulate this
# change: TODO.

# Note: Remove syndna_pool_number as that was part of the purpose of
# making this change. Also, it's always going to be empty or worse have
# a value that won't be checked.
data_columns = ['Sample_ID', 'Sample_Name', 'Sample_Plate', 'well_id_384',
'I7_Index_ID', 'index', 'I5_Index_ID', 'index2',
'Sample_Project', 'Well_description']

# For now, assume only MetagenomicSampleSheetv100 (and v95, v99) contains
# 'contains_replicates' column. Assume AbsQuantSampleSheetv10 doesn't.
# For now, assume only AbsQuantSampleSheetv10 doesn't contain
# 'contains_replicates' column, while the others do.

_BIOINFORMATICS_COLUMNS = {'Sample_Project', 'QiitaID', 'BarcodesAreRC',
'ForwardAdapter', 'ReverseAdapter',
Expand Down Expand Up @@ -1081,21 +1186,19 @@ def load_sample_sheet(sample_sheet_path):
if sheet.validate_and_scrub_sample_sheet(echo_msgs=False):
return sheet

sheet = MetagenomicSampleSheetv100(sample_sheet_path)
sheet = MetagenomicSampleSheetv101(sample_sheet_path)
if sheet.validate_and_scrub_sample_sheet(echo_msgs=False):
return sheet

sheet = MetagenomicSampleSheetv90(sample_sheet_path)
sheet = MetagenomicSampleSheetv100(sample_sheet_path)
if sheet.validate_and_scrub_sample_sheet(echo_msgs=False):
return sheet

sheet = MetatranscriptomicSampleSheetv10(sample_sheet_path)

sheet = MetagenomicSampleSheetv90(sample_sheet_path)
if sheet.validate_and_scrub_sample_sheet(echo_msgs=False):
return sheet

sheet = MetatranscriptomicSampleSheetv0(sample_sheet_path)

sheet = MetatranscriptomicSampleSheetv10(sample_sheet_path)
if sheet.validate_and_scrub_sample_sheet(echo_msgs=False):
return sheet

Expand All @@ -1110,7 +1213,9 @@ def load_sample_sheet(sample_sheet_path):
def _create_sample_sheet(sheet_type, sheet_version, assay_type):
if sheet_type == _STANDARD_METAG_SHEET_TYPE:
if assay_type == _METAGENOMIC:
if sheet_version == '90':
if sheet_version == '101':
sheet = MetagenomicSampleSheetv101()
elif sheet_version == '90':
sheet = MetagenomicSampleSheetv90()
elif sheet_version in ['95', '99', '100']:
# 95, 99, and v100 are functionally the same type.
Expand Down
38 changes: 38 additions & 0 deletions metapool/tests/data/test_katharoseq_sheet1.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
[Header],,,,,,,,,,
IEMFileVersion,4,,,,,,,,,
SheetType,standard_metag,,,,,,,,,
SheetVersion,101,,,,,,,,,
Investigator Name,Knight,,,,,,,,,
Experiment Name,RKL0042,,,,,,,,,
Date,2/26/24,,,,,,,,,
Workflow,GenerateFASTQ,,,,,,,,,
Application,FASTQ Only,,,,,,,,,
Assay,Metagenomic,,,,,,,,,
Description,,,,,,,,,,
Chemistry,Default,,,,,,,,,
,,,,,,,,,,
[Reads],,,,,,,,,,
150,,,,,,,,,,
150,,,,,,,,,,
,,,,,,,,,,
[Settings],,,,,,,,,,
ReverseComplement,0,,,,,,,,,
MaskShortReads,1,,,,,,,,,
OverrideCycles,Y151;I8N2;I8N2;Y151,,,,,,,,,
,,,,,,,,,,
[Data],,,,,,,,,,
Lane,Sample_ID,Sample_Name,Sample_Plate,well_id_384,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Well_description
1,SAMPLE_1,SAMPLE-1,MyProject_99999_P40,A1,iTru7_107_07,CCGACTAT,iTru5_01_A,ACCGACAA,MyProject_99999,this is a description
1,SAMPLE_2,SAMPLE-2,MyProject_99999_P40,C1,iTru7_107_08,CCGACTAT,iTru5_02_A,CTTCGCAA,MyProject_99999,this is a description
1,SAMPLE_3,SAMPLE-3,MyProject_99999_P40,E1,iTru7_107_09,GCCTTGTT,iTru5_03_A,AACACCAC,MyProject_99999,this is a description
1,SAMPLE_4,SAMPLE-4,MyProject_99999_P40,G1,iTru7_107_10,AACTTGCC,iTru5_04_A,CGTATCTC,MyProject_99999,this is a description
1,SAMPLE_5,SAMPLE-5,MyProject_99999_P40,I1,iTru7_107_11,CAATGTGG,iTru5_05_A,GGTACGAA,MyProject_99999,this is a description
,,,,,,,,,,
[Bioinformatics],,,,,,,,,,
Sample_Project,QiitaID,BarcodesAreRC,ForwardAdapter,ReverseAdapter,HumanFiltering,library_construction_protocol,experiment_design_description,,,
MyProject_99999,11661,FALSE,AACC,GGTT,FALSE,Knight Lab Kapa HP,description1,,,
,,,,,,,,,,
[Contact],,,,,,,,,,
Email,Sample_Project,,,,,,,,,
foo@bar.com,MyProject_99999,,,,,,,,,
,,,,,,,,,,

0 comments on commit f1fb9de

Please sign in to comment.