Skip to content

Commit

Permalink
Removed legacy atropos-and-bowtie2 setting.
Browse files Browse the repository at this point in the history
Tests reviewed and adjusted appropriately.
  • Loading branch information
charles-cowart committed Apr 18, 2024
1 parent 2d38a3e commit 6978944
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 137 deletions.
50 changes: 13 additions & 37 deletions metapool/prep.py
Expand Up @@ -156,7 +156,7 @@ def remove_qiita_id(project_name):
return matches[1]


def get_run_prefix(run_path, project, sample_id, lane, pipeline):
def get_run_prefix(run_path, project, sample_id, lane):
"""For a sample find the run prefix
Parameters
Expand All @@ -169,9 +169,6 @@ def get_run_prefix(run_path, project, sample_id, lane, pipeline):
Sample ID (was sample_name). Changed to reflect name used for files.
lane: str
Lane number
pipeline: str
The pipeline used to generate the data. Should be one of
`atropos-and-bowtie2` or `fastp-and-minimap2`.
Returns
-------
Expand All @@ -182,32 +179,17 @@ def get_run_prefix(run_path, project, sample_id, lane, pipeline):
base = os.path.join(run_path, project)
path = base

# each pipeline sets up a slightly different directory structure,
# importantly fastp-and-minimap2 won't save intermediate files
if pipeline == 'atropos-and-bowtie2':
qc = os.path.join(base, 'atropos_qc')
hf = os.path.join(base, 'filtered_sequences')

# If both folders exist and have sequence files always prefer the
# human-filtered sequences
if _exists_and_has_files(qc):
path = qc
if _exists_and_has_files(hf):
path = hf
elif pipeline == 'fastp-and-minimap2':
qc = os.path.join(base, 'trimmed_sequences')
hf = os.path.join(base, 'filtered_sequences')

if _exists_and_has_files(qc) and _exists_and_has_files(hf):
path = hf
elif _exists_and_has_files(qc):
path = qc
elif _exists_and_has_files(hf):
path = hf
else:
path = base
qc = os.path.join(base, 'trimmed_sequences')
hf = os.path.join(base, 'filtered_sequences')

if _exists_and_has_files(qc) and _exists_and_has_files(hf):
path = hf
elif _exists_and_has_files(qc):
path = qc
elif _exists_and_has_files(hf):
path = hf
else:
raise ValueError('Invalid pipeline "%s"' % pipeline)
path = base

search_me = '%s_S*_L*%s_R*.fastq.gz' % (sample_id, lane)

Expand Down Expand Up @@ -453,7 +435,7 @@ def process_sample(sample, prep_columns, run_center, run_date, run_prefix,


def preparations_for_run(run_path, sheet, generated_prep_columns,
carried_prep_columns, pipeline='fastp-and-minimap2'):
carried_prep_columns):
"""Given a run's path and sample sheet generates preparation files
Parameters
Expand All @@ -469,11 +451,6 @@ def preparations_for_run(run_path, sheet, generated_prep_columns,
carried_prep_columns: list
List of required columns for output that are expected in KLSampleSheet.
Varies w/different versions of KLSampleSheet.
pipeline: str, optional
Which pipeline generated the data. The important difference is that
`atropos-and-bowtie2` saves intermediate files, whereas
`fastp-and-minimap2` doesn't. Default is `fastp-and-minimap2`, the
latest version of the sequence processing pipeline.
Returns
-------
Expand Down Expand Up @@ -533,8 +510,7 @@ def preparations_for_run(run_path, sheet, generated_prep_columns,
else:
sample_id = sample.sample_id

run_prefix = get_run_prefix(run_path, project, sample_id, lane,
pipeline)
run_prefix = get_run_prefix(run_path, project, sample_id, lane)

# ignore the sample if there's no file
if run_prefix is not None:
Expand Down
29 changes: 9 additions & 20 deletions metapool/scripts/seqpro.py
Expand Up @@ -16,13 +16,9 @@
@click.argument('sample_sheet', type=click.Path(exists=True, dir_okay=False,
file_okay=True))
@click.argument('output_dir', type=click.Path(writable=True))
@click.option('--pipeline', help='Which pipeline generated the data',
show_default=True, default='fastp-and-minimap2',
type=click.Choice(['atropos-and-bowtie2', 'fastp-and-minimap2']))
@click.option('--verbose', help='list prep-file output paths, study_ids',
is_flag=True)
def format_preparation_files(run_dir, sample_sheet, output_dir, pipeline,
verbose):
def format_preparation_files(run_dir, sample_sheet, output_dir, verbose):
"""Generate the preparation files for the projects in a run
RUN_DIR: should be the directory where the results of running bcl2fastq are
Expand All @@ -41,13 +37,9 @@ def format_preparation_files(run_dir, sample_sheet, output_dir, pipeline,
sample_sheet = load_sample_sheet(sample_sheet)
df_sheet = sample_sheet_to_dataframe(sample_sheet)

if pipeline == 'fastp-and-minimap2':
stats = run_counts(run_dir, sample_sheet)
stats['sample_name'] = \
df_sheet.set_index('lane', append=True)['sample_name']
else:
click.echo('Stats collection is not supported for pipeline '
'atropos-and-bowtie2')
stats = run_counts(run_dir, sample_sheet)
stats['sample_name'] = \
df_sheet.set_index('lane', append=True)['sample_name']

# sample_sheet_to_dataframe() automatically lowercases the column names
# before returning df_sheet. Hence, sample_sheet.CARRIED_PREP_COLUMNS also
Expand All @@ -58,20 +50,17 @@ def format_preparation_files(run_dir, sample_sheet, output_dir, pipeline,
preps = preparations_for_run(run_dir,
df_sheet,
sample_sheet.GENERATED_PREP_COLUMNS,
c_prep_columns,
pipeline=pipeline)
c_prep_columns)

os.makedirs(output_dir, exist_ok=True)

for (run, project, lane), df in preps.items():
fp = os.path.join(output_dir, f'{run}.{project}.{lane}.tsv')

if pipeline == 'fastp-and-minimap2':
# stats are indexed by sample name and lane, lane is the first
# level index. When merging, make sure to select the lane subset
# that we care about, otherwise we'll end up with repeated rows
df = df.merge(stats.xs(lane, level=1), how='left',
on='sample_name')
# stats are indexed by sample name and lane, lane is the first
# level index. When merging, make sure to select the lane subset
# that we care about, otherwise we'll end up with repeated rows
df = df.merge(stats.xs(lane, level=1), how='left', on='sample_name')

# strip qiita_id from project names in sample_project column
df['sample_project'] = df['sample_project'].map(
Expand Down
52 changes: 3 additions & 49 deletions metapool/scripts/tests/test_seqpro.py
Expand Up @@ -36,46 +36,6 @@ def setUp(self):
def tearDown(self):
rmtree(self.vf_test_dir, ignore_errors=True)

def test_atropos_run(self):
runner = CliRunner()

with runner.isolated_filesystem():
result = runner.invoke(
format_preparation_files,
args=[
self.run,
self.sheet,
"./",
"--pipeline",
"atropos-and-bowtie2",
],
)

# assert that expected error message appeared in stdout. we are
# not concerned w/warning messages that may also appear.
self.assertIn(
"Stats collection is not supported for pipeline "
"atropos-and-bowtie2",
result.output,
)
self.assertEqual(result.exit_code, 0)

exp_preps = [
"191103_D32611_0365_G00DHB5YXX.Baz_12345.1.tsv",
"191103_D32611_0365_G00DHB5YXX.Baz_12345.3.tsv",
"191103_D32611_0365_G00DHB5YXX.FooBar_666.3.tsv",
]

self.assertEqual(sorted(os.listdir("./")), exp_preps)

for prep, exp_lines in zip(exp_preps, [4, 4, 5]):
with open(prep) as f:
self.assertEqual(
len(f.read().split("\n")),
exp_lines,
"Assertion error in %s" % prep,
)

def test_fastp_run(self):
runner = CliRunner()

Expand All @@ -85,9 +45,7 @@ def test_fastp_run(self):
args=[
self.fastp_run,
self.fastp_sheet,
"./",
"--pipeline",
"fastp-and-minimap2",
"./"
],
)

Expand Down Expand Up @@ -396,9 +354,7 @@ def test_legacy_run(self):
args=[
self.fastp_run,
self.v90_test_sheet,
"./",
"--pipeline",
"fastp-and-minimap2",
"./"
],
)

Expand Down Expand Up @@ -442,9 +398,7 @@ def test_fastp_run(self):
args=[
self.temp_copy,
self.fastp_sheet,
"./",
"--pipeline",
"fastp-and-minimap2",
"./"
],
)
self.assertEqual(result.output, "")
Expand Down
Binary file not shown.
Binary file not shown.
53 changes: 22 additions & 31 deletions metapool/tests/test_prep.py
Expand Up @@ -75,6 +75,9 @@ def _check_run_191103_D32611_0365_G00DHB5YXX(self, obs):
# make sure the columns are in the same order before comparing
obs_df = obs_df[exp.columns].copy()

exp.to_csv('exp')
obs_df.to_csv('obs')

pd.testing.assert_frame_equal(obs_df, exp)

data = [['sample.1', 'Eqiiperiment', 'Knight Lab Kapa HP',
Expand Down Expand Up @@ -202,8 +205,7 @@ def test_preparations_for_run(self):
obs = preparations_for_run(self.good_run,
sample_sheet_to_dataframe(sheet),
sheet.GENERATED_PREP_COLUMNS,
sheet.CARRIED_PREP_COLUMNS,
pipeline='atropos-and-bowtie2')
sheet.CARRIED_PREP_COLUMNS)
self._check_run_191103_D32611_0365_G00DHB5YXX(obs)

def test_preparations_for_run_missing_columns(self):
Expand All @@ -217,8 +219,7 @@ def test_preparations_for_run_missing_columns(self):
with self.assertWarns(UserWarning) as cm:
obs = preparations_for_run(self.good_run, ss,
sheet.GENERATED_PREP_COLUMNS,
sheet.CARRIED_PREP_COLUMNS,
pipeline='atropos-and-bowtie2')
sheet.CARRIED_PREP_COLUMNS)

self.assertEqual(str(cm.warnings[0].message), "'well_description' "
"is not present in s"
Expand Down Expand Up @@ -287,67 +288,57 @@ def test_remove_qiita_id(self):

def test_get_run_prefix(self):
# project 1
obs = get_run_prefix(self.good_run, 'Baz_12345', 'sample_1', '1',
'atropos-and-bowtie2')
obs = get_run_prefix(self.good_run, 'Baz_12345', 'sample_1', '1')
self.assertEqual('sample_1_S11_L001', obs)

obs = get_run_prefix(self.good_run, 'Baz_12345', 'sample_1', '3',
'atropos-and-bowtie2')
obs = get_run_prefix(self.good_run, 'Baz_12345', 'sample_1', '3')
self.assertEqual('sample_1_S11_L003', obs)

obs = get_run_prefix(self.good_run, 'Baz_12345', 'sample_2', '1',
'atropos-and-bowtie2')
obs = get_run_prefix(self.good_run, 'Baz_12345', 'sample_2', '1')
self.assertEqual('sample_2_S10_L001', obs)

obs = get_run_prefix(self.good_run, 'Baz_12345', 'sample_2', '3',
'atropos-and-bowtie2')
self.assertIsNone(obs)

# project 2
obs = get_run_prefix(self.good_run, 'FooBar_666', 'sample_31', '3',
'atropos-and-bowtie2')
obs = get_run_prefix(self.good_run, 'FooBar_666', 'sample_31', '3')
self.assertEqual('sample_31_S13_L003', obs)

obs = get_run_prefix(self.good_run, 'FooBar_666', 'sample_32', '3',
'atropos-and-bowtie2')
obs = get_run_prefix(self.good_run, 'FooBar_666', 'sample_32', '3')
self.assertEqual('sample_32_S19_L003', obs)

obs = get_run_prefix(self.good_run, 'FooBar_666', 'sample_34', '3',
'atropos-and-bowtie2')
obs = get_run_prefix(self.good_run, 'FooBar_666', 'sample_34', '3')
self.assertEqual('sample_34_S33_L003', obs)

def test_get_run_prefix_fastp_minimap(self):
obs = get_run_prefix(self.good_run_new_version, 'Baz_12345', 'sample1',
'1', 'fastp-and-minimap2')
'1')
self.assertEqual('sample1_S11_L001', obs)

obs = get_run_prefix(self.good_run_new_version, 'Baz_12345', 'sample1',
'3', 'fastp-and-minimap2')
'3')
self.assertEqual('sample1_S11_L003', obs)

obs = get_run_prefix(self.good_run_new_version, 'Baz_12345', 'sample2',
'1', 'fastp-and-minimap2')
'1')
self.assertEqual('sample2_S10_L001', obs)

obs = get_run_prefix(self.good_run_new_version, 'Baz_12345',
'sample44', '3', 'fastp-and-minimap2')
'sample44', '3')
self.assertIsNone(obs)

obs = get_run_prefix(self.good_run_new_version, 'Baz_12345', 'sample2',
'3', 'fastp-and-minimap2')
'3')
self.assertIsNone(obs)

# project 2
obs = get_run_prefix(self.good_run_new_version, 'FooBar_666',
'sample31', '3', 'fastp-and-minimap2')
'sample31', '3')
self.assertEqual('sample31_S13_L003', obs)

obs = get_run_prefix(self.good_run_new_version, 'FooBar_666',
'sample32', '3', 'fastp-and-minimap2')
'sample32', '3')
self.assertEqual('sample32_S19_L003', obs)

obs = get_run_prefix(self.good_run_new_version, 'FooBar_666',
'sample34', '3', 'fastp-and-minimap2')
'sample34', '3')
self.assertIsNone(obs)

def test_get_run_prefix_more_than_forward_and_reverse(self):
Expand All @@ -365,15 +356,15 @@ def test_get_run_prefix_more_than_forward_and_reverse(self):
# project 2
with self.assertWarnsRegex(Warning, message):
obs = get_run_prefix(self.OKish_run_new_version, 'FooBar_666',
'sample31', '3', 'fastp-and-minimap2')
'sample31', '3')
self.assertIsNone(obs)

obs = get_run_prefix(self.OKish_run_new_version, 'FooBar_666',
'sample32', '3', 'fastp-and-minimap2')
'sample32', '3')
self.assertEqual('sample32_S19_L003', obs)

obs = get_run_prefix(self.OKish_run_new_version, 'FooBar_666',
'sample34', '3', 'fastp-and-minimap2')
'sample34', '3')
self.assertIsNone(obs)

def test_is_non_empty_gz_file(self):
Expand Down

0 comments on commit 6978944

Please sign in to comment.