Skip to content

Commit

Permalink
added unit test for bad sample names
Browse files Browse the repository at this point in the history
sample names in the sample sheet can contain characters that are
illegal in filenames, causing Picard to fail when trying to write
files. a sanitization function (util.file.string_to_file_name()) is now
applied to sample names in illumina.py, and a unit test has been added
to test_illumina.py
  • Loading branch information
tomkinsc committed Feb 26, 2016
1 parent 86b0eb4 commit b659360
Show file tree
Hide file tree
Showing 6 changed files with 79 additions and 1 deletion.
15 changes: 15 additions & 0 deletions test/input/TestDifficultSampleNames/RunInfo.xml
@@ -0,0 +1,15 @@
<?xml version="1.0"?>
<RunInfo xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" Version="2">
<Run Id="150821_M04004_0006_000000000-AEF96" Number="6">
<Flowcell>000000000-AEF96</Flowcell>
<Instrument>M04004</Instrument>
<Date>150821</Date>
<Reads>
<Read Number="1" NumCycles="101" IsIndexedRead="N" />
<Read Number="2" NumCycles="8" IsIndexedRead="Y" />
<Read Number="3" NumCycles="8" IsIndexedRead="Y" />
<Read Number="4" NumCycles="101" IsIndexedRead="N" />
</Reads>
<FlowcellLayout LaneCount="1" SurfaceCount="2" SwathCount="1" TileCount="14" />
</Run>
</RunInfo>
40 changes: 40 additions & 0 deletions test/input/TestDifficultSampleNames/SampleSheet.csv
@@ -0,0 +1,40 @@
[Header],,,,,,,,,
IEMFileVersion,4,,,,,,,,
Investigator Name,James,,,,,,,,
Experiment Name,EBoV HS ,,,,,,,,
Date,8/18/2015,,,,,,,,
Workflow,GenerateFASTQ,,,,,,,,
Application,FASTQ Only,,,,,,,,
Assay,Nextera XT,,,,,,,,
Description,,,,,,,,,
Chemistry,Amplicon,,,,,,,,
,,,,,,,,,
[Reads],,,,,,,,,
101,,,,,,,,,
101,,,,,,,,,
,,,,,,,,,
[Settings],,,,,,,,,
ReverseComplement,0,,,,,,,,
,,,,,,,,,
[Data],,,,,,,,,
Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description
difficult/\//value+for'` -sample_name0.6,,,,N702,CGTACTAG,S502,CTCTCTAT,,
difficult/\//value+for'` -sample_name0.5,,,,N702,CGTACTAG,S503,TATCCTCT,,
difficult/\//value+for'` -sample_name0.4,,,,N702,CGTACTAG,S504,AGAGTAGA,,
difficult/\//value+for'` -sample_name0.3,,,,N702,CGTACTAG,S505,GTAAGGAG,,
difficult/\//value+for'` -sample_name0.1,,,,N702,CGTACTAG,S508,CTAAGCCT,,
difficult/\//value+for'` -sample_name4.6,,,,N703,AGGCAGAA,S502,CTCTCTAT,,
difficult/\//value+for'` -sample_name4.5,,,,N703,AGGCAGAA,S503,TATCCTCT,,
difficult/\//value+for'` -sample_name4.4,,,,N703,AGGCAGAA,S504,AGAGTAGA,,
difficult/\//value+for'` -sample_name4.3,,,,N703,AGGCAGAA,S505,GTAAGGAG,,
difficult/\//value+for'` -sample_name4.1,,,,N703,AGGCAGAA,S508,CTAAGCCT,,
difficult/\//value+for'` -sample_name20.6,,,,N704,TCCTGAGC,S502,CTCTCTAT,,
difficult/\//value+for'` -sample_name20.5,,,,N704,TCCTGAGC,S503,TATCCTCT,,
difficult/\//value+for'` -sample_name20.4,,,,N704,TCCTGAGC,S504,AGAGTAGA,,
difficult/\//value+for'` -sample_name20.3,,,,N704,TCCTGAGC,S505,GTAAGGAG,,
difficult/\//value+for'` -sample_name20.1,,,,N704,TCCTGAGC,S508,CTAAGCCT,,
difficult/\//value+for'` -sample_name48.6,,,,N705,GGACTCCT,S502,CTCTCTAT,,
difficult/\//value+for'` -sample_name48.5,,,,N705,GGACTCCT,S503,TATCCTCT,,
difficult/\//value+for'` -sample_name48.4,,,,N705,GGACTCCT,S504,AGAGTAGA,,
difficult/\//value+for'` -sample_name48.3,,,,N705,GGACTCCT,S505,GTAAGGAG,,
difficult/\//value+for'` -sample_name48.1,,,,N705,GGACTCCT,S508,CTAAGCCT,,
Binary file not shown.
Binary file not shown.
23 changes: 23 additions & 0 deletions test/unit/test_illumina.py
Expand Up @@ -150,6 +150,29 @@ def test_tarball_fail_missing_data(self):
self.assertRaises(Exception, idir.get_RunInfo())


class TestDifficultSampleNames(TestCaseWithTmp):

def test_paired_1(self):
inDir = util.file.get_test_input_path(self)
outBam = util.file.mkstempfname('.bam')
outHeader = util.file.mkstempfname('.txt')
sampleSheet = os.path.join(inDir, 'SampleSheet.csv')
runInfo = os.path.join(inDir, 'RunInfo.xml')
fastq = (os.path.join(inDir, 'mebv-0-1_S5_L001_R1_001.fastq.gz'),
os.path.join(inDir, 'mebv-0-1_S5_L001_R2_001.fastq.gz'))
illumina.miseq_fastq_to_bam(outBam, sampleSheet, fastq[0], inFastq2=fastq[1], runInfo=runInfo)
rgs = list(tools.samtools.SamtoolsTool().getReadGroups(outBam).values())
self.assertEqual(len(rgs), 1)
rgs = rgs[0]
self.assertEqual(rgs.get('ID'), 'AEF96')
self.assertEqual(rgs.get('PL'), 'illumina')
self.assertEqual(rgs.get('PU'), 'AEF96.1.CGTACTAG-CTAAGCCT')
# we care more about Picard not failing, but if we care about how filenames are processed:
#self.assertEqual(rgs.get('LB'), 'difficult-value+for_-sample_name0.1')
#self.assertEqual(rgs.get('SM'), 'difficult-value+for_-sample_name0.1')
self.assertEqual(rgs.get('CN'), 'M04004')
self.assertTrue(rgs.get('DT','').startswith('2015-08-2'))

class TestMiseqToBam(TestCaseWithTmp):

def test_paired_1(self):
Expand Down
2 changes: 1 addition & 1 deletion util/file.py
Expand Up @@ -342,7 +342,7 @@ def temp_catted_files(input_files, prefix=None, suffix=None):

def string_to_file_name(string_value):
replacements_dict = {
"\\": "_", # win directory separator
"\\": "-", # win directory separator
"/": "-", # posix directory separator
"^": "_", # caret
"&": "_and_", # background
Expand Down

0 comments on commit b659360

Please sign in to comment.