Merge pull request #217 from broadinstitute/escape-sample-names

Escape sample names
broadinstitute · Feb 27, 2016 · 9f5ac1d · 9f5ac1d
2 parents 17d85f9 + 8ccebaa
commit 9f5ac1d
Show file tree

Hide file tree

Showing 9 changed files with 130 additions and 3 deletions.
diff --git a/illumina.py b/illumina.py
@@ -414,6 +414,7 @@ def _detect_and_load_sheet(self, infile):
 
         # populate library IDs, run IDs (ie BAM filenames)
         for row in self.rows:
+            row['sample'] = util.file.string_to_file_name(row['sample'])
             row['library'] = row['sample']
             if row.get('library_id_per_sample'):
                 row['library'] += '.l' + row['library_id_per_sample']

diff --git a/test/input/TestDifficultSampleNames/RunInfo.xml b/test/input/TestDifficultSampleNames/RunInfo.xml
@@ -0,0 +1,15 @@
+<?xml version="1.0"?>
+<RunInfo xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" Version="2">
+  <Run Id="150821_M04004_0006_000000000-AEF96" Number="6">
+    <Flowcell>000000000-AEF96</Flowcell>
+    <Instrument>M04004</Instrument>
+    <Date>150821</Date>
+    <Reads>
+      <Read Number="1" NumCycles="101" IsIndexedRead="N" />
+      <Read Number="2" NumCycles="8" IsIndexedRead="Y" />
+      <Read Number="3" NumCycles="8" IsIndexedRead="Y" />
+      <Read Number="4" NumCycles="101" IsIndexedRead="N" />
+    </Reads>
+    <FlowcellLayout LaneCount="1" SurfaceCount="2" SwathCount="1" TileCount="14" />
+  </Run>
+</RunInfo>
diff --git a/test/input/TestDifficultSampleNames/SampleSheet.csv b/test/input/TestDifficultSampleNames/SampleSheet.csv
@@ -0,0 +1,40 @@
+[Header],,,,,,,,,
+IEMFileVersion,4,,,,,,,,
+Investigator Name,James,,,,,,,,
+Experiment Name,EBoV HS ,,,,,,,,
+Date,8/18/2015,,,,,,,,
+Workflow,GenerateFASTQ,,,,,,,,
+Application,FASTQ Only,,,,,,,,
+Assay,Nextera XT,,,,,,,,
+Description,,,,,,,,,
+Chemistry,Amplicon,,,,,,,,
+,,,,,,,,,
+[Reads],,,,,,,,,
+101,,,,,,,,,
+101,,,,,,,,,
+,,,,,,,,,
+[Settings],,,,,,,,,
+ReverseComplement,0,,,,,,,,
+,,,,,,,,,
+[Data],,,,,,,,,
+Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description
+difficult/\//value+for'` -〠데이터Sénégalsample_name0.6,,,,N702,CGTACTAG,S502,CTCTCTAT,,
+difficult/\//value+for'` -Sénégalsample_name0.5,,,,N702,CGTACTAG,S503,TATCCTCT,,
+difficult/\//value+for'` -Sénégalsample_name0.4,,,,N702,CGTACTAG,S504,AGAGTAGA,,
+difficult/\//value+for'` -Sénégalsample_name0.3,,,,N702,CGTACTAG,S505,GTAAGGAG,,
+difficult/\//value+for'` -Sénégalsample_name0.1,,,,N702,CGTACTAG,S508,CTAAGCCT,,
+difficult/\//value+for'` -Sénégalsample_name4.6,,,,N703,AGGCAGAA,S502,CTCTCTAT,,
+difficult/\//value+for'` -Sénégalsample_name4.5,,,,N703,AGGCAGAA,S503,TATCCTCT,,
+difficult/\//value+for'` -Sénégalsample_name4.4,,,,N703,AGGCAGAA,S504,AGAGTAGA,,
+difficult/\//value+for'` -Sénégalsample_name4.3,,,,N703,AGGCAGAA,S505,GTAAGGAG,,
+difficult/\//value+for'` -Sénégalsample_name4.1,,,,N703,AGGCAGAA,S508,CTAAGCCT,,
+difficult/\//value+for'` -Sénégalsample_name20.6,,,,N704,TCCTGAGC,S502,CTCTCTAT,,
+difficult/\//value+for'` -Sénégalsample_name20.5,,,,N704,TCCTGAGC,S503,TATCCTCT,,
+difficult/\//value+for'` -Sénégalsample_name20.4,,,,N704,TCCTGAGC,S504,AGAGTAGA,,
+difficult/\//value+for'` -Sénégalsample_name20.3,,,,N704,TCCTGAGC,S505,GTAAGGAG,,
+difficult/\//value+for'` -Sénégalsample_name20.1,,,,N704,TCCTGAGC,S508,CTAAGCCT,,
+difficult/\//value+for'` -Sénégalsample_name48.6,,,,N705,GGACTCCT,S502,CTCTCTAT,,
+difficult/\//value+for'` -Sénégalsample_name48.5,,,,N705,GGACTCCT,S503,TATCCTCT,,
+difficult/\//value+for'` -Sénégalsample_name48.4,,,,N705,GGACTCCT,S504,AGAGTAGA,,
+difficult/\//value+for'` -Sénégalsample_name48.3,,,,N705,GGACTCCT,S505,GTAAGGAG,,
+difficult/\//value+for'` -Sénégalsample_name48.1,,,,N705,GGACTCCT,S508,CTAAGCCT,,
diff --git a/test/input/TestDifficultSampleNames/mebv-0-1_S5_L001_R1_001.fastq.gz b/test/input/TestDifficultSampleNames/mebv-0-1_S5_L001_R1_001.fastq.gz
diff --git a/test/input/TestDifficultSampleNames/mebv-0-1_S5_L001_R2_001.fastq.gz b/test/input/TestDifficultSampleNames/mebv-0-1_S5_L001_R2_001.fastq.gz
diff --git a/test/unit/test_illumina.py b/test/unit/test_illumina.py
@@ -1,4 +1,5 @@
 # Unit tests for illumina.py
+# -*- coding: utf-8 -*-
 
 __author__ = "dpark@broadinstitute.org"
 
@@ -150,6 +151,28 @@ def test_tarball_fail_missing_data(self):
             self.assertRaises(Exception, idir.get_RunInfo())
 
 
+class TestDifficultSampleNames(TestCaseWithTmp):
+
+    def test_paired_1(self):
+        inDir = util.file.get_test_input_path(self)
+        outBam = util.file.mkstempfname('.bam')
+        outHeader = util.file.mkstempfname('.txt')
+        sampleSheet = os.path.join(inDir, 'SampleSheet.csv')
+        runInfo = os.path.join(inDir, 'RunInfo.xml')
+        fastq = (os.path.join(inDir, 'mebv-0-1_S5_L001_R1_001.fastq.gz'),
+                 os.path.join(inDir, 'mebv-0-1_S5_L001_R2_001.fastq.gz'))
+        illumina.miseq_fastq_to_bam(outBam, sampleSheet, fastq[0], inFastq2=fastq[1], runInfo=runInfo)
+        rgs = list(tools.samtools.SamtoolsTool().getReadGroups(outBam).values())
+        self.assertEqual(len(rgs), 1)
+        rgs = rgs[0]
+        self.assertEqual(rgs.get('ID'), 'AEF96')
+        self.assertEqual(rgs.get('PL'), 'illumina')
+        self.assertEqual(rgs.get('PU'), 'AEF96.1.CGTACTAG-CTAAGCCT')
+        self.assertEqual(rgs.get('LB'), u'difficult-value+for_-Sénégalsample_name0.1')
+        self.assertEqual(rgs.get('SM'), u'difficult-value+for_-Sénégalsample_name0.1')
+        self.assertEqual(rgs.get('CN'), 'M04004')
+        self.assertTrue(rgs.get('DT','').startswith('2015-08-2'))
+
 class TestMiseqToBam(TestCaseWithTmp):
 
     def test_paired_1(self):

diff --git a/tools/kraken.py b/tools/kraken.py
@@ -57,7 +57,7 @@ class Jellyfish(tools.Tool):
     def __init__(self, install_methods=None):
         if not install_methods:
             install_methods = []
-            install_methods.append(tools.CondaPackage("jellyfish1", version=JELLYFISH_VERSION))
+            install_methods.append(tools.CondaPackage("jellyfish", version=JELLYFISH_VERSION))
             install_methods.append(
                 DownloadAndInstallJellyfish(
                     JELLYFISH_URL, os.path.join(JELLYFISH_DIR, 'bin', 'jellyfish')

diff --git a/tools/samtools.py b/tools/samtools.py
@@ -120,8 +120,8 @@ def getHeader(self, inBam):
         ''' fetch BAM header as a list of tuples (already split on tabs) '''
         tmpf = util.file.mkstempfname('.txt')
         self.dumpHeader(inBam, tmpf)
-        with open(tmpf, 'rt') as inf:
-            header = list(line.rstrip('\n').split('\t') for line in inf)
+        with open(tmpf, 'rb') as inf:
+            header = list(line.decode("latin-1").rstrip('\n').split('\t') for line in inf)
         os.unlink(tmpf)
         return header
 

diff --git a/util/file.py b/util/file.py
@@ -339,3 +339,51 @@ def temp_catted_files(input_files, prefix=None, suffix=None):
         yield fn
     finally:
         os.remove(fn)
+
+def string_to_file_name(string_value):
+    replacements_dict = {
+        "\\": "-", # win directory separator 
+        "/": "-", # posix directory separator 
+        "^": "_", # caret
+        "&": "_and_", # background
+        "\"": "", # double quotes
+        r"'": "", # single quotes
+        r":": "_", # colon (problem for ntfs)
+        r" ": "_", # spaces
+        r"|": "-", # shouldn't confuse a vertical bar for a shell pipe
+        r"!": ".", # not a bash operator
+        r";": ".", # not a terminator
+        r"?": "_", # could be mistaken for a wildcard
+        r"*": "_", # could be mistaken for a wildcard
+        r"`": "_", # no subshells
+        r" -": "_-", # could be mistaken for an argument
+        r" --": "_--", # could be mistaken for an argument
+        r">": "]", # no redirect chars
+        r"<": "[", # no redirect chars
+        r"\\x": "_", # hex char
+        r"\\o": "_", # octal char
+        #r"\\u": "", # unicode char
+        #"": "", # other illegal strings to replace
+    }
+
+    # group of ascii control and non-printable characters    
+    control_chars = ''.join( map(chr, list(range(0,32)) + list(range(127,160)) ) )
+    control_char_re = re.compile('[%s]' % re.escape(control_chars))
+    string_value = control_char_re.sub("_", string_value)
+
+    # replacements from the dictionary above
+    strs_to_replace_re = re.compile(r'|'.join(re.escape(key) for key in replacements_dict.keys()))
+    string_value = strs_to_replace_re.sub(lambda x: replacements_dict.get(x.group(), "_"), string_value)
+
+    # condense runs of underscores
+    double_underscore_re = re.compile(r'_{2,}')
+    string_value = double_underscore_re.sub("_", string_value)
+
+    # condense runs of dashes
+    double_dash_re = re.compile(r'-{2,}')
+    string_value = double_dash_re.sub("-", string_value)
+
+    # remove leading or trailing periods (no hidden files (*NIX) or missing file extensions (NTFS))
+    string_value = string_value.strip(".")
+
+    return string_value