Skip to content

Commit

Permalink
Merge pull request #217 from broadinstitute/escape-sample-names
Browse files Browse the repository at this point in the history
Escape sample names
  • Loading branch information
tomkinsc committed Feb 27, 2016
2 parents 17d85f9 + 8ccebaa commit 9f5ac1d
Show file tree
Hide file tree
Showing 9 changed files with 130 additions and 3 deletions.
1 change: 1 addition & 0 deletions illumina.py
Expand Up @@ -414,6 +414,7 @@ def _detect_and_load_sheet(self, infile):

# populate library IDs, run IDs (ie BAM filenames)
for row in self.rows:
row['sample'] = util.file.string_to_file_name(row['sample'])
row['library'] = row['sample']
if row.get('library_id_per_sample'):
row['library'] += '.l' + row['library_id_per_sample']
Expand Down
15 changes: 15 additions & 0 deletions test/input/TestDifficultSampleNames/RunInfo.xml
@@ -0,0 +1,15 @@
<?xml version="1.0"?>
<RunInfo xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" Version="2">
<Run Id="150821_M04004_0006_000000000-AEF96" Number="6">
<Flowcell>000000000-AEF96</Flowcell>
<Instrument>M04004</Instrument>
<Date>150821</Date>
<Reads>
<Read Number="1" NumCycles="101" IsIndexedRead="N" />
<Read Number="2" NumCycles="8" IsIndexedRead="Y" />
<Read Number="3" NumCycles="8" IsIndexedRead="Y" />
<Read Number="4" NumCycles="101" IsIndexedRead="N" />
</Reads>
<FlowcellLayout LaneCount="1" SurfaceCount="2" SwathCount="1" TileCount="14" />
</Run>
</RunInfo>
40 changes: 40 additions & 0 deletions test/input/TestDifficultSampleNames/SampleSheet.csv
@@ -0,0 +1,40 @@
[Header],,,,,,,,,
IEMFileVersion,4,,,,,,,,
Investigator Name,James,,,,,,,,
Experiment Name,EBoV HS ,,,,,,,,
Date,8/18/2015,,,,,,,,
Workflow,GenerateFASTQ,,,,,,,,
Application,FASTQ Only,,,,,,,,
Assay,Nextera XT,,,,,,,,
Description,,,,,,,,,
Chemistry,Amplicon,,,,,,,,
,,,,,,,,,
[Reads],,,,,,,,,
101,,,,,,,,,
101,,,,,,,,,
,,,,,,,,,
[Settings],,,,,,,,,
ReverseComplement,0,,,,,,,,
,,,,,,,,,
[Data],,,,,,,,,
Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description
difficult/\//value+for'` -〠데이터Sénégalsample_name0.6,,,,N702,CGTACTAG,S502,CTCTCTAT,,
difficult/\//value+for'` -Sénégalsample_name0.5,,,,N702,CGTACTAG,S503,TATCCTCT,,
difficult/\//value+for'` -Sénégalsample_name0.4,,,,N702,CGTACTAG,S504,AGAGTAGA,,
difficult/\//value+for'` -Sénégalsample_name0.3,,,,N702,CGTACTAG,S505,GTAAGGAG,,
difficult/\//value+for'` -Sénégalsample_name0.1,,,,N702,CGTACTAG,S508,CTAAGCCT,,
difficult/\//value+for'` -Sénégalsample_name4.6,,,,N703,AGGCAGAA,S502,CTCTCTAT,,
difficult/\//value+for'` -Sénégalsample_name4.5,,,,N703,AGGCAGAA,S503,TATCCTCT,,
difficult/\//value+for'` -Sénégalsample_name4.4,,,,N703,AGGCAGAA,S504,AGAGTAGA,,
difficult/\//value+for'` -Sénégalsample_name4.3,,,,N703,AGGCAGAA,S505,GTAAGGAG,,
difficult/\//value+for'` -Sénégalsample_name4.1,,,,N703,AGGCAGAA,S508,CTAAGCCT,,
difficult/\//value+for'` -Sénégalsample_name20.6,,,,N704,TCCTGAGC,S502,CTCTCTAT,,
difficult/\//value+for'` -Sénégalsample_name20.5,,,,N704,TCCTGAGC,S503,TATCCTCT,,
difficult/\//value+for'` -Sénégalsample_name20.4,,,,N704,TCCTGAGC,S504,AGAGTAGA,,
difficult/\//value+for'` -Sénégalsample_name20.3,,,,N704,TCCTGAGC,S505,GTAAGGAG,,
difficult/\//value+for'` -Sénégalsample_name20.1,,,,N704,TCCTGAGC,S508,CTAAGCCT,,
difficult/\//value+for'` -Sénégalsample_name48.6,,,,N705,GGACTCCT,S502,CTCTCTAT,,
difficult/\//value+for'` -Sénégalsample_name48.5,,,,N705,GGACTCCT,S503,TATCCTCT,,
difficult/\//value+for'` -Sénégalsample_name48.4,,,,N705,GGACTCCT,S504,AGAGTAGA,,
difficult/\//value+for'` -Sénégalsample_name48.3,,,,N705,GGACTCCT,S505,GTAAGGAG,,
difficult/\//value+for'` -Sénégalsample_name48.1,,,,N705,GGACTCCT,S508,CTAAGCCT,,
Binary file not shown.
Binary file not shown.
23 changes: 23 additions & 0 deletions test/unit/test_illumina.py
@@ -1,4 +1,5 @@
# Unit tests for illumina.py
# -*- coding: utf-8 -*-

__author__ = "dpark@broadinstitute.org"

Expand Down Expand Up @@ -150,6 +151,28 @@ def test_tarball_fail_missing_data(self):
self.assertRaises(Exception, idir.get_RunInfo())


class TestDifficultSampleNames(TestCaseWithTmp):

def test_paired_1(self):
inDir = util.file.get_test_input_path(self)
outBam = util.file.mkstempfname('.bam')
outHeader = util.file.mkstempfname('.txt')
sampleSheet = os.path.join(inDir, 'SampleSheet.csv')
runInfo = os.path.join(inDir, 'RunInfo.xml')
fastq = (os.path.join(inDir, 'mebv-0-1_S5_L001_R1_001.fastq.gz'),
os.path.join(inDir, 'mebv-0-1_S5_L001_R2_001.fastq.gz'))
illumina.miseq_fastq_to_bam(outBam, sampleSheet, fastq[0], inFastq2=fastq[1], runInfo=runInfo)
rgs = list(tools.samtools.SamtoolsTool().getReadGroups(outBam).values())
self.assertEqual(len(rgs), 1)
rgs = rgs[0]
self.assertEqual(rgs.get('ID'), 'AEF96')
self.assertEqual(rgs.get('PL'), 'illumina')
self.assertEqual(rgs.get('PU'), 'AEF96.1.CGTACTAG-CTAAGCCT')
self.assertEqual(rgs.get('LB'), u'difficult-value+for_-Sénégalsample_name0.1')
self.assertEqual(rgs.get('SM'), u'difficult-value+for_-Sénégalsample_name0.1')
self.assertEqual(rgs.get('CN'), 'M04004')
self.assertTrue(rgs.get('DT','').startswith('2015-08-2'))

class TestMiseqToBam(TestCaseWithTmp):

def test_paired_1(self):
Expand Down
2 changes: 1 addition & 1 deletion tools/kraken.py
Expand Up @@ -57,7 +57,7 @@ class Jellyfish(tools.Tool):
def __init__(self, install_methods=None):
if not install_methods:
install_methods = []
install_methods.append(tools.CondaPackage("jellyfish1", version=JELLYFISH_VERSION))
install_methods.append(tools.CondaPackage("jellyfish", version=JELLYFISH_VERSION))
install_methods.append(
DownloadAndInstallJellyfish(
JELLYFISH_URL, os.path.join(JELLYFISH_DIR, 'bin', 'jellyfish')
Expand Down
4 changes: 2 additions & 2 deletions tools/samtools.py
Expand Up @@ -120,8 +120,8 @@ def getHeader(self, inBam):
''' fetch BAM header as a list of tuples (already split on tabs) '''
tmpf = util.file.mkstempfname('.txt')
self.dumpHeader(inBam, tmpf)
with open(tmpf, 'rt') as inf:
header = list(line.rstrip('\n').split('\t') for line in inf)
with open(tmpf, 'rb') as inf:
header = list(line.decode("latin-1").rstrip('\n').split('\t') for line in inf)
os.unlink(tmpf)
return header

Expand Down
48 changes: 48 additions & 0 deletions util/file.py
Expand Up @@ -339,3 +339,51 @@ def temp_catted_files(input_files, prefix=None, suffix=None):
yield fn
finally:
os.remove(fn)

def string_to_file_name(string_value):
replacements_dict = {
"\\": "-", # win directory separator
"/": "-", # posix directory separator
"^": "_", # caret
"&": "_and_", # background
"\"": "", # double quotes
r"'": "", # single quotes
r":": "_", # colon (problem for ntfs)
r" ": "_", # spaces
r"|": "-", # shouldn't confuse a vertical bar for a shell pipe
r"!": ".", # not a bash operator
r";": ".", # not a terminator
r"?": "_", # could be mistaken for a wildcard
r"*": "_", # could be mistaken for a wildcard
r"`": "_", # no subshells
r" -": "_-", # could be mistaken for an argument
r" --": "_--", # could be mistaken for an argument
r">": "]", # no redirect chars
r"<": "[", # no redirect chars
r"\\x": "_", # hex char
r"\\o": "_", # octal char
#r"\\u": "", # unicode char
#"": "", # other illegal strings to replace
}

# group of ascii control and non-printable characters
control_chars = ''.join( map(chr, list(range(0,32)) + list(range(127,160)) ) )
control_char_re = re.compile('[%s]' % re.escape(control_chars))
string_value = control_char_re.sub("_", string_value)

# replacements from the dictionary above
strs_to_replace_re = re.compile(r'|'.join(re.escape(key) for key in replacements_dict.keys()))
string_value = strs_to_replace_re.sub(lambda x: replacements_dict.get(x.group(), "_"), string_value)

# condense runs of underscores
double_underscore_re = re.compile(r'_{2,}')
string_value = double_underscore_re.sub("_", string_value)

# condense runs of dashes
double_dash_re = re.compile(r'-{2,}')
string_value = double_dash_re.sub("-", string_value)

# remove leading or trailing periods (no hidden files (*NIX) or missing file extensions (NTFS))
string_value = string_value.strip(".")

return string_value

0 comments on commit 9f5ac1d

Please sign in to comment.