In [1]:
import glob
import os.path
import os
import tempfile

fps = glob.glob('fastq/*fastq.gz')
output_dir = 'filtered-fastq'


In [2]:
for fp in fps:
    fn = os.path.split(fp)[1]
    basename = fn.replace('.fastq.gz', '')
    output_fp = os.path.join(output_dir, fn).replace('.gz', '')
    with tempfile.NamedTemporaryFile() as sam_f:
        sam_f_name = sam_f.name
        !bowtie2 -p 1 -x /Users/gregcaporaso/reference-data/hg19/hg19 --very-sensitive -U $fp -S $sam_f_name 2> /dev/null
        with tempfile.NamedTemporaryFile() as bam_f:
            bam_f_name = bam_f.name
            !samtools view -f 4 -F 256 -o $bam_f_name -b $sam_f_name 2> /dev/null
            !bedtools bamtofastq -i $bam_f_name -fq $output_fp
    !gzip -f $output_fp
    
    

In [5]:
# this cell is just creating a log of how many reads are filtered per sample. 
# this is super-inefficient, but i decided i wanted to do this after running the above -
# in a future workflow this will all happen in one step (e.g., determine how many and which
# reads are filtered from the .sam file and log those so that a few can be confirmed)

import skbio.io

output_fp = 'human-filter.log'

with open(output_fp, 'w') as log_fh:
    log_fh.write('id\tinput-reads\toutput-reads\tfiltered-reads\n')
    for input_fp in fps: 
        fn = os.path.split(input_fp)[1]
        output_fp = os.path.join(output_dir, fn)
        n_input_seqs = 0
        for e in skbio.io.read(input_fp, format='fastq', variant='illumina1.8'):
            n_input_seqs += 1
        n_output_seqs = 0
        for e in skbio.io.read(output_fp, format='fastq', variant='illumina1.8'):
            n_output_seqs += 1
        log_fh.write('%s\t%d\t%d\t%d\n' % (fn, n_input_seqs, n_output_seqs, n_input_seqs - n_output_seqs))
        log_fh.flush()
    
    