In [1]:
import pandas as pd
import numpy as np
import os
import pysam
import pybedtools
import glob
from collections import defaultdict, OrderedDict
from subprocess import Popen, PIPE, check_call
from tqdm import tnrange, tqdm_notebook

In [2]:
prev_output = '/projects/ps-yeolab3/bay001/for_eric/scratch/B_samples/non_encode_batch19/results/'
curr_output = '/projects/ps-yeolab4/bay001/tmp/non_encode_batch19_2/results/'

prev_version = '0.4.0'
curr_version = '0.4.0'

In [3]:
def get_file_extensions_from_version(version):
    """
    Returns a dictionary containing the extensions used for each version.
    """
    extensions = defaultdict(str)
    if version == '0.2.2':
        extensions['trimmed_fastqs'] = '*.r1TrTr.fq.gz'
        extensions['repeat_alignments'] = '*.r1TrTr.sorted.STARAligned.out.bam'
        extensions['genome_alignments'] = '*.r1TrTr.sorted.STARUnmapped.out.sorted.STARAligned.outSo.bam'
        extensions['clipper_peaks'] = '*.r1TrTr.sorted.STARUnmapped.out.sorted.STARAligned.outSo.rmDupSo.peakClusters.bed'
        extensions['inputnorm_peaks'] = '*.r1TrTr.sorted.STARUnmapped.out.sorted.STARAligned.outSo.rmDupSo.peakClusters.normed.compressed.bed'
    elif version == '0.4.0':
        extensions['trimmed_fastqs'] = '*.r1.fqTrTr.sorted.fq.gz'
        extensions['repeat_alignments'] = '*.r1.fq.repeat-mapped.bam'
        extensions['genome_alignments'] = '*.r1.fq.genome-mappedSoSo.bam'
        extensions['clipper_peaks'] = '*.r1.fq.genome-mappedSoSo.rmDupSo.peakClusters.bed'
        extensions['inputnorm_peaks'] = '*.r1.fq.genome-mappedSoSo.rmDupSo.peakClusters.normed.compressed.bed'
    else:
        print("Version not supported yet")
        return 1
    return extensions

In [4]:
file_ext_dict = get_file_extensions_from_version('0.4.0')
prev_trimmed_fastqs = sorted(glob.glob(os.path.join(prev_output, file_ext_dict['trimmed_fastqs'])))
len(prev_trimmed_fastqs)

24

In [5]:
file_ext_dict = get_file_extensions_from_version('0.4.0')
curr_trimmed_fastqs = sorted(glob.glob(os.path.join(curr_output, file_ext_dict['trimmed_fastqs'])))
len(curr_trimmed_fastqs)

0

In [6]:
def get_num_reads_from_fastq(fastq):
    """
    Returns the number of 
    """
    cmd = "zcat {} | wc -l".format(fastq)
    x = Popen(cmd, stdout=PIPE, shell=True)
    num_lines = int(x.communicate()[0].strip())
    try:
        assert num_lines % 4 == 0 # otherwise this might be a truncated fastq.
        return int(num_lines/4)
    except AssertionError:
        print("Number of lines not divisible by 4!")
        return -1
    
def compare_trimmed_fastqs(fastqs):
    """
    returns the ratio of min/max reads for two fastq files. 
    also returns a dictionary of the actual reads for each fastq.
    """
    num_reads_dict = OrderedDict()
    num_reads_arr = []
    for fastq in fastqs:
        num_read = get_num_reads_from_fastq(fastq)
        num_reads_dict[os.path.basename(fastq)] = num_read
        num_reads_arr.append(num_read)
    
    try:
        assert num_reads_arr[0] == num_reads_arr[1]
        similarity = 1
    except AssertionError:
        similarity = min(num_reads_arr)/float(max(num_reads_arr))
        
    return similarity, num_reads_dict

In [7]:
compare_trimmed_fastqs([curr_trimmed_fastqs[1], curr_trimmed_fastqs[1]])

IndexError: list index out of range

In [None]:
file_ext_dict = get_file_extensions_from_version('0.4.0')
prev_genome_bams = sorted(glob.glob(os.path.join(prev_output, file_ext_dict['repeat_alignments'])))
prev_genome_bams

In [None]:
file_ext_dict = get_file_extensions_from_version('0.4.0')
curr_genome_bams = sorted(glob.glob(os.path.join(curr_output, file_ext_dict['repeat_alignments'])))
curr_genome_bams

In [None]:
TMP_DIR = "/home/bay001/scratch/.tmp/"
def get_mapped_num_from_bam_file(bam):
    """ 
    From a bam file, return the number of mapped reads. 
    If the bam file is unsorted or unindexed, this function will sort/index 
    """
    if not os.path.exists(bam + '.bai'):
        try:
            check_call(["samtools index {}".format(bam)], shell=True)
        except Exception as e:
            sorted_bam = os.path.join(TMP_DIR, os.path.basename(bam) + ".sorted.bam")
            check_call(["samtools sort -o {} {}".format(sorted_bam, bam)], shell=True)
            check_call(["samtools index {}".format(sorted_bam)], shell=True)
            bam = sorted_bam
    samfile = pysam.AlignmentFile(bam, "rb")
    print(samfile.mapped)
    return samfile.mapped

def compare_bam_files(bams):
    """
    Returns the ratio of min/max read numbers from two bam files
    Also returns a dictionary containing the mapped read numbers for each bam file.
    """
    num_mapped_reads_dict = OrderedDict()
    num_mapped_reads_arr = []
    for bam in bams:
        num_mapped_reads = get_mapped_num_from_bam_file(bam)
        num_mapped_reads_dict[os.path.basename(bam)] = num_mapped_reads
        num_mapped_reads_arr.append(num_mapped_reads)
    
    try:
        assert num_mapped_reads_arr[0] == num_mapped_reads_arr[1]
        similarity = 1
    except AssertionError:
        similarity = min(num_mapped_reads_arr)/float(max(num_mapped_reads_arr))
        
    return similarity, num_mapped_reads_dict

In [None]:
# compare_bam_files([prev_genome_bams[0], curr_genome_bams[0]])

In [None]:
def compare_bed_files(beds):
    num_intervals = []
    bedtools = []
    num_intervals_dict = defaultdict(int)
    for bed in sorted(beds):
        bedtool = pybedtools.BedTool(bed)
        bedtool = bedtool.merge(s=True, c='4,5,6', o='distinct,distinct,distinct')
        num_intervals.append(bedtool.count()) # after merge
        num_intervals_dict[os.path.basename(bed)] = bedtool.count()
        bedtools.append(bedtool)
    assert len(bedtools) == 2
    num_intersecting = bedtools[0].intersect(
        bedtools[1], f=0.50, r=True, s=True
    ).count()
    similarity = [
        (num_intersecting / float(num_intervals[0])),
        (num_intersecting / float(num_intervals[1]))
    ]
    return similarity, num_intervals_dict

In [None]:
prev_file_ext_dict = get_file_extensions_from_version(prev_version)
prev_beds = sorted(glob.glob(os.path.join(prev_output, prev_file_ext_dict['clipper_peaks'])))

curr_file_ext_dict = get_file_extensions_from_version(curr_version)
curr_beds = sorted(glob.glob(os.path.join(curr_output, curr_file_ext_dict['clipper_peaks'])))

curr_beds

In [None]:
prev_beds

In [None]:
compare_bed_files([prev_beds[0], curr_beds[0]])

In [None]:
def get_prefix(fn):
    """
    Assumes that the first two elements of a file correspond to the dataset and sample, which 
    in combination will be unique.
    """
    parts = os.path.basename(fn).split('.')
    return '.'.join([parts[0], parts[1]]) # returns the dataset (batch) name and the sample name.

def compare_outputs(prev_output, prev_version, curr_output, curr_version):
    """
    Returns the prefix of files from previous and current output directories. 
    This shouldn't change unless the filename structure changes.
    """
    prev_file_ext_dict = get_file_extensions_from_version(prev_version)
    curr_file_ext_dict = get_file_extensions_from_version(curr_version)
    
    prev_trimmed_fastqs = sorted(glob.glob(os.path.join(prev_output, prev_file_ext_dict['trimmed_fastqs'])))
    curr_trimmed_fastqs = sorted(glob.glob(os.path.join(curr_output, curr_file_ext_dict['trimmed_fastqs'])))
    
    prev_repeat_alignments = sorted(glob.glob(os.path.join(prev_output, prev_file_ext_dict['repeat_alignments'])))
    curr_repeat_alignments = sorted(glob.glob(os.path.join(curr_output, curr_file_ext_dict['repeat_alignments'])))
    
    prev_genome_alignments = sorted(glob.glob(os.path.join(prev_output, prev_file_ext_dict['genome_alignments'])))
    curr_genome_alignments = sorted(glob.glob(os.path.join(curr_output, curr_file_ext_dict['genome_alignments'])))
    
    prev_clipper_peaks = sorted(glob.glob(os.path.join(prev_output, prev_file_ext_dict['clipper_peaks'])))
    curr_clipper_peaks = sorted(glob.glob(os.path.join(curr_output, curr_file_ext_dict['clipper_peaks'])))
    
    prev_inputnorm_peaks = sorted(glob.glob(os.path.join(prev_output, prev_file_ext_dict['inputnorm_peaks'])))
    curr_inputnorm_peaks = sorted(glob.glob(os.path.join(curr_output, curr_file_ext_dict['inputnorm_peaks'])))
    
    
    try:
        assert len(prev_clipper_peaks) == len(curr_clipper_peaks)
    except AssertionError:
        print("WARN: list of clipper files are different! {} vs {}".format(len(prev_clipper_peaks), len(curr_clipper_peaks)))
    
    trimmed_files = pair_samples(prev_trimmed_fastqs, curr_trimmed_fastqs)
    repeat_mapped_files = pair_samples(prev_repeat_alignments, curr_repeat_alignments)
    genome_mapped_files = pair_samples(prev_genome_alignments, curr_genome_alignments)
    clipper_files = pair_samples(prev_clipper_peaks, curr_clipper_peaks)
    inputnormed_files = pair_samples(prev_inputnorm_peaks, curr_inputnorm_peaks)
    
    results = defaultdict(dict)
    progress = tnrange(len(trimmed_files.keys()))
    for sample_id in trimmed_files.keys():
        prev = trimmed_files[sample_id]['prev']
        curr = trimmed_files[sample_id]['curr']
        trimmed_files_similarity, trimmed_files_dict = compare_trimmed_fastqs([prev, curr])
        results[sample_id]["Cutadapt"] = trimmed_files_similarity
        progress.update(1)
    
    """
    progress = tnrange(len(genome_mapped_files.keys()))
    for sample_id in genome_mapped_files.keys():
        prev = genome_mapped_files[sample_id]['prev']
        curr = genome_mapped_files[sample_id]['curr']
        genome_mapped_files_similarity, genome_mapped_files_dict = compare_bam_files([prev, curr])
        results[sample_id]["Genome"] = genome_mapped_files_similarity
        progress.update(1)
    """
        
    progress = tnrange(len(clipper_files.keys()))
    for sample_id in clipper_files.keys():
        prev = clipper_files[sample_id]['prev']
        curr = clipper_files[sample_id]['curr']
        clipper_peaks_similarity, clipper_peaks_dict = compare_bed_files([prev, curr])
        results[sample_id]["Clipper"] = clipper_peaks_similarity
        progress.update(1)
        
    return results
    
def pair_samples(prev_samples, curr_samples):
    """
    Given two lists of files, group and return the pairs with matching prefixes.
    """
    samples_dict = defaultdict(dict)
    
    for prev_sample in prev_samples:
        samples_dict[get_prefix(prev_sample)]['prev'] = prev_sample
    
    for curr_sample in curr_samples:
        samples_dict[get_prefix(curr_sample)]['curr'] = curr_sample
        
    return samples_dict


In [None]:
df = compare_outputs(
    prev_output, prev_version, curr_output, curr_version
)

In [None]:
pd.DataFrame(df)