In [None]:
import os, fnmatch
from Bio import SeqIO
from Bio.Seq import Seq
import random
import pandas as pd
import difflib

# find all files, output names
def findFastqFiles(directory, pattern):
    """Walks the directory structure, appending filenames to an array"""
    filenames = []
    for root, dirs, files in os.walk(directory):
        for basename in files:
            if fnmatch.fnmatch(basename, pattern):
                filename = os.path.join(root, basename)
                filenames.append(filename)
    return filenames

# parse sequences, output percent
def parseFastqSeq(array):
    """Returns percent of sequences greater than 30"""
    all_sequences   = []
    large_sequences = []
    for filename in array:
        for sequence in SeqIO.parse(open(filename, "r"), "fastq"):
            if len(sequence.seq) > 30:
                large_sequences.append(sequence)
            all_sequences.append(sequence)
    percent = len(large_sequences) / float(len(all_sequences))
    return percent

def freqFastaSeq(file):
    """Parses a fasta/q file, returns 10 most common sequences"""
    sequences = []
    fasta_sequences = SeqIO.parse(open(file),'fastq')
    for fasta in fasta_sequences:
        sequences.append(str(fasta))
    from collections import Counter
    c = Counter(sequences)
    most_common_seq = c.most_common(10)
    return most_common_seq

def parse_fastq(filename, suffix):
    # output the file to a dictionary
    with open(filename) as f:
        lines = f.readlines()
    head = [item[:-1] for item in lines[0::4]]
    read = [item[:-1] for item in lines[1::4]]
    qual = [item[:-1] for item in lines[3::4]]
    return {'Header_'+str(suffix): head, 'Sequence_'+str(suffix): read, 'QScore_'+str(suffix): qual}

def constrained_sum_sample_pos(n, total):
    """Return a randomly chosen list of n positive integers summing to total.
    Each such list is equally likely to occur."""

    dividers = sorted(random.sample(xrange(1, total), n - 1))
    return [a - b for a, b in zip(dividers + [total], [0] + dividers)]

def gen_qscore(length, score):
    """Return a qscore of length length"""
    myScore = constrained_sum_sample_pos(length,score)
    return ''.join([chr(i+33) for i in myScore])

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

def merge_dicts(*dict_args):
    '''
    Given any number of dicts, shallow copy and merge into a new dict,
    precedence goes to key value pairs in latter dicts.
    '''
    result = {}
    for dictionary in dict_args:
        result.update(dictionary)
    return result

def return_reverse_complement(seq):
    # returns the reverse complement of a sequence
    seq1 = 'ATCGNTAGCNatcgntagcn'
    seq_dict = {seq1[i]: seq1[i + 5]
                for i in range(18) if i < 5 or 9 <= i < 13}
    return "".join([seq_dict[base] for base in reversed(seq)])

In [None]:
import random
import difflib

def parse_fastq(filename, suffix):
    # output the file to a dictionary
    with open(filename) as f:
        lines = f.readlines()
    head = [item[:-1] for item in lines[0::4]]
    read = [item[:-1] for item in lines[1::4]]
    qual = [item[:-1] for item in lines[3::4]]
    return {'Header_'+str(suffix): head, 'Sequence_'+str(suffix): read, 'QScore_'+str(suffix): qual}

# locate the files we want to work with
# mac
sdDevelop_R1 = "/Users/ChrisM/Documents/workspace/iBest/testingSD/sdDevelop/no_dup_R1.fastq"
sdDevelop_R2 = "/Users/ChrisM/Documents/workspace/iBest/testingSD/sdDevelop/no_dup_R2.fastq"

sdMaster_R1 = "/Users/ChrisM/Documents/workspace/iBest/testingSD/sdMaster/output_nodup_PE1.fastq"
sdMaster_R2 = "/Users/ChrisM/Documents/workspace/iBest/testingSD/sdMaster/output_nodup_PE2.fastq"

rawReads_R1 = "/Users/ChrisM/Documents/workspace/iBest/testingSD/rawReads/tiny_R1.fastq"
rawReads_R2 = "/Users/ChrisM/Documents/workspace/iBest/testingSD/rawReads/tiny_R2.fastq"

# create an index of the files -- index is faster and less RAM intense
sdDevelopR1 = parse_fastq(sdDevelop_R1, "R1")
sdDevelopR2 = parse_fastq(sdDevelop_R2, "R2")

sdMasterR1 = parse_fastq(sdMaster_R1, "R1")
sdMasterR2 = parse_fastq(sdMaster_R2, "R2")

rawReadsR1 = parse_fastq(rawReads_R1, "R1")
rawReadsR2 = parse_fastq(rawReads_R2, "R2")
rawReads = merge_dicts(rawReadsR1, rawReadsR2)
   
raw_df = pd.DataFrame.from_dict(rawReads)

print("done")

In [None]:
# develop removes less than master so let's find out the difference
# what did sdDevelop remove from rawReads
diffDev = [i for i in rawReadsR1['Header_R1'] if i not in sdDevelopR1['Header_R1']]

# what did sdMaster remove from rawReads
diffMaster = [i for i in rawReadsR1['Header_R1'] if i not in sdMasterR1['Header_R1']]
        
# this is what sdMaster removes that sdDevelop does not
diffRemoved = [i for i in diffMaster if i not in diffDev]

# this is all of the headers
allHeaders = [i for i in rawReadsR1['Header_R1']]

print("done")
print(len(diffMaster))
print(len(diffDev))
print(len(diffRemoved))
print(len(allHeaders))

In [None]:
# now we can find out why
# test if it's the reverse complement
# assumes s = 10 and l = 10
# find duplicates of diffRemoved
myRCKeys = []
s = 10
l = s + 10
for arow, acol in raw_df.iterrows(): # raw_df is a dataframe of the truncated raw reads
    if acol['Header_R1'] in diffRemoved: # diffRemoved is the difference between Master and Develop
        a_r1 = acol['Sequence_R1'] # r1 of the removed duplicate
        a_r2 = acol['Sequence_R2'] # r2 of the removed duplicate
        for brow, bcol in raw_df.iterrows(): # now we are looking for what was kept
            b_r1 = bcol['Sequence_R1'] # r1 to compare to
            b_r2 = bcol['Sequence_R2'] # r2 to compare to
            for j in range(len(a_r1)-l): # python is zero indexed and don't want to include EOL
                if a_r1[j+s:j+l] + a_r2[j+s:j+l] == b_r1[j+s:j+l] + b_r2[j+s:j+l]: # from Sam's code
                    if bcol['Header_R1'] not in myRCKeys: # keeping track of what gets flagged
                        myRCKeys.append(bcol['Header_R1'])
                                    
print("done")  
print(len(myRCKeys))

In [None]:
# find duplicates of diffDev
myDevKeys = []
s = 10
l = s + 10
for arow, acol in raw_df.iterrows(): # raw_df is a dataframe of the truncated raw reads
    if acol['Header_R1'] in diffDev: # diffDev is the difference between raw and Develop
        a_r1 = acol['Sequence_R1'] # r1 of the removed duplicate
        a_r2 = acol['Sequence_R2'] # r2 of the removed duplicate
        for brow, bcol in raw_df.iterrows(): # now we are looking for what was kept
            b_r1 = bcol['Sequence_R1'] # r1 to compare to
            b_r2 = bcol['Sequence_R2'] # r2 to compare to
            for j in range(len(a_r1)-(s+l)): # python is zero indexed and don't want to include EOL
                if a_r1[j+s:j+l] + a_r2[j+s:j+l] == b_r1[j+s:j+l] + b_r2[j+s:j+l]: # from Sam's code
                    if bcol['Header_R1'] not in myDevKeys: # keeping track of what gets flagged
                        myDevKeys.append(bcol['Header_R1'])
                                    
print("done")  
print(len(myDevKeys))

In [None]:
# find duplicates of diffMaster
myMstrKeys = []
s = 10
l = s + 10
for arow, acol in raw_df.iterrows(): # raw_df is a dataframe of the truncated raw reads
    if acol['Header_R1'] in diffMaster: # diffDev is the difference between raw and Develop
        a_r1 = acol['Sequence_R1'] # r1 of the removed duplicate
        a_r2 = acol['Sequence_R2'] # r2 of the removed duplicate
        for brow, bcol in raw_df.iterrows(): # now we are looking for what was kept
            b_r1 = bcol['Sequence_R1'] # r1 to compare to
            b_r2 = bcol['Sequence_R2'] # r2 to compare to
            for j in range(len(a_r1)-(s+l)): # python is zero indexed and don't want to include EOL
                if a_r1[j+s:j+l] + a_r2[j+s:j+l] == b_r1[j+s:j+l] + b_r2[j+s:j+l]: # from Sam's code
                    if bcol['Header_R1'] not in myMstrKeys: # keeping track of what gets flagged
                        myMstrKeys.append(bcol['Header_R1'])
                                    
print("done")  
print(len(myMstrKeys))

In [None]:
for arow, acol in raw_df.iterrows():
    if acol['Header_R1'] in diffRemoved:    
        print(acol['Header_R1'])
        print(acol['Sequence_R1'])
        print("+")
        print(acol['QScore_R1'])

In [None]:
for arow, acol in raw_df.iterrows():
    if acol['Header_R1'] in myRCKeys:      
        print(acol['Header_R2'])
        print(acol['Sequence_R2'])
        print("+")
        print(acol['QScore_R2'])

In [None]:
for arow, acol in raw_df.iterrows():
    if acol['Header_R1'] in diffDev:    
        print(acol['Header_R1'])
        print(acol['Sequence_R1'])
        print("+")
        print(acol['QScore_R1'])

In [None]:
for arow, acol in raw_df.iterrows():
    if acol['Header_R1'] in myDevKeys:      
        print(acol['Header_R2'])
        print(acol['Sequence_R2'])
        print("+")
        print(acol['QScore_R2'])

In [None]:
import random
import pandas as pd
import difflib

def merge_dicts(*dict_args):
    '''
    Given any number of dicts, shallow copy and merge into a new dict,
    precedence goes to key value pairs in latter dicts.
    '''
    result = {}
    for dictionary in dict_args:
        result.update(dictionary)
    return result

def parse_fastq(filename, suffix):
    # output the file to a dictionary
    with open(filename) as f:
        lines = f.readlines()
    head = [item[:-1] for item in lines[0::4]]
    read = [item[:-1] for item in lines[1::4]]
    qual = [item[:-1] for item in lines[3::4]]
    return {'Header_'+str(suffix): head, 'Sequence_'+str(suffix): read, 'QScore_'+str(suffix): qual}

# just want to find the duplicates of these headers

myHeaders = ["@M03610:8:000000000-AJJAD:1:1111:25298:9039 1:N:0:20",
            "@M03610:8:000000000-AJJAD:1:2114:19128:4795 1:N:0:20",
            "@M03610:8:000000000-AJJAD:1:1114:8658:23919 1:N:0:20"]

# these are the rawreads files
#rawReads_R1 = "/Users/ChrisM/Documents/workspace/iBest/testingSD/rawReads/15-03190-2_S20_L001_R1_001.fastq"
#rawReads_R2 = "/Users/ChrisM/Documents/workspace/iBest/testingSD/rawReads/15-03190-2_S20_L001_R2_001.fastq"
rawReads_R1 = "C:/cygwin64/home/chrism/workspace/testingSD/rawReads/15-03190-2_S20_L001_R1_001.fastq"
rawReads_R2 = "C:/cygwin64/home/chrism/workspace/testingSD/rawReads/15-03190-2_S20_L001_R2_001.fastq"

# gather the headers into a dataframe to make searching easier
rawReadsR1 = parse_fastq(rawReads_R1, "R1")
rawReadsR2 = parse_fastq(rawReads_R2, "R2")
rawReads = merge_dicts(rawReadsR1, rawReadsR2)
raw_df = pd.DataFrame.from_dict(rawReads)

print("done")

In [None]:
# find duplicates of myHeaders
myMstrKeys = []
s = 10
l = s + 10
for arow, acol in raw_df.iterrows(): # raw_df is a dataframe of the truncated raw reads
    if acol['Header_R1'] in myHeaders: # difference between raw and myHeaders
        a_r1 = acol['Sequence_R1'] # r1 of the removed duplicate
        a_r2 = acol['Sequence_R2'] # r2 of the removed duplicate
        for brow, bcol in raw_df.iterrows(): # now we are looking for what was kept
            if bcol['Header_R1'] not in myHeaders: # difference between raw and myHeaders
                b_r1 = bcol['Sequence_R1'] # r1 to compare to
                b_r2 = bcol['Sequence_R2'] # r2 to compare to
                for j in range(len(a_r1)-(s+l)): # python is zero indexed and don't want to include EOL
                    if a_r1[j+s:j+l] + a_r2[j+s:j+l] == b_r1[j+s:j+l] + b_r2[j+s:j+l]: 
                        if bcol['Header_R1'] not in myMstrKeys: # keeping track of what gets flagged
                            myMstrKeys.append(bcol['Header_R1'])
                    elif a_r1[j+s:j+l] + a_r2[j+s:j+l] == b_r2[j+s:j+l] + b_r1[j+s:j+l]: 
                        if bcol['Header_R1'] not in myMstrKeys: # keeping track of what gets flagged
                            myMstrKeys.append(bcol['Header_R1'])
                                    
print("done")  
print(len(myMstrKeys))

In [None]:
myDevKeys = ['@M03610:8:000000000-AJJAD:1:1111:25298:9039 1:N:0:20',
            '@M03610:8:000000000-AJJAD:1:2114:19128:4795 1:N:0:20',
            '@M03610:8:000000000-AJJAD:1:1114:8658:23919 1:N:0:20',
            '@M03610:8:000000000-AJJAD:1:1112:17531:6370 1:N:0:20',
            '@M03610:8:000000000-AJJAD:1:2113:8433:13563 1:N:0:20',
            '@M03610:8:000000000-AJJAD:1:2112:10461:22345 1:N:0:20']

for arow, acol in raw_df.iterrows():
    if acol['Header_R1'] in myDevKeys:      
        print(acol['Header_R1'])
        print(acol['Sequence_R1'])
        print("+")
        print(acol['QScore_R1'])
        
print("________________________________________")

for arow, acol in raw_df.iterrows():
    if acol['Header_R1'] in myDevKeys:      
        print(acol['Header_R2'])
        print(acol['Sequence_R2'])
        print("+")
        print(acol['QScore_R2'])