In [93]:
def bowtie(p, M, occ):
    num_mismatches = 0
    length = len(p)
    last_char = p[-1]
    sp = M[last_char]
    if sp == -1:
        return (None, 0)
    
    # find next(last_char)
    nxt = float('inf')
    nxt_item = None
    for item in M:
        if nxt > M[item] > M[last_char] and item != last_char:
            nxt = M[item]
            nxt_item = item
    if nxt_item == None:
        ep = len(occ['$']) - 1
    else:
        ep = nxt - 1
    
    first_mismatch = -1
    should_replace = False
    # changed for loop a bit, only works on strings len >= 2
    # for strings of len 1, it skips to the end and works
    for i in range(length-2,-1,-1):
        sp_ph = M[p[i]] + occ[p[i]][sp-1]
        ep_ph = M[p[i]] + occ[p[i]][ep]-1

        poss_switches = ALPHABET.copy()
        poss_switches.remove(p[i])
        is_mismatch = False
        while sp_ph > ep_ph and poss_switches:
            new_char = poss_switches.pop()
            sp_ph = M[new_char] + occ[new_char][sp-1]
            ep_ph = M[new_char] + occ[new_char][ep]-1
            is_mismatch = True

            if first_mismatch == -1:
                first_mismatch = i

        num_mismatches += is_mismatch
        sp = sp_ph
        ep = ep_ph

    if num_mismatches > 6:
        best_align_score = num_mismatches
        best_align = ((sp,ep+1), num_mismatches)
        for j in range(first_mismatch, len(p)):
            a = bowtie_offset(p, M, occ, [j])
            if a[1] < best_align_score:
                best_align = a
                best_align_score = a[1]
        return best_align
    else:
        return ((sp,ep+1), num_mismatches)

def bowtie_offset(p, M, occ, mismatches):
    num_mismatches = 0
    length = len(p)
    last_char = p[-1]
    sp = M[last_char]
    if sp == -1:
        return (None, 0)
    
    # find next(last_char)
    nxt = float('inf')
    nxt_item = None
    for item in M:
        if nxt > M[item] > M[last_char] and item != last_char:
            nxt = M[item]
            nxt_item = item
    if nxt_item == None:
        ep = len(occ['$']) - 1
    else:
        ep = nxt - 1

    # changed for loop a bit, only works on strings len >= 2
    # for strings of len 1, it skips to the end and works
    for i in range(length-2,-1,-1):
        if i in mismatches:
            # force a mismatch here
            poss_switches = list('$CGAT')
            poss_switches.remove(p[i])
            sp_ph = 1
            ep_ph = 0
            while sp_ph > ep_ph and poss_switches:
                new_char = poss_switches.pop()
                sp_ph = M[new_char] + occ[new_char][sp-1]
                ep_ph = M[new_char] + occ[new_char][ep]-1

            num_mismatches += 1
        else:
            sp_ph = M[p[i]] + occ[p[i]][sp-1]
            ep_ph = M[p[i]] + occ[p[i]][ep]-1

            poss_switches = ALPHABET.copy()
            poss_switches.remove(p[i])
            is_mismatch = False
            while sp_ph > ep_ph and poss_switches:
                new_char = poss_switches.pop()
                sp_ph = M[new_char] + occ[new_char][sp-1]
                ep_ph = M[new_char] + occ[new_char][ep]-1
                is_mismatch = True

            num_mismatches += is_mismatch
            sp = sp_ph
            ep = ep_ph

    return ((sp,ep+1), num_mismatches)

In [4]:
a = Aligner(None, None)

FM index of genome: 169.02813172340393
FM index of isoform, length 635: 1.1617658138275146
FM index of isoform, length 666: 0.00148773193359375
FM index of isoform, length 501: 0.0011560916900634766
FM index of isoform, length 1265: 0.0028409957885742188
FM index of isoform, length 107: 0.00023984909057617188
FM index of isoform, length 2191: 0.005011796951293945
FM index of isoform, length 490: 0.0010120868682861328
FM index of isoform, length 107: 0.00023508071899414062
FM index of isoform, length 851: 0.0019769668579101562
FM index of isoform, length 457: 0.0009362697601318359
FM index of isoform, length 89: 0.0001990795135498047
FM index of isoform, length 84: 0.00019884109497070312
FM index of isoform, length 756: 0.001641988754272461
FM index of isoform, length 479: 0.0009851455688476562
FM index of isoform, length 507: 0.001180887222290039
FM index of isoform, length 2148: 1.0746479034423828
FM index of isoform, length 601: 0.0012531280517578125
FM index of isoform, length 607: 

In [5]:
with open('genome.fa') as f:
    f.readline()
    genome = f.readline()[:-1]+'$'
reads = []
with open('reads.fa') as f:
    while f.readline():
        reads.append(f.readline()[:-1])        

In [6]:
known_genes = []
unknown_genes = []
from shared import *
with open('genes.tab') as f:
    curr = f.readline().split()
    gene_id = curr[1]
    isoform_id = ''
    isoforms = []
    exons = []
    first_gene = -1
    while curr:
        if curr[0] == 'gene':
            if exons:
                isoforms.append(Isoform(isoform_id,exons))
            if first_gene != -1:
                known_genes.append(Gene(gene_id, isoforms))
            else:
                first_gene = 0
            gene_id = curr[1]
            isoforms = []
            first_iso = -1
        elif curr[0] == 'isoform':
            if first_iso != -1:
                isoforms.append(Isoform(isoform_id, exons))
            else:
                first_iso = 0
            isoform_id = curr[1]
            exons = []
        elif curr[0] == 'exon':
            exons.append(Exon(curr[1],int(curr[2]),int(curr[3])))
        else:
            break
        curr = f.readline().split()
    isoforms.append(Isoform(isoform_id, exons))
    known_genes.append(Gene(gene_id,isoforms))
    isoforms = []
    first_gene = -1
    while curr:
        if curr[0] == 'unknown gene':
            if exons:
                isoforms.append(Isoform(isoform_id,exons))
            if first_gene != -1:
                genes.append(Gene(gene_id, isoforms))
            else:
                first_gene = 0
            gene_id = curr[1]
            isoforms = []
            first_iso = -1
        elif curr[0] == 'unknown isoform':
            if first_iso != -1:
                isoforms.append(Isoform(isoform_id, exons))
            else:
                first_iso = 0
            isoform_id = curr[1]
            exons = []
        elif curr[0] == 'unknown exon':
            exons.append(Exon(curr[1],int(curr[2]),int(curr[3])))
        curr = f.readline().split()
    isoforms.append(Isoform(isoform_id,exons))
    unknown_genes.append(Gene(gene_id, isoforms))

In [8]:
known_isoforms = set()
unknown_isoforms = set()
for gene in known_genes:
    for isoform in gene.isoforms:
        known_isoforms.add(isoform)
for gene in unknown_genes:
    for isoform in gene.isoforms:
        unknown_isoforms.add(isoform)
from evaluation import *
genome_isoform_offsets = index_isoform_locations(known_isoforms, unknown_isoforms)

In [9]:
import random
isoform = random.sample(known_isoforms, 1)[0]
iso_str = ''.join(map(lambda x: genome[x.start:x.end], isoform.exons))
iso_str = list(iso_str)
iso_str[7] = random.choice(ALPHABET)
iso_str[25] = random.choice(ALPHABET)
iso_str[56] = random.choice(ALPHABET)
iso_str[89] = random.choice(ALPHABET)
iso_str[101] = random.choice(ALPHABET)
iso_str[120] = random.choice(ALPHABET)
iso_str[140] = random.choice(ALPHABET)
iso_str[167] = random.choice(ALPHABET)
iso_str=''.join(iso_str)
print(iso_str)
#outer_align(iso_str)
a.align(iso_str)

CAATGCCCGAGGGAGGCTTGTGCAATTATTCTCATAGATTCTGGTGCTGATCCAAACATTGTAGATGTGTATGGCAACACAGCTGTCCACTATGCTGTTAATAGTGAGAATTTGTCAGTGATGGCAAAATTGCTGTCCTGCGGTACAGACATTAAAGTGAAGAACAATCTGGCCACACACCACTTTTATTGGCCATAAGGAAAAGAAGTGAGCAAATTGTGGAATTTTTACTGACAAAAAAATGCAAATGCAAATGGAGTTGATAAGTTTAAATGATTCATCAACAACTTTTGGAATATAAACAAAAGATATCTAAAAATTCTCAAAATAGTAATCCAGAGGAACATCTGAAGGAACACCTGACGAGGCTGCACCCTTGGCGGAAAGAACACCTGACACGGCTGAAAGCTTGGTGGAAAGAACACCTGACGAATAGGATACAGTGAATTCCTCTTCAAAGATTTTAGCCTGTAAACATCCTTTAAAATTCAAGAGGGGGGAAGATTAAGTACAATGAGTTCTGAGTTCCTCATCAAAGAACAAATATGTCAGTATGTTCAGCTTCTCCGTTCTTTGTTCTCCGTTTTAAAGTTTAACTTCCTCGTTCGTTATGCCTCCTTGCCCCTAGTTTCATTAAACAACCCCCTTCTAGCCTCTAACA


KeyboardInterrupt: 

In [161]:
genome[8542892:8542892+50]

'GCCACCACTGTGGTCTAAGATCTATCACCTGCAGAAGTGACCTGTGGCAA'

In [102]:
s = 'ATGATCGTCTATCGATCTGACGGTATGCAGTCATCGACTATGATTGCGTATAGCAGTCAGCTGA$'
sa = get_suffix_array(s)
L = get_bwt(s,sa)
M = get_M(get_F(L))
occ = get_occ(L)

In [103]:
bowtie('ATG', M, occ)

((13, 16), 0)

In [90]:
random.choice(ALPHABET)

'A'

In [138]:
a.align(reads[0])

[(0, 8542892, 50)]

In [140]:
reads[0]

'ATTACTCTTGGGAATGAAATCCTATCTATATAAGCTGTGGTTTGAAATCC'

In [19]:
genome[8542892:8542892+50]

'GCCACCACTGTGGTCTAAGATCTATCACCTGCAGAAGTGACCTGTGGCAA'

In [84]:
def outer_align(read_sequence):
        """
        Returns an alignment to the genome sequence. An alignment is a list of pieces. 
        Each piece consists of a start index in the read, a start index in the genome, and a length 
        indicating how many bases are aligned in this piece. Note that mismatches are count as "aligned".

        Note that <read_start_2> >= <read_start_1> + <length_1>. If your algorithm produces an alignment that 
        violates this, we will remove pieces from your alignment arbitrarily until consecutive pieces 
        satisfy <read_start_2> >= <read_start_1> + <length_1>

        Return value must be in the form (also see the project pdf):
        [(<read_start_1>, <reference_start_1, length_1), (<read_start_2>, <reference_start_2, length_2), ...]

        If no good matches are found: return the best match you can find or return []

        Time limit: 0.5 seconds per read on average on the provided data.
        """
        
        # bowtie, read to isoforms
        # first see if the read matches a known isoform for under 6 mismatches
        min_mismatches = float('inf')
        best_match = None
        for iso_name in a.iso_names:
            match_range, num_mismatches = bowtie(read_sequence, a.isos_M[iso_name], a.isos_occ[iso_name])
            if num_mismatches < min_mismatches:
                min_mismatches = num_mismatches
                best_match = (iso_name, match_range)
        
        alignment = []
        seqlength = len(read_sequence)
        
        if min_mismatches <= 6 and best_match:
            # we have a match, direct to transcriptome
            # if we have multiple matches, it'll be weird but just take the first one
            iso_name, match_range = best_match
            curr = a.isos_sa[iso_name][match_range[0]] # start pt in spliced isoform
            print(curr)
            exon_starts = a.iso_exon_starts[iso_name]
            exon_num = 0
            exon_offset = 0
            curr_pt = 0
            for i in range(len(exon_starts)):
                if curr < exon_starts[i]:
                    exon_num = i
                    break
                exon_offset = exon_starts[i]
            exons = a.isos[iso_name].exons
            print(exons)
            print(exon_starts)
            read_pt = 0
            curr_pt = exons[exon_num].start + curr - exon_offset
#             if not i:
#                 curr_pt = curr
#             else:
#                 curr_pt = exons[exon_num].start + curr
            while seqlength:
                if exon_num < len(exons):
                    offset = exons[exon_num].end-curr_pt
                else:
                    break
                if offset > 0:
                    diff = min(offset,seqlength)
                else:
                    diff = seqlength
                alignment.append((read_pt, curr_pt, diff))
                seqlength -= diff
                curr_pt += diff
                read_pt += diff
                exon_num += 1
#                 print(seqlength, diff, curr_pt, read_pt, exon_num)
        return alignment
            

In [94]:
import time
evaluations = []
counter = 0
total_time = 0
for read in reads:
    start_time = time.time()
    alignment = outer_align(read)
    evaluation = evaluate_alignment(genome, read, alignment, unknown_isoforms, genome_isoform_offsets)
    evaluations.append(evaluation)
    interval = time.time() - start_time
    counter += 1
    print(counter)
    print('Time taken: ' + str(interval))
    total_time += interval
    print('Average time: ' + str(total_time / counter))

1166
[exon	ENSE00003749383	10229576	10229652, exon	ENSE00003691834	10233194	10233400, exon	ENSE00003484151	10235269	10235412, exon	ENSE00003667449	10241104	10241241, exon	ENSE00003570848	10273383	10273500, exon	ENSE00003633427	10284505	10284665, exon	ENSE00001017033	10287389	10287534, exon	ENSE00001017041	10322863	10323013, exon	ENSE00001017045	10359274	10359461, exon	ENSE00001017052	10367473	10367569, exon	ENSE00001017031	10381108	10381281, exon	ENSE00003566319	10415606	10415725, exon	ENSE00003613335	10417662	10417783, exon	ENSE00001017035	10426291	10426471, exon	ENSE00001017048	10457851	10458055, exon	ENSE00001017044	10483537	10483656, exon	ENSE00001315726	10486846	10489195]
[76, 282, 425, 562, 679, 839, 984, 1134, 1321, 1417, 1590, 1709, 1830, 2010, 2214, 2333, 4682]
1
Time taken: 0.06486225128173828
Average time: 0.06486225128173828
210
[exon	ENSE00001788379	6387890	6387997, exon	ENSE00001712270	6390677	6390843, exon	ENSE00001679869	6393083	6393200, exon	ENSE00001713465	6397865	639

342
[exon	ENSE00003757355	5474919	5476019]
[1100]
60
Time taken: 0.061608076095581055
Average time: 0.0632312536239624
328
[exon	ENSE00001662293	2495033	2495123, exon	ENSE00001678745	2495980	2496128, exon	ENSE00001645333	2505393	2505511, exon	ENSE00001733572	2507317	2507461]
[90, 238, 356, 500]
61
Time taken: 0.0636899471282959
Average time: 0.06323877318960722
62
Time taken: 0.06133294105529785
Average time: 0.06320803396163448
391
[exon	ENSE00001721303	3910309	3910721, exon	ENSE00001679975	3913085	3913233]
[412, 560]
63
Time taken: 0.05941605567932129
Average time: 0.06314784383016919
64
Time taken: 0.06424283981323242
Average time: 0.06316495314240456
25
[exon	ENSE00001802701	8250613	8250877, exon	ENSE00001729938	8252369	8252739]
[264, 634]
65
Time taken: 0.06839895248413086
Average time: 0.06324547620920035
243
[exon	ENSE00001764290	8079523	8079691, exon	ENSE00001666620	8083124	8083423]
[168, 467]
66
Time taken: 0.06913399696350098
Average time: 0.06333469622062915
67
Time taken: 0

119
Time taken: 0.06098604202270508
Average time: 0.0629357209726542
9
[exon	ENSE00003733945	6390781	6390843, exon	ENSE00001679869	6393083	6393200, exon	ENSE00003744295	6397871	6397908]
[62, 179, 216]
120
Time taken: 0.06346011161804199
Average time: 0.0629400908946991
320
[exon	ENSE00001531603	2625022	2626116]
[1094]
121
Time taken: 0.06225466728210449
Average time: 0.06293442623674377
124
[exon	ENSE00001609710	10515763	10515963, exon	ENSE00001712433	10564030	10564437]
[200, 607]
122
Time taken: 0.06028318405151367
Average time: 0.06291269474342222
427
[exon	ENSE00001750992	9337334	9337845]
[511]
123
Time taken: 0.06459188461303711
Average time: 0.06292634669358169
567
[exon	ENSE00003298469	5592529	5594065]
[1536]
124
Time taken: 0.06426072120666504
Average time: 0.06293710777836461
143
[exon	ENSE00001531603	2625022	2626116]
[1094]
125
Time taken: 0.06545305252075195
Average time: 0.06295723533630371
4
[exon	ENSE00001499913	5488089	5488169]
[80]
126
Time taken: 0.060214996337890625
Av

722
[exon	ENSE00001646882	2775696	2776627]
[931]
171
Time taken: 0.06753087043762207
Average time: 0.06225734007985968
11
[exon	ENSE00001436923	5488828	5488911]
[83]
172
Time taken: 0.06257987022399902
Average time: 0.06225921525511631
462
[exon	ENSE00001727607	2069244	2069807]
[563]
173
Time taken: 0.06369709968566895
Average time: 0.06226752672581314
174
Time taken: 0.06123495101928711
Average time: 0.06226159238267219
1399
[exon	ENSE00001860371	3222880	3223157, exon	ENSE00003548683	3228596	3228657, exon	ENSE00003655188	3236550	3236697, exon	ENSE00001852478	3237404	3240747, exon	ENSE00001833363	3248438	3248685]
[277, 338, 485, 3828, 4075]
175
Time taken: 0.06254410743713379
Average time: 0.06226320675441197
176
Time taken: 0.06263399124145508
Average time: 0.06226531348445199
157
[exon	ENSE00003734790	9546953	9547163]
[210]
177
Time taken: 0.06197690963745117
Average time: 0.06226368408418645
24
[exon	ENSE00001439705	3019871	3019986]
[115]
178
Time taken: 0.06147313117980957
Average 

231
Time taken: 0.06359601020812988
Average time: 0.0616917372781993
232
Time taken: 0.0632016658782959
Average time: 0.06169824559113075
230
[exon	ENSE00001662293	2495033	2495123, exon	ENSE00001678745	2495980	2496128, exon	ENSE00001645333	2505393	2505511, exon	ENSE00001733572	2507317	2507461]
[90, 238, 356, 500]
233
Time taken: 0.06296777725219727
Average time: 0.06170369422486923
234
Time taken: 0.061077117919921875
Average time: 0.06170101654835236
176
[exon	ENSE00001657492	8542841	8543108]
[267]
235
Time taken: 0.06064271926879883
Average time: 0.06169651315567341
236
Time taken: 0.06433892250061035
Average time: 0.061707709805440096
40
[exon	ENSE00001807840	1996583	1996689]
[106]
237
Time taken: 0.06074810028076172
Average time: 0.0617036608201039
346
[exon	ENSE00001665029	5579827	5579972, exon	ENSE00001597851	5589230	5589633]
[145, 548]
238
Time taken: 0.05958414077758789
Average time: 0.06169475527370677
45
[exon	ENSE00001808578	4563283	4563389]
[106]
239
Time taken: 0.065894126

197
[exon	ENSE00001839054	3165154	3165282, exon	ENSE00003574247	3168563	3168725, exon	ENSE00001949402	3170097	3170453]
[128, 290, 646]
291
Time taken: 0.07170891761779785
Average time: 0.061537314116749973
248
[exon	ENSE00001682401	4016070	4016212, exon	ENSE00001752929	4016535	4016837]
[142, 444]
292
Time taken: 0.06307506561279297
Average time: 0.0615425803889967
146
[exon	ENSE00002057944	1987330	1987424, exon	ENSE00002049806	1994163	1995421]
[94, 1352]
293
Time taken: 0.05843520164489746
Average time: 0.061531975000791585
294
Time taken: 0.05097603797912598
Average time: 0.06149607045309884
295
Time taken: 0.06457901000976562
Average time: 0.06150652109566381
1247
[exon	ENSE00001845775	3165130	3165282, exon	ENSE00001926508	3168553	3168725, exon	ENSE00003601318	3170097	3170169, exon	ENSE00003592279	3173438	3173537, exon	ENSE00003473282	3175880	3177372]
[152, 324, 396, 495, 1987]
296
Time taken: 0.0620119571685791
Average time: 0.061508228649964206
71
[exon	ENSE00003758400	3999189	4001

445
[exon	ENSE00001605194	8193543	8195392]
[1849]
347
Time taken: 0.06731915473937988
Average time: 0.061341015681066154
367
[exon	ENSE00001802701	8250613	8250877, exon	ENSE00001729938	8252369	8252739]
[264, 634]
348
Time taken: 0.0648641586303711
Average time: 0.06135113965505841
8
[exon	ENSE00003716433	2673189	2673274]
[85]
349
Time taken: 0.0590510368347168
Average time: 0.061344549102564594
101
[exon	ENSE00001764290	8079523	8079691, exon	ENSE00001666620	8083124	8083423]
[168, 467]
350
Time taken: 0.060693979263305664
Average time: 0.061342690331595284
464
[exon	ENSE00001762640	5368402	5368741, exon	ENSE00001789031	5436490	5436538, exon	ENSE00001769076	5486379	5486439, exon	ENSE00001763221	5555994	5556087, exon	ENSE00003254110	5562450	5562481]
[339, 387, 447, 540, 571]
351
Time taken: 0.05561709403991699
Average time: 0.06132637809144805
352
[exon	ENSE00001782100	2741635	2742141]
[506]
352
Time taken: 0.05015087127685547
Average time: 0.06129462949254296
353
Time taken: 0.0606861114

125
[exon	ENSE00003734790	9546953	9547163]
[210]
403
Time taken: 0.06618499755859375
Average time: 0.06125279040845393
190
[exon	ENSE00002089431	9305653	9305896]
[243]
404
Time taken: 0.06110692024230957
Average time: 0.06125242934368624
175
[exon	ENSE00003648479	8569695	8570545]
[850]
405
Time taken: 0.05905771255493164
Average time: 0.06124701028988685
261
[exon	ENSE00001636044	7806779	7807276]
[497]
406
Time taken: 0.0635519027709961
Average time: 0.061252687364963476
461
[exon	ENSE00001608016	2976421	2976597, exon	ENSE00001648320	3032950	3033144, exon	ENSE00001645386	3092509	3093468]
[176, 370, 1329]
407
Time taken: 0.06154203414916992
Average time: 0.061253398290723196
451
[exon	ENSE00001782100	2741635	2742141]
[506]
408
Time taken: 0.06289529800415039
Average time: 0.061257422554726694
28
[exon	ENSE00002040703	2298265	2298368, exon	ENSE00001329082	2299457	2299648]
[103, 294]
409
Time taken: 0.06276297569274902
Average time: 0.061261103613743866
573
[exon	ENSE00001865382	3173375	3

209
[exon	ENSE00001601617	2956134	2956198, exon	ENSE00001732406	2957187	2957302, exon	ENSE00001671289	2958589	2958673, exon	ENSE00001747209	2959243	2959317, exon	ENSE00001796418	2959777	2959874, exon	ENSE00001765826	2960362	2960419, exon	ENSE00001776292	2960696	2960810, exon	ENSE00001683970	2961120	2961282, exon	ENSE00001787788	2962503	2962657, exon	ENSE00001783912	2962761	2962848, exon	ENSE00001405469	2963624	2963880]
[64, 179, 263, 337, 434, 491, 605, 767, 921, 1008, 1264]
467
Time taken: 0.0676569938659668
Average time: 0.06113776660322888
468
Time taken: 0.05252814292907715
Average time: 0.0611193699714465
5
[exon	ENSE00003298469	5592529	5594065]
[1536]
469
Time taken: 0.047801971435546875
Average time: 0.06109097466539981
470
Time taken: 0.05830717086791992
Average time: 0.06108505167859666
458
[exon	ENSE00001750992	9337334	9337845]
[511]
471
Time taken: 0.04973268508911133
Average time: 0.06106094898944701
8
[exon	ENSE00001439705	3019871	3019986]
[115]
472
Time taken: 0.059777021

343
[exon	ENSE00001782100	2741635	2742141]
[506]
515
Time taken: 0.06876373291015625
Average time: 0.0609666713233133
77
[exon	ENSE00001608016	2976421	2976597, exon	ENSE00001648320	3032950	3033144, exon	ENSE00001645386	3092509	3093468]
[176, 370, 1329]
516
Time taken: 0.05817699432373047
Average time: 0.06096126497253891
766
[exon	ENSE00001646882	2775696	2776627]
[931]
517
Time taken: 0.05672907829284668
Average time: 0.06095307892480256
23
[exon	ENSE00001807519	2570577	2570683]
[106]
518
Time taken: 0.06135296821594238
Average time: 0.06095385091185109
415
[exon	ENSE00001625311	3706588	3707352]
[764]
519
Time taken: 0.06144595146179199
Average time: 0.06095479908246755
520
Time taken: 0.06308579444885254
Average time: 0.06095889715047983
711
[exon	ENSE00001786136	3867534	3867772, exon	ENSE00001726405	3895555	3896096]
[238, 779]
521
Time taken: 0.055979013442993164
Average time: 0.06094933883242323
24
[exon	ENSE00001436923	5488828	5488911]
[83]
522
Time taken: 0.06051015853881836
Avera

481
[exon	ENSE00001608016	2976421	2976597, exon	ENSE00001648320	3032950	3033144, exon	ENSE00001645386	3092509	3093468]
[176, 370, 1329]
579
Time taken: 0.0643930435180664
Average time: 0.06092621913639787
165
[exon	ENSE00002089431	9305653	9305896]
[243]
580
Time taken: 0.06756877899169922
Average time: 0.060937671825803556
358
[exon	ENSE00001727607	2069244	2069807]
[563]
581
Time taken: 0.06248593330383301
Average time: 0.06094033664762461
582
Time taken: 0.06219077110290527
Average time: 0.06094248516043437
450
[exon	ENSE00001786136	3867534	3867772, exon	ENSE00001726405	3895555	3896096]
[238, 779]
583
Time taken: 0.06323504447937012
Average time: 0.0609464175091804
75
[exon	ENSE00001839054	3165154	3165282, exon	ENSE00003574247	3168563	3168725, exon	ENSE00001949402	3170097	3170453]
[128, 290, 646]
584
Time taken: 0.062165021896362305
Average time: 0.06094850416052831
31
[exon	ENSE00001609710	10515763	10515963, exon	ENSE00001712433	10564030	10564437]
[200, 607]
585
Time taken: 0.0613567

480
[exon	ENSE00003749383	10229576	10229652, exon	ENSE00003691834	10233194	10233400, exon	ENSE00003484151	10235269	10235412, exon	ENSE00003667449	10241104	10241241, exon	ENSE00003570848	10273383	10273500, exon	ENSE00003633427	10284505	10284665, exon	ENSE00001017033	10287389	10287534, exon	ENSE00001017041	10322863	10323013, exon	ENSE00001017045	10359274	10359461, exon	ENSE00001017052	10367473	10367569, exon	ENSE00001017031	10381108	10381281, exon	ENSE00003566319	10415606	10415725, exon	ENSE00003613335	10417662	10417783, exon	ENSE00001017035	10426291	10426471, exon	ENSE00001017048	10457851	10458055, exon	ENSE00001017044	10483537	10483656, exon	ENSE00001315726	10486846	10489195]
[76, 282, 425, 562, 679, 839, 984, 1134, 1321, 1417, 1590, 1709, 1830, 2010, 2214, 2333, 4682]
647
Time taken: 0.0652930736541748
Average time: 0.060937410897044166
229
[exon	ENSE00001682401	4016070	4016212, exon	ENSE00001752929	4016535	4016837]
[142, 444]
648
Time taken: 0.09193110466003418
Average time: 0.060985

128
[exon	ENSE00001657492	8542841	8543108]
[267]
697
Time taken: 0.06655693054199219
Average time: 0.0710067489054829
205
[exon	ENSE00001657492	8542841	8543108]
[267]
698
Time taken: 0.06539106369018555
Average time: 0.07099870351119164
231
[exon	ENSE00001643453	2718932	2719213, exon	ENSE00001800469	2720565	2720740]
[281, 456]
699
Time taken: 0.06366419792175293
Average time: 0.07098821065627113
34
[exon	ENSE00001807519	2570577	2570683]
[106]
700
Time taken: 0.0659489631652832
Average time: 0.070981011731284
269
[exon	ENSE00003748822	6245219	6245596]
[377]
701
Time taken: 0.06978487968444824
Average time: 0.07097930540882062
92
[exon	ENSE00003758460	7866213	7866429]
[216]
702
Time taken: 0.07169222831726074
Average time: 0.07098032096851925
729
[exon	ENSE00001601617	2956134	2956198, exon	ENSE00001732406	2957187	2957302, exon	ENSE00001671289	2958589	2958673, exon	ENSE00001747209	2959243	2959317, exon	ENSE00001796418	2959777	2959874, exon	ENSE00001765826	2960362	2960419, exon	ENSE0000177

317
[exon	ENSE00001718290	2949927	2950100, exon	ENSE00001786392	2950542	2950662, exon	ENSE00001717418	2950910	2950984, exon	ENSE00001754050	2954200	2954279]
[173, 293, 367, 446]
759
Time taken: 0.07151508331298828
Average time: 0.07099568262715898
545
[exon	ENSE00001802701	8250613	8250877, exon	ENSE00001729938	8252369	8252739]
[264, 634]
760
Time taken: 0.06547021865844727
Average time: 0.07098841227983174
761
Time taken: 0.07382011413574219
Average time: 0.07099213330723766
762
Time taken: 0.06222701072692871
Average time: 0.0709806305216992
763
Time taken: 0.058090925216674805
Average time: 0.0709637370678263
367
[exon	ENSE00001727607	2069244	2069807]
[563]
764
Time taken: 0.06659698486328125
Average time: 0.07095802142357951
765
Time taken: 0.06588506698608398
Average time: 0.07095139011058932
0
[exon	ENSE00001808578	4563283	4563389]
[106]
766
Time taken: 0.06952190399169922
Average time: 0.07094952394072132
409
[exon	ENSE00001681783	7563431	7563565, exon	ENSE00001717663	7589689	758

107
[exon	ENSE00001802701	8250613	8250877, exon	ENSE00001729938	8252369	8252739]
[264, 634]
814
Time taken: 0.07275700569152832
Average time: 0.0707812854054519
232
[exon	ENSE00001627710	2711338	2713332]
[1994]
815
Time taken: 0.0610508918762207
Average time: 0.07076934627228719
816
Time taken: 0.06283688545227051
Average time: 0.07075962511932149
136
[exon	ENSE00003712820	2603073	2603356]
[283]
817
Time taken: 0.06329989433288574
Average time: 0.07075049448188399
1
[exon	ENSE00003759741	4295652	4295738]
[86]
818
Time taken: 0.06945514678955078
Average time: 0.07074891092724789
6
[exon	ENSE00003716433	2673189	2673274]
[85]
819
Time taken: 0.06427597999572754
Average time: 0.07074100747067705
338
[exon	ENSE00003742672	7205502	7205818, exon	ENSE00001016990	7205970	7206127, exon	ENSE00001016989	7209200	7209286, exon	ENSE00001633443	7214954	7216365]
[316, 473, 559, 1970]
820
Time taken: 0.06300020217895508
Average time: 0.07073156746422372
821
Time taken: 0.06453299522399902
Average time: 

464
[exon	ENSE00003756290	2254152	2254221, exon	ENSE00003755158	2263408	2263944]
[69, 605]
869
Time taken: 0.06719374656677246
Average time: 0.07054133519478854
1796
[exon	ENSE00003754996	6654408	6656455]
[2047]
870
Time taken: 0.0703439712524414
Average time: 0.0705411083396824
44
[exon	ENSE00001689198	6757945	6757976, exon	ENSE00001713116	6759495	6759845]
[31, 381]
871
Time taken: 0.0647737979888916
Average time: 0.07053448685822339
28
[exon	ENSE00001759787	8688709	8689150]
[441]
872
Time taken: 0.06584620475769043
Average time: 0.07052911038792462
516
[exon	ENSE00001609710	10515763	10515963, exon	ENSE00001712433	10564030	10564437]
[200, 607]
873
Time taken: 0.06901001930236816
Average time: 0.07052737030649786
17
[exon	ENSE00001807840	1996583	1996689]
[106]
874
Time taken: 0.060921669006347656
Average time: 0.07051637980157778
107
[exon	ENSE00003287532	5537689	5537747, exon	ENSE00003444578	5538353	5538540, exon	ENSE00001541870	5543340	5543454, exon	ENSE00001763221	5555994	5556087, e

157
[exon	ENSE00002057944	1987330	1987424, exon	ENSE00002049806	1994163	1995421]
[94, 1352]
936
Time taken: 0.060816049575805664
Average time: 0.07005719980622968
937
Time taken: 0.0664069652557373
Average time: 0.07005330414502318
938
Time taken: 0.06146526336669922
Average time: 0.07004414845122965
538
[exon	ENSE00001770393	10695985	10696043, exon	ENSE00001592495	10697395	10697684, exon	ENSE00001772789	10733973	10734053, exon	ENSE00001633794	10735299	10735391, exon	ENSE00001613133	10746179	10746415]
[58, 347, 427, 519, 755]
939
Time taken: 0.06450676918029785
Average time: 0.0700382513487047
347
[exon	ENSE00001643453	2718932	2719213, exon	ENSE00001800469	2720565	2720740]
[281, 456]
940
Time taken: 0.07161188125610352
Average time: 0.07003992542307427
331
[exon	ENSE00001750992	9337334	9337845]
[511]
941
Time taken: 0.06389188766479492
Average time: 0.07003339190792202
1091
[exon	ENSE00001785486	1987208	1987424, exon	ENSE00001740253	1991502	1991616, exon	ENSE00001782233	1991779	1991952

154
[exon	ENSE00003734790	9546953	9547163]
[210]
1000
Time taken: 0.06673407554626465
Average time: 0.06966915249824523
191
[exon	ENSE00001718290	2949927	2950100, exon	ENSE00001786392	2950542	2950662, exon	ENSE00001717418	2950910	2950984, exon	ENSE00001754050	2954200	2954279]
[173, 293, 367, 446]
1001
Time taken: 0.0616152286529541
Average time: 0.06966110662027791
1
[exon	ENSE00001807840	1996583	1996689]
[106]
1002
Time taken: 0.0650320053100586
Average time: 0.06965648675869086
98
[exon	ENSE00002040703	2298265	2298368, exon	ENSE00001329082	2299457	2299648]
[103, 294]
1003
Time taken: 0.0626680850982666
Average time: 0.06964951925952793
1004
Time taken: 0.05122041702270508
Average time: 0.06963116358000919
106
[exon	ENSE00001645429	4791518	4791540, exon	ENSE00001736182	4796663	4796776, exon	ENSE00001020318	4798777	4798872, exon	ENSE00001020323	4813268	4813446, exon	ENSE00001020284	4815266	4815344, exon	ENSE00001020332	4819068	4819147, exon	ENSE00001747540	4823403	4823518]
[22, 135, 23

31
[exon	ENSE00003716433	2673189	2673274]
[85]
1045
Time taken: 0.06486821174621582
Average time: 0.0695293458454917
284
[exon	ENSE00001689198	6757945	6757976, exon	ENSE00001713116	6759495	6759845]
[31, 381]
1046
Time taken: 0.06567597389221191
Average time: 0.06952566193349047
186
[exon	ENSE00001689198	6757945	6757976, exon	ENSE00001713116	6759495	6759845]
[31, 381]
1047
Time taken: 0.06672477722167969
Average time: 0.06952298678094815
494
[exon	ENSE00003757355	5474919	5476019]
[1100]
1048
Time taken: 0.0681300163269043
Average time: 0.06952165761066757
50
[exon	ENSE00002088930	6380547	6380647]
[100]
1049
Time taken: 0.06893086433410645
Average time: 0.06952109441402643
419
[exon	ENSE00003737220	2653853	2654518]
[665]
1050
Time taken: 0.06199383735656738
Average time: 0.06951392559778123
1051
Time taken: 0.06367731094360352
Average time: 0.06950837220610266
1052
Time taken: 0.06668615341186523
Average time: 0.06950568948861764
1053
Time taken: 0.06067800521850586
Average time: 0.06949

316
[exon	ENSE00001531603	2625022	2626116]
[1094]
1106
Time taken: 0.06572484970092773
Average time: 0.06945044093278606
205
[exon	ENSE00003756290	2254152	2254221, exon	ENSE00003755158	2263408	2263944]
[69, 605]
1107
Time taken: 0.06286883354187012
Average time: 0.06944449548798848
1108
Time taken: 0.06308388710021973
Average time: 0.06943875486669988
1109
Time taken: 0.06393098831176758
Average time: 0.06943378844059084
1110
Time taken: 0.06557989120483398
Average time: 0.06943031646109916
406
[exon	ENSE00001601617	2956134	2956198, exon	ENSE00001732406	2957187	2957302, exon	ENSE00001671289	2958589	2958673, exon	ENSE00001747209	2959243	2959317, exon	ENSE00001796418	2959777	2959874, exon	ENSE00001765826	2960362	2960419, exon	ENSE00001776292	2960696	2960810, exon	ENSE00001683970	2961120	2961282, exon	ENSE00001787788	2962503	2962657, exon	ENSE00001783912	2962761	2962848, exon	ENSE00001405469	2963624	2963880]
[64, 179, 263, 337, 434, 491, 605, 767, 921, 1008, 1264]
1111
Time taken: 0.05526

1165
Time taken: 0.05206775665283203
Average time: 0.06931890131577913
1375
[exon	ENSE00003761532	6560773	6563113]
[2340]
1166
Time taken: 0.05814099311828613
Average time: 0.06930931477358575
1167
Time taken: 0.054566144943237305
Average time: 0.06929668138041492
54
[exon	ENSE00001764290	8079523	8079691, exon	ENSE00001666620	8083124	8083423]
[168, 467]
1168
Time taken: 0.0539088249206543
Average time: 0.06928350684577472
30
[exon	ENSE00001436923	5488828	5488911]
[83]
1169
Time taken: 0.06616401672363281
Average time: 0.0692808383341219
115
[exon	ENSE00001759787	8688709	8689150]
[441]
1170
Time taken: 0.07167696952819824
Average time: 0.06928288630950145
38
[exon	ENSE00001500134	5539237	5539325]
[88]
1171
Time taken: 0.0664970874786377
Average time: 0.0692805073181856
1172
Time taken: 0.0643000602722168
Average time: 0.06927625778998939
93
[exon	ENSE00002089431	9305653	9305896]
[243]
1173
Time taken: 0.06317687034606934
Average time: 0.0692710579711966
13
[exon	ENSE00002088930	6380547	

166
[exon	ENSE00003648479	8569695	8570545]
[850]
1235
Time taken: 0.06992793083190918
Average time: 0.0689316790113565
1236
Time taken: 0.06841111183166504
Average time: 0.06893125784049914
124
[exon	ENSE00002089431	9305653	9305896]
[243]
1237
Time taken: 0.05405306816101074
Average time: 0.06891923020130795
1238
Time taken: 0.06322622299194336
Average time: 0.06891463164944256
332
[exon	ENSE00003754996	6654408	6656455]
[2047]
1239
Time taken: 0.05808687210083008
Average time: 0.06890589253761963
52
[exon	ENSE00001807519	2570577	2570683]
[106]
1240
Time taken: 0.0595250129699707
Average time: 0.06889832731216185
958
[exon	ENSE00001627710	2711338	2713332]
[1994]
1241
Time taken: 0.05810093879699707
Average time: 0.0688896267573551
576
[exon	ENSE00001786136	3867534	3867772, exon	ENSE00001726405	3895555	3896096]
[238, 779]
1242
Time taken: 0.05927705764770508
Average time: 0.06888188716869999
526
[exon	ENSE00001609710	10515763	10515963, exon	ENSE00001712433	10564030	10564437]
[200, 607]
1

167
[exon	ENSE00003748822	6245219	6245596]
[377]
1293
Time taken: 0.05386805534362793
Average time: 0.0686547420602018
326
[exon	ENSE00001662293	2495033	2495123, exon	ENSE00001678745	2495980	2496128, exon	ENSE00001645333	2505393	2505511, exon	ENSE00001733572	2507317	2507461]
[90, 238, 356, 500]
1294
Time taken: 0.06801390647888184
Average time: 0.06865424682404932
1295
Time taken: 0.06112098693847656
Average time: 0.0686484296349485
1296
Time taken: 0.06743502616882324
Average time: 0.06864749336684192
1297
Time taken: 0.06822085380554199
Average time: 0.06864716442346389
376
[exon	ENSE00003648479	8569695	8570545]
[850]
1298
Time taken: 0.059745073318481445
Average time: 0.06864030610982369
94
[exon	ENSE00001718290	2949927	2950100, exon	ENSE00001786392	2950542	2950662, exon	ENSE00001717418	2950910	2950984, exon	ENSE00001754050	2954200	2954279]
[173, 293, 367, 446]
1299
Time taken: 0.06187105178833008
Average time: 0.0686350949825554
615
[exon	ENSE00001531603	2625022	2626116]
[1094]
130

6
[exon	ENSE00001436923	5488828	5488911]
[83]
1344
Time taken: 0.0747218132019043
Average time: 0.06844264233396166
300
[exon	ENSE00001643453	2718932	2719213, exon	ENSE00001800469	2720565	2720740]
[281, 456]
1345
Time taken: 0.07648086547851562
Average time: 0.06844861870804683
1272
[exon	ENSE00001845775	3165130	3165282, exon	ENSE00001926508	3168553	3168725, exon	ENSE00003601318	3170097	3170169, exon	ENSE00003592279	3173438	3173537, exon	ENSE00003473282	3175880	3177372]
[152, 324, 396, 495, 1987]
1346
Time taken: 0.07766103744506836
Average time: 0.06845546300131357
940
[exon	ENSE00003298469	5592529	5594065]
[1536]
1347
Time taken: 0.07642388343811035
Average time: 0.06846137868092515
1348
Time taken: 0.06732487678527832
Average time: 0.06846053557862868
31
[exon	ENSE00001595132	10695973	10696043, exon	ENSE00001772789	10733973	10734053, exon	ENSE00001633794	10735299	10735391, exon	ENSE00001613133	10746179	10746415]
[70, 150, 242, 478]
1349
Time taken: 0.06600785255432129
Average time: 

214
[exon	ENSE00001727607	2069244	2069807]
[563]
1400
Time taken: 0.17865228652954102
Average time: 0.06869668739182608
105
[exon	ENSE00003748822	6245219	6245596]
[377]
1401
Time taken: 0.09360098838806152
Average time: 0.06871446348104539
1402
Time taken: 0.0650169849395752
Average time: 0.0687118261924994
152
[exon	ENSE00001718290	2949927	2950100, exon	ENSE00001786392	2950542	2950662, exon	ENSE00001717418	2950910	2950984, exon	ENSE00001754050	2954200	2954279]
[173, 293, 367, 446]
1403
Time taken: 0.06600713729858398
Average time: 0.06870989840283873
413
[exon	ENSE00001609710	10515763	10515963, exon	ENSE00001712433	10564030	10564437]
[200, 607]
1404
Time taken: 0.07567691802978516
Average time: 0.06871486066753028
721
[exon	ENSE00001646882	2775696	2776627]
[931]
1405
Time taken: 0.06639909744262695
Average time: 0.06871321243747698
560
[exon	ENSE00001742871	2976451	2976597, exon	ENSE00001648320	3032950	3033144, exon	ENSE00001805372	3037609	3038046]
[146, 340, 777]
1406
Time taken: 0.0

1371
[exon	ENSE00001605194	8193543	8195392]
[1849]
1465
Time taken: 0.06349706649780273
Average time: 0.06874528305522411
1466
Time taken: 0.06551599502563477
Average time: 0.06874308026666368
221
[exon	ENSE00002040703	2298265	2298368, exon	ENSE00001329082	2299457	2299648]
[103, 294]
1467
Time taken: 0.06819295883178711
Average time: 0.06874270526909389
270
[exon	ENSE00003748822	6245219	6245596]
[377]
1468
Time taken: 0.06656002998352051
Average time: 0.0687412184330683
1469
Time taken: 0.06537318229675293
Average time: 0.06873892569233561
37
[exon	ENSE00001643453	2718932	2719213, exon	ENSE00001800469	2720565	2720740]
[281, 456]
1470
Time taken: 0.0616450309753418
Average time: 0.06873409991361656
15
[exon	ENSE00001718290	2949927	2950100, exon	ENSE00001786392	2950542	2950662, exon	ENSE00001717418	2950910	2950984, exon	ENSE00001754050	2954200	2954279]
[173, 293, 367, 446]
1471
Time taken: 0.06803703308105469
Average time: 0.0687336260408548
1472
Time taken: 0.06917119026184082
Average t

58
[exon	ENSE00002089431	9305653	9305896]
[243]
1522
Time taken: 0.07164287567138672
Average time: 0.06864808188474758
999
[exon	ENSE00003754996	6654408	6656455]
[2047]
1523
Time taken: 0.0699918270111084
Average time: 0.0686489641862094
646
[exon	ENSE00003758400	3999189	4001379]
[2190]
1524
Time taken: 0.06568503379821777
Average time: 0.06864701934999681
72
[exon	ENSE00001807640	5667998	5668157]
[159]
1525
Time taken: 0.06253194808959961
Average time: 0.06864300946720311
230
[exon	ENSE00003648479	8569695	8570545]
[850]
1526
Time taken: 0.06739091873168945
Average time: 0.06864218896213396
326
[exon	ENSE00001665029	5579827	5579972, exon	ENSE00001597851	5589230	5589633]
[145, 548]
1527
Time taken: 0.06655406951904297
Average time: 0.0686408214968798
176
[exon	ENSE00001636044	7806779	7807276]
[497]
1528
Time taken: 0.0675053596496582
Average time: 0.06864007839357665
76
[exon	ENSE00003758460	7866213	7866429]
[216]
1529
Time taken: 0.07349395751953125
Average time: 0.06864325293845956
17

1568
Time taken: 0.06739997863769531
Average time: 0.06861310771533422
726
[exon	ENSE00001646882	2775696	2776627]
[931]
1569
Time taken: 0.06606388092041016
Average time: 0.06861148296912967
560
[exon	ENSE00003648479	8569695	8570545]
[850]
1570
Time taken: 0.06748628616333008
Average time: 0.06861076628326611
292
[exon	ENSE00001722185	3723301	3724262]
[961]
1571
Time taken: 0.061199188232421875
Average time: 0.06860604853784863
1572
Time taken: 0.06497788429260254
Average time: 0.06860374054532621
465
[exon	ENSE00001531603	2625022	2626116]
[1094]
1573
Time taken: 0.059303998947143555
Average time: 0.06859782844005083
1574
Time taken: 0.05908679962158203
Average time: 0.06859178585503274
51
[exon	ENSE00001657492	8542841	8543108]
[267]
1575
Time taken: 0.06030631065368652
Average time: 0.06858652523585729


In [96]:
from collections import Counter
Counter(evaluations.copy())

Counter({('gene', 6): 44,
         ('gene', 0): 67,
         ('unaligned', 0): 556,
         ('gene', 5): 86,
         ('gene', 3): 229,
         ('gene', 2): 282,
         ('gene', 4): 150,
         ('gene', 1): 161})

In [117]:
counter = 0
def suffix_replacer(hits):
    (start, end), length = hits
    genome_seqs = []
    for i in range(start,end):
        index = a.genome_sa[i]
        genome_seqs.append((index,index+length))
    return genome_seqs
def genome_align():
    min_mis = 7
    best_hits = []
    hits = bowtie(read, a.genome_M, a.genome_occ)
    curr_mis = hits[1]
    if curr_mis < min_mis:
        min_mis = curr_mis
        best_hits = [(0, suffix_replacer(hits)[0][0], 50)]
    for offset in range(10, 41):
        seed1, seed2 = read[:offset], read[offset:]
        hits1 = bowtie(seed1, a.genome_M, a.genome_occ)
        hits2 = bowtie(seed2, a.genome_M, a.genome_occ)
        curr_mis = hits1[1] + hits2[1]
        if curr_mis < min_mis:
            hits1 = suffix_replacer(hits1)
            hits2 = suffix_replacer(hits2)
            for seq1 in hits1:
                for seq2 in hits2:
                    if MIN_INTRON_SIZE < seq2[0] - seq1[1] < MAX_INTRON_SIZE:
                        min_mis = curr_mis
                        seq1len = len(seq1)
                        seq2len = len(seq2)
                        best_hits = [(0, seq1[0], seq1len), (seq1len, seq2[0], seq2len)]
                        break
    for offset1 in range(10,31):
        for offset2 in range(10+offset1, 41):
            seed1, seed2,seed3 = read[:offset1], read[offset1:offset2], read[offset2:]
            hits1 = bowtie(seed1, a.genome_M, a.genome_occ)
            hits2 = bowtie(seed2, a.genome_M, a.genome_occ)
            hits3 = bowtie(seed3, a.genome_M, a.genome_occ)
            curr_mis = hits1[1] + hits2[1] + hits3[1]
            if curr_mis < min_mis:
                hits1 = suffix_replacer(hits1)
                hits2 = suffix_replacer(hits2)
                hits3 = suffix_replacer(hits3)
                for seq1 in hits1:
                    for seq2 in hits2:
                        for seq3 in hits3:
                            if MIN_INTRON_SIZE < seq2[0] - seq1[1] < MAX_INTRON_SIZE and MIN_INTRON_SIZE < seq3[0] - seq2[1] < MAX_INTRON_SIZE:
                                min_mis = curr_mis
                                seq1len = len(seq1)
                                seq2len = len(seq2)
                                seq3len = len(seq3)
                                best_hits = [(0, seq1[0], seq1len), (seq1len, seq2[0], seq2len), (seq1len+seq2len, seq3[0], seq3len)]
                                break
    return best_hits

723


In [None]:
counter = 0
def suffix_replacer(hits):
    (start, end), length = hits
    genome_seqs = []
    for i in range(start,end):
        index = a.genome_sa[i]
        genome_seqs.append((index,index+length))
    return genome_seqs
iteration = 0
for read in reads:
    min_mis = 7
    best_hits = []
    hits = bowtie(read, a.genome_M, a.genome_occ)
    curr_mis = hits[1]
    if curr_mis < min_mis:
        min_mis = curr_mis
        best_hits = suffix_replacer(hits)
    for offset in range(8, 43):
        seed1, seed2 = read[:offset], read[offset:]
        hits1 = bowtie(seed1, a.genome_M, a.genome_occ)
        hits2 = bowtie(seed2, a.genome_M, a.genome_occ)
        curr_mis = hits1[1] + hits2[1]
        if curr_mis < min_mis:
            hits1 = suffix_replacer(hits1)
            hits2 = suffix_replacer(hits2)
            for seq1 in hits1:
                for seq2 in hits2:
                    if MIN_INTRON_SIZE < seq2[0] - seq1[1] < MAX_INTRON_SIZE:
                        min_mis = curr_mis
                        best_hits = hits
    for offset1 in range(8,35):
        for offset2 in range(8+offset1, 43):
            seed1, seed2,seed3 = read[:offset1], read[offset1:offset2], read[offset2:]
            hits1 = bowtie(seed1, a.genome_M, a.genome_occ)
            hits2 = bowtie(seed2, a.genome_M, a.genome_occ)
            hits3 = bowtie(seed3, a.genome_M, a.genome_occ)
            curr_mis = hits1[1] + hits2[1] + hits3[1]
            if curr_mis < min_mis:
                hits1 = suffix_replacer(hits1)
                hits2 = suffix_replacer(hits2)
                hits3 = suffix_replacer(hits3)
                for seq1 in hits1:
                    for seq2 in hits2:
                        for seq3 in hits3:
                            if MIN_INTRON_SIZE < seq2[0] - seq1[1] < MAX_INTRON_SIZE and MIN_INTRON_SIZE < seq3[0] - seq2[1] < MAX_INTRON_SIZE:
                                min_mis = curr_mis
                                best_hits = hits
    iteration += 1
    if best_hits:
        counter += 1
    print('iteration: ', iteration, ', counter: ', counter, ', time taken: ', time.time()-start)



In [None]:
counter = 0
def suffix_replacer(hits):
    (start, end), length = hits
    genome_seqs = []
    for i in range(start,end):
        index = a.genome_sa[i]
        genome_seqs.append((index,index+length))
    return genome_seqs
iteration = 0
for read in reads:
    start = time.time()
    min_mis = 7
    best_hits = []
    before_single = time.time()
    hits = bowtie(read, a.genome_M, a.genome_occ)
    curr_mis = hits[1]
    if curr_mis < min_mis:
        min_mis = curr_mis
        best_hits = suffix_replacer(hits)
    print('single: ', time.time()-before_single)
    before_double = time.time()
    for offset in range(5, 46):
        seed1, seed2 = read[:offset], read[offset:]
        hits1 = bowtie(seed1, a.genome_M, a.genome_occ)
        hits2 = bowtie(seed2, a.genome_M, a.genome_occ)
        curr_mis = hits1[1] + hits2[1]
        if curr_mis < min_mis:
            hits1 = suffix_replacer(hits1)
            hits2 = suffix_replacer(hits2)
            for seq1 in hits1:
                for seq2 in hits2:
                    if MIN_INTRON_SIZE < seq2[0] - seq1[1] < MAX_INTRON_SIZE:
                        min_mis = curr_mis
                        best_hits = [hits1, hits2]
                        break
    print('double: ', time.time()-before_double)
    before_triple = time.time()
    for offset1 in range(5,41):
        for offset2 in range(5+offset1, 46):
            seed1, seed2,seed3 = read[:offset1], read[offset1:offset2], read[offset2:]
            hits1 = bowtie(seed1, a.genome_M, a.genome_occ)
            hits2 = bowtie(seed2, a.genome_M, a.genome_occ)
            hits3 = bowtie(seed3, a.genome_M, a.genome_occ)
            curr_mis = hits1[1] + hits2[1] + hits3[1]
            if curr_mis < min_mis:
                hits1 = suffix_replacer(hits1)
                hits2 = suffix_replacer(hits2)
                hits3 = suffix_replacer(hits3)
                print('suffix replacing: ', time.time()-before_triple)
                for seq1 in hits1:
                    for seq2 in hits2:
                        for seq3 in hits3:
                            if MIN_INTRON_SIZE < seq2[0] - seq1[1] < MAX_INTRON_SIZE and MIN_INTRON_SIZE < seq3[0] - seq2[1] < MAX_INTRON_SIZE:
                                min_mis = curr_mis
                                best_hits = [hits1, hits2, hits3]
                                break
    print('triple: ', time.time()-before_triple)
    iteration += 1
    if best_hits:
        counter += 1
    print('iteration: ', iteration, ', counter: ', counter, ', time taken: ', time.time()-start)


In [None]:
a