In [1]:
#2017-05-05
#A. Pendleton
#This script is used to make an AGP file for Zoey. Can be built upon as new forms
#   of contigs become assembled and incorporated into the AGP.

#this uses iPython magic to make plots appear inline
%matplotlib inline
import matplotlib.pyplot as plt
import scipy.stats as stats
import pandas as pd
import subprocess

#import genutils
import os
import sys
import numpy as np
import re
import scipy 
import matplotlib.patches as patches
import glob

#This script is used to make an AGP file for Zoey. Can be built upon as new contigs become assembled and incorporated into the available set for assembly.

In [34]:
wkDir = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/AGP/'
print('Current working directory is:\n', wkDir)

logFile = open(wkDir + 'temp/iPythonNotebook_LogFile.txt','w')

Current working directory is:
 /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/AGP/


In [35]:
#####READING IN PRIMARY CONTIG COORDINATES FROM MUMMER ALIGNMENT#######
#Primary contig alignments (no canu processing)
primaryContigCoord = open(wkDir + 'input/primary.2017-04-10.txt', 'r')
primaryContigCoordBed = open(wkDir + 'input/primary.2017-04-10.bed', 'w')

contigDict,index = {}, 0

for line in primaryContigCoord:
    if line.startswith("#") is True: #skips header line
        continue
    line = line.rstrip().split('\t')
    contigID, length, Dir, chrom, start, end = line[0],line[1],line[2],line[3],int(line[4]),int(line[5])    
    """if 'chr18' not in chrom: #FILTERING ONLY FOR CHROM 18 FOR NOW!!! REMOVE LATER 
        continue"""
    index += 1
    contigDict[index] = [chrom,start,end,contigID,Dir,length]
    primaryContigCoordBed.write('%s\t%i\t%i\t%s\n' % (chrom,int(start-1),end,contigID))
    #if index > 15: #FOR TESTING--- REMOVE LATER
    #    break
    
primaryContigCoord.close()
primaryContigCoordBed.close()
print ('Identified coordinates for %i primary contigs' % len(contigDict))
logFile.write('Identified coordinates for %i primary contigs\n\n' % len(contigDict))

Identified coordinates for 2688 primary contigs


49

Step 1. Read in coordinates from alignment file based on Mummer of primary contigs (per Jeff)
Step 2. BLAT each contig against the contig proximal to that proximal to it

In [36]:
def read_line(Dict,curr_index,i):
    #Contig 1
    chrom1,start1,end1,contigID1,Dir1,length1 = Dict[curr_index][0],int(Dict[curr_index][1]),int(Dict[curr_index][2]), Dict[curr_index][3], Dict[curr_index][4], int(Dict[curr_index][5])
    #Contig 2
    chrom2,start2,end2,contigID2,Dir2,length2 = Dict[i+1][0],int(Dict[i+1][1]),int(Dict[i+1][2]), Dict[i+1][3], Dict[i+1][4], int(Dict[i+1][5])
    return chrom1,start1,end1,contigID1,Dir1,length1,chrom2,start2,end2,contigID2,Dir2,length2
###################################################################################################
def read_last_line(Dict):
    #Contig 1
    chrom1,start1,end1,contigID1,Dir1,length1 = Dict[i-1][0],int(Dict[i-1][1]),int(Dict[i-1][2]), Dict[i-1][3], Dict[i-1][4], int(Dict[i-1][5])
    #Contig 2
    chrom2,start2,end2,contigID2,Dir2,length2 = Dict[i][0],int(Dict[i][1]),int(Dict[i][2]), Dict[i][3], Dict[i][4], int(Dict[i][5])
    return chrom1,start1,end1,contigID1,Dir1,length1,chrom2,start2,end2,contigID2,Dir2,length2
###################################################################################################
def process_last_contig(contigDict,curr_index,i):
    chrom1,start1,end1,contigID1,Dir1,length1,chrom2,start2,end2,contigID2,Dir2,length2 = read_last_line(contigDict)
    print ('\n#%s (%i) -- %s (%i)' % (contigID1,i-1,contigID2,i))
    logFile.write(('\n#%s (%i) -- %s (%i)\n' % (contigID1,i-1,contigID2,i)))
    if start1 == start2 or end1 == end2:
        return
    if end2 > end1:
        curr_index+=1 
        curated_contigDict[i] = [chrom2,start2,end2,contigID2,Dir2,length2]
        inList.append(contigID2)
        print('PASS - Last contig on chromosome overlap or is spaced correctly')
        logFile.write('PASS - Last contig on chromosome overlap or is spaced correctly\n')
        return
###################################################################################################
def process_first_contig(contigDict,curr_index,i):
    curr_index+=1
    #Always save first and contig in the dictionary/dataset
    curated_contigDict[i] = [chrom1,start1,end1,contigID1,Dir1,length1]
    curated_contigDict[i+1] = [chrom2,start2,end2,contigID2,Dir2,length2]
    inList.append(contigID2)
    print('PASS - First call in dataset')
    logFile.write('PASS - First call in dataset\n')
    return curr_index 
###################################################################################################
def process_same_starts(contigDict,curr_index,i):
    if start1 == start2 and end1 > end2:
        print('FAIL -- Share start, contig1 longer' + '\n' + str(contigDict[i]) + '\n' + str(contigDict[i+1]))
        logFile.write('FAIL -- Share start, contig1 longer' + '\n' + str(contigDict[i]) + '\n' + str(contigDict[i+1]) + '\n')
        return
    if start1 == start2 and end1 < end2: #overwrite previous position if contig 2 is longer than contig 1
        curated_contigDict[i] = [chrom2,start2,end2,contigID2,Dir2,length2]
        print('FAIL -- Share start, contig2 longer' + '\n' + str(contigDict[i]) + '\n' + str(contigDict[i+1]))
        logFile.write('FAIL -- Share start, contig2 longer' + '\n' + str(contigDict[i]) + '\n' + str(contigDict[i+1]) + '\n')
        return
###################################################################################################
def process_same_ends(contigDict,curr_index,i):    
    if start1 < start2 and end1 == end2:
        print('Ends same, kept contig 1 only' + '\n' + str(contigDict[i]) + '\n' + str(contigDict[i+1]))
        logFile.write('Ends same, kept contig 1 only' + '\n' + str(contigDict[i]) + '\n' + str(contigDict[i+1]) + '\n')
        return 
    if start2 < start1 and end1 == end2: 
        print('ERROR - is this contig list sorted??!!')
        logFile.write('ERROR - is this contig list sorted??!!\n\n\n')
        sys.exit(1)

In [37]:
curated_contigDict, curr_index, inList = {}, 1, []
chrom1,chrom2 = '0', '0' #initializing values


for key, value in contigDict.items(): 
    i = int(key)
    #1. If last contig in dataset 
    if i == len(contigDict):
        continue
    
    #2. If contigs are on different chromosomes:
    if chrom1 != chrom2:
        process_last_contig(contigDict,curr_index,i)
        continue
        
    chrom1,start1,end1,contigID1,Dir1,length1,chrom2,start2,end2,contigID2,Dir2,length2 = read_line(contigDict,curr_index,i)
    print ('\n#%s (%i) -- %s (%i)' % (contigID1,curr_index,contigID2,i+1))
    logFile.write('\n#%s (%i) -- %s (%i)\n' % (contigID1,curr_index,contigID2,i+1))

    #3. Automatically saves first contig in dataset
    if curr_index == 1 and len(curated_contigDict) == 0:
        curr_index = process_first_contig(contigDict,curr_index,i)
        continue
    
    #4. If contigs have the same start site, choose the longest contig
    if start1 == start2:
        process_same_starts(contigDict,curr_index,i)
        continue

    #5. If contigs have same end coordinate, choose longest contig (i.e. contig #1 if sorted)
    if end1 == end2: 
        process_same_ends(contigDict,curr_index,i)
        continue
        
    #6. If contigs #1 and #2 have same start AND end, keep contig #1
    if start1 == start2 and end1 == end2:
        print('FAIL -- Same start and end coordinate' + '\n' + str(contigDict[i]) + '\n' + str(contigDict[i+1]))
        logFile.write('FAIL -- Same start and end coordinate' + '\n' + str(contigDict[i]) + '\n' + str(contigDict[i+1]) + '\n')
        continue #because contig1 is already in the dictionary
        
    #7. If contig #2 is fully within contig #1 -- continue
    if start1 < start2 and end1 > end2: # contig2 is fully within contig1
        #curated_contigDict[curr_index] = [chrom1,start1,end1,contigID1,Dir1,length1]
        print('FAIL -- Contig2 fully within contig1' + '\nKeeping contig: ' + contigID1 +  '\n' + str(contigDict[i]) + '\n' + str(contigDict[i+1]))
        logFile.write('FAIL -- Contig2 fully within contig1' + '\nKeeping contig: ' + contigID1 +  '\n' + str(contigDict[i]) + '\n' + str(contigDict[i+1]) +'\n')
        continue
    if start1 > start2 and end1 < end2:
        print('ERROR - is this contig list sorted??!!')
        logFile.write('ERROR - is this contig list sorted??!!\n\n\n')
        sys.exit(1)
    
    #8. If contig passes all these - then automatically saves
    curr_index=i+1
    #if contigID2 not in inList:
    curated_contigDict[i] = [chrom2,start2,end2,contigID2,Dir2,length2]
    print('PASS - contigs overlap or are spaced correctly')
    logFile.write('PASS - contigs overlap or are spaced correctly\n')

print ('Identified CURATED coordinates for %i primary contigs' % len(curated_contigDict))
logFile.write('\n##Identified CURATED coordinates for %i primary contigs\n\n########\n\n' % len(curated_contigDict))


#Re-naming dictionary keys
count=0
for keys in curated_contigDict:
    count+=1
    curated_contigDict[count]=curated_contigDict.pop(keys)



#CTG-0076 (1) -- CTG-1722 (2)
PASS - First call in dataset

#CTG-1722 (2) -- CTG-0353 (3)
PASS - contigs overlap or are spaced correctly

#CTG-0353 (3) -- CTG-0052 (4)
PASS - contigs overlap or are spaced correctly

#CTG-0052 (4) -- CTG-0231 (5)
PASS - contigs overlap or are spaced correctly

#CTG-0231 (5) -- CTG-0230 (6)
FAIL -- Contig2 fully within contig1
Keeping contig: CTG-0231
['chr1', 17343875, 20909826, 'CTG-0231', 'rc', '3565775']
['chr1', 17356763, 17361463, 'CTG-0230', 'fwd', '4861']

#CTG-0231 (5) -- CTG-0143 (7)
PASS - contigs overlap or are spaced correctly

#CTG-0143 (7) -- CTG-2241 (8)
FAIL -- Contig2 fully within contig1
Keeping contig: CTG-0143
['chr1', 20906456, 25699815, 'CTG-0143', 'fwd', '4864237']
['chr1', 24634452, 24637932, 'CTG-2241', 'rc', '6265']

#CTG-0143 (7) -- CTG-1991 (9)
FAIL -- Contig2 fully within contig1
Keeping contig: CTG-0143
['chr1', 24634452, 24637932, 'CTG-2241', 'rc', '6265']
['chr1', 24635586, 24645896, 'CTG-1991', 'fwd', '10214']

#CTG-014

#CTG-0835 (708) -- CTG-2129 (709)
PASS - Last contig on chromosome overlap or is spaced correctly

#CTG-2129 (709) -- CTG-0720 (710)
PASS - Last contig on chromosome overlap or is spaced correctly

#CTG-0720 (710) -- CTG-1189 (711)

#CTG-1189 (711) -- CTG-1048 (712)
PASS - Last contig on chromosome overlap or is spaced correctly

#CTG-1048 (712) -- CTG-0817 (713)
PASS - Last contig on chromosome overlap or is spaced correctly

#CTG-0817 (713) -- CTG-0816 (714)

#CTG-0816 (714) -- CTG-0272 (715)
PASS - Last contig on chromosome overlap or is spaced correctly

#CTG-0272 (715) -- CTG-0989 (716)
PASS - Last contig on chromosome overlap or is spaced correctly

#CTG-0989 (716) -- CTG-0213 (717)
PASS - Last contig on chromosome overlap or is spaced correctly

#CTG-0213 (717) -- CTG-2170 (718)

#CTG-2170 (718) -- CTG-1441 (719)
PASS - Last contig on chromosome overlap or is spaced correctly

#CTG-1441 (719) -- CTG-1240 (720)

#CTG-1240 (720) -- CTG-0854 (721)
PASS - Last contig on chromosome o

#CTG-2615 (2187) -- CTG-0821 (2188)
PASS - Last contig on chromosome overlap or is spaced correctly

#CTG-0821 (2188) -- CTG-2408 (2189)

#CTG-2408 (2189) -- CTG-1068 (2190)
PASS - Last contig on chromosome overlap or is spaced correctly

#CTG-1068 (2190) -- CTG-0629 (2191)
PASS - Last contig on chromosome overlap or is spaced correctly

#CTG-0629 (2191) -- CTG-1697 (2192)
PASS - Last contig on chromosome overlap or is spaced correctly

#CTG-1697 (2192) -- CTG-0491 (2193)
PASS - Last contig on chromosome overlap or is spaced correctly

#CTG-0491 (2193) -- CTG-1667 (2194)
PASS - Last contig on chromosome overlap or is spaced correctly

#CTG-1667 (2194) -- CTG-1140 (2195)
PASS - Last contig on chromosome overlap or is spaced correctly

#CTG-1140 (2195) -- CTG-0405 (2196)
PASS - Last contig on chromosome overlap or is spaced correctly

#CTG-0405 (2196) -- CTG-0074 (2197)
PASS - Last contig on chromosome overlap or is spaced correctly

#CTG-0074 (2197) -- CTG-1783 (2198)

#CTG-1783 (2198) 

In [7]:
"""print('Non-curated:')
for i in contigDict:
    print(i,contigDict[i])
print( '\nCurated:')
for i in curated_contigDict:
    print(i, curated_contigDict[i])"""

"print('Non-curated:')\nfor i in contigDict:\n    print(i,contigDict[i])\nprint( '\nCurated:')\nfor i in curated_contigDict:\n    print(i, curated_contigDict[i])"

In [38]:
def process_first_contig_on_chrom(chrom1,start1,end1,contigID1,Dir1,length1):
    chrom2 = chrom1
    start2 = start1 - 10000
    end2 = start1 + 10000
    contigID2 = 'canFam3'
    Dir2 = 'fwd'
    length1 = end2-start2
    
    if start2 < 0:
        start2 = 0
    
    #Determine overlap 
    determine_contig_overlap(end1, contigID1, Dir1, length1, end2, contigID2, Dir2, length2)
 
    #Find coordinates to extract
    find_overlapping_coordinates(overlap,end1, contigID1, Dir1, length1, end2, contigID2, Dir2, length2)
    
    #Extract FASTA
    fastaRoot = '/home/ampend/links/kidd-lab/genomes/canFam3.1/'
    extract_fasta(fastaRoot,contigID1,Dir1,extract_coord1,contigID2,Dir2,extract_coord2)
###################################################################################################
def determine_contig_overlap(end1, contigID1, Dir1, length1, end2, contigID2, Dir2, length2):
    #How much do the contigs overlap?
    global overlap
    overlap = end1 - start2
    print('\nOverlap = ', overlap)
    #OVERLAPPING CONTIGS
    if overlap > 0:
        find_overlapping_coordinates(overlap,end1, contigID1, Dir1, length1, end2, contigID2, Dir2, length2)
    #DIRECTLY ADJACENT CONTIGS
    if overlap == 0:
        print('Contigs are directly adjacent to one another')
        logFile.write('Contigs are directly adjacent to one another\n')
        #WRITE FUNCTION FOR THESE
    #CONTIGS WITH GAP BETWEEN THEM
    if overlap < 0:
        print('Gap between contigs')
        logFile.write('Gap between contigs\n')

###################################################################################################
def find_overlapping_coordinates(overlap,end1, contigID1, Dir1, length1, end2, contigID2, Dir2, length2):
    ###Determine coordinates to extract for BLAT
    global extract_coord1
    global extract_coord2
    #Contig #1
    if 'fwd' in Dir1:
        extract_coord1 = [length1 - overlap - 10000, length1]
    else:
        extract_coord1 = [0, overlap + 10000]
    #Contig #2
    if 'fwd' in Dir2:
        extract_coord2 = [0, overlap + 10000] 
    else:
        extract_coord2 = [length2 - overlap - 10000, length2]
    
    print ('Coordinates to extract for BLAT: ',extract_coord1,extract_coord2,'\n')
    logFile.write('Coordinates to extract for BLAT: ' + str(extract_coord1) + str(extract_coord2) + '\n')

    #safety check
    for i in range(0,1): #CHECKS 
        #= if the region to extract extends beyond the length of the contig, if so then the coordinate changes to the length of contig
        if extract_coord1[i] > length1:
            extract_coord1[i] = length1
        if extract_coord2[i] > length2:
            extract_coord2[i] = length2
        # If the region extended too far, and the value is negative -- then make starting extraction coordinate = 0
        if extract_coord1[i] < 0: 
            extract_coord1[i] = 0
        if extract_coord2[i] < 0:
            extract_coord2[i] = 0
            
    if overlap < 0: #### SEND TO DIFFERENT FUNCTION LATER -- FOLLOW OVERLAPPING CONTIGS FOR NOW
        print('Contigs do not overlap')
        logFile.write('Contigs do not overlap\n')
    #Where to find fasta of contigs
    fastaRoot = '/home/ampend/links/kidd-lab/jmkidd-projects/zoey/contig-assignment/kmer-matches/eval1/'
    extract_fasta(fastaRoot,contigID1,Dir1,extract_coord1, contigID2,Dir2,extract_coord2)
    
###################################################################################################
def extract_fasta(fastaRoot,contigID1,Dir1,extract_coord1,contigID2,Dir2,extract_coord2):
    #Contig 1
    fasta_path1 = fastaRoot + contigID1 + '/' + contigID1 + '.fa'
    cmd = 'samtools faidx %s %s:%i-%i  > %stemp/contig1.fa' % (fasta_path1,contigID1,extract_coord1[0],extract_coord1[1],wkDir)
    logFile.write(cmd + '\n')
    subprocess.call(cmd,shell=True)
    
    #Contig 2
    fasta_path2 = fastaRoot + contigID2 + '/' + contigID2 + '.fa'
    cmd = 'samtools faidx %s %s:%i-%i > %stemp/contig2.fa' % (fasta_path2,contigID2,extract_coord2[0],extract_coord2[1],wkDir)
    logFile.write(cmd + '\n')
    subprocess.call(cmd,shell=True)
    
###################################################################################################
def run_blat(wkDir):
    blatcmd = 'blat %stemp/contig1.fa %stemp/contig2.fa %stemp/temp.blat' % (wkDir,wkDir,wkDir)
    logFile.write(blatcmd + '\n')
    subprocess.call(blatcmd,shell=True)
    
    
###################################################################################################
def parse_blat(wkDir,overlap,contigID1,Dir1,extract_coord1,contigID2,Dir2,extract_coord2):
    inFile = open('%stemp/temp.blat' % (wkDir),'r')
    lineCount, cleanAlignment, offset = 0, False, 0
    for line in inFile:
        line=line.rstrip().split('\t')
        lineCount+=1
        if lineCount > 5:#skips the BLAT results headers
            score,strand = int(line[0]),line[8]
            percent_of_overlap = float(score)/overlap
            if percent_of_overlap < float(0.75):
                continue
            print('\n#Parsing BLAT results','Percent of overlap matched in BLAT: ',format(percent_of_overlap, '.3f'))
            logFile.write('\n#Parsing BLAT results\n' + 'Percent of overlap matched in BLAT: ' + format(percent_of_overlap, '.3f') + ' \n')
            #CHECKS TO MAKE SURE WE DID THIS RIGHT - correct end vs correct end
            #if '+' not in strand:
            #    print('ERROR: Top hit not in proper orientation.... SKIPPING -- PLEASE CHECK')
            #    return
            #parse contig #1 (left contig)
            blat_length1, blat_start1, blat_end1 = int(line[14]),int(line[15]),int(line[16])
            print(blat_length1, blat_start1, blat_end1)
            logFile.write('length = %i, start = %i, end = %i' % (blat_length1, blat_start1, blat_end1))
            #parse contig #2 (right contig)
            blat_length2, blat_start2, blat_end2 = int(line[10]),int(line[11]),int(line[12])
            print(blat_length2, blat_start2, blat_end2)
            logFile.write('length = %i, start = %i, end = %i' % (blat_length2, blat_start2, blat_end2))
            #Calculate offset  - to compensate for non-clean alignments (extension of left contig that does not overlap with adjacent contig)
            offset = calculate_offset(overlap,blat_length1, blat_start1, blat_end1,Dir1,blat_length2, blat_start2, blat_end2,Dir2)
            return offset
###################################################################################################
def calculate_offset(overlap,blat_length1, blat_start1, blat_end1,Dir1,blat_length2, blat_start2, blat_end2,Dir2):
    global offset
    if 'rc' in Dir1:
        offset = blat_start1 - 0
    else:
        offset = blat_length1 - blat_end1
    if offset < 3:
        offset = 0
    print('\nBLAT offset = ', offset)
    logFile.write('\nBLAT offset = ' + str(offset) + '\n')
    return offset

###################################################################################################
    

In [None]:
#write_AGP_header(agpFile)
posDict, offset_BLAT = {}, []

for i in range(1,len(curated_contigDict)): 
    curr_index = i
    offset = 0 #set equal to zero at beginning, will change if there is one from parsing the blat hit(s)
    #1. Reading in coordinates from contigList
    chrom1,start1,end1,contigID1,Dir1,length1,chrom2,start2,end2,contigID2,Dir2,length2 = read_line(curated_contigDict, curr_index, i)
    
    #2. Checking contigs are on same chromosome
    if chrom1 != chrom2:# or i == 1: #This is first contig on chromosome, need to process it first
        #continue
        process_first_contig_on_chrom(chrom1,start1,end1,contigID1,Dir1,length1) 
        continue
    print ('\n#',contigID1, contigID2,'\n',chrom1,start1,end1,contigID1,Dir1,length1,'\n',chrom2,start2,end2,contigID2,Dir2,length2)
    
    #3. Determines the overlap/orientation of the two contigs
    determine_contig_overlap(end1,contigID1,Dir1,length1,end2,contigID2,Dir2,length2)

    #4. BLAT the properly oriented contig ends against one another
    #     and parse the results
    run_blat(wkDir)
    parse_blat(wkDir,overlap,contigID1,Dir1,extract_coord1,contigID2,Dir2,extract_coord2)
    if offset > 0:
        offset_BLAT.append([contigID1,Dir1,extract_coord1,contigID2,Dir2,extract_coord2,overlap])
    
    #save contig1 information to dictionary
    posDict[i] = [chrom1,start1,end1,contigID1,Dir1,length1,overlap,offset,contigID2]


    #print('###################\n')
    #if i > 2:
    #    break
        



# CTG-0076 CTG-0353 
 chr1 202580 7069329 CTG-0076 rc 6872240 
 chr1 7064688 9594107 CTG-0353 rc 2527416

Overlap =  4641
Coordinates to extract for BLAT:  [0, 14641] [2512775, 2527416] 


#Parsing BLAT results Percent of overlap matched in BLAT:  0.966
14641 0 4568
14642 10112 14642

BLAT offset =  0

# CTG-0353 CTG-0052 
 chr1 7064688 9594107 CTG-0353 rc 2527416 
 chr1 9575524 17361463 CTG-0052 fwd 7832328

Overlap =  18583
Coordinates to extract for BLAT:  [0, 28583] [0, 28583] 


#Parsing BLAT results Percent of overlap matched in BLAT:  0.996
28583 0 18559
28583 65 18675

BLAT offset =  0

# CTG-0052 CTG-0231 
 chr1 9575524 17361463 CTG-0052 fwd 7832328 
 chr1 17343875 20909826 CTG-0231 rc 3565775

Overlap =  17588
Coordinates to extract for BLAT:  [7804740, 7832328] [3538187, 3565775] 


#Parsing BLAT results Percent of overlap matched in BLAT:  0.986
27589 10057 27589
27589 10168 27589

BLAT offset =  0

# CTG-0231 CTG-0143 
 chr1 17343875 20909826 CTG-0231 rc 3565775 
 chr1 20


# CTG-1354 CTG-1183 
 chr1 104528420 104558658 CTG-1354 fwd 29968 
 chr1 104542988 104606793 CTG-1183 rc 63633

Overlap =  15670
Coordinates to extract for BLAT:  [4298, 29968] [37963, 63633] 


#Parsing BLAT results Percent of overlap matched in BLAT:  0.955
25671 10248 25671
25671 10021 25671

BLAT offset =  0

# CTG-1183 CTG-0366 
 chr1 104542988 104606793 CTG-1183 rc 63633 
 chr1 104568678 106945658 CTG-0366 fwd 2388817

Overlap =  38115
Coordinates to extract for BLAT:  [0, 48115] [0, 48115] 


#Parsing BLAT results Percent of overlap matched in BLAT:  0.964
48115 0 38000
48115 0 37981

BLAT offset =  0

# CTG-0366 CTG-0626 
 chr1 104568678 106945658 CTG-0366 fwd 2388817 
 chr1 106947491 107986272 CTG-0626 rc 1036859

Overlap =  -1833
Gap between contigs

# CTG-0626 CTG-1009 
 chr1 106947491 107986272 CTG-0626 rc 1036859 
 chr1 107971823 108201839 CTG-1009 rc 194951

Overlap =  14449
Coordinates to extract for BLAT:  [0, 24449] [170502, 194951] 


#Parsing BLAT results Percent of

#Parsing BLAT results Percent of overlap matched in BLAT:  0.992
16691 0 6705
16692 10028 16689

BLAT offset =  0

# CTG-1542 CTG-0838 
 chr10 25484567 25506164 CTG-1542 rc 21472 
 chr10 25488111 25926063 CTG-0838 fwd 460067

Overlap =  18053
Coordinates to extract for BLAT:  [0, 28053] [0, 28053] 


#Parsing BLAT results Percent of overlap matched in BLAT:  0.968
21472 0 17947
28053 0 17635

BLAT offset =  0

# CTG-0838 CTG-0701 
 chr10 25488111 25926063 CTG-0838 fwd 460067 
 chr10 25889999 26731033 CTG-0701 rc 826784

Overlap =  36064
Coordinates to extract for BLAT:  [414003, 460067] [780720, 826784] 


# CTG-0701 CTG-0037 
 chr10 25889999 26731033 CTG-0701 rc 826784 
 chr10 26727734 35678751 CTG-0037 rc 9019875

Overlap =  3299
Coordinates to extract for BLAT:  [0, 13299] [9006576, 9019875] 


#Parsing BLAT results Percent of overlap matched in BLAT:  0.997
13299 0 3297
13300 10000 13300

BLAT offset =  0

# CTG-0037 CTG-0118 
 chr10 26727734 35678751 CTG-0037 rc 9019875 
 chr10 35

In [8]:
for i in posDict:
    print (posDict[i][0:9])

['chr18', 2619, 26992, 'CTG-1462', 'rc', 24712, 4849, 0, 'CTG-0201']
['chr18', 22143, 3966426, 'CTG-0201', 'rc', 3968428, 1729, 0, 'CTG-1861']
['chr18', 3964697, 3978047, 'CTG-1861', 'fwd', 13154, 2637, 0, 'CTG-0743']
['chr18', 3975410, 4660269, 'CTG-0743', 'fwd', 681475, 25890, 0, 'CTG-0770']
['chr18', 4634379, 5281622, 'CTG-0770', 'rc', 639396, 21363, 0, 'CTG-0177']
['chr18', 5260259, 9645181, 'CTG-0177', 'rc', 4363384, 6495, 0, 'CTG-0767']
['chr18', 9638686, 10286737, 'CTG-0767', 'rc', 647068, 2059, 0, 'CTG-0615']
['chr18', 10284678, 11357644, 'CTG-0615', 'rc', 1078170, 37187, 0, 'CTG-1278']
['chr18', 11320457, 11367174, 'CTG-1278', 'fwd', 37673, 40908, 0, 'CTG-0400']
['chr18', 11326266, 13501863, 'CTG-0400', 'rc', 2190587, -4852, 0, 'CTG-0564']
['chr18', 13506715, 14852995, 'CTG-0564', 'rc', 1349347, -1291, 0, 'CTG-0274']
['chr18', 14854286, 17956069, 'CTG-0274', 'rc', 3082702, -8964, 0, 'CTG-0878']
['chr18', 17965033, 18359454, 'CTG-0878', 'rc', 395029, -30419, 0, 'CTG-1134']
['ch

In [None]:
def process_first_agp_contig(agp_chrom, start, end, contigID, direction, length, overlap, offset, pairedContig):
    agp_prev_chrom, agp_prev_overlap, prev_offset = agp_chrom, 0, 0
    
    if direction == 'fwd':
        direction = '+' #changes notation of the direction
    if direction == 'rc':
        direction = '-'  #changes notation of the direction
    
    #Specially process those with offsets > 0
    if offset > 0 or prev_offset > 0:
        info = process_blat_offsets(agp_prev_chrom, agp_prev_overlap, prev_offset, agp_start, agp_end, overlap, offset,direction)
    #Those with BLAT offsets = 0
    else:
        agp_start, agp_end = 1, length
        contig_start, contig_end = 1, length
        info = [agp_prev_chrom,agp_prev_overlap,agp_start, agp_end,contig_start,contig_end,direction,offset]
    
    return info
########################################################################
def process_blat_offsets(agp_prev_chrom, agp_prev_overlap, prev_offset, agp_start, agp_end, overlap, offset,direction):
    if agp_prev_overlap < 5 and agp_prev_overlap > 0:
        agp_prev_overlap = 0
        
    if prev_offset ==0  and offset > 0: #contig left of offset
        agp_start = agp_end + 1
        agp_end = agp_start + length + agp_prev_overlap

        #Determine contig coordinates that align
        if direction == 'fwd':
            direction = '+' #changes notation of the direction
            contig_start = 1 + agp_prev_overlap
            contig_end = length
        if direction == 'rc':
            direction = '-'  #changes notation of the direction
            contig_start = 1
            contig_end = length - agp_prev_overlap

    if prev_offset > 0 and offset == 0: #contig right of offset
        agp_start = agp_end + 500 + 2 #compensate for added gap between the two which has length = 500
        agp_end = agp_start + length + agp_prev_overlap

        if direction == 'fwd':
            direction = '+' #changes notation of the direction
            contig_start = 1 + agp_prev_overlap
            contig_end = length
        if direction == 'rc':
            direction = '-'  #changes notation of the direction
            contig_start = 1
            contig_end = length - agp_prev_overlap  
            
    if prev_offset > 0 and offset > 0:
        print('This type of alignment needs to be readdressed...\n')
        sys.exit()
        
        
    info = [agp_prev_chrom,agp_prev_overlap,agp_start, agp_end,contig_start,contig_end,direction,offset]
    
    return info

########################################################################
def process_next_agp_contig(agp_chrom, direction, length, overlap, offset, pairedContig, agp_start, agp_end):
    agp_prev_chrom, agp_prev_overlap, prev_offset = posDict[i-1][0], int(posDict[i-1][6]),int(posDict[i-1][7])
    
    #Specially address those with BLAT offsets (unaligned sequence at the junctions)
    if prev_offset > 0 or offset > 0:
        info = process_blat_offsets(agp_prev_chrom, agp_prev_overlap, prev_offset, agp_start, agp_end, overlap, offset,direction)
        return info
    
    #Junctions without unaligned sequence(s):
    else:
        if agp_prev_overlap < 5:
            agp_prev_overlap = 0

        agp_start = agp_end + 1
        agp_end = agp_start + length + agp_prev_overlap

        #Determine contig coordinates that align
        if direction == 'fwd':
            direction = '+' #changes notation of the direction
            contig_start = 1 + agp_prev_overlap
            contig_end = length
        if direction == 'rc':
            direction = '-'  #changes notation of the direction
            contig_start = 1
            contig_end = length - agp_prev_overlap

        info = [agp_prev_chrom,agp_prev_overlap,agp_start, agp_end,contig_start,contig_end,direction,offset]
        return info       
        
########################################################################  
def process_agp_GAP(agp_chrom, direction, length, overlap, offset, pairedContig, agp_start, agp_end):
    gap_length = 10000
    agp_start = agp_end + 1
    agp_end = agp_start + gap_length
    if 'CTG' in contigID:
        gap_type = 'contig'
    else:#To change once we add more types of contigs in here
        gap_type = 'OTHER'
    agp_prev_overlap = 0
    
    info = [agp_chrom,agp_start,agp_end,gap_type,gap_length,agp_prev_overlap]
    return info  

########################################################################  
def write_AGP_header(agpFile): #alter for each assembly
    agpFile.write('##agp-version 1.0\n# ORGANISM: Canis lupus familiaris\n# TAX_ID: 9615\n')
    agpFile.write('# ASSEMBLY NAME: Zoey_v1\n# ASSEMBLY DATE: 19-April-2017\n')
    agpFile.write('# GENOME CENTER: University of Michigan - J.M. Kidd Lab\n')
    agpFile.write('# DESCRIPTION: AGP specifying the assembly of chromosome 18 from primary PacBio contigs from FALCON assembly\n')      


Now putting the information in the curated contig set (contigs constituting the golden path) into the coordinates of Zoey, and adding gap positions, if present. 

In [9]:
#Defining AGP outfile
agpFile = open(wkDir + 'results/AGP_Zoey_Assembly_v1.txt', 'w')

#Now putting them in the coordinates of the Zoey genome!! :) 
count, agp = 0, [] 
print('agp_prev_chrom,agp_prev_overlap,agp_start, agp_end, contig_start,contig_end,direction')

for i in posDict:
    count += 1
    agp_chrom, start, end, contigID, direction, length, overlap, offset, pairedContig = posDict[i][0:9]
    print('\n#',contigID)
    print(posDict[i])
    print('....')
    #print(count,agp_chrom, start, end, contigID, direction, length, overlap, offset, pairedContig)    
    
    #Processing first contig in dataset -OR- on a new chromosome
    if count == 1 or agp_chrom != agp_prev_chrom: 
        count = 1
        info = process_first_agp_contig(agp_chrom, start, end, contigID, direction, length, overlap, offset, pairedContig)
        agp_prev_chrom,agp_prev_overlap,agp_start, agp_end, contig_start,contig_end,direction, offset = info[0:10]
        agp.append([agp_prev_chrom, agp_start, agp_end, count, 'D', contigID, contig_start, contig_end, direction ])
        print(agp_prev_chrom,agp_prev_overlap,agp_start, agp_end, contig_start,contig_end,direction)
    
    #Process next contig (not first contig on chrom)
    else:
        if overlap > 0: #if contigs overlap --> NO GAP!
            info = process_next_agp_contig(agp_chrom, direction, length, overlap, offset, pairedContig, agp_start, agp_end)
            agp_prev_chrom,agp_prev_overlap,agp_start, agp_end, contig_start,contig_end,direction,offset = info[0:10]
            if offset > 0:
                #Add contig to the left of the gap
                agp.append([agp_chrom, agp_start, agp_end, count, 'D', contigID, contig_start, contig_end, direction])
                print(agp_prev_chrom,agp_prev_overlap,agp_start, agp_end, contig_start,contig_end,direction)
                #Add introduced gap from unaligned sequences from contig-contig junction BLAT
                count += 1
                agp.append([agp_chrom, agp_end+1, agp_end+501, count, 'U', '500','BLAT_gap','no','na'])
                print(agp_chrom, agp_end+1, agp_end+501, count, 'U', '500','BLAT_gap','no','na')
            else:
                agp.append([agp_chrom, agp_start, agp_end, count, 'D', contigID, contig_start, contig_end, direction])         
                print(agp_prev_chrom,agp_prev_overlap,agp_start, agp_end, contig_start,contig_end,direction)
        else: #if contigs do not overlap --> GAP!
            #process contig to the left of the gap
            info = process_next_agp_contig(agp_chrom, direction, length, overlap, offset, pairedContig, agp_start, agp_end)
            agp_prev_chrom,agp_prev_overlap,agp_start, agp_end, contig_start,contig_end,direction,offset = info[0:10]
            agp.append([agp_chrom, agp_start, agp_end, count, 'D', contigID, contig_start, contig_end, direction])
            print(agp_chrom, agp_start, agp_end, count, 'D', contigID, contig_start, contig_end, direction)
            
            #processing gap
            count += 1
            gap_info = process_agp_GAP(agp_chrom, direction, length, overlap, offset, pairedContig, agp_start, agp_end)
            agp_chrom,agp_start,agp_end,gap_type,gap_length,agp_prev_overlap = gap_info[0:7]
            agp.append([agp_chrom,agp_start,agp_end,count,'U',gap_length,gap_type,'no','na'])
            print(agp_chrom,agp_start,agp_end,count,'U',gap_length,gap_type,'no','na')
    #if count > 15:
    #    break

#Write AGP header lines
write_AGP_header(agpFile)

#Write out to agp File the information for contigs/gaps
for i in range(0,len(agp)):
    agpFile.write("\t".join(map(str,agp[i])) + '\n')
agpFile.close()
logFile.close()

agp_prev_chrom,agp_prev_overlap,agp_start, agp_end, contig_start,contig_end,direction

# CTG-1462
['chr18', 2619, 26992, 'CTG-1462', 'rc', 24712, 4849, 0, 'CTG-0201']
....
chr18 0 1 24712 1 24712 -

# CTG-0201
['chr18', 22143, 3966426, 'CTG-0201', 'rc', 3968428, 1729, 0, 'CTG-1861']
....
chr18 4849 24713 3997990 1 3963579 -

# CTG-1861
['chr18', 3964697, 3978047, 'CTG-1861', 'fwd', 13154, 2637, 0, 'CTG-0743']
....
chr18 1729 3997991 4012874 1730 13154 +

# CTG-0743
['chr18', 3975410, 4660269, 'CTG-0743', 'fwd', 681475, 25890, 0, 'CTG-0770']
....
chr18 2637 4012875 4696987 2638 681475 +

# CTG-0770
['chr18', 4634379, 5281622, 'CTG-0770', 'rc', 639396, 21363, 0, 'CTG-0177']
....
chr18 25890 4696988 5362274 1 613506 -

# CTG-0177
['chr18', 5260259, 9645181, 'CTG-0177', 'rc', 4363384, 6495, 0, 'CTG-0767']
....
chr18 21363 5362275 9747022 1 4342021 -

# CTG-0767
['chr18', 9638686, 10286737, 'CTG-0767', 'rc', 647068, 2059, 0, 'CTG-0615']
....
chr18 6495 9747023 10400586 1 640573 -

# CTG-061

In [26]:
###################################################################################################
def extract_contig_fasta(fastaRoot,contigID,orient,extract_coord1,extract_coord2):
    #Contig 1
    fasta_path1 = fastaRoot + contigID + '/' + contigID + '.fa'
    cmd = 'samtools faidx %s %s:%i-%i  > %stemp/contig1.fa' % (fasta_path1,contigID,extract_coord1,extract_coord2,wkDir)
    #print(cmd)
    subprocess.call(cmd,shell=True)
###################################################################################################    
def reverse_comp_contig_fasta(wkDir):
    cmd = 'fastarevcomp %stemp/contig1.fa > %stemp/contig1.fa.rc' % (wkDir,wkDir)
    #print(cmd)
    subprocess.call(cmd,shell=True)
    
    cmd = 'mv %stemp/contig1.fa.rc %stemp/seq.fa'  % (wkDir,wkDir)
    #print(cmd)
    subprocess.call(cmd,shell=True) 
###################################################################################################    
def read_in_fasta(wkDir):    
    inFile = open(wkDir + 'temp/seq.fa', 'r')
    for line in inFile:
        line=line.rstrip()
        if '>' in line:
            continue
        else:
            seq.append(line)
    inFile.close()
    return seq

#Reformat the fasta file
agp_Fasta = open(wkDir + 'results/AGP_Zoey_Assembly_v1.fa', 'r')
tmpFile = open(wkDir + 'results/tmp.fa', 'w')

###################################################################################################    
def reformat_fasta_file(wkDir):
    agp_Fasta = open(wkDir + 'results/AGP_Zoey_Assembly_v1.fa', 'r')
    tmpFile = open(wkDir + 'results/tmp.fa', 'w')

    for line in agp_Fasta:
        if '>' in line:
            tmpFile.write(line)#Writes chrom ID
            continue
        seq=line.rstrip()

        start = 0
        fasta_length = float(len(seq))
        fasta_line_length = 80
        Max = int(fasta_length/fasta_line_length)

        for i in range(0,Max+1):
            Line = seq[start:start+fasta_line_length]
            start = start + fasta_line_length
            tmpFile.write(Line + '\n')
            """if i > 10:
                break"""    

    agp_Fasta.close()
    tmpFile.close()

    cmd = 'mv %sresults/tmp.fa %sresults/AGP_Zoey_Assembly_v1.fa'  % (wkDir,wkDir)
    #print(cmd)
    subprocess.call(cmd,shell=True) 

In [32]:
agpFile = open(wkDir + 'results/AGP_Zoey_Assembly_v1.txt', 'r')
fastaRoot = '/home/ampend/links/kidd-lab/jmkidd-projects/zoey/contig-assignment/kmer-matches/eval1/'
agp_Fasta = open(wkDir + 'results/AGP_Zoey_Assembly_v1.fa2', 'w')

contigID = ''
count, prev_chrom = 0, ''

for line in agpFile:
    if '#' in line:
        continue
    count+= 1
    
    line = line.rstrip().split('\t')
    contigID = line[5]
    
    seq = []
    
    if 'CTG' in contigID:
        chrom, contigID, extract_coord1, extract_coord2, orient = line[0],line[5],int(line[6]),int(line[7]), line[8]
        #Extract fasta
        extract_contig_fasta(fastaRoot,contigID,orient,extract_coord1,extract_coord2)
    
        #Reverse complement extracted FASTA if needed
        if '-' == orient:
            reverse_comp_contig_fasta(wkDir)
        else:
            cmd = 'mv %stemp/contig1.fa %stemp/seq.fa'  % (wkDir,wkDir)
            #print(cmd)
            subprocess.call(cmd,shell=True) 
        
        seq = read_in_fasta(wkDir)
    else: #These are gaps   
        gap_length = int(line[5])
        seq = 'N' * gap_length
    
    #Writes '>' chromosome to the fasta file
    if count == 1 or prev_chrom != chrom:
        if count > 1:
            agp_Fasta.write('\n')
        agp_Fasta.write('>' + chrom + '\n')
    
    for i in range(0,len(seq)):
        Seq = seq[i].upper()#.upper()print(seq[i])
        agp_Fasta.write(Seq)
        #agp_Fasta.write(seq[i])

    prev_chrom = chrom #reset prev_chrom identity


print('Finished writing out the sequence for %i contigs and gaps to Zoey fasta file' % count)
agpFile.close()
agp_Fasta.close()


#Reformat the fasta file to only have certain # of nucleotides per line
reformat_fasta_file(wkDir)

Finished writing out the sequence for 73 contigs and gaps to Zoey fasta file


0

Have generated the golden path file. Below, I'm calculating statistics on the AGP. 

In [74]:
agpFile = open(wkDir + 'results/AGP_Zoey_Assembly_v1.txt', 'r')

contig_count,gap_count,line_count,blat_gap_count = 0,0,0,0
prev_Type,start_stop_list,contig_start = '',[],1

for line in agpFile:
    if '#' in line: #skips header
        continue    
    line = line.rstrip().split('\t')
    line_count += 1
    chrom,start,end,num,Type = line[0], int(line[1]),int(line[2]),int(line[3]),line[4]
    if 'D' in Type:
        #this is a contig
        contig_count += 1
        if 'U' in prev_Type:
            contig_start = start
    if 'U' in Type:
        #this is a gap
        gap_count += 1
        if 'BLAT' in line[6]:
            blat_gap_count+=1
        #print(start,end)
        if 'D' in prev_Type:
            contig_end = start - 1
            start_stop_list.append([contig_start,contig_end])
    prev_Type = Type
    
print('##Counts\n%i contigs are in the Golden Path for Zoey' % contig_count)
print('%i gaps remain in the Golden Path for Zoey' % gap_count)
print('Of these %i gaps, %i gap(s) correspond(s) to BLAT gaps (unaligned sequences where gaps were introduced)' %(gap_count,blat_gap_count))

continuous_contig_array = []
for i in range(0,len(start_stop_list)):
    start,end = int(start_stop_list[i][0]),int(start_stop_list[i][1])
    length = end-start
    continuous_contig_array.append(length)

Average_continuity = format(np.mean(continuous_contig_array), '.2f') 
print('\n##Statistics\nAverage continuity (bp): ', Average_continuity)  



##Counts
49 contigs are in the Golden Path for Zoey
24 gaps remain in the Golden Path for Zoey
Of these 24 gaps, 1 gap(s) correspond(s) to BLAT gaps (unaligned sequences where gaps were introduced)

##Statistics
Average continuity (bp):  2261880.79


In [75]:
chrom_length_file = open(wkDir + 'input/cyto_band.bed','r')
chrom_lengths = {}

for line in chrom_length_file:
    line=line.rstrip().split('\t')
    chrom,length = line[0],int(line[2])
    if 'chrUn' in chrom:
        continue
    chrom_lengths[chrom] = length
    
print (chrom_lengths)

{'chr5': 88915250, 'chr6': 77573801, 'chr16': 59632846, 'chr34': 42124431, 'chr14': 60966679, 'chr33': 31377067, 'chr4': 88276631, 'chr32': 38810281, 'chr31': 39895921, 'chr13': 63241923, 'chr28': 41182112, 'chr38': 23914537, 'chr36': 30810995, 'chr12': 72498081, 'chr10': 69331447, 'chr21': 50858623, 'chr22': 61439934, 'chrM': 16727, 'chr24': 47698779, 'chr25': 51628933, 'chrX': 123869142, 'chr3': 91889043, 'chr27': 45876710, 'chr17': 64289059, 'chr2': 85426708, 'chr35': 26524999, 'chr8': 74330416, 'chr30': 40214260, 'chr18': 55844845, 'chr19': 53741614, 'chr29': 41845238, 'chr11': 74389097, 'chr23': 52294480, 'chr15': 64190966, 'chr37': 30902991, 'chr7': 80974532, 'chr9': 61074082, 'chr20': 58134056, 'chr26': 38964690, 'chr1': 122678785}


In [83]:
gapFile = open(wkDir + 'input/gaps.bed.sorted','r')
#to keep track of the previous call
prev_chrom, prev_start, prev_end = '', '', ''
#save continuous stretches to array
cont_pos_list, per_chrom, count =  [], {}, 0 


for line in gapFile:
    line=line.rstrip().split('\t')
    chrom,start,end=line[0],int(line[1]),int(line[2])
    if 'chrUn' in chrom:
        continue
    #FOR TESTING
    """if 'chr18' not in chrom:
        continue"""
    count+=1

    #If first line in file
    if count == 1:
        per_chrom[chrom] = []
        if start == 0: #gap is at the beginning of the chromosome
            #Save for next
            cont_start = end
            prev_chrom, prev_start, prev_end = chrom, start, end
            continue
        #first continuous stretch on chromosome
        cont_start = 0
        cont_end = start - 1
        cont_pos_list.append([count-1,cont_start,cont_end])
        per_chrom[chrom].append([count-1,cont_start,cont_end])
        
        #next continuous stretch, define start
        cont_start = end
        #Save for next
        prev_chrom, prev_start, prev_end = chrom, start, end
        continue
    #If the next call is on a different chromosome, must save the previous gap's info
    if chrom not in prev_chrom:
        per_chrom[chrom] = []
        #previous continuous stretch
        cont_end = chrom_lengths[prev_chrom]
        cont_pos_list.append([count-1,cont_start,cont_end])
        per_chrom[chrom].append([count-1,cont_start,cont_end])
        #first continuous stretch on new chromosome
        cont_start = 0
        cont_end = start - 1
        #next adjacent stretch, define start
        cont_start = end      
    else:
        cont_end = start - 1
        cont_pos_list.append([count-1,cont_start,cont_end])
        per_chrom[chrom].append([count-1,cont_start,cont_end])
        #next adjacent strech, define start
        cont_start = end
        
    prev_chrom, prev_start, prev_end = chrom, start, end
    

print('There are %i gaps in the canFam3 assembly (not including chrUn)' % count)
gapFile.close()


#Calculating the continuity statistics
continuous_contig_array = []
for i in range(0,len(cont_pos_list)):
    start,end = int(cont_pos_list[i][1]),int(cont_pos_list[i][2])
    length = end-start
    continuous_contig_array.append(length)    
Average_continuity = format(np.mean(continuous_contig_array), '.2f')
print('Average continuity (bp): ', Average_continuity)  


    


There are 19553 gaps in the canFam3 assembly (not including chrUn)
Average continuity (bp):  118532.19


In [116]:
#Continuity values per chromosome
for i in per_chrom:
    chrom = i
    print('\n#',chrom)
    print('%i gaps on %s' % (len(per_chrom[chrom]),chrom))
    continuous_contig_array = []
    for j in per_chrom[chrom]:
        start = int(j[1])
        end = int(j[2])
        length = end - start
        continuous_contig_array.append(length)
    Average_continuity = format(np.mean(continuous_contig_array), '.2f')
    print('Average continuity (bp): ', Average_continuity)


# chr32
190 gaps on chr32
Average continuity (bp):  202505.99

# chr26
391 gaps on chr26
Average continuity (bp):  99154.57

# chr16
442 gaps on chr16
Average continuity (bp):  133897.94

# chr34
297 gaps on chr34
Average continuity (bp):  140893.48

# chr14
345 gaps on chr14
Average continuity (bp):  175115.68

# chr33
264 gaps on chr33
Average continuity (bp):  118442.50

# chr1
1121 gaps on chr1
Average continuity (bp):  108669.12

# chrX
1032 gaps on chrX
Average continuity (bp):  119702.28

# chr13
488 gaps on chr13
Average continuity (bp):  129005.97

# chr9
764 gaps on chr9
Average continuity (bp):  79047.59

# chr28
395 gaps on chr28
Average continuity (bp):  103803.70

# chr7
607 gaps on chr7
Average continuity (bp):  132802.75

# chr12
482 gaps on chr12
Average continuity (bp):  150218.18

# chr10
744 gaps on chr10
Average continuity (bp):  92985.91

# chr21
374 gaps on chr21
Average continuity (bp):  135844.89

# chr22
371 gaps on chr22
Average continuity (bp):  165163.49

