In [1]:
#2017-05-02
#A. Pendleton
#This script is used to make an AGP file for Zoey. Can be built upon as new forms
#   of contigs become assembled and incorporated into the AGP.

#this uses iPython magic to make plots appear inline
%matplotlib inline
import matplotlib.pyplot as plt
import scipy.stats as stats
import pandas as pd
import subprocess

#import genutils
import os
import sys
import numpy as np
import re
import scipy 
import matplotlib.patches as patches
import glob

#This script is used to make an AGP file for Zoey. Can be built upon as new contigs become assembled and incorporated into the available set for assembly.

In [10]:
wkDir = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/AGP/'
print('Current working directory is:\n', wkDir)

logFile = open(wkDir + 'temp/iPythonNotebook_LogFile.txt','w')

Current working directory is:
 /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/AGP/


In [3]:
#####READING IN PRIMARY CONTIG COORDINATES FROM MUMMER ALIGNMENT#######
#Primary contig alignments (no canu processing)
primaryContigCoord = open(wkDir + 'input/primary.2017-04-10.txt', 'r')
primaryContigCoordBed = open(wkDir + 'input/primary.2017-04-10.bed', 'w')

contigDict,index = {}, 0

for line in primaryContigCoord:
    if line.startswith("#") is True: #skips header line
        continue
    line = line.rstrip().split('\t')
    contigID, length, Dir, chrom, start, end = line[0],line[1],line[2],line[3],int(line[4]),int(line[5])    
    if 'chr18' not in chrom: #FILTERING ONLY FOR CHROM 18 FOR NOW!!! REMOVE LATER 
        continue
    index += 1
    contigDict[index] = [chrom,start,end,contigID,Dir,length]
    primaryContigCoordBed.write('%s\t%i\t%i\t%s\n' % (chrom,int(start-1),end,contigID))
    #if index > 15: #FOR TESTING--- REMOVE LATER
    #    break
    
primaryContigCoord.close()
primaryContigCoordBed.close()
print ('Identified coordinates for %i primary contigs' % len(contigDict))
logFile.write('Identified coordinates for %i primary contigs\n\n' % len(contigDict))

Identified coordinates for 77 primary contigs


Step 1. Read in coordinates from alignment file based on Mummer of primary contigs (per Jeff)
Step 2. BLAT each contig against the contig proximal to that proximal to it

In [12]:
def read_line(Dict,curr_index,i):
    #Contig 1
    chrom1,start1,end1,contigID1,Dir1,length1 = Dict[curr_index][0],int(Dict[curr_index][1]),int(Dict[curr_index][2]), Dict[curr_index][3], Dict[curr_index][4], int(Dict[curr_index][5])
    #Contig 2
    chrom2,start2,end2,contigID2,Dir2,length2 = Dict[i+1][0],int(Dict[i+1][1]),int(Dict[i+1][2]), Dict[i+1][3], Dict[i+1][4], int(Dict[i+1][5])
    return chrom1,start1,end1,contigID1,Dir1,length1,chrom2,start2,end2,contigID2,Dir2,length2
###################################################################################################
def read_last_line(Dict):
    #Contig 1
    chrom1,start1,end1,contigID1,Dir1,length1 = Dict[i-1][0],int(Dict[i-1][1]),int(Dict[i-1][2]), Dict[i-1][3], Dict[i-1][4], int(Dict[i-1][5])
    #Contig 2
    chrom2,start2,end2,contigID2,Dir2,length2 = Dict[i][0],int(Dict[i][1]),int(Dict[i][2]), Dict[i][3], Dict[i][4], int(Dict[i][5])
    return chrom1,start1,end1,contigID1,Dir1,length1,chrom2,start2,end2,contigID2,Dir2,length2
###################################################################################################
def process_last_contig(contigDict,curr_index,i):
    chrom1,start1,end1,contigID1,Dir1,length1,chrom2,start2,end2,contigID2,Dir2,length2 = read_last_line(contigDict)
    print ('\n#%s (%i) -- %s (%i)' % (contigID1,i-1,contigID2,i))
    logFile.write(('\n#%s (%i) -- %s (%i)\n' % (contigID1,i-1,contigID2,i)))
    if start1 == start2 or end1 == end2:
        return
    if end2 > end1:
        curr_index+=1 
        curated_contigDict[i] = [chrom2,start2,end2,contigID2,Dir2,length2]
        inList.append(contigID2)
        print('PASS - Last contig on chromosome overlap or is spaced correctly')
        logFile.write('PASS - Last contig on chromosome overlap or is spaced correctly\n')
        return
###################################################################################################
def process_first_contig(contigDict,curr_index,i):
    curr_index+=1
    #Always save first and contig in the dictionary/dataset
    curated_contigDict[i] = [chrom1,start1,end1,contigID1,Dir1,length1]
    curated_contigDict[i+1] = [chrom2,start2,end2,contigID2,Dir2,length2]
    inList.append(contigID2)
    print('PASS - First call in dataset')
    logFile.write('PASS - First call in dataset\n')
    return curr_index 
###################################################################################################
def process_same_starts(contigDict,curr_index,i):
    if start1 == start2 and end1 > end2:
        print('FAIL -- Share start, contig1 longer' + '\n' + str(contigDict[i]) + '\n' + str(contigDict[i+1]))
        logFile.write('FAIL -- Share start, contig1 longer' + '\n' + str(contigDict[i]) + '\n' + str(contigDict[i+1]) + '\n')
        return
    if start1 == start2 and end1 < end2: #overwrite previous position if contig 2 is longer than contig 1
        curated_contigDict[i] = [chrom2,start2,end2,contigID2,Dir2,length2]
        print('FAIL -- Share start, contig2 longer' + '\n' + str(contigDict[i]) + '\n' + str(contigDict[i+1]))
        logFile.write('FAIL -- Share start, contig2 longer' + '\n' + str(contigDict[i]) + '\n' + str(contigDict[i+1]) + '\n')
        return
###################################################################################################
def process_same_ends(contigDict,curr_index,i):    
    if start1 < start2 and end1 == end2:
        print('Ends same, kept contig 1 only' + '\n' + str(contigDict[i]) + '\n' + str(contigDict[i+1]))
        logFile.write('Ends same, kept contig 1 only' + '\n' + str(contigDict[i]) + '\n' + str(contigDict[i+1]) + '\n')
        return 
    if start2 < start1 and end1 == end2: 
        print('ERROR - is this contig list sorted??!!')
        logFile.write('ERROR - is this contig list sorted??!!\n\n\n')
        sys.exit(1)

In [13]:
curated_contigDict, curr_index, inList = {}, 1, []
chrom1,chrom2 = '0', '0' #initializing values


for key, value in contigDict.items(): 
    i = int(key)
    #1. If last contig in dataset 
    if i == len(contigDict):
        continue
    
    #2. If contigs are on different chromosomes:
    if chrom1 != chrom2:
        process_last_contig(contigDict,curr_index,i)
        continue
        
    chrom1,start1,end1,contigID1,Dir1,length1,chrom2,start2,end2,contigID2,Dir2,length2 = read_line(contigDict,curr_index,i)
    print ('\n#%s (%i) -- %s (%i)' % (contigID1,curr_index,contigID2,i+1))
    logFile.write('\n#%s (%i) -- %s (%i)\n' % (contigID1,curr_index,contigID2,i+1))

    #3. Automatically saves first contig in dataset
    if curr_index == 1 and len(curated_contigDict) == 0:
        curr_index = process_first_contig(contigDict,curr_index,i)
        continue
    
    #4. If contigs have the same start site, choose the longest contig
    if start1 == start2:
        process_same_starts(contigDict,curr_index,i)
        continue

    #5. If contigs have same end coordinate, choose longest contig (i.e. contig #1 if sorted)
    if end1 == end2: 
        process_same_ends(contigDict,curr_index,i)
        continue
        
    #6. If contigs #1 and #2 have same start AND end, keep contig #1
    if start1 == start2 and end1 == end2:
        print('FAIL -- Same start and end coordinate' + '\n' + str(contigDict[i]) + '\n' + str(contigDict[i+1]))
        logFile.write('FAIL -- Same start and end coordinate' + '\n' + str(contigDict[i]) + '\n' + str(contigDict[i+1]) + '\n')
        continue #because contig1 is already in the dictionary
        
    #7. If contig #2 is fully within contig #1 -- continue
    if start1 < start2 and end1 > end2: # contig2 is fully within contig1
        #curated_contigDict[curr_index] = [chrom1,start1,end1,contigID1,Dir1,length1]
        print('FAIL -- Contig2 fully within contig1' + '\nKeeping contig: ' + contigID1 +  '\n' + str(contigDict[i]) + '\n' + str(contigDict[i+1]))
        logFile.write('FAIL -- Contig2 fully within contig1' + '\nKeeping contig: ' + contigID1 +  '\n' + str(contigDict[i]) + '\n' + str(contigDict[i+1]) +'\n')
        continue
    if start1 > start2 and end1 < end2:
        print('ERROR - is this contig list sorted??!!')
        logFile.write('ERROR - is this contig list sorted??!!\n\n\n')
        sys.exit(1)
    
    #8. If contig passes all these - then automatically saves
    curr_index=i+1
    #if contigID2 not in inList:
    curated_contigDict[i] = [chrom2,start2,end2,contigID2,Dir2,length2]
    print('PASS - contigs overlap or are spaced correctly')
    logFile.write('PASS - contigs overlap or are spaced correctly\n')

print ('Identified CURATED coordinates for %i primary contigs' % len(curated_contigDict))
logFile.write('\n##Identified CURATED coordinates for %i primary contigs\n\n########\n\n' % len(curated_contigDict))


#Re-naming dictionary keys
count=0
for keys in curated_contigDict:
    count+=1
    curated_contigDict[count]=curated_contigDict.pop(keys)



#CTG-1462 (1) -- CTG-0201 (2)
PASS - First call in dataset

#CTG-0201 (2) -- CTG-2401 (3)
FAIL -- Contig2 fully within contig1
Keeping contig: CTG-0201
['chr18', 22143, 3966426, 'CTG-0201', 'rc', '3968428']
['chr18', 28961, 33381, 'CTG-2401', 'fwd', '4387']

#CTG-0201 (2) -- CTG-1986 (4)
FAIL -- Contig2 fully within contig1
Keeping contig: CTG-0201
['chr18', 28961, 33381, 'CTG-2401', 'fwd', '4387']
['chr18', 1781533, 1791763, 'CTG-1986', 'rc', '10221']

#CTG-0201 (2) -- CTG-1861 (5)
PASS - contigs overlap or are spaced correctly

#CTG-1861 (5) -- CTG-0743 (6)
PASS - contigs overlap or are spaced correctly

#CTG-0743 (6) -- CTG-0770 (7)
PASS - contigs overlap or are spaced correctly

#CTG-0770 (7) -- CTG-0177 (8)
PASS - contigs overlap or are spaced correctly

#CTG-0177 (8) -- CTG-0767 (9)
PASS - contigs overlap or are spaced correctly

#CTG-0767 (9) -- CTG-0615 (10)
PASS - contigs overlap or are spaced correctly

#CTG-0615 (10) -- CTG-1278 (11)
PASS - contigs overlap or are spaced cor

In [7]:
"""print('Non-curated:')
for i in contigDict:
    print(i,contigDict[i])
print( '\nCurated:')
for i in curated_contigDict:
    print(i, curated_contigDict[i])"""

"print('Non-curated:')\nfor i in contigDict:\n    print(i,contigDict[i])\nprint( '\nCurated:')\nfor i in curated_contigDict:\n    print(i, curated_contigDict[i])"

In [32]:
def process_first_contig_on_chrom(chrom1,start1,end1,contigID1,Dir1,length1):
    chrom2 = chrom1
    start2 = start1 - 10000
    end2 = start1 + 10000
    contigID2 = 'canFam3'
    Dir2 = 'fwd'
    length1 = end2-start2
    
    if start2 < 0:
        start2 = 0
    
    #Determine overlap 
    determine_contig_overlap(end1, contigID1, Dir1, length1, end2, contigID2, Dir2, length2)
 
    #Find coordinates to extract
    find_overlapping_coordinates(overlap,end1, contigID1, Dir1, length1, end2, contigID2, Dir2, length2)
    
    #Extract FASTA
    fastaRoot = '/home/ampend/links/kidd-lab/genomes/canFam3.1/'
    extract_fasta(fastaRoot,contigID1,Dir1,extract_coord1,contigID2,Dir2,extract_coord2)
###################################################################################################
def determine_contig_overlap(end1, contigID1, Dir1, length1, end2, contigID2, Dir2, length2):
    #How much do the contigs overlap?
    global overlap
    overlap = end1 - start2
    print('\nOverlap = ', overlap)
    #OVERLAPPING CONTIGS
    if overlap > 0:
        find_overlapping_coordinates(overlap,end1, contigID1, Dir1, length1, end2, contigID2, Dir2, length2)
    #DIRECTLY ADJACENT CONTIGS
    if overlap == 0:
        print('Contigs are directly adjacent to one another')
        logFile.write('Contigs are directly adjacent to one another\n')
        #WRITE FUNCTION FOR THESE
    #CONTIGS WITH GAP BETWEEN THEM
    if overlap < 0:
        print('Gap between contigs')
        logFile.write('Gap between contigs\n')

###################################################################################################
def find_overlapping_coordinates(overlap,end1, contigID1, Dir1, length1, end2, contigID2, Dir2, length2):
    ###Determine coordinates to extract for BLAT
    global extract_coord1
    global extract_coord2
    #Contig #1
    if 'fwd' in Dir1:
        extract_coord1 = [length1 - overlap - 10000, length1]
    else:
        extract_coord1 = [0, overlap + 10000]
    #Contig #2
    if 'fwd' in Dir2:
        extract_coord2 = [0, overlap + 10000] 
    else:
        extract_coord2 = [length2 - overlap - 10000, length2]
    
    print ('Coordinates to extract for BLAT: ',extract_coord1,extract_coord2,'\n')
    logFile.write('Coordinates to extract for BLAT: ' + str(extract_coord1) + str(extract_coord2) + '\n')

    #safety check
    for i in range(0,1): #CHECKS 
        #= if the region to extract extends beyond the length of the contig, if so then the coordinate changes to the length of contig
        if extract_coord1[i] > length1:
            extract_coord1[i] = length1
        if extract_coord2[i] > length2:
            extract_coord2[i] = length2
        # If the region extended too far, and the value is negative -- then make starting extraction coordinate = 0
        if extract_coord1[i] < 0: 
            extract_coord1[i] = 0
        if extract_coord2[i] < 0:
            extract_coord2[i] = 0
            
    if overlap < 0: #### SEND TO DIFFERENT FUNCTION LATER -- FOLLOW OVERLAPPING CONTIGS FOR NOW
        print('Contigs do not overlap')
        logFile.write('Contigs do not overlap\n')
    #Where to find fasta of contigs
    fastaRoot = '/home/ampend/links/kidd-lab/jmkidd-projects/zoey/contig-assignment/kmer-matches/eval1/'
    extract_fasta(fastaRoot,contigID1,Dir1,extract_coord1, contigID2,Dir2,extract_coord2)
    
###################################################################################################
def extract_fasta(fastaRoot,contigID1,Dir1,extract_coord1,contigID2,Dir2,extract_coord2):
    #Contig 1
    fasta_path1 = fastaRoot + contigID1 + '/' + contigID1 + '.fa'
    cmd = 'samtools faidx %s %s:%i-%i  > %stemp/contig1.fa' % (fasta_path1,contigID1,extract_coord1[0],extract_coord1[1],wkDir)
    logFile.write(cmd + '\n')
    subprocess.call(cmd,shell=True)
    
    #Contig 2
    fasta_path2 = fastaRoot + contigID2 + '/' + contigID2 + '.fa'
    cmd = 'samtools faidx %s %s:%i-%i > %stemp/contig2.fa' % (fasta_path2,contigID2,extract_coord2[0],extract_coord2[1],wkDir)
    logFile.write(cmd + '\n')
    subprocess.call(cmd,shell=True)
    
###################################################################################################
def run_blat(wkDir):
    blatcmd = 'blat %stemp/contig1.fa %stemp/contig2.fa %stemp/temp.blat' % (wkDir,wkDir,wkDir)
    logFile.write(blatcmd + '\n')
    subprocess.call(blatcmd,shell=True)
    
    
###################################################################################################
def parse_blat(wkDir,overlap,contigID1,Dir1,extract_coord1,contigID2,Dir2,extract_coord2):
    inFile = open('%stemp/temp.blat' % (wkDir),'r')
    lineCount, cleanAlignment, offset = 0, False, 0
    for line in inFile:
        line=line.rstrip().split('\t')
        lineCount+=1
        if lineCount > 5:#skips the BLAT results headers
            score,strand = int(line[0]),line[8]
            percent_of_overlap = float(score)/overlap
            if percent_of_overlap < float(0.75):
                continue
            print('\n#Parsing BLAT results','Percent of overlap matched in BLAT: ',format(percent_of_overlap, '.3f'))
            logFile.write('\n#Parsing BLAT results\n' + 'Percent of overlap matched in BLAT: ' + format(percent_of_overlap, '.3f') + ' \n')
            #CHECKS TO MAKE SURE WE DID THIS RIGHT - correct end vs correct end
            #if '+' not in strand:
            #    print('ERROR: Top hit not in proper orientation.... SKIPPING -- PLEASE CHECK')
            #    return
            #parse contig #1 (left contig)
            blat_length1, blat_start1, blat_end1 = int(line[14]),int(line[15]),int(line[16])
            print(blat_length1, blat_start1, blat_end1)
            logFile.write('length = %i, start = %i, end = %i' % (blat_length1, blat_start1, blat_end1))
            #parse contig #2 (right contig)
            blat_length2, blat_start2, blat_end2 = int(line[10]),int(line[11]),int(line[12])
            print(blat_length2, blat_start2, blat_end2)
            logFile.write('length = %i, start = %i, end = %i' % (blat_length2, blat_start2, blat_end2))
            #Calculate offset  - to compensate for non-clean alignments (extension of left contig that does not overlap with adjacent contig)
            offset = calculate_offset(overlap,blat_length1, blat_start1, blat_end1,Dir1,blat_length2, blat_start2, blat_end2,Dir2)
            return offset
###################################################################################################
def calculate_offset(overlap,blat_length1, blat_start1, blat_end1,Dir1,blat_length2, blat_start2, blat_end2,Dir2):
    global offset
    if 'rc' in Dir1:
        offset = blat_start1 - 0
    else:
        offset = blat_length1 - blat_end1
    if offset < 3:
        offset = 0
    print('\nBLAT offset = ', offset)
    logFile.write('\nBLAT offset = ' + str(offset) + '\n')
    return offset

###################################################################################################
    

In [33]:
#write_AGP_header(agpFile)
posDict, offset_BLAT = {}, []

for i in range(1,len(curated_contigDict)): 
    curr_index=i
    #1. Reading in coordinates from contigList
    chrom1,start1,end1,contigID1,Dir1,length1,chrom2,start2,end2,contigID2,Dir2,length2 = read_line(curated_contigDict, curr_index, i)
    
    #2. Checking contigs are on same chromosome
    if chrom1 != chrom2:# or i == 1: #This is first contig on chromosome, need to process it first
        #continue
        process_first_contig_on_chrom(chrom1,start1,end1,contigID1,Dir1,length1) 
        continue
    #print ('\n#',contigID1, contigID2,'\n',chrom1,start1,end1,contigID1,Dir1,length1,'\n',chrom2,start2,end2,contigID2,Dir2,length2)
    
    #3. Determines the overlap/orientation of the two contigs
    determine_contig_overlap(end1,contigID1,Dir1,length1,end2,contigID2,Dir2,length2)

    #4. BLAT the properly oriented contig ends against one another
    #     and parse the results
    run_blat(wkDir)
    parse_blat(wkDir,overlap,contigID1,Dir1,extract_coord1,contigID2,Dir2,extract_coord2)
    if offset > 0:
        offset_BLAT.append([contigID1,Dir1,extract_coord1,contigID2,Dir2,extract_coord2,overlap])
    
    #save contig1 information to dictionary
    posDict[i] = [chrom1,start1,end1,contigID1,Dir1,length1,overlap,offset,contigID2]


    #print('###################\n')
    #if i > 2:
    #    break
        



Overlap =  4849
Coordinates to extract for BLAT:  [0, 14849] [3953579, 3968428] 


#Parsing BLAT results Percent of overlap matched in BLAT:  0.982
14849 0 4806
14850 10026 14849

BLAT offset =  0

Overlap =  1729
Coordinates to extract for BLAT:  [0, 11729] [0, 11729] 


#Parsing BLAT results Percent of overlap matched in BLAT:  0.995
11729 0 1720
11729 0 1725

BLAT offset =  0

Overlap =  2637
Coordinates to extract for BLAT:  [517, 13154] [0, 12637] 


#Parsing BLAT results Percent of overlap matched in BLAT:  0.997
12638 10004 12638
12637 0 2633

BLAT offset =  0

Overlap =  25890
Coordinates to extract for BLAT:  [645585, 681475] [603506, 639396] 


#Parsing BLAT results Percent of overlap matched in BLAT:  0.993
35891 10079 35891
35891 10087 35890

BLAT offset =  0

Overlap =  21363
Coordinates to extract for BLAT:  [0, 31363] [4332021, 4363384] 


#Parsing BLAT results Percent of overlap matched in BLAT:  0.994
31363 0 21380
31364 10063 31364

BLAT offset =  0

Overlap =  6495


In [35]:
"""for i in posDict:
    print (posDict[i][0:9])"""

'for i in posDict:\n    print (posDict[i][0:9])'

In [37]:
def process_first_agp_contig(agp_chrom, start, end, contigID, direction, length, overlap, offset, pairedContig):
    agp_prev_chrom, agp_prev_overlap = agp_chrom, 0
    
    if direction == 'fwd':
        direction = '+' #changes notation of the direction
    if direction == 'rc':
        direction = '-'  #changes notation of the direction
    agp_start, agp_end = 1, length
    contig_start, contig_end = 1, length
    
    info = [agp_prev_chrom,agp_prev_overlap,agp_start, agp_end,contig_start,contig_end,direction]
    return info
########################################################################
def process_next_agp_contig(agp_chrom, direction, length, overlap, offset, pairedContig, agp_start, agp_end):
    agp_prev_chrom, agp_prev_overlap = posDict[i-1][0], int(posDict[i-1][6])

    if agp_prev_overlap < 0:
        agp_prev_overlap = 0
    
    agp_start = agp_end + 1
    agp_end = agp_start + length + agp_prev_overlap
    
    #Determine contig coordinates that align
    if direction == 'fwd':
        direction = '+' #changes notation of the direction
        contig_start = 1 + agp_prev_overlap
        contig_end = length
    if direction == 'rc':
        direction = '-'  #changes notation of the direction
        contig_start = 1
        contig_end = length - agp_prev_overlap
        
    info = [agp_prev_chrom,agp_prev_overlap,agp_start, agp_end,contig_start,contig_end,direction]
    return info       
        
########################################################################  
def process_agp_GAP(agp_chrom, direction, length, overlap, offset, pairedContig, agp_start, agp_end):
    gap_length = 10000
    agp_start = agp_end + 1
    agp_end = agp_start + gap_length
    if 'CTG' in contigID:
        gap_type = 'contig'
    else:#To change once we add more types of contigs in here
        gap_type = 'OTHER'
    agp_prev_overlap = 0
    
    info = [agp_chrom,agp_start,agp_end,gap_type,gap_length,agp_prev_overlap]
    return info  

########################################################################  
      


Now putting the information in the curated contig set (contigs constituting the golden path) into the coordinates of Zoey, and adding gap positions, if present. 

In [38]:
def write_AGP_header(agpFile): #alter for each assembly
    agpFile.write('##agp-version 1.0\n# ORGANISM: Canis lupus familiaris\n# TAX_ID: 9615\n')
    agpFile.write('# ASSEMBLY NAME: Zoey_v1\n# ASSEMBLY DATE: 19-April-2017\n')
    agpFile.write('# GENOME CENTER: University of Michigan - J.M. Kidd Lab\n')
    agpFile.write('# DESCRIPTION: AGP specifying the assembly of chromosome 18 from primary PacBio contigs from FALCON assembly\n')

In [40]:
#Defining AGP outfile
agpFile = open(wkDir + 'results/AGP_Zoey_Assembly_v1.txt', 'w')

#Now putting them in the coordinates of the Zoey genome!! :) 
count, agp = 0, [] 
print('agp_prev_chrom,agp_prev_overlap,agp_start, agp_end, contig_start,contig_end,direction')

for i in posDict:
    count += 1
    agp_chrom, start, end, contigID, direction, length, overlap, offset, pairedContig = posDict[i][0:9]
    print('\n#',contigID)
    print(posDict[i])
    print('....')
    #print(count,agp_chrom, start, end, contigID, direction, length, overlap, offset, pairedContig)    
    
    #Processing first contig in dataset -OR- on a new chromosome
    if count == 1 or agp_chrom != agp_prev_chrom: 
        count = 1
        info = process_first_agp_contig(agp_chrom, start, end, contigID, direction, length, overlap, offset, pairedContig)
        agp_prev_chrom,agp_prev_overlap,agp_start, agp_end, contig_start,contig_end,direction = info[0:9]
        agp.append([chrom, agp_start, agp_end, count, 'D', contigID, contig_start, contig_end, direction ])
        print(agp_prev_chrom,agp_prev_overlap,agp_start, agp_end, contig_start,contig_end,direction)
    #Process next contig (not first contig on chrom)
    else:
        if overlap > 0: #if contigs overlap --> NO GAP!
            info = process_next_agp_contig(agp_chrom, direction, length, overlap, offset, pairedContig, agp_start, agp_end)
            agp_prev_chrom,agp_prev_overlap,agp_start, agp_end, contig_start,contig_end,direction = info[0:9]
            agp.append([agp_chrom, agp_start, agp_end, count, 'D', contigID, contig_start, contig_end, direction ])
            print(agp_prev_chrom,agp_prev_overlap,agp_start, agp_end, contig_start,contig_end,direction)
        else: #if contigs do not overlap --> GAP!
            #process contig to the left of the gap
            info = process_next_agp_contig(agp_chrom, direction, length, overlap, offset, pairedContig, agp_start, agp_end)
            agp_prev_chrom,agp_prev_overlap,agp_start, agp_end, contig_start,contig_end,direction = info[0:9]
            agp.append([agp_chrom, agp_start, agp_end, count, 'D', contigID, contig_start, contig_end, direction ])

            #processing gap
            count += 1
            gap_info = process_agp_GAP(agp_chrom, direction, length, overlap, offset, pairedContig, agp_start, agp_end)
            agp_chrom,agp_start,agp_end,gap_type,gap_length,agp_prev_overlap = gap_info[0:7]
            agp.append([agp_chrom,agp_start,agp_end,count,'U',gap_length,gap_type,'no','na'])
    #if count > 15:
    #    break

#Write AGP header lines
write_AGP_header(agpFile)

#Write out to agp File the information for contigs/gaps
for i in range(0,len(agp)):
    agpFile.write("\t".join(map(str,agp[i])) + '\n')
agpFile.close()
logFile.close()

agp_prev_chrom,agp_prev_overlap,agp_start, agp_end, contig_start,contig_end,direction

# CTG-1462
['chr18', 2619, 26992, 'CTG-1462', 'rc', 24712, 4849, 0, 'CTG-0201']
....
chr18 0 1 24712 1 24712 -

# CTG-0201
['chr18', 22143, 3966426, 'CTG-0201', 'rc', 3968428, 1729, 0, 'CTG-1861']
....
chr18 4849 24713 3997990 1 3963579 -

# CTG-1861
['chr18', 3964697, 3978047, 'CTG-1861', 'fwd', 13154, 2637, 0, 'CTG-0743']
....
chr18 1729 3997991 4012874 1730 13154 +

# CTG-0743
['chr18', 3975410, 4660269, 'CTG-0743', 'fwd', 681475, 25890, 0, 'CTG-0770']
....
chr18 2637 4012875 4696987 2638 681475 +

# CTG-0770
['chr18', 4634379, 5281622, 'CTG-0770', 'rc', 639396, 21363, 0, 'CTG-0177']
....
chr18 25890 4696988 5362274 1 613506 -

# CTG-0177
['chr18', 5260259, 9645181, 'CTG-0177', 'rc', 4363384, 6495, 0, 'CTG-0767']
....
chr18 21363 5362275 9747022 1 4342021 -

# CTG-0767
['chr18', 9638686, 10286737, 'CTG-0767', 'rc', 647068, 2059, 0, 'CTG-0615']
....
chr18 6495 9747023 10400586 1 640573 -

# CTG-061

Have generated the golden path file. Below, I'm calculating statistics on the AGP. 

In [76]:
agpFile = open(wkDir + 'results/AGP_Zoey_Assembly_v1.txt', 'r')

contig_count,gap_count,line_count = 0,0,0
prev_Type,start_stop_list,contig_start = '',[],1

for line in agpFile:
    if '#' in line: #skips header
        continue    
    line = line.rstrip().split('\t')
    line_count += 1
    chrom,start,end,num,Type = line[0], int(line[1]),int(line[2]),int(line[3]),line[4]
    if 'D' in Type:
        #this is a contig
        contig_count += 1
        if 'U' in prev_Type:
            contig_start = start
    if 'U' in Type:
        #this is a gap
        gap_count += 1
        #print(start,end)
        if 'D' in prev_Type:
            contig_end = start - 1
            start_stop_list.append([contig_start,contig_end])
    prev_Type = Type
    
print('%i contigs are in the Golden Path for Zoey' % contig_count)
print('%i gaps remain in the Golden Path for Zoey' % gap_count)


continuous_contig_array = []
for i in range(0,len(start_stop_list)):
    start,end = int(start_stop_list[i][0]),int(start_stop_list[i][1])
    length = end-start
    continuous_contig_array.append(length)

Average_continuity = format(np.mean(continuous_contig_array), '.2f') 
print('Average continuity (bp): ', Average_continuity)  



49 contigs are in the Golden Path for Zoey
23 gaps remain in the Golden Path for Zoey
Average continuity (bp):  2360223.48
