In [1]:
#2017-04-25
#A. Pendleton
#This script is used to make an AGP file for Zoey. Can be built upon as new forms
#   of contigs become assembled and incorporated into the AGP.

#this uses iPython magic to make plots appear inline
%matplotlib inline
import matplotlib.pyplot as plt
import scipy.stats as stats
import pandas as pd
import subprocess

#import genutils
import os
import sys
import numpy as np
import re
import scipy 
import matplotlib.patches as patches
import glob

#This script is used to make an AGP file for Zoey. Can be built upon as new contigs become assembled and incorporated into the available set for assembly.

In [2]:
wkDir = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/AGP/'
print('Current working directory is:\n', wkDir)

agpFile = open(wkDir + 'results/AGP_Zoey_Assembly_v1.txt', 'w')
print('Writing out AGP to file:\n', agpFile)

Current working directory is:
 /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/AGP/
Writing out AGP to file:
 <_io.TextIOWrapper name='/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/AGP/results/AGP_Zoey_Assembly_v1.txt' mode='w' encoding='UTF-8'>


In [3]:
#####READING IN PRIMARY CONTIG COORDINATES FROM MUMMER ALIGNMENT#######
#Primary contig alignments (no canu processing)
primaryContigCoord = open(wkDir + 'input/primary.2017-04-10.txt', 'r')
primaryContigCoordBed = open(wkDir + 'input/primary.2017-04-10.bed', 'w')

contigDict,index = {}, 0

for line in primaryContigCoord:
    if line.startswith("#") is True: #skips header line
        continue
    line = line.rstrip().split('\t')
    contigID, length, Dir, chrom, start, end = line[0],line[1],line[2],line[3],int(line[4]),int(line[5])    
    if 'chr18' not in chrom: #FILTERING ONLY FOR CHROM 18 FOR NOW!!! REMOVE LATER 
        continue
    index += 1
    contigDict[index] = [chrom,start,end,contigID,Dir,length]
    primaryContigCoordBed.write('%s\t%i\t%i\t%s\n' % (chrom,int(start-1),end,contigID))
    if index > 10: #FOR TESTING--- REMOVE LATER
        break
    
primaryContigCoord.close()
primaryContigCoordBed.close()
print ('Identified coordinates for %i primary contigs' % len(contigDict))

Identified coordinates for 11 primary contigs


Step 1. Read in coordinates from alignment file based on Mummer of primary contigs (per Jeff)
Step 2. BLAT each contig against the contig proximal to that proximal to it

In [4]:
def read_line(Dict,curr_index,i):
    #Contig 1
    chrom1,start1,end1,contigID1,Dir1,length1 = Dict[curr_index][0],int(Dict[curr_index][1]),int(Dict[curr_index][2]), Dict[curr_index][3], Dict[curr_index][4], int(Dict[curr_index][5])
    #Contig 2
    chrom2,start2,end2,contigID2,Dir2,length2 = Dict[i+1][0],int(Dict[i+1][1]),int(Dict[i+1][2]), Dict[i+1][3], Dict[i+1][4], int(Dict[i+1][5])
    return chrom1,start1,end1,contigID1,Dir1,length1,chrom2,start2,end2,contigID2,Dir2,length2
###################################################################################################
def read_last_line(Dict):
    #Contig 1
    chrom1,start1,end1,contigID1,Dir1,length1 = Dict[i-1][0],int(Dict[i-1][1]),int(Dict[i-1][2]), Dict[i-1][3], Dict[i-1][4], int(Dict[i-1][5])
    #Contig 2
    chrom2,start2,end2,contigID2,Dir2,length2 = Dict[i][0],int(Dict[i][1]),int(Dict[i][2]), Dict[i][3], Dict[i][4], int(Dict[i][5])
    return chrom1,start1,end1,contigID1,Dir1,length1,chrom2,start2,end2,contigID2,Dir2,length2
###################################################################################################
def process_last_contig(contigDict,curr_index,i):
    chrom1,start1,end1,contigID1,Dir1,length1,chrom2,start2,end2,contigID2,Dir2,length2 = read_last_line(contigDict)
    print ('\n#%s (%i) -- %s (%i)' % (contigID1,i-1,contigID2,i))
    if start1 == start2 or end1 == end2:
        return
    if end2 > end1:
        curr_index+=1 
        curated_contigDict[i] = [chrom2,start2,end2,contigID2,Dir2,length2]
        print('PASS - Last contig on chromosome overlap or is spaced correctly')
        return
###################################################################################################
def process_first_contig(contigDict,curr_index,i):
    curr_index+=1
    #Always save first contig in the dictionary/dataset
    curated_contigDict[i] = [chrom1,start1,end1,contigID1,Dir1,length1]
    print('PASS - First call in dataset')
    return curr_index 
###################################################################################################
def process_same_starts(contigDict,curr_index,i):
    if start1 == start2 and end1 > end2:
        print('FAIL -- Share start, contig1 longer' + '\n' + str(contigDict[i]) + '\n' + str(contigDict[i+1]))
        return
    if start1 == start2 and end1 < end2: #overwrite previous position if contig 2 is longer than contig 1
        curated_contigDict[i] = [chrom2,start2,end2,contigID2,Dir2,length2]
        print('FAIL -- Share start, contig2 longer' + '\n' + str(contigDict[i]) + '\n' + str(contigDict[i+1]))
        return
###################################################################################################
def process_same_ends(contigDict,curr_index,i):    
    if start1 < start2 and end1 == end2:
        print('Ends same, kept contig 1 only' + '\n' + str(contigDict[i]) + '\n' + str(contigDict[i+1]))
        return 
    if start2 < start1 and end1 == end2: 
        print('ERROR - is this contig list sorted??!!')
        sys.exit(1)

In [10]:
curated_contigDict, curr_index = {}, 1
chrom1,chrom2 = '0', '0' #initializing values


for key, value in contigDict.items(): 
    i = int(key)
    #1. If last contig in dataset 
    if i == len(contigDict):
        continue
    
    #2. If contigs are on different chromosomes:
    if chrom1 != chrom2:
        process_last_contig(contigDict,curr_index,i)
        continue
        
    chrom1,start1,end1,contigID1,Dir1,length1,chrom2,start2,end2,contigID2,Dir2,length2 = read_line(contigDict,curr_index,i)
    print ('\n#%s (%i) -- %s (%i)' % (contigID1,curr_index,contigID2,i+1))

    #3. Automatically saves first contig in dataset
    if curr_index == 1 and len(curated_contigDict) == 0:
        curr_index = process_first_contig(contigDict,curr_index,i)
        continue
    
    #4. If contigs have the same start site, choose the longest contig
    if start1 == start2:
        process_same_starts(contigDict,curr_index,i)
        continue

    #5. If contigs have same end coordinate, choose longest contig (i.e. contig #1 if sorted)
    if end1 == end2: 
        process_same_ends(contigDict,curr_index,i)
        continue
        
    #6. If contigs #1 and #2 have same start AND end, keep contig #1
    if start1 == start2 and end1 == end2:
        print('FAIL -- Same start and end coordinate' + '\n' + str(contigDict[i]) + '\n' + str(contigDict[i+1]))
        continue #because contig1 is already in the dictionary
        
    #7. If contig #2 is fully within contig #1 -- continue
    if start1 < start2 and end1 > end2: # contig2 is fully within contig1
        curated_contigDict[curr_index] = [chrom1,start1,end1,contigID1,Dir1,length1]
        print('FAIL -- Contig2 fully within contig1' + '\nKeeping contig: ' + contigID1 +  '\n' + str(contigDict[i]) + '\n' + str(contigDict[i+1]))
        continue
    if start1 > start2 and end1 < end2:
        print('ERROR - is this contig list sorted??!!')
        sys.exit(1)
        
    #8. If contig passes all these - then automatically saves
    curr_index=i+1
    curated_contigDict[i] = [chrom2,start2,end2,contigID2,Dir2,length2]
    #curated_contigDict[i] = [chrom1,start1,end1,contigID1,Dir1,length1]
    print('PASS - contigs overlap or are spaced correctly')

print ('Identified CURATED coordinates for %i primary contigs' % len(curated_contigDict))
   
#Re-naming dictionary keys
count=0
for keys in curated_contigDict:
    count+=1
    curated_contigDict[count]=curated_contigDict.pop(keys)



#CTG-1462 (1) -- CTG-0201 (2)
PASS - First call in dataset

#CTG-0201 (2) -- CTG-2401 (3)
FAIL -- Contig2 fully within contig1
Keeping contig: CTG-0201
['chr18', 22143, 3966426, 'CTG-0201', 'rc', '3968428']
['chr18', 28961, 33381, 'CTG-2401', 'fwd', '4387']

#CTG-0201 (2) -- CTG-1986 (4)
FAIL -- Contig2 fully within contig1
Keeping contig: CTG-0201
['chr18', 28961, 33381, 'CTG-2401', 'fwd', '4387']
['chr18', 1781533, 1791763, 'CTG-1986', 'rc', '10221']

#CTG-0201 (2) -- CTG-1861 (5)
PASS - contigs overlap or are spaced correctly

#CTG-1861 (5) -- CTG-0743 (6)
PASS - contigs overlap or are spaced correctly

#CTG-0743 (6) -- CTG-0770 (7)
PASS - contigs overlap or are spaced correctly

#CTG-0770 (7) -- CTG-0177 (8)
PASS - contigs overlap or are spaced correctly

#CTG-0177 (8) -- CTG-0767 (9)
PASS - contigs overlap or are spaced correctly

#CTG-0767 (9) -- CTG-0615 (10)
PASS - contigs overlap or are spaced correctly

#CTG-0615 (10) -- CTG-1278 (11)
PASS - contigs overlap or are spaced cor

In [11]:
print('Non-curated:')
for i in contigDict:
    print(i,contigDict[i])
print( '\nCurated:')
for i in curated_contigDict:
    print(i, curated_contigDict[i])

Non-curated:
1 ['chr18', 2619, 26992, 'CTG-1462', 'rc', '24712']
2 ['chr18', 22143, 3966426, 'CTG-0201', 'rc', '3968428']
3 ['chr18', 28961, 33381, 'CTG-2401', 'fwd', '4387']
4 ['chr18', 1781533, 1791763, 'CTG-1986', 'rc', '10221']
5 ['chr18', 3964697, 3978047, 'CTG-1861', 'fwd', '13154']
6 ['chr18', 3975410, 4660269, 'CTG-0743', 'fwd', '681475']
7 ['chr18', 4634379, 5281622, 'CTG-0770', 'rc', '639396']
8 ['chr18', 5260259, 9645181, 'CTG-0177', 'rc', '4363384']
9 ['chr18', 9638686, 10286737, 'CTG-0767', 'rc', '647068']
10 ['chr18', 10284678, 11357644, 'CTG-0615', 'rc', '1078170']
11 ['chr18', 11320457, 11367174, 'CTG-1278', 'fwd', '37673']

Curated:
1 ['chr18', 2619, 26992, 'CTG-1462', 'rc', 24712]
2 ['chr18', 22143, 3966426, 'CTG-0201', 'rc', 3968428]
3 ['chr18', 3964697, 3978047, 'CTG-1861', 'fwd', 13154]
4 ['chr18', 3975410, 4660269, 'CTG-0743', 'fwd', 681475]
5 ['chr18', 4634379, 5281622, 'CTG-0770', 'rc', 639396]
6 ['chr18', 5260259, 9645181, 'CTG-0177', 'rc', 4363384]
7 ['chr18',

In [118]:
def process_first_contig_on_chrom(chrom1,start1,end1,contigID1,Dir1,length1):
    chrom2 = chrom1
    start2 = start1 - 10000
    end2 = start1 + 10000
    contigID2 = 'canFam3'
    Dir2 = 'fwd'
    length1 = end2-start2
    
    if start2 < 0:
        start2 = 0
    
    #Determine overlap 
    determine_contig_overlap(end1, contigID1, Dir1, length1, end2, contigID2, Dir2, length2)
    
    #Find coordinates to extract
    find_overlapping_coordinates(overlap,end1, contigID1, Dir1, length1, end2, contigID2, Dir2, length2)
    
    #Extract FASTA
    fastaRoot = '/home/ampend/links/kidd-lab/genomes/canFam3.1/'
    extract_fasta(fastaRoot,contigID1,Dir1,extract_coord1,contigID2,Dir2,extract_coord2)
###################################################################################################
def determine_contig_overlap(end1, contigID1, Dir1, length1, end2, contigID2, Dir2, length2):
    #How much do the contigs overlap?
    global overlap
    overlap = end1 - start2
    print('\nOverlap = ', overlap)
    #OVERLAPPING CONTIGS
    if overlap > 0:
        find_overlapping_coordinates(overlap,end1, contigID1, Dir1, length1, end2, contigID2, Dir2, length2)
    #DIRECTLY ADJACENT CONTIGS
    if overlap == 0:
        print('Contigs are directly adjacent to one another')
        #WRITE FUNCTION FOR THESE
    #CONTIGS WITH GAP BETWEEN THEM
    if overlap < 0:
        print('Gap between contigs')
        #WRITE FUNCTION FOR THESE
###################################################################################################
def find_overlapping_coordinates(overlap,end1, contigID1, Dir1, length1, end2, contigID2, Dir2, length2):
    ###Determine coordinates to extract for BLAT
    global extract_coord1
    global extract_coord2
    #Contig #1
    if 'fwd' in Dir1:
        extract_coord1 = [length1 - overlap - 10000, length1]
    else:
        extract_coord1 = [0, overlap + 10000]
    #Contig #2
    if 'fwd' in Dir2:
        extract_coord2 = [0, overlap + 10000] 
    else:
        extract_coord2 = [length2 - overlap - 10000, length2]
    
    print ('Coordinates to extract for BLAT: ',extract_coord1,extract_coord2,'\n')
    #safety check
    for i in range(0,1): #CHECKS 
        #= if the region to extract extends beyond the length of the contig, if so then the coordinate changes to the length of contig
        if extract_coord1[i] > length1:
            extract_coord1[i] = length1
        if extract_coord2[i] > length2:
            extract_coord2[i] = length2
        # If the region extended too far, and the value is negative -- then make starting extraction coordinate = 0
        if extract_coord1[i] < 0: 
            extract_coord1[i] = 0
        if extract_coord2[i] < 0:
            extract_coord2[i] = 0
            
    if overlap < 0: #### SEND TO DIFFERENT FUNCTION LATER -- FOLLOW OVERLAPPING CONTIGS FOR NOW
        print('Contigs do not overlap')
    
    data = []
    
    #Where to find fasta of contigs
    fastaRoot = '/home/ampend/links/kidd-lab/jmkidd-projects/zoey/contig-assignment/kmer-matches/eval1/'
    extract_fasta(fastaRoot,contigID1,Dir1,extract_coord1, contigID2,Dir2,extract_coord2)
    
###################################################################################################
def extract_fasta(fastaRoot,contigID1,Dir1,extract_coord1,contigID2,Dir2,extract_coord2):
    #Contig 1
    fasta_path1 = fastaRoot + contigID1 + '/' + contigID1 + '.fa'
    cmd = 'samtools faidx %s %s:%i-%i  > %stemp/contig1.fa' % (fasta_path1,contigID1,extract_coord1[0],extract_coord1[1],wkDir)
    #print (cmd)
    subprocess.call(cmd,shell=True)
    
    #Contig 2
    fasta_path2 = fastaRoot + contigID2 + '/' + contigID2 + '.fa'
    cmd = 'samtools faidx %s %s:%i-%i > %stemp/contig2.fa' % (fasta_path2,contigID2,extract_coord2[0],extract_coord2[1],wkDir)
    #print (cmd)
    subprocess.call(cmd,shell=True)
    
###################################################################################################
def run_blat(wkDir):
    blatcmd = 'blat %stemp/contig1.fa %stemp/contig2.fa %stemp/temp.blat' % (wkDir,wkDir,wkDir)
    #print(blatcmd)
    subprocess.call(blatcmd,shell=True)
    
    
###################################################################################################
def parse_blat(wkDir,overlap,contigID1,Dir1,extract_coord1,contigID2,Dir2,extract_coord2):
    inFile = open('%stemp/temp.blat' % (wkDir),'r')
    lineCount, cleanAlignment, offset = 0, False, 0
    for line in inFile:
        line=line.rstrip().split('\t')
        lineCount+=1
        if lineCount > 5:#skips the BLAT results headers
            score,strand = int(line[0]),line[8]
            percent_of_overlap = float(score)/overlap
            if percent_of_overlap < float(0.75):
                continue
            print('\n#Parsing BLAT results','Percent of overlap matched in BLAT: ',format(percent_of_overlap, '.3f'))
            #CHECKS TO MAKE SURE WE DID THIS RIGHT - correct end vs correct end
            if '+' not in strand:
                print('ERROR: Top hit not in proper orientation.... SKIPPING -- PLEASE CHECK')
                return
            #parse contig #1 (left contig)
            blat_length1, blat_start1, blat_end1 = int(line[14]),int(line[15]),int(line[16])
            print(blat_length1, blat_start1, blat_end1)
            #parse contig #2 (right contig)
            blat_length2, blat_start2, blat_end2 = int(line[10]),int(line[11]),int(line[12])
            print(blat_length2, blat_start2, blat_end2)
            #Calculate offset  - to compensate for non-clean alignments (extension of left contig that does not overlap with adjacent contig)
            calculate_offset(overlap,blat_length1, blat_start1, blat_end1,blat_length2, blat_start2, blat_end2)

###################################################################################################
def calculate_offset(overlap,blat_length1, blat_start1, blat_end1,blat_length2, blat_start2, blat_end2):
    global offset
    offset = blat_start1 - 0
    print('\nBLAT offset = ', offset)

###################################################################################################
    

In [120]:
#write_AGP_header(agpFile)
posDict = {}

for i in range(1,len(curated_contigDict)-1): 
    curr_index=i
    #1. Reading in coordinates from contigList
    chrom1,start1,end1,contigID1,Dir1,length1,chrom2,start2,end2,contigID2,Dir2,length2 = read_line(curated_contigDict, curr_index, i)
    #2. Checking contigs are on same chromosome
    if chrom1 != chrom2:# or i == 1: #This is first contig on chromosome, need to process it first
        #continue
        process_first_contig_on_chrom(chrom1,start1,end1,contigID1,Dir1,length1) 
        continue
    print ('\n#',contigID1, contigID2,'\n',chrom1,start1,end1,contigID1,Dir1,length1,'\n',chrom2,start2,end2,contigID2,Dir2,length2)
    
    #3. Determines the overlap/orientation of the two contigs
    determine_contig_overlap(end1,contigID1,Dir1,length1,end2,contigID2,Dir2,length2)
    
    #4. BLAT the properly oriented contig ends against one another
    #     and parse the results
    run_blat(wkDir)
    parse_blat(wkDir,overlap,contigID1,Dir1,extract_coord1,contigID2,Dir2,extract_coord2)
    
    #save contig1 information to dictionary
    posDict[i] = [chrom1,start1,end1,contigID1,Dir1,overlap,offset,contigID2]


    if i > 1:
        break
        



# CTG-1462 CTG-0201 
 chr18 2619 26992 CTG-1462 rc 24712 
 chr18 22143 3966426 CTG-0201 rc 3968428

Overlap =  4849
Coordinates to extract for BLAT:  [0, 14849] [3953579, 3968428] 


#Parsing BLAT results Percent of overlap matched in BLAT:  0.982
14849 0 4806
14850 10026 14849

BLAT offset =  0

# CTG-0201 CTG-1861 
 chr18 22143 3966426 CTG-0201 rc 3968428 
 chr18 3964697 3978047 CTG-1861 fwd 13154

Overlap =  1729
Coordinates to extract for BLAT:  [0, 11729] [0, 11729] 


#Parsing BLAT results Percent of overlap matched in BLAT:  0.995
ERROR: Top hit not in proper orientation.... SKIPPING -- PLEASE CHECK


In [117]:
for i in posDict:
    print (posDict[i])

['chr18', 2619, 26992, 'CTG-1462', 'rc', 4849, 0, 'CTG-0201']
['chr18', 22143, 3966426, 'CTG-0201', 'rc', 1729, 0, 'CTG-1861']
['chr18', 3964697, 3978047, 'CTG-1861', 'fwd', 2637, 10004, 'CTG-0743']
['chr18', 3975410, 4660269, 'CTG-0743', 'fwd', 25890, 10004, 'CTG-0770']
['chr18', 4634379, 5281622, 'CTG-0770', 'rc', 21363, 0, 'CTG-0177']
['chr18', 5260259, 9645181, 'CTG-0177', 'rc', 6495, 0, 'CTG-0767']
['chr18', 9638686, 10286737, 'CTG-0767', 'rc', 2059, 0, 'CTG-0615']


In [None]:
def write_AGP_header(agpFile): #alter for each assembly
    agpFile.write('##agp-version 1.0\n# ORGANISM: Canis lupus familiaris\n# TAX_ID: 9615\n')
    agpFile.write('# ASSEMBLY NAME: Zoey_v1\n# ASSEMBLY DATE: 19-April-2017\n')
    agpFile.write('# GENOME CENTER: U. Michigan - Kidd Lab\n')
    agpFile.write('# DESCRIPTION: AGP specifying the assembly of chromosome 18 from primary PacBio contigs from FALCON assembly\n')