In [1]:
#2017-04-19
#A. Pendleton
#This script is used to make an AGP file for Zoey. Can be built upon as new forms
#   of contigs become assembled and incorporated into the AGP.

#this uses iPython magic to make plots appear inline
%matplotlib inline
import matplotlib.pyplot as plt
import scipy.stats as stats
import pandas as pd
import subprocess

#import genutils
import os
import sys
import numpy as np
import re
import scipy 
import matplotlib.patches as patches
import glob

#This script is used to make an AGP file for Zoey. Can be built upon as new contigs become assembled and incorporated into the available set for assembly.

In [2]:
wkDir = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/AGP/'
print('Current working directory is:\n', wkDir)

agpFile = open(wkDir + 'results/AGP_Zoey_Assembly_v1.txt', 'w')
print('Writing out AGP to file:\n', agpFile)

Current working directory is:
 /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/AGP/
Writing out AGP to file:
 <_io.TextIOWrapper name='/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/AGP/results/AGP_Zoey_Assembly_v1.txt' mode='w' encoding='UTF-8'>


In [3]:
#Primary contig alignments (no canu processing)
primaryContigCoord = open(wkDir + 'input/primary.2017-04-10.txt', 'r')

contigList, index = [], 0
contigDict = {}

for line in primaryContigCoord:
    if line.startswith("#") is True: #skips header line
        continue
    line = line.rstrip().split('\t')
    contigID, length, Dir, chrom, start, end = line[0:6]    
    if 'chr18' not in chrom: #FILTERING ONLY FOR CHROM 18 FOR NOW!!! REMOVE LATER 
        continue
    index += 1
    contigDict[index] = [chrom,start,end,contigID,Dir,length]
    if index>10:
        break
    
primaryContigCoord.close()
print ('Identified coordinates for %i primary contigs' % len(contigDict))


Identified coordinates for 11 primary contigs


Step 1. Read in coordinates from alignment file based on Mummer of primary contigs (per Jeff)
Step 2. BLAT each contig against the contig proximal to that proximal to it

In [28]:
def write_AGP_header(agpFile): #alter for each assembly
    agpFile.write('##agp-version 1.0\n# ORGANISM: Canis lupus familiaris\n# TAX_ID: 9615\n')
    agpFile.write('# ASSEMBLY NAME: Zoey_v1\n# ASSEMBLY DATE: 19-April-2017\n')
    agpFile.write('# GENOME CENTER: U. Michigan - Kidd Lab\n')
    agpFile.write('# DESCRIPTION: AGP specifying the assembly of chromosome 18 from primary PacBio contigs from FALCON assembly\n')
def read_line():
    #Contig 1
    chrom1,start1,end1,contigID1,Dir1,length1 = contigDict[i][0],int(contigDict[i][1]),int(contigDict[i][2]), contigDict[i][3], contigDict[i][4], int(contigDict[i][5])
    #Contig 2
    chrom2,start2,end2,contigID2,Dir2,length2 = contigDict[i+1][0],int(contigDict[i+1][1]),int(contigDict[i+1][2]), contigDict[i+1][3], contigDict[i+1][4], int(contigDict[i+1][5])
    return chrom1,start1,end1,contigID1,Dir1,length1,chrom2,start2,end2,contigID2,Dir2,length2

In [75]:
def determine_contig_overlap(end1, contigID1, Dir1, length1, end2, contigID2, Dir2, length2):
    #How much do the contigs overlap?
    overlap = end1 - start2
    print('Overlap = ', overlap)
    if overlap > 0:
        find_overlapping_coordinates(overlap,end1, contigID1, Dir1, length1, end2, contigID2, Dir2, length2)

In [83]:
def find_overlapping_coordinates(overlap,end1, contigID1, Dir1, length1, end2, contigID2, Dir2, length2):
    #determine coordinates to extract for BLAT
    if 'fwd' in Dir1:
        extract_coord1 = [length1 - overlap - 10000, length1]
    else:
        extract_coord1 = [1, 1 + overlap + 10000]
    if 'fwd' in Dir2:
        extract_coord2 = [1, 1 + overlap + 10000]
    else:
        extract_coord2 = [length2 - overlap - 10000, length2]
    print (extract_coord1,extract_coord2)
    #safety check
    for i in range(0,1): #CHECKS = if the region to extract extends beyond the length of the contig, if so then the coordinate changes to the length of contig
        if extract_coord1[i] > length1:
            extract_coord1[i] = length1
        if extract_coord2[i] > length2:
            extract_coord2[i] = length2

    if overlap < 0: #### SEND TO DIFFERENT FUNCTION LATER -- FOLLOW OVERLAPPING CONTIGS FOR NOW
        print('Contigs do not overlap')
    extract_fasta(contigID1,Dir1,extract_coord1,contigID2,Dir2,extract_coord2)
###################################################################################################
def extract_fasta(contigID1,Dir1,extract_coord1,contigID2,Dir2,extract_coord2):
    fastaRoot = '/home/ampend/links/kidd-lab/jmkidd-projects/zoey/contig-assignment/kmer-matches/eval1/'
    ##Defining fasta paths
    #Contig 1
    fasta_path1 = fastaRoot + contigID1 + '/' + contigID1 + '.fa'
    cmd = 'samtools faidx %s %s:%i-%i  > %stemp/contig1.fa' % (fasta_path1,contigID1,extract_coord1[0],extract_coord1[1],wkDir)
    print (cmd)
    subprocess.call(cmd,shell=True)
    
    #Contig 2
    fasta_path2 = fastaRoot + contigID2 + '/' + contigID2 + '.fa'
    cmd = 'samtools faidx %s %s:%i-%i > %stemp/contig2.fa' % (fasta_path2,contigID2,extract_coord2[0],extract_coord2[1],wkDir)
    print (cmd)
    subprocess.call(cmd,shell=True)
    
    run_blat(wkDir)
    
###################################################################################################
def run_blat(wkDir):
    blatcmd = 'blat %stemp/contig1.fa %stemp/contig2.fa %stemp/temp.blat' % (wkDir,wkDir,wkDir)
    print(blatcmd)
    subprocess.call(blatcmd,shell=True)

In [84]:
#write_AGP_header(agpFile)
for i in range(1,len(contigDict)-1):
    chrom1,start1,end1,contigID1,Dir1,length1,chrom2,start2,end2,contigID2,Dir2,length2 = read_line()
    #chrom1,start1,end1,contigID1,Dir1,length1 = contigDict[i][0:6] #this contig
    #chrom2,start2,end2,contigID2,Dir2,length2 = contigDict[i+1][0:6] #next contig
    if chrom != next_chrom: ##CHECKS = if chromID has switched
        continue 
    print ('\n#',chrom1,start1,end1,contigID1,Dir1,length1)
    print (chrom2,start2,end2,contigID2,Dir2,length2)
    
    #Determines the overlap/orientation of the two contigs
    determine_contig_overlap(end1,contigID1,Dir1,length1,end2,contigID2,Dir2,length2)
    break
    #Extracts fasta
    extract_fasta(contigID,Dir,next_contigID,next_Dir)
    #Runs blat
    run_blat(fasta_path1,fasta_path2,wkDir)
    break
    if i > 10:
        break
        



# chr18 2619 26992 CTG-1462 rc 24712
chr18 22143 3966426 CTG-0201 rc 3968428
Overlap =  4849
[1, 14850] [3953579, 3968428]
samtools faidx /home/ampend/links/kidd-lab/jmkidd-projects/zoey/contig-assignment/kmer-matches/eval1/CTG-1462/CTG-1462.fa CTG-1462:1-14850  > /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/AGP/temp/contig1.fa
samtools faidx /home/ampend/links/kidd-lab/jmkidd-projects/zoey/contig-assignment/kmer-matches/eval1/CTG-0201/CTG-0201.fa CTG-0201:3953579-3968428 > /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/AGP/temp/contig2.fa
blat /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/AGP/temp/contig1.fa /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/AGP/temp/contig2.fa /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/AGP/temp/temp.blat
