In [1]:
# 2018-10-10
#
# Updated 2019-04-25
# A. Pendleton
# Generation of miropeats images for any bed file as input


In [2]:
#this uses iPython magic to make plots appear inline
%matplotlib inline
#Remaining required 
import matplotlib.pyplot as plt
import subprocess
import sys
import numpy as np
import matplotlib.patches as patches
import gzip
import fileinput
import glob



def count_lines(f):
    lineCount = 0
    with open(f, 'r') as f:
        for line in f:
            lineCount += 1
        return lineCount
#Facilitates running commands on flux from within the ipython enviroment.
# Requires import of: 
#   - sys
#   - subprocess
def runCMD(cmd):
    val = subprocess.Popen(cmd, shell=True).wait()
    if val == 0:
        pass
    else:
        print ('command failed')
        print (cmd)
        sys.exit(1)
#Write PBS Script header and standard run-by-id perl script to run tasks 
def write_pbs_file(wkDir,cmdsFile,jobName, mem, jobCount):
    #write PBS file
    if '/scripts/' in wkDir:
        wkDir = wkDir.replace('/scripts/','')
    pbsFile = open(cmdsFile.replace('.cmds','.pbs'),'w')
    print('Writing pbs file: %s'% pbsFile)
    pbsFile.write('#!/bin/bash\n')
    pbsFile.write('#PBS -S /bin/bash\n')
    pbsFile.write('#PBS -V\n')
    pbsFile.write('#PBS -M ampend@med.umich.edu\n')
    pbsFile.write('#PBS -j oe\n')
    pbsFile.write('#PBS -N %s\n' % jobName)
    pbsFile.write('#PBS -o %s\n' % (wkDir + 'logs/'))
    pbsFile.write('#PBS -l pmem=%iG\n' % mem)
    pbsFile.write('#PBS -l nodes=1:ppn=1,qos=flux,walltime=100:00:00\n')
    pbsFile.write('##PBS -A medbsm_flux\n')
    pbsFile.write('##PBS -q flux\n')
    pbsFile.write('#PBS -A jmkidd_fluxod\n')
    pbsFile.write('#PBS -q fluxod\n')
    pbsFile.write('#PBS -t 1-%s\n' % jobCount)
    pbsFile.write('cd %s\n' % (wkDir))
    pbsFile.write('/home/ampend/links/kidd-lab/jmkidd-projects/scripts/perlUtils/run-by-id-log.pl %s %sBLAT_commands.logs $PBS_ARRAYID' % (cmdsFile,cmdsFile.replace('/scripts/','/logs/')))
    pbsFile.close()

______________


# Miropeats pipeline processing script
### Goal 
The goal of this notebook is to facilitate an easier processing of regions of interest in the Zoey PacBio genome to which make miropeats images against the current dog genome reference assembly (CanFam3.1). 

__Warning__: I have hard-coded regions of this script, of which I will try to denote. 

### Required inputs
1. Input BED file
2. Root directory path that you wish all resulting directories and files to be created.
3. Path to the genome reference FASTA file(s). 

### Input files
The input file for this script is a bed file of coordinates of interested for which you would like to generate a miropeats image. 

__BED file__: The input file should be a tab-delimited BED coordinate file that has chromosome, start (0-based), end, and sequence identifier. 

- The BED input file can contain only one sequence, for practice. This script writes a command file for EACH sequence that you wish to process.

__Genome fasta files__: The full path will be needed for the two genome fasta files that you want to extract the sequence from for alignment and visualization with miropeats. 

You will need a genome FASTA sequence for ___sequence 'A'___ (matches the sequence with coordinates of your input bed file). For example, genome A == Zoey. You will also need to have a genome B for ___sequence 'B'___, against which you are visualizing alignment. 

______________


In [3]:
## HARD CODED -- Change the below to fit your needs.

In [8]:
#Root directory that has input files, and to which all results will be written
rootDir = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/miropeats/example/'

#Example input file can be found here:
inputBedfile = rootDir + 'input/' + 'example.bed'


#Provide the name of the samples or genome names that you'd like to process.
# Each will need a fasta file (e.g. genome) associated with it.
seqAID = 'Zoey'
seqBID = 'CanFam'

#Here, provide full path of the fasta file that will be used to extract sequences 
#   for alignment and visualization with miropeats
#Example below: Zoey and CanFam3 
seqAGenome = '/home/ampend/links/kidd-lab/genomes/zoey/assemblies/2.3/ref/zoey.2.3.fa'
seqBGenome = '/home/ampend/links/kidd-lab/genomes/canFam3.1/canFam3.1-cat/canFam3.1.fa'


# Read in what coordinates you wish to plot. They will be stored in roiCoords array

In [9]:

roiCoords = [] #coordinates chrom,start,end,ID for regions of interest

for line in open(inputBedfile,'r'):
    line=line.rstrip().split('\t')
    chrom,start,end,ID = line[0],int(line[1]),int(line[2]),line[3]
    roiCoords.append([chrom,start,end,ID])

print('%i ROIs read into array' % len(roiCoords))


1 ROIs read into array


______________


# Generate FASTA files for the regions of interest based off of bed coordinate file in step above, stored in roiCoord array

In [2]:
#Creates a directory named input within your defined input directory (== rootDir)
def create_directory(rootDir,ID):
    inDir = rootDir + 'input/' + ID + '/'
    cmd = 'mkdir -p %s' % inDir 
    runCMD(cmd)
    
    return inDir
############################################################################################################
#This generates a commands file named MiropeatsCommands for each 
def create_commands_file(inDir,ID):
    #CREATE COMMANDS FILE 
    cmdsfile = '%sMiropeatsCommands_%s.cmds' % (inDir,ID)
    cmdsFile = open(cmdsfile,'w')
    print('All commands for processing of ROI %s written to: %s\n' % (ID,cmdsfile))

    cmd = 'chmod 777 %s' % cmdsfile #make the commands file executable
    runCMD(cmd)
    return cmdsFile

############################################################################################################
# To process each region of interest (ROI) through the miropeats pipeline, it will require a BED file 
# to be generated for the sequences to be compared: sequence A (e.g.,  Zoey) and sequence B (e.g.,  CanFam) 
def make_bedfile(inDir,ROI_ID,seqACoords,seqBCoords,seqAID,seqBID):
    #SeqA (e.g. Zoey == bottom of miropeats image)
    bedfile = inDir + '%s_%s.bed' % (ROI_ID, seqAID)
    bedFile = open(bedfile,'w')
    chrom = seqACoords.split(':')[0]
    start,end = int(seqACoords.split(':')[1].split('-')[0]),int(seqACoords.split(':')[1].split('-')[1])
    bedFile.write('%s\t%i\t%i\t%s_%s\n' % (chrom,start,end,seqAID,ROI_ID))
    #SeqB (e.g. CanFam == top of miropeats image)
    bedfile = inDir + '%s_%s.bed' % (ROI_ID, seqBID)
    bedFile = open(bedfile,'w')
    chrom = seqBCoords.split(':')[0]
    start,end = int(seqBCoords.split(':')[1].split('-')[0]),int(seqBCoords.split(':')[1].split('-')[1])
    bedFile.write('%s\t%i\t%i\t%s_%s\n' % (chrom,start,end,seqBID,ROI_ID))
    
############################################################################################################
#This reads through the bedfile that was created in the function above, and extends the coordinates 
#  out for viewing in the miropeats image. This can be altered with the provided window size
def read_bedfile(inDir,ROI_ID,name,windowSize):
    #WINDOW SIZE - extension to view the entire region
    windowSize = int(windowSize) #Just in case it wasn't read in as an integer. 
    
    #name == either seqA or seqB identifier
    
    ##READ BED FILE FOR SEQA (BOTTOM) or SEQB (TOP)
    bedfile = '%s%s_%s.bed' % (inDir,name,ROI_ID)
    for line in open(bedfile, 'r'):
        line=line.rstrip().split('\t')
        chrom,start,end,ID = line[0], int(line[1]),int(line[2]),line[3]
        break
    print('Coordinates for ROI %s: %s:%i-%i' % (ROI_ID,chrom,start,end))
    
    #EXTEND OUT 1KB FOR VIEWING
    start,end = start-1000, end+1000
    if start < 0:
        start = 0
    print('Extended coordinates are %s:%i-%i' % (chrom,start,end))
    return chrom,start,end
############################################################################################################
# Here, it will require 
def extract_fasta(inDir,name,ID,chrom,start,end,cmdsFile):
    if 'Zoey' in name: 
        GenomeFastaFile = '/home/ampend/links/kidd-lab/genomes/zoey/assemblies/2.3/ref/zoey.2.3.fa'
    else:
        GenomeFastaFile = '/home/ampend/links/kidd-lab/genomes/canFam3.1/canFam3.1-cat/canFam3.1.fa'

    #Name of output file
    outFasta = '%s%s_%s.fa' % (inDir,name,ID)
    #Samtools index of new 
    cmd = 'samtools faidx %s %s:%i-%i > %s' % (GenomeFastaFile,chrom,start,end,outFasta)
    cmdsFile.write('%s\n' % cmd)
    runCMD(cmd)

    #RENAME FASTA ID
    cmd = 'sed -i \'s/%s:%i-%i/%s_%s/g\' %s' % (chrom,start,end,name,ID,outFasta)
    cmdsFile.write('%s\n' % cmd)
    runCMD(cmd)
    
    return outFasta
############################################################################################################
def write_repeat_commands(outFasta,cmdsFile):
    cmd = 'module load RepeatMasker'
    cmdsFile.write('%s\n' % cmd)    

    cmd = 'RepeatMasker --species dog %s' % outFasta
    cmdsFile.write('%s\n' % cmd)

############################################################################################################
def do_exon_intersects(inDir,ID,name,cmdsFile):    
    #INTERSECT WITH EXONS
    if 'Zoey' in name:
        exonBedFile = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/FINAL_GENES/Zoey_FinalGenes_exons.bed'
    else:
        exonBedFile = ' ~/links/kidd-lab/genomes/canFam3.1/annotations/ensemble-79/ensGene.exons.bed'
    
    cmd = 'bedtools intersect -wo -a %s%s_%s.bed -b %s> %sexons_%s.exons' % (inDir,name,ID,exonBedFile,inDir,name)
    runCMD(cmd)
    print(cmd)
    cmdsFile.write('%s\n' % cmd)
############################################################################################################    
def run_miropeats_script(cmdsFile):
    #WRITE MIROPEATS COMMAND
    ## INPUTS == MASKED FASTA FILES
    zoeyFasta = '%sZoey_%s.fa' % (inDir,ID)
    canfamFasta = '%sCanFam_%s.fa' % (inDir,ID)
    
    sList = [50,100,200,250]
    for s in sList:
        miropeatsoutput = inDir + 'miropeats_s' + str(s) + '_' + ID + '_onlyinter.ps' 
        #Remove any previously generated *ps files
        cmd = 'rm %s' % miropeatsoutput
        cmdsFile.write('%s\n' % cmd)        
        
        #Write command to change directories
        cmd = 'cd %s' % inDir
        cmdsFile.write('%s\n' % cmd)
        
        #Write miropeats commands
        cmd = 'miropeats -s %i -onlyinter -o %s -seq %s %s' % (s,miropeatsoutput,canfamFasta.replace(inDir,'').replace('.masked',''),zoeyFasta.replace(inDir,'').replace('.masked',''))
        cmdsFile.write('%s\n' % cmd)
        
        
    # BLAT COMMAND
    cmd = 'blat -fine -minMatch=1 -minScore=10 -out=blast9 %s %s %sBLAT_CanFam_vs_Zoey.blat' % (canfamFasta,zoeyFasta,inDir)
    cmdsFile.write('%s\n' % cmd)

############################################################################################################    
def call_gaps(cmdsFile):
    names = ['Zoey','CanFam']
    zoeyFasta = '%sZoey_%s.fa' % (inDir,ID)
    canfamFasta = '%sCanFam_%s.fa' % (inDir,ID)
    
    for name in names:
        fastaFile = '%s%s_%s.fa' % (inDir,name,ID)
        cmd = 'perl ~/links/kidd-lab/jmkidd-projects/scripts/perlUtils/get_gaps.pl %s > %sgaps_%s.gaps' % (fastaFile, inDir, name)
        cmdsFile.write('%s\n' % cmd)

############################################################################################################    
def write_miropeats_annotate(miropeatsDir,inDir,ID,chrom,start,end):
    #ANNOTATING MIROPEATS IMAGE
    #miropeats script:
    scriptPath = miropeatsDir + 'scripts/annotate-miropeats-2seqs-ForZoeyGenomeComparisons.py'
    
    #Miropeats command
    sList = [50,100,200,250]
    for s in sList:
        cmd = 'python %s ' % (scriptPath)
        cmd += '--miroin %s ' % (inDir + 'miropeats_s' + str(s) + '_' + ID + '_onlyinter.ps')
        cmd += '--topRM %s%s_%s.fa.out ' % (inDir,'CanFam',ID)
        cmd += '--bottomRM %s%s_%s.fa.out ' % (inDir,'Zoey',ID)
        cmd += '--topName CanFam_%s ' % (ID)
        cmd += '--bottomName Zoey_%s ' % (ID)
        cmd += '--blat %sBLAT_CanFam_vs_Zoey.blat ' % (inDir)
        cmd += '--coord %s:%i-%i ' % (chrom,start,end)
        print(cmd)
        cmdsFile.write('%s\n' % cmd)

### Get the coordinates of the Zoey ROIs
### Generate FASTA files for each ROI in its own directory

In [11]:
#Reading through each of the regions of interest (ROI)
for roi in range(0,len(roiCoords)):
    chrom,start,end,ID = roiCoords[roi][0],int(roiCoords[roi][1]),int(roiCoords[roi][2]),roiCoords[roi][3]
    print('\n#### %s' % ID)
    
    
    print('Coordinates for ROI %s: %s:%i-%i' % (ID,chrom,start,end))
    #EXTEND OUT 1KB FOR VIEWING
    start,end = start-1000, end+1000
    if start < 0:
        start = 0
        
    #Overwrite and change in the array
    roiCoords[roi][1],roiCoords[roi][2] = start,end
    print('Expanded new coord: ', (start,end))
    
    #MAKE DIRECTORY FOR OUTPUTS
    newDir = '%sinput/%s/' % (rootDir,ID)
    cmd = 'mkdir -p %s ' % (newDir)
    runCMD(cmd)
    
    
    #CREATE COMMANDS FILE 
    cmdsfile = '%sMiropeatsCommands_%s.cmds' % (newDir,ID)
    cmdsFile = open(cmdsfile,'w')
    print('All commands for processing of ROI %s written to: %s' % (ID,cmdsfile))
    
    cmd = 'chmod 777 %s' % cmdsfile #make the commands file executable
    runCMD(cmd) 
    
    
    ##WRITE BED FILE FOR ZOEY
    seqAbedfile = '%s%s_%s.bed' % (newDir,seqAID,ID)
    bedFile = open(seqAbedfile, 'w')
    bedFile.write('%s\t%i\t%i\t%s_%s\n' % (chrom,start,end,seqAID,ID))
    bedFile.close()
    
    ##EXTRACT FASTA 
    outFasta = '%s%s_%s.fa' % (newDir,seqAID,ID)
    #faidx extracts from genome in between extended roi coordinates
    cmd = 'samtools faidx %s %s:%i-%i > %s' % (seqAGenome,chrom,start,end,outFasta) 
    print(cmd) #print out command
    cmdsFile.write('%s\n' % cmd)
    runCMD(cmd)
    
    #RENAME FASTA ID
    cmd = 'sed -i \'s/%s:%i-%i/%s_%s/g\' %s' % (chrom,start,end,seqAID,ID,outFasta)
    cmdsFile.write('%s\n' % cmd)
    runCMD(cmd)
    
    ##REPEAT MASK FASTA
    #************ HARD CODED PATH TO REPEATMASKER************
    cmd = '/home/jmkidd/links/kidd-lab/pkg/RepeatMasker/4.0.7/RepeatMasker/RepeatMasker --species dog %s' % outFasta
    print(cmd)
    runCMD(cmd)
    cmdsFile.write('%s\n' % cmd)

    """#INTERSECT WITH EXONS
    exonBedFile = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/FINAL_GENES/Zoey_FinalGenes_exons.bed'
    cmd = 'bedtools intersect -wo -a %s -b %s> %sexons_Zoey.exons' % (zoeybedfile,exonBedFile,newDir)
    runCMD(cmd)
    cmdsFile.write('%s\n' % cmd)"""
    break



#### CRABP1_Locus
Coordinates for ROI CRABP1_Locus: chr3:58404873-58416852
Expanded new coord:  (58403873, 58417852)
All commands for processing of ROI CRABP1_Locus written to: /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/miropeats/example/input/CRABP1_Locus/MiropeatsCommands_CRABP1_Locus.cmds
samtools faidx /home/ampend/links/kidd-lab/genomes/zoey/assemblies/2.3/ref/zoey.2.3.fa chr3:58403873-58417852 > /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/miropeats/example/input/CRABP1_Locus/Zoey_CRABP1_Locus.fa
/home/jmkidd/links/kidd-lab/pkg/RepeatMasker/4.0.7/RepeatMasker/RepeatMasker --species dog /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/miropeats/example/input/CRABP1_Locus/Zoey_CRABP1_Locus.fa


In [None]:

#Reading through each of the regions of interest (ROI)
for roi in range(0,len(roiCoords)):
    chrom,start,end,ID = roiCoords[roi][0],int(roiCoords[roi][1]),int(roiCoords[roi][2]),roiCoords[roi][3]
    print('\n#### %s' % ID)
    
    print('Coordinates for ROI %s: %s:%i-%i' % (ID,chrom,start,end))

    ############################################
    # Find the equivalent region in the sequence B fasta that matches sequence A
    # Example: ROI in zoey (seqA) but need to find corresponding region in the canfam (seqB) genome
    ############################################
    
    ## BLAT SEQA vs SEQB 
    seqB_Genome2bitFile = '/home/ampend/links/kidd-lab/genomes/canFam3.1/canFam3.1-cat/canFam3.1.2bit'
    seqB_oocFile = '/home/ampend/links/kidd-lab/genomes/canFam3.1/canFam3.1-cat/11.ooc'
    
    zoeyFasta = '%s%s_%s.fa' % (newDir,seqAID,ID)
    blatOutFile = newDir + 'BLAT_to_%s_' % (seqBID) + ID + '.psl' 
    cmd = 'blat -out=psl -ooc=%s -tileSize=11 -noHead %s %s %s' % (seqB_oocFile,seqB_Genome2bitFile,zoeyFasta,blatOutFile)
    print(cmd)
    cmdsFile.write('%s\n' % cmd)
    runCMD(cmd)
    
    break
    
    #PARSE BLAT FILE TO FIND BEST HIT
    bestHit,bestHitInfo = 0, [] 
    for line in open(blatOutFile,'r'):
        line=line.rstrip().split('\t')
        hitLength,zoeyLength = int(line[0]),int(line[10])
        propHit = hitLength/float(zoeyLength)
        if propHit > bestHit:
            bestHit = propHit
            bestHitInfo = line
    CFchrom,CFstart,CFend = bestHitInfo[13],int(bestHitInfo[15]),int(bestHitInfo[16])
    print('Best Hit: %s:%i-%i (%f%%)' % (CFchrom,CFstart,CFend,bestHit))
    
    
    #EXTEND OUT FOR ALIGNMENTS
    CFstart,CFend = CFstart - 1000, CFend + 1000
    if CFstart < 0:
        CFstart = 0
    
    #WRITE BED FILE FOR CANFAM
    canfamBedfile = '%sCanFam_%s.bed' % (newDir,ID)
    bedFile = open(canfamBedfile, 'w')
    bedFile.write('%s\t%i\t%i\tCanFam_%s\n' % (CFchrom,CFstart,CFend,ID))
    bedFile.close()
    
    
    #GET FASTA
    CanFamGenomeFastaFile = '/home/ampend/links/kidd-lab/genomes/canFam3.1/canFam3.1-cat/canFam3.1.fa'
    outFasta = '%sCanFam_%s.fa' % (newDir,ID)
    cmd = 'samtools faidx %s %s:%i-%i > %s' % (CanFamGenomeFastaFile,CFchrom,CFstart,CFend,outFasta)
    cmdsFile.write('%s\n' % cmd)

    
    #RENAME FASTA ID
    cmd = 'sed -i \'s/%s:%i-%i/CanFam_%s/g\' %s' % (CFchrom,CFstart,CFend,ID,outFasta)
    cmdsFile.write('%s\n' % cmd)
    
    
    ##REPEAT MASK FASTA
    cmd = 'RepeatMasker --species dog %s' % outFasta
    cmdsFile.write('%s\n' % cmd)

    
    #INTERSECT WITH EXONS
    exonBedFile = ' ~/links/kidd-lab/genomes/canFam3.1/annotations/ensemble-79/ensGene.exons.bed'
    cmd = 'bedtools intersect -wo -a %s -b %s > %sexons_CanFam.exons' % (canfamBedfile,exonBedFile,newDir)
    cmdsFile.write('%s\n' % cmd)

    #WRITE MIROPEATS COMMAND
    ## INPUTS == MASKED FASTA FILES
    zoeyFasta = '%sZoey_%s.fa' % (newDir,ID)
    canfamFasta = '%sCanFam_%s.fa' % (newDir,ID)

    
    #################
    ### MIROPEATS ###
    #################
    print('\n#MIROPEATS')

    #Run first miropeats + blat
    run_miropeats_script(cmdsFile)

    #Call gaps
    call_gaps(cmdsFile)

    #Run miropeats annotate script
    write_miropeats_annotate(miropeatsDir,newDir,ID,chrom,start,end)

    
cmdsFile.close()
print('Done')


#### CRABP1_Locus
Coordinates for ROI CRABP1_Locus: chr3:58403873-58417852


In [None]:
"""
#Reading through each of the 
for roi in range(0,len(roiCoords)):
    chrom,start,end,ID = roiCoords[roi][0],int(roiCoords[roi][1]),int(roiCoords[roi][2]),roiCoords[roi][3]
    print('\n#### %s' % ID)
    
    
    print('Coordinates for ROI %s: %s:%i-%i' % (ID,chrom,start,end))
    #EXTEND OUT 1KB FOR VIEWING
    start,end = start-1000, end+1000
    if start < 0:
        start = 0
        
    #Overwrite and change in the array
    roiCoords[roi][1],roiCoords[roi][2] = start,end
    print('Expanded new coord: ', (start,end))
    
    #MAKE DIRECTORY FOR OUTPUTS
    newDir = '%sinput/%s/' % (miropeatsDir,ID)
    cmd = 'mkdir -p %s ' % (newDir)
    runCMD(cmd)
    
    
    #CREATE COMMANDS FILE 
    cmdsfile = '%sMiropeatsCommands_%s.cmds' % (newDir,ID)
    cmdsFile = open(cmdsfile,'w')
    print('All commands for processing of ROI %s written to: %s' % (ID,cmdsfile))
    
    cmd = 'chmod 777 %s' % cmdsfile #make the commands file executable
    runCMD(cmd) 
    
    
    ##WRITE BED FILE FOR ZOEY
    seqAbedfile = '%s%s_%s.bed' % (seqAID,newDir,ID)
    bedFile = open(seqAbedfile, 'w')
    bedFile.write('%s\t%i\t%i\tZoey_%s\n' % (chrom,start,end,ID))
    bedFile.close()
    
    ##EXTRACT FASTA 
    seqAGenome = '/home/ampend/links/kidd-lab/genomes/zoey/assemblies/2.3/ref/zoey.2.3.fa'
    outFasta = '%sZoey_%s.fa' % (newDir,ID)
    cmd = 'samtools faidx %s %s:%i-%i > %s' % (zoeyGenomeFastaFile,chrom,start,end,outFasta)
    cmdsFile.write('%s\n' % cmd)
    runCMD(cmd)
    
    #RENAME FASTA ID
    cmd = 'sed -i \'s/%s:%i-%i/Zoey_%s/g\' %s' % (chrom,start,end,ID,outFasta)
    cmdsFile.write('%s\n' % cmd)
    runCMD(cmd)
    
    ##REPEAT MASK FASTA
    cmd = 'module load RepeatMasker'
    cmdsFile.write('%s\n' % cmd)    
    
    cmd = 'RepeatMasker --species dog %s' % outFasta
    cmdsFile.write('%s\n' % cmd)

    #INTERSECT WITH EXONS
    exonBedFile = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/FINAL_GENES/Zoey_FinalGenes_exons.bed'
    cmd = 'bedtools intersect -wo -a %s -b %s> %sexons_Zoey.exons' % (zoeybedfile,exonBedFile,newDir)
    runCMD(cmd)
    cmdsFile.write('%s\n' % cmd)

    ############################################
    #GET CANFAM EQUIVALENT PROCESSED
    ############################################
    
    ## BLAT ZOEY SEQ AGAINST CANFAM3.1
    CanFamGenome2bitFile = '/home/ampend/links/kidd-lab/genomes/canFam3.1/canFam3.1-cat/canFam3.1.2bit'
    oocFile = '/home/ampend/links/kidd-lab/genomes/canFam3.1/canFam3.1-cat/11.ooc'
    
    zoeyFasta = '%sZoey_%s.fa' % (newDir,ID)
    blatOutFile = newDir + 'BLAT_to_Canfam_' + ID + '.psl' 
    cmd = 'blat -out=psl -ooc=%s -tileSize=11 -noHead %s %s %s' % (oocFile,CanFamGenome2bitFile,zoeyFasta,blatOutFile)
    cmdsFile.write('%s\n' % cmd)
    runCMD(cmd)
    
    
    #PARSE BLAT FILE TO FIND BEST HIT
    bestHit,bestHitInfo = 0, [] 
    for line in open(blatOutFile,'r'):
        line=line.rstrip().split('\t')
        hitLength,zoeyLength = int(line[0]),int(line[10])
        propHit = hitLength/float(zoeyLength)
        if propHit > bestHit:
            bestHit = propHit
            bestHitInfo = line
    CFchrom,CFstart,CFend = bestHitInfo[13],int(bestHitInfo[15]),int(bestHitInfo[16])
    print('Best Hit: %s:%i-%i (%f%%)' % (CFchrom,CFstart,CFend,bestHit))
    
    
    #EXTEND OUT FOR ALIGNMENTS
    CFstart,CFend = CFstart - 1000, CFend + 1000
    if CFstart < 0:
        CFstart = 0
    
    #WRITE BED FILE FOR CANFAM
    canfamBedfile = '%sCanFam_%s.bed' % (newDir,ID)
    bedFile = open(canfamBedfile, 'w')
    bedFile.write('%s\t%i\t%i\tCanFam_%s\n' % (CFchrom,CFstart,CFend,ID))
    bedFile.close()
    
    
    #GET FASTA
    CanFamGenomeFastaFile = '/home/ampend/links/kidd-lab/genomes/canFam3.1/canFam3.1-cat/canFam3.1.fa'
    outFasta = '%sCanFam_%s.fa' % (newDir,ID)
    cmd = 'samtools faidx %s %s:%i-%i > %s' % (CanFamGenomeFastaFile,CFchrom,CFstart,CFend,outFasta)
    cmdsFile.write('%s\n' % cmd)

    
    #RENAME FASTA ID
    cmd = 'sed -i \'s/%s:%i-%i/CanFam_%s/g\' %s' % (CFchrom,CFstart,CFend,ID,outFasta)
    cmdsFile.write('%s\n' % cmd)
    
    
    ##REPEAT MASK FASTA
    cmd = 'RepeatMasker --species dog %s' % outFasta
    cmdsFile.write('%s\n' % cmd)

    
    #INTERSECT WITH EXONS
    exonBedFile = ' ~/links/kidd-lab/genomes/canFam3.1/annotations/ensemble-79/ensGene.exons.bed'
    cmd = 'bedtools intersect -wo -a %s -b %s > %sexons_CanFam.exons' % (canfamBedfile,exonBedFile,newDir)
    cmdsFile.write('%s\n' % cmd)

    #WRITE MIROPEATS COMMAND
    ## INPUTS == MASKED FASTA FILES
    zoeyFasta = '%sZoey_%s.fa' % (newDir,ID)
    canfamFasta = '%sCanFam_%s.fa' % (newDir,ID)

    
    #################
    ### MIROPEATS ###
    #################
    print('\n#MIROPEATS')

    #Run first miropeats + blat
    run_miropeats_script(cmdsFile)

    #Call gaps
    call_gaps(cmdsFile)

    #Run miropeats annotate script
    write_miropeats_annotate(miropeatsDir,newDir,ID,chrom,start,end)

    
cmdsFile.close()
print('Done')"""

In [9]:
############################################################################################################
############################################################################################################
############################################################################################################
############################################################################################################

# This section generates miropeats images from specified coordinates in Zoey AND CanFam. 
### Some regions may not have good BLAT alignments with corresponding CanFam so you may need to do a custom alignment of two Zoey/CanFam regions.

### Find all gene symbols for printing onto miropeats image for each exon/gene model

In [22]:
finalGeneDir = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/FINAL_GENES/'
symbolDict = {}
for line in open(finalGeneDir + 'FinalDeNovoGenes_ZoeyCoord_CanfamCoord_BLAST2GO.txt', 'r'):
    if 'ShortGene' in line: #skip header
        continue
    line=line.rstrip().split('\t')
    longGeneID = line[1]
    geneSymbol = line[14]
    symbolDict[longGeneID] = geneSymbol

print('Added %i gene and gene symbol pairings' % len(symbolDict.keys()))

Added 42911 gene and gene symbol pairings


In [38]:
def get_gene_symbols(cmdsFile,inDir,ID,name):
    for line in open('%sexons_Zoey.exons' % inDir):
        line=line.rstrip().split('\t')
        exon = line[7]
        longGeneID = exon.rsplit('.',1)[0]
        if longGeneID not in symbolDict.keys():
            symbol = longGeneID
        else:
            if ',' in symbolDict[longGeneID]:
                symbol = symbolDict[longGeneID].split(', ')[0]
            else:
                symbol = symbolDict[longGeneID]
        cmdsFile.write('sed -i \'s/%s/%s/g\' %sexons_Zoey.exons\n' % (line[7],symbol,inDir))
    

In [46]:
#############################################
ID = 'OCA2_Locus_5prime'
zoeyCoords = 'chr3:32147080-32222336'
canFamCoords = 'chr3:31657254-32343345'
#############################################
ID = 'IMPK_Locus'
zoeyCoords = 'chr4:10542321-10753952'
canFamCoords = 'chr4:10500414-10512850'
#############################################
ID = 'PAX6_Locus'
zoeyCoords = 'chr18:35804273-35862527'
canFamCoords = 'chr18:35615647-35648660'
#############################################
ID = 'CHP2_Locus'
zoeyCoords = 'chr6:22,071,627-22,078,595'.replace(',','')
canFamCoords = 'chr6:22125564-22132356'
#############################################
ID = 'CD151_POLR2L_Locus'
zoeyCoords = 'chr18:45320491-45333588'
canFamCoords = 'chr18:45185291-45205702'
#############################################
ID = 'CAMK2N1_Locus'
zoeyCoords = 'chr2:75797821-75804153'
canFamCoords = 'chr2:78402536-78420364'

In [51]:
78362536+40000

78402536

In [53]:
miropeatsDir = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/miropeats/'

#***** CHANGE *****
ID = 'CD151_POLR2L_Locus'
zoeyCoords = 'chr18:45315491-45333588'
canFamCoords = 'chr18:45185291-45205702'

"""ID = 'CAMK2N1_Locus'
zoeyCoords = 'chr2:75797821-75804153'
canFamCoords = 'chr2:78402536-78420364'"""

print('\n#### %s' % ID)

#Create directory and commands file
inDir = create_directory(miropeatsDir,ID)
cmdsFile = create_commands_file(inDir,ID)


#MAKE BEDFILES FOR EACH
make_bedfile(inDir,ID,zoeyCoords,canFamCoords)

##############
###  ZOEY  ###
##############
name = 'Zoey'
print('\n#%s' % name)

#Get coordinates from zoey bedfile
chrom,start,end = read_bedfile(inDir,ID,name)

##EXTRACT FASTA 
outFasta = extract_fasta(inDir,name,ID,chrom,start,end,cmdsFile)

##REPEAT MASK FASTA
write_repeat_commands(outFasta,cmdsFile)

## INTERSECT WITH EXONS
do_exon_intersects(inDir,ID,name,cmdsFile)
get_gene_symbols(cmdsFile,inDir,ID,name)


##############
### CANFAM ###
##############
name = 'CanFam'
print('\n#%s' % name)

#Get coordinates from CANFAM bedfile
CFchrom,CFstart,CFend = read_bedfile(inDir,ID,name)

##EXTRACT FASTA 
outFasta = extract_fasta(inDir,name,ID,CFchrom,CFstart,CFend,cmdsFile)

##REPEAT MASK FASTA
write_repeat_commands(outFasta,cmdsFile)

## INTERSECT WITH EXONS
do_exon_intersects(inDir,ID,name,cmdsFile)


#################
### MIROPEATS ###
#################
print('\n#MIROPEATS')

#Run first miropeats + blat
run_miropeats_script(cmdsFile)

#Call gaps
call_gaps(cmdsFile)

#Run miropeats annotate script
write_miropeats_annotate(miropeatsDir,inDir,ID,chrom,start,end)

cmdsFile.close()


#### CD151_POLR2L_Locus
All commands for processing of ROI CD151_POLR2L_Locus written to: /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/miropeats/input/CD151_POLR2L_Locus/MiropeatsCommands_CD151_POLR2L_Locus.cmds


#Zoey
Coordinates for ROI Zoey_CD151_POLR2L_Locus: chr18:45315491-45333588
Extended coordinates are chr18:45314491-45334588
bedtools intersect -wo -a /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/miropeats/input/CD151_POLR2L_Locus/Zoey_CD151_POLR2L_Locus.bed -b /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/FINAL_GENES/Zoey_FinalGenes_exons.bed> /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/miropeats/input/CD151_POLR2L_Locus/exons_Zoey.exons

#CanFam
Coordinates for ROI CanFam_CD151_POLR2L_Locus: chr18:45185291-45205702
Extended coordinates are chr18:45184291-45206702
bedtools intersect -wo -a /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/miropeats/input/CD151_POLR2L_Locus/CanFam_CD151_