In [1]:
# 2018-10-10
# A. Pendleton
# Generation of miropeats images for any bed file as input

In [2]:
#this uses iPython magic to make plots appear inline
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import subprocess
import sys
import numpy as np
import matplotlib.patches as patches
import gzip
import fileinput
import glob
from scipy import stats
import re
from matplotlib_venn import venn3, venn3_circles
from collections import OrderedDict


def count_lines(f):
    lineCount = 0
    with open(f, 'r') as f:
        for line in f:
            lineCount += 1
        return lineCount
def runCMD(cmd):
    val = subprocess.Popen(cmd, shell=True).wait()
    if val == 0:
        pass
    else:
        print ('command failed')
        print (cmd)
        sys.exit(1)
# TO REMOVE TOP AND RIGHT AXIS OF PLOTS
def simpleaxis(ax):
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.get_xaxis().tick_bottom()
    ax.get_yaxis().tick_left()
#Write PBS Script
def write_pbs_file(wkDir,cmdsFile,jobName, mem, jobCount):
    #write PBS file
    if '/scripts/' in wkDir:
        wkDir = wkDir.replace('/scripts/','')
    pbsFile = open(cmdsFile.replace('.cmds','.pbs'),'w')
    print('Writing pbs file: %s'% pbsFile)
    pbsFile.write('#!/bin/bash\n')
    pbsFile.write('#PBS -S /bin/bash\n')
    pbsFile.write('#PBS -V\n')
    pbsFile.write('#PBS -M ampend@med.umich.edu\n')
    pbsFile.write('#PBS -j oe\n')
    pbsFile.write('#PBS -N %s\n' % jobName)
    pbsFile.write('#PBS -o %s\n' % (wkDir + 'logs/'))
    pbsFile.write('#PBS -l pmem=%iG\n' % mem)
    pbsFile.write('#PBS -l nodes=1:ppn=1,qos=flux,walltime=100:00:00\n')
    pbsFile.write('##PBS -A medbsm_flux\n')
    pbsFile.write('##PBS -q flux\n')
    pbsFile.write('#PBS -A jmkidd_fluxod\n')
    pbsFile.write('#PBS -q fluxod\n')
    pbsFile.write('#PBS -t 1-%s\n' % jobCount)
    pbsFile.write('cd %s\n' % (wkDir))
    pbsFile.write('/home/ampend/links/kidd-lab/jmkidd-projects/scripts/perlUtils/run-by-id-log.pl %s %sBLAT_commands.logs $PBS_ARRAYID' % (cmdsFile,cmdsFile.replace('/scripts/','/logs/')))
    pbsFile.close()

In [4]:
###############################################################################
#changes based on PS_template_April2010 value
def display_edit_setup(line):
    if '%%DocumentFonts: Helvetica-Bold Helvetica' in line:
        line = '%%DocumentFonts: Arial-Bold Arial\n'

    if '%%BoundingBox:' in line:
        line = '%%BoundingBox: 0 0 612 792\n'
    if '/repwidth' in line:
        line = '/repwidth 0.25 cm def\n'
    if '/linkwidth' in line:
        line = '/linkwidth 0 def\n'
    if '/gapwidth' in line:
        line = '/gapwidth 0 def\n'
    if '/leftmargin' in line:
        line = '/leftmargin 0.5 cm def\n'
    if '/bottommargin' in line:
        line = '/bottommargin 2 cm def\n'
    if '/pageheight' in line:
        line = '/pageheight 26 cm def\n'

    if '/Helvetica-Bold findfont' in line:
        line = '/Arial-Bold findfont 7 scalefont setfont\n\n'

    if '/Helvetica findfont' in line:
        line = '/Arial findfont 7 scalefont setfont\n'
    if '0 pageheight 2 cm sub moveto' in line:
        line = '0 pageheight 1.5 cm sub moveto\n'
    if '/graphicmargin 17.5' in line:
        line = '/graphicmargin 17.5 cm def\n'

    if '(Longest Sequence ' in line:
        line = '\n'    
    if '( Threshold Score ' in line:
        line = '\n'    

    if 'curveto' in line:
        line= line.replace('curveto','lineto')
    return line
###############################################################################
def read_rm_file(rmFileName):
    rmLines = []
    inFile = open(rmFileName,'r')
    for line in inFile:
        if line == '\n':
            continue
        line = line.rstrip()
        line = line.split()
        if line[0] == 'There':
            return []
        if line[0] == 'SW':
            continue
        if line[0] == 'score':
            continue
        rmLines.append(line)
    inFile.close()
    return rmLines
###############################################################################
def repeat_class_to_name(r):
    if 'SINE' in r:
        return 'SINE'
    if 'ARTEFACT' in r:
        return 'ARTEFACT'
    if 'DNA' in r:
        return 'DNA'
    if 'LINE' in r:
        return 'LINE'
    if 'Low_complexity' in r:
        return 'Low_complexity'
    if 'LTR' in r:
        return 'LTR'
    if 'Other' in r:
        return 'Other'
    if 'rRNA' in r:
        return 'rRNA'
    if 'scRNA' in r:
        return 'scRNA'
    if 'snRNA' in r:
        return 'snRNA'
    if 'srpRNA' in r:
        return 'srpRNA'
    if 'tRNA' in r:
        return 'tRNA'
    if 'RNA' in r:
        return 'RNA'
    if 'Satellite' in r:
        return 'Satellite'
    if 'Simple_repeat' in r:
        return 'Simple_repeat'
    if 'Unknown' in r:
        return 'Unknown'
    if 'Retroposon' in r:
        return 'SINE'    
    print ('repeat class unknown for',r)
    return 'Unknown'
###############################################################################
def sort_and_merge_repeats(repeats):
    repeats.sort()
    newRep = []
    for r in repeats:
        s = r[0]
        e = r[1]
        orientation = r[2]
        repClass = r[3]
        if len(newRep) == 0:
            newRep.append(r)
        else:
            lr = newRep[-1]
            ls = lr[0]
            le = lr[1]
            lorient = lr[2]
            lrepClass = lr[3]
            # overlap, need to extend
            if le > s and lrepClass == repClass and lorient == orientation:
                n = [ls,e,orientation,repClass]
                newRep[-1] = n
            else:
                newRep.append(r)
    return newRep    
###############################################################################
def process_repeat_file(fn):
    repeatLines = read_rm_file(fn)
    repeats = []
    for R in repeatLines:
        s = int(R[5])
        e = int(R[6])
        orientation = R[8]

        repClass = R[10]
        if orientation == 'C':
            orientation = '-'
        repClass = repeat_class_to_name(repClass)
        reps = [s,e,orientation,repClass]
        repeats.append(reps)
    repeats = sort_and_merge_repeats(repeats)
    return repeats
###############################################################################
def process_gap_file(fn):
    gapLines = []
    inFile = open(fn,'r')
    for line in inFile:
        line = line.rstrip()
        line = line.split()
        s = int(line[1])
        e = int(line[2])
        gapLines.append([s,e])
    inFile.close()
    return gapLines
###############################################################################
def process_exon_file(fn):
    exonLines = []
    inFile = open(fn,'r')
    for line in inFile:
        line = line.rstrip()
        line = line.split()
        s = int(line[1])
        e = int(line[2])
        name = line[3]
        exonLines.append([name,s,e])
        exonLines.sort()
    inFile.close()
    return exonLines  
###############################################################################
def process_primer_file(fn):
    primerLines = []
    inFile = open(fn,'r')
    for line in inFile:
        line = line.rstrip()
        line = line.split()
        s = int(line[1])
        e = int(line[2])
        dir = line[3]
        primerLines.append([s,e,dir])
    inFile.close()
    return primerLines
###############################################################################

###############################################################################
USAGE = """
python annotate-miropeats-3seqs-AP.py    
--miroin <in miropeat ps file>  
--topRM <RM of top> 
--bottomRM <RM of bottom>
--topName <name of top> 
--bottomName <name of bottom>
--blat <blat output of canFam vs fosmid, blast9 output format>
"""

parser = OptionParser(USAGE)
parser.add_option('--miroin',dest='miropeatsInput',help='input file of miropeats ps file')
parser.add_option('--topRM',dest='topRM',help='repeat mask out file for top sequence')
parser.add_option('--bottomRM',dest='bottomRM',help='repeat mask out file for bottom sequence')
parser.add_option('--topName',dest='topName',help='name of top seqence')
parser.add_option('--bottomName',dest='bottomName',help='name of bottom seqence')
parser.add_option('--blat',dest='blat',help='blat of canFam versus fosmid results, blast9 output format')

(options,args)=parser.parse_args()
if options.miropeatsInput is None:
    parser.error('miropeats input file name not given')
if options.topRM is None:
    parser.error('top RM input file name not given')
if options.bottomRM is None:
    parser.error('bottom RM input file name not given')
if options.topName is None:
    parser.error('top seq name not given')
if options.bottomName is None:
    parser.error('bottom seq name not given')
if options.blat is None:
    parser.error('blat output not given, blast9 output format')
    


In [8]:
##### Info files #####
PSTemplate = '/home/ampend/links/kidd-lab/jmkidd-projects/people-projects/jwilds-projects/RetroSeq-HGDP/align/scripts/PS_template_July2008_withlines.txt'

#### setup the repeat to color dictionary#####
color = {}
color['Other'] = 'Black'
color['Simple Repeat'] = 'DkGray'
color['Low Complexity'] = 'LtGray'
color['DNA'] = 'Pink'
color['LTR'] = 'Orange'
color['LINE'] = 'Green'
color['SINE'] = 'Purple'
color['GAP'] = 'Black' 
color['EXON'] = 'Blue'

#height of canfam and zoey sequence, in "Y" coordinates
#zoey is what is on the bottom
#ref_line=1.2
#fos_line=0.4
#####bottomLine = 0.3
bottomLine = .6 #0.3
#middleLine = 0.8 # ONLY FOR WHEN YOU HAVE A THREE-WAY ALIGNMENT
####topLine = 1.0
topLine = 1.3
# for direction...
#arrowEndBp = 1000
arrowEndBp = 100

In [None]:
# these are for coloring breakpoints... we will skip this for now...
chrm_breaks = {}
chrm_breaks['not do'] = 1
clone_breaks = {}
clone_breaks['not do'] = 1


In [9]:
############################################################################################################
############################################################################################################
############################################################################################################
############################################################################################################

# Generate FASTA files for the regions of interest based off of bed coordinate file

### Get the coordinates of the Zoey ROIs

In [11]:
miropeatsDir = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/miropeats/'

zoeyROIBedfile = miropeatsDir + 'input/' + 'Zoey_ROI.bed'

zoeyCoords = []

for line in open(zoeyROIBedfile,'r'):
    line=line.rstrip().split('\t')
    chrom,start,end,ID = line[0],int(line[1]),int(line[2]),line[3]
    zoeyCoords.append([chrom,start,end,ID])

print('%i Zoey ROIs read into array' % len(zoeyCoords))

1 Zoey ROIs read into array


### Generate FASTA files for each ROI in its own directory

In [44]:
#Conditionally create scripts directory
cmd = 'mkdir -p %s' % (miropeatsDir + 'scripts/')
runCMD(cmd)

#Cannot run repeatmasker within ipython notebooks so you'll have to run these yourself later
repeatMaskerCommandfile = miropeatsDir + 'scripts/' + 'RepeatMask_ZoeySeqs.cmds'
repeatMaskerCommandFile = open(repeatMaskerCommandfile,'w')


for roi in range(0,len(zoeyCoords)):
    chrom,start,end,ID = zoeyCoords[roi][0],int(zoeyCoords[roi][1]),int(zoeyCoords[roi][2]),zoeyCoords[roi][3]
    
    #EXTEND OUT 5KB FOR VIEWING
    start,end = start-5000, end+5000
    #Change in the array
    zoeyCoords[roi][1],zoeyCoords[roi][2] = start,end
    
    #conditionally make directory for each ROI, make it only if it doesn't already exist
    newDir = '%sinput/%s/' % (miropeatsDir,ID)
    cmd = 'mkdir -p %s ' % (newDir)
    runCMD(cmd)
    
    ##EXTRACT FASTA 
    zoeyGenomeFastaFile = '/home/ampend/links/kidd-lab/genomes/zoey/assemblies/2.3/ref/zoey.2.3.fa'
    print(zoeyGenomeFastaFile)
    
    outFasta = '%s%s.fa' % (newDir,ID)
    cmd = 'samtools faidx %s %s:%i-%i > %s' % (zoeyGenomeFastaFile,chrom,start,end,outFasta)
    #print(cmd)
    runCMD(cmd)
    
    ##REPEAT MASK FASTA
    cmd = 'RepeatMasker --species dog %s' % outFasta
    #print(cmd)
    repeatMaskerCommandFile.write('%s\n' % cmd)
    
    
    

/home/ampend/links/kidd-lab/genomes/zoey/assemblies/2.3/ref/zoey.2.3.fa


## ***** Run RepeatMasker Commands now *****

You'll need to do this on Flux


## BLAT Zoey Fasta against the CanFam3.1 genome

In [46]:
blatCommands = ''
for roi in range(0,len(zoeyCoords)):
    chrom,start,end,ID = zoeyCoords[roi][0],int(zoeyCoords[roi][1]),int(zoeyCoords[roi][2]),zoeyCoords[roi][3]
    print('\n#### %s' % ID)
    #conditionally make directory for each ROI, make it only if it doesn't already exist
    newDir = '%sinput/%s/' % (miropeatsDir,ID)
    
    ##Run BLAT against CanFam3.1 
    CanFamGenome2bitFile = '/home/ampend/links/kidd-lab/genomes/canFam3.1/canFam3.1-cat/canFam3.1.2bit'
    oocFile = '/home/ampend/links/kidd-lab/genomes/canFam3.1/canFam3.1-cat/11.ooc'
    print(CanFamGenome2bitFile)
    
    zoeyFasta = '%s%s.fa' % (newDir,ID)
    blatOutFile = newDir + 'BLAT_to_Canfam_' + ID + '.psl' 
    cmd = 'blat -out=psl -ooc=%s -tileSize=11 -noHead %s %s %s' % (oocFile,CanFamGenome2bitFile,zoeyFasta,blatOutFile)
    print(cmd)
    #runCMD(cmd)

    #PARSE BLAT HIT TO FIND BEST HIT
    bestHit,bestHitInfo = 0, [] 
    for line in open(blatOutFile,'r'):
        line=line.rstrip().split('\t')
        hitLength,zoeyLength = int(line[0]),int(line[10])
        propHit = hitLength/float(zoeyLength)
        if propHit > bestHit:
            bestHit = propHit
            bestHitInfo = line
    CFchrom,CFstart,CFend = line[13],int(line[15]),int(line[16])
    print('Best Hit: %s:%i-%i (%f%%)' % (CFchrom,CFstart,CFend,bestHit))
    
    
    #EXTEND OUT FOR ALIGNMENTS



#### CRABP1_Locus
/home/ampend/links/kidd-lab/genomes/canFam3.1/canFam3.1-cat/canFam3.1.2bit
blat -out=psl -ooc=/home/ampend/links/kidd-lab/genomes/canFam3.1/canFam3.1-cat/11.ooc -tileSize=11 -noHead /home/ampend/links/kidd-lab/genomes/canFam3.1/canFam3.1-cat/canFam3.1.2bit /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/miropeats/input/CRABP1_Locus/CRABP1_Locus.fa /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/miropeats/input/CRABP1_Locus/BLAT_to_Canfam_CRABP1_Locus.psl
Best Hit: chr1:87290321-87290504 (0.941241%)


In [6]:
bottomName = options.bottomName
miropeatsInFile = options.miropeatsInput
topRepeatMaskFile = options.topRM
bottomRepeatMaskFile = options.bottomRM
miropeatsOutFile = miropeatsInFile + '.annotated.ps'

inFile = open(miropeatsInFile,'r')
outFile = open(miropeatsOutFile,'w')
