# 2017-10-12
# A. Pendleton
# Patterns of the non-reference and reference cfERV insertions

In [1]:
#this uses iPython magic to make plots appear inline
import os
import sys
import numpy as np
import subprocess

def count_lines(f):
    lineCount = 0
    with open(f, 'r') as f:
        for line in f:
            lineCount += 1
        return lineCount
def runCMD(cmd):
    val = subprocess.Popen(cmd, shell=True).wait()
    if val == 0:
        pass
    else:
        print ('command failed')
        print (cmd)
        sys.exit(1)

In [2]:
#Root directory and the bedfiles for the non-reference and reference insertions
ervDir = '/home/ampend/links/kidd-lab/ampend-projects/cfERVs/'
bedfiles = [ervDir + 'input/cfERV_NonReferenceInsertionSites_2017-06-29.bed',ervDir + 'input/cfERV_ReferenceInsertionSites_2017-06-29.bed']

In [3]:
for File in bedfiles:
    print('#Reading in coordinates from the following ERV insertion site bed file:\n', File)
    f = File
    lineCount = count_lines(f)
    print('#There are a total of %i cfERV insertions in this bed file\n' % (lineCount))
    bedFile = open(f, 'r')

    ervDict = {}

    for line in bedFile:
        line = line.rstrip()
        line = line.split('\t')
        chrom = line[0]
        start = int(line[1])
        end = int(line[2])
        ID = line[3]
        if 'NonReference' in File:
            insertionType = 'NonReference'
        else:
            insertionType = 'Reference'
        ervDict[ID] = [chrom,start,end,ID,'Genes','Olfactory','FST','VST',insertionType]
    bedFile.close()

#Reading in coordinates from the following ERV insertion site bed file:
 /home/ampend/links/kidd-lab/ampend-projects/cfERVs/input/cfERV_NonReferenceInsertionSites_2017-06-29.bed
#There are a total of 59 cfERV insertions in this bed file

#Reading in coordinates from the following ERV insertion site bed file:
 /home/ampend/links/kidd-lab/ampend-projects/cfERVs/input/cfERV_ReferenceInsertionSites_2017-06-29.bed
#There are a total of 107 cfERV insertions in this bed file



In [33]:
def do_intersects(a, b, outfile):
    cmd = 'bedtools intersect -wo -a %s -b %s > %s' % (a, b, outfile)
    #print(cmd)
    runCMD(cmd)
    Dict = process_intersect(outfile)
    return Dict
########################################################################################
def process_intersect(outfile):
    Dict={}
    for line in open(outfile,'r'):
        line=line.rstrip().split()
        chrom,start,end,ID = line[0:4]
        hitID = line[7]
        if ID in Dict.keys():
            Dict[ID].append(hitID)
        else:
            Dict[ID] = []
            Dict[ID].append(hitID)
    return Dict
########################################################################################
def do_window_intersects(a,b,windowSize,outfile):
    cmd = 'bedtools window -w %i -a %s -b %s > %s' % (windowSize, a, b, outfile)
    #print(cmd)
    runCMD(cmd)
##############################################################################################################
def make_ensG_dict(ervfile):
    ensGDict = {}
    for line in open(ervfile,'r'):
        line = line.rstrip().split()
        winID = line[3]
        ensGDict[winID] = [[],[],[],[],[],[]]  
    return ensGDict
##############################################################################################################
def parse_gene_intersect_file(Dict,intfile,windowLength,index):
    intersectFile = open(intfile,'r') #outfile from the intersect step with ensembl 81 genes

    ensID = ''
    sigGenes = [] #To keep track of all gene IDs (ENSCAFG) that are in sig windows

    #Reading the intersect file results
    for line in intersectFile:
        line = line.rstrip() #removing extraneous whitespace characters
        line = line.split('\t') #delimiting "columns" in the file based on tabs		
        winID = line[3]
        if '_' in line[7]:
            hit = line[7].split('_')
            Protein = hit[0]
            Gene = hit[1]
            b2goID = Gene + '|' + Protein
            ensGID = hit[2]
        else:
            ensGID = line[7]
        if Dict not in Dict[winID][index]:
            Dict[winID][index].append(ensGID)
    intersectFile.close()
   
    return Dict


In [52]:
def making_summary_text_files(ervDir,Type,ervfile,ensGDict,otherDict,estDict,vcdrDict,xpclrDict,axDict):
    inFile = open(ervfile, 'r')

    ############################################################################
    # SUMMARIZING ERV INSERTIONS WITH ENSEMBL GENES AND FST REGIONS
    ##########################################################################
    Outfile = ervDir + 'results/' + Type + '_cfERV_WithFSTIntersects_GeneTable.txt' #Write out the VST Summary data table here
    outFile = open(Outfile, 'w')
    outFile.write('Chrom\tStart\tEnd\tWindow ID\tERV Type\t0kb Intersecting Ensembl Genes\t5kb Intersecting Ensembl Genes\t10kb Intersecting EnsemblGenes\t25kb Intersecting Ensembl Genes\t50kb  Intersecting Ensembl Genes\t100kb  Intersecting Ensembl Genes\t0kb Intersecting Dog ESTs\t5kb Intersecting Dog ESTs\t10kb Intersecting Dog ESTs\t25kb Intersecting Dog ESTs\t50kb Intersecting Dog ESTs\t100kb Intersecting Dog ESTs\t0kb Intersecting "Other" RefSeq Genes\t50kb Intersecting FST CDR\t50kb Intersecting XP-CLR CDR\t50kb Intersecting VST VCDR\t50kb Intersecting Axelsson CDR\n')

    allDict = {} # defining the dictionary that will have all the results of bedtool intersect compiled by windows

    for line in inFile:
        line = line.rstrip()
        line = line.split('\t')
        chrom, start_pos, end_pos, WinID = line[0:4]
        allDict[WinID] = [chrom,start_pos,end_pos,WinID,Type,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False] #setting up empty dictionary
        #Open up dictionary for ERVs
        #0=chr, 1=start_pos, 2=end_pos, 3=ERV ID 4=ERV Type (reference/nonreference)
        #5=Intersects with Ensembl Gene IDs (default = False) at 0kb
        #6=Intersects with Ensembl Gene IDs (default = False) at 5kb
        #7=Intersects with Ensembl Gene IDs (default = False) at 10kb
        #8=Intersects with Ensembl Gene IDs (default = False) at 25kb
        #9=Intersects with Ensembl Gene IDs (default = False) at 50kb
        #10=Intersects with Ensembl Gene IDs (default = False) at 100kb
        #11 = Intersects Dog EST (from UCSC) at 0kb
        #12 = Intersects Dog EST (from UCSC) at 5kb
        #13 = Intersects Dog EST (from UCSC) at 10kb
        #14 = Intersects Dog EST (from UCSC) at 25kb
        #15 = Intersects Dog EST (from UCSC) at 50kb
        #16 = Intersects Dog EST (from UCSC) at 100kb
        #17 = Intersects "Other" Refseq Genes (from UCSC) at 0kb
        #18=Intersects with an FST CDR (default = False)
        #19=Intersects with an XP-CLR CDR (default = False)
        #20=Intersects with an VST VCDR (default = False) 
        #21=Intersects with an Axelsson FST CDR (default = False)
        
        #Did this ERV intersect with a gene?
        index = 5 #len(ensGDict.keys())
        if WinID in ensGDict: #If this window is in the ensembl list you generated, then...:
            for num in range(0,index+1):
                added = []
                if len(ensGDict[WinID][num]) > 0:
                    if ensGDict[WinID][num] not in added:
                        y = ", ".join(map(str, ensGDict[WinID][num])) #This takes the list you generated from the Ensembl file and joins each entry in the list with a comma
                        allDict[WinID][5+num] = y #Now it adds 'y' you generated in the line above to your dictionary
                        added.append(ensGDict[WinID][num])
        #Did this ERV window intersect with an Dog ESTs?
        if WinID in estDict: #If this window is in the ensembl list you generated, then...:
            for num in range(0,index+1):
                added = []
                if len(estDict[WinID][num]) > 0:
                    if ensGDict[WinID][num] not in added:
                        y = ", ".join(map(str, estDict[WinID][num])) #This takes the list you generated from the Ensembl file and joins each entry in the list with a comma
                        allDict[WinID][11+num] = y #Now it adds 'y' you generated in the line above to your dictionary
                        added.append(ensGDict[WinID][num])
                        
        #Did this ERV window intersect with an "OTHER" RefSeq gene?
        if WinID in otherDict: #If this window is in the ensembl list you generated, then...:
            y = ", ".join(map(str, otherDict[WinID])) #This takes the list you generated from the Ensembl file and joins each entry in the list with a comma
            allDict[WinID][17] = y #Now it adds 'y' you generated in the line above to your dictionary
            
        #SELECTION SCAN RESULTS    
        #Did this ERV window intersect with a FST CDR?
        if WinID in cdrDict: #If this window is in the ensembl list you generated, then...:
            y = ", ".join(map(str, cdrDict[WinID])) #This takes the list you generated from the Ensembl file and joins each entry in the list with a comma
            allDict[WinID][18] = y #Now it adds 'y' you generated in the line above to your dictionary
        #Did this ERV window intersect with a XP-CLR CDR?
        if WinID in xpclrDict: #If this window is in the ensembl list you generated, then...:
            y = ", ".join(map(str, xpclrDict[WinID])) #This takes the list you generated from the Ensembl file and joins each entry in the list with a comma
            allDict[WinID][19] = y #Now it adds 'y' you generated in the line above to your dictionary
        #Did this ERV window intersect with a VST VCDR?
        if WinID in vcdrDict: #If this window is in the ensembl list you generated, then...:
            y = ", ".join(map(str, vcdrDict[WinID])) #This takes the list you generated from the Ensembl file and joins each entry in the list with a comma
            allDict[WinID][20] = y #Now it adds 'y' you generated in the line above to your dictionary
        #Did this ERV window intersect with a Axelsson FST Sweep?
        if WinID in axDict: #If this window is in the ensembl list you generated, then...:
            y = ", ".join(map(str, axDict[WinID])) #This takes the list you generated from the Ensembl file and joins each entry in the list with a comma
            allDict[WinID][21] = y #Now it adds 'y' you generated in the line above to your dictionary
   

    #Writing out results
    for keys in sorted(allDict.keys()): #Now loops through each 
        outFile.write("\t".join(map(str,allDict[keys])))
        outFile.write("\n")
    outFile.close()
    inFile.close()

In [53]:
for ervfile in bedfiles:
    if 'NonReference' in ervfile:
        Type = 'NonReference'
    else:
        Type = 'Reference'
    print(ervfile)
    #ervfile = '/home/ampend/links/kidd-lab/ampend-projects/cfERVs/input/cfERV_NonReferenceInsertionSites_2017-06-29.bed'
    ervDir = '/home/ampend/links/kidd-lab/ampend-projects/cfERVs/Insertion_Assessment/'

    ###Selective Sweep Intersects
    windowSize = int(50000)
    #Get CDR intersects -- save to Dict
    cdrfile = '/home/ampend/links/kidd-lab/ampend-projects/Angela/Re-RunningAnalysis_NewScripts/FINAL_results/MergedOverlapping_Sliding_TotalSIGFstCalls_AutoXPar_54callset.bed'
    outfile = ervDir + 'results/' + '%ibp_Intersect_%sERVs_with_CDRs_SimulationThreshold.txt' % (windowSize, Type)
    do_window_intersects(ervfile,cdrfile,int(windowSize),outfile)
    cdrDict = process_intersect(outfile)
    #Get VCDR intersects -- save to Dict
    vcdrfile = '/home/ampend/links/kidd-lab/ampend-projects/Angela/Feichen_VST/FINAL_FILTERED_VCDR_SET/Filtered_VCDR.bed'
    outfile = ervDir + 'results/' + '%ibp_Intersect_%s_with_VCDRs.txt'% (windowSize, Type)
    do_window_intersects(ervfile,vcdrfile,int(windowSize),outfile)
    vcdrDict = process_intersect(outfile)
    #Get VCDR intersects -- save to Dict
    xpclrfile = '/home/ampend/links/kidd-lab/ampend-projects/Angela/XP-CLR/results/Campbell_No-MAF-Filter/50kbWindow_2kbGrid/Dogs_v_Wolves/SignificantWindows/Significant_3Windows_pct99_AverageXP-CLRScore.4.bed'
    outfile = ervDir + 'results/' + '%ibp_Intersect_%sERVs_with_XPCLR_SimulationThreshold.txt'% (windowSize, Type)
    do_window_intersects(ervfile,xpclrfile,int(windowSize),outfile)
    xpclrDict = process_intersect(outfile)
    #Get Axelsson loci intersects -- save to Dict
    axelssonBedFile = '/home/ampend/links/kidd-lab/ampend-projects/Angela/Re-RunningAnalysis_NewScripts/new_results/results/Axelsson_CaganBlass/input/Axelsson_canfam3.1.bed'
    outfile = ervDir + 'results/' + '%ibp_Intersect_%sERVs_with_AxelssonCDRs_SimulationThreshold.txt'% (windowSize, Type)
    do_window_intersects(ervfile,axelssonBedFile,int(windowSize),outfile)
    axDict = process_intersect(outfile)

    #Get ensembl gene intersects within window ranges -- save to Dict
    windowSizes = ['0','5000','10000','25000','50000','100000']
    index = 0
    ensGDict = make_ensG_dict(ervfile)
    for windowSize in windowSizes:
        genebedfile = '/home/ampend/links/kidd-lab/ampend-projects/BLAST2GO/results/BLAST2GO_Ensembl81_GeneTables_WithEnscafIDsAndChrom.bed'
        outfile = ervDir + 'results/' + str(windowSize) + 'bpWindowIntersect_%sERVs_with_EnsemblGenes.txt'% (Type)
        do_window_intersects(ervfile,genebedfile,int(windowSize),outfile)
        ensGDict = parse_gene_intersect_file(ensGDict,outfile,windowSize,index)
        index+=1

    #Intersecting and Parsing results with TransMap EST UCSC Track
    index = 0
    estDict = make_ensG_dict(ervfile)
    for windowSize in windowSizes:
        dogESTBedFile = '/home/ampend/links/kidd-lab/ampend-projects/cfERVs/GeneIntersects/dog_allESTs_CF3.bed'
        outfile = ervDir + 'results/' + str(windowSize) + 'bpWindowIntersect_%sERVs_with_UCSCDogESTs_SimulationThreshold.txt'% (Type)
        do_window_intersects(ervfile,dogESTBedFile,int(windowSize),outfile)
        estDict = parse_gene_intersect_file(estDict,outfile,windowSize,index)
        index+=1
        
    #5. Intersecting and Parsing results with "Other" RefSeq UCSC Track
    otherRefSeqBedFile = '/home/ampend/links/kidd-lab/ampend-projects/cfERVs/GeneIntersects/other_refseq.bed'
    outfile = ervDir + 'results/' + 'Intersect_%sERVs_with_OtherRefseqGenes_SimulationThreshold.txt'% (Type)
    otherDict = do_intersects(ervfile, otherRefSeqBedFile, outfile)
    
    #MAKING SUMMARY FILE 
    making_summary_text_files(ervDir,Type,ervfile,ensGDict,otherDict,estDict,vcdrDict,xpclrDict,axDict)
print('DONE!')
    

/home/ampend/links/kidd-lab/ampend-projects/cfERVs/input/cfERV_NonReferenceInsertionSites_2017-06-29.bed
/home/ampend/links/kidd-lab/ampend-projects/cfERVs/Insertion_Assessment/results/0bpWindowIntersect_NonReferenceERVs_with_UCSCDogESTs_SimulationThreshold.txt
/home/ampend/links/kidd-lab/ampend-projects/cfERVs/Insertion_Assessment/results/5000bpWindowIntersect_NonReferenceERVs_with_UCSCDogESTs_SimulationThreshold.txt
/home/ampend/links/kidd-lab/ampend-projects/cfERVs/Insertion_Assessment/results/10000bpWindowIntersect_NonReferenceERVs_with_UCSCDogESTs_SimulationThreshold.txt
/home/ampend/links/kidd-lab/ampend-projects/cfERVs/Insertion_Assessment/results/25000bpWindowIntersect_NonReferenceERVs_with_UCSCDogESTs_SimulationThreshold.txt
/home/ampend/links/kidd-lab/ampend-projects/cfERVs/Insertion_Assessment/results/50000bpWindowIntersect_NonReferenceERVs_with_UCSCDogESTs_SimulationThreshold.txt
/home/ampend/links/kidd-lab/ampend-projects/cfERVs/Insertion_Assessment/results/100000bpWindowI