# 2017-10-12
# A. Pendleton
# Patterns of the non-reference and reference cfERV insertions

In [58]:
#this uses iPython magic to make plots appear inline
import os
import sys
import numpy as np
import subprocess

def count_lines(f):
    lineCount = 0
    with open(f, 'r') as f:
        for line in f:
            lineCount += 1
        return lineCount
def runCMD(cmd):
    val = subprocess.Popen(cmd, shell=True).wait()
    if val == 0:
        pass
    else:
        print ('command failed')
        print (cmd)
        sys.exit(1)

In [2]:
#Root directory and the bedfiles for the non-reference and reference insertions
ervDir = '/home/ampend/links/kidd-lab/ampend-projects/cfERVs/'
bedfiles = [ervDir + 'input/cfERV_NonReferenceInsertionSites_2017-06-29.bed',ervDir + 'input/cfERV_ReferenceInsertionSites_2017-06-29.bed']

In [3]:
for File in bedfiles:
    print('#Reading in coordinates from the following ERV insertion site bed file:\n', File)
    f = File
    lineCount = count_lines(f)
    print('#There are a total of %i cfERV insertions in this bed file\n' % (lineCount))
    bedFile = open(f, 'r')

    ervDict = {}

    for line in bedFile:
        line = line.rstrip()
        line = line.split('\t')
        chrom = line[0]
        start = int(line[1])
        end = int(line[2])
        ID = line[3]
        if 'NonReference' in File:
            insertionType = 'NonReference'
        else:
            insertionType = 'Reference'
        ervDict[ID] = [chrom,start,end,ID,'Genes','Olfactory','FST','VST',insertionType]
    bedFile.close()

#Reading in coordinates from the following ERV insertion site bed file:
 /home/ampend/links/kidd-lab/ampend-projects/cfERVs/input/cfERV_NonReferenceInsertionSites_2017-06-29.bed
#There are a total of 59 cfERV insertions in this bed file

#Reading in coordinates from the following ERV insertion site bed file:
 /home/ampend/links/kidd-lab/ampend-projects/cfERVs/input/cfERV_ReferenceInsertionSites_2017-06-29.bed
#There are a total of 107 cfERV insertions in this bed file



In [4]:
def do_intersects(a, b, outfile):
    cmd = 'bedtools intersect -wo -a %s -b %s > %s' % (a, b, outfile)
    #print(cmd)
    runCMD(cmd)
    Dict = process_intersect(outfile)
    return Dict
########################################################################################
def process_intersect(outfile):
    Dict={}
    for line in open(outfile,'r'):
        line=line.rstrip().split()
        chrom,start,end,ID = line[0:4]
        hitID = line[7]
        if ID in Dict.keys():
            Dict[ID].append(hitID)
        else:
            Dict[ID] = []
            Dict[ID].append(hitID)
    return Dict
########################################################################################
def do_window_intersects(a,b,windowSize,outfile):
    cmd = 'bedtools window -w %i -a %s -b %s > %s' % (windowSize, a, b, outfile)
    #print(cmd)
    runCMD(cmd)
##############################################################################################################
def make_ensG_dict(ervfile):
    ensGDict = {}
    for line in open(ervfile,'r'):
        line = line.rstrip().split()
        winID = line[3]
        ensGDict[winID] = [[],[],[],[],[],[]]  
    return ensGDict
##############################################################################################################
def parse_gene_intersect_file(Dict,intfile,windowLength,index):
    intersectFile = open(intfile,'r') #outfile from the intersect step with ensembl 81 genes

    ensID = ''
    sigGenes = [] #To keep track of all gene IDs (ENSCAFG) that are in sig windows

    #Reading the intersect file results
    for line in intersectFile:
        line = line.rstrip() #removing extraneous whitespace characters
        line = line.split('\t') #delimiting "columns" in the file based on tabs		
        winID = line[3]
        if '_' in line[7]:
            hit = line[7].split('_')
            Protein = hit[0]
            Gene = hit[1]
            b2goID = Gene + '|' + Protein
            ensGID = hit[2]
        else:
            ensGID = line[7]
        if Dict not in Dict[winID][index]:
            Dict[winID][index].append(ensGID)
    intersectFile.close()
   
    return Dict


In [5]:
def making_summary_text_files(ervDir,Type,ervfile,ensGDict,otherDict,estDict,vcdrDict,xpclrDict,axDict):
    inFile = open(ervfile, 'r')

    ############################################################################
    # SUMMARIZING ERV INSERTIONS WITH ENSEMBL GENES AND FST REGIONS
    ##########################################################################
    Outfile = ervDir + 'results/' + Type + '_cfERV_WithFSTIntersects_GeneTable.txt' #Write out the VST Summary data table here
    outFile = open(Outfile, 'w')
    outFile.write('Chrom\tStart\tEnd\tWindow ID\tERV Type\t0kb Intersecting Ensembl Genes\t5kb Intersecting Ensembl Genes\t10kb Intersecting EnsemblGenes\t25kb Intersecting Ensembl Genes\t50kb  Intersecting Ensembl Genes\t100kb  Intersecting Ensembl Genes\t0kb Intersecting Dog ESTs\t5kb Intersecting Dog ESTs\t10kb Intersecting Dog ESTs\t25kb Intersecting Dog ESTs\t50kb Intersecting Dog ESTs\t100kb Intersecting Dog ESTs\t0kb Intersecting "Other" RefSeq Genes\t50kb Intersecting FST CDR\t50kb Intersecting XP-CLR CDR\t50kb Intersecting VST VCDR\t50kb Intersecting Axelsson CDR\n')

    allDict = {} # defining the dictionary that will have all the results of bedtool intersect compiled by windows

    for line in inFile:
        line = line.rstrip()
        line = line.split('\t')
        chrom, start_pos, end_pos, WinID = line[0:4]
        allDict[WinID] = [chrom,start_pos,end_pos,WinID,Type,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False] #setting up empty dictionary
        #Open up dictionary for ERVs
        #0=chr, 1=start_pos, 2=end_pos, 3=ERV ID 4=ERV Type (reference/nonreference)
        #5=Intersects with Ensembl Gene IDs (default = False) at 0kb
        #6=Intersects with Ensembl Gene IDs (default = False) at 5kb
        #7=Intersects with Ensembl Gene IDs (default = False) at 10kb
        #8=Intersects with Ensembl Gene IDs (default = False) at 25kb
        #9=Intersects with Ensembl Gene IDs (default = False) at 50kb
        #10=Intersects with Ensembl Gene IDs (default = False) at 100kb
        #11 = Intersects Dog EST (from UCSC) at 0kb
        #12 = Intersects Dog EST (from UCSC) at 5kb
        #13 = Intersects Dog EST (from UCSC) at 10kb
        #14 = Intersects Dog EST (from UCSC) at 25kb
        #15 = Intersects Dog EST (from UCSC) at 50kb
        #16 = Intersects Dog EST (from UCSC) at 100kb
        #17 = Intersects "Other" Refseq Genes (from UCSC) at 0kb
        #18=Intersects with an FST CDR (default = False)
        #19=Intersects with an XP-CLR CDR (default = False)
        #20=Intersects with an VST VCDR (default = False) 
        #21=Intersects with an Axelsson FST CDR (default = False)
        
        #Did this ERV intersect with a gene?
        index = 5 #len(ensGDict.keys())
        if WinID in ensGDict: #If this window is in the ensembl list you generated, then...:
            for num in range(0,index+1):
                added = []
                if len(ensGDict[WinID][num]) > 0:
                    if ensGDict[WinID][num] not in added:
                        y = ", ".join(map(str, ensGDict[WinID][num])) #This takes the list you generated from the Ensembl file and joins each entry in the list with a comma
                        allDict[WinID][5+num] = y #Now it adds 'y' you generated in the line above to your dictionary
                        added.append(ensGDict[WinID][num])
                        b2goGenes.append(ensGDict[WinID][num])
        #Did this ERV window intersect with an Dog ESTs?
        if WinID in estDict: #If this window is in the ensembl list you generated, then...:
            for num in range(0,index+1):
                added = []
                if len(estDict[WinID][num]) > 0:
                    if ensGDict[WinID][num] not in added:
                        y = ", ".join(map(str, estDict[WinID][num])) #This takes the list you generated from the Ensembl file and joins each entry in the list with a comma
                        allDict[WinID][11+num] = y #Now it adds 'y' you generated in the line above to your dictionary
                        added.append(ensGDict[WinID][num])
                        
        #Did this ERV window intersect with an "OTHER" RefSeq gene?
        if WinID in otherDict: #If this window is in the ensembl list you generated, then...:
            y = ", ".join(map(str, otherDict[WinID])) #This takes the list you generated from the Ensembl file and joins each entry in the list with a comma
            allDict[WinID][17] = y #Now it adds 'y' you generated in the line above to your dictionary
            
        #SELECTION SCAN RESULTS    
        #Did this ERV window intersect with a FST CDR?
        if WinID in cdrDict: #If this window is in the ensembl list you generated, then...:
            y = ", ".join(map(str, cdrDict[WinID])) #This takes the list you generated from the Ensembl file and joins each entry in the list with a comma
            allDict[WinID][18] = y #Now it adds 'y' you generated in the line above to your dictionary
        #Did this ERV window intersect with a XP-CLR CDR?
        if WinID in xpclrDict: #If this window is in the ensembl list you generated, then...:
            y = ", ".join(map(str, xpclrDict[WinID])) #This takes the list you generated from the Ensembl file and joins each entry in the list with a comma
            allDict[WinID][19] = y #Now it adds 'y' you generated in the line above to your dictionary
        #Did this ERV window intersect with a VST VCDR?
        if WinID in vcdrDict: #If this window is in the ensembl list you generated, then...:
            y = ", ".join(map(str, vcdrDict[WinID])) #This takes the list you generated from the Ensembl file and joins each entry in the list with a comma
            allDict[WinID][20] = y #Now it adds 'y' you generated in the line above to your dictionary
        #Did this ERV window intersect with a Axelsson FST Sweep?
        if WinID in axDict: #If this window is in the ensembl list you generated, then...:
            y = ", ".join(map(str, axDict[WinID])) #This takes the list you generated from the Ensembl file and joins each entry in the list with a comma
            allDict[WinID][21] = y #Now it adds 'y' you generated in the line above to your dictionary
   

    #Writing out results
    for keys in sorted(allDict.keys()): #Now loops through each 
        outFile.write("\t".join(map(str,allDict[keys])))
        outFile.write("\n")
    outFile.close()
    inFile.close()


In [6]:
for ervfile in bedfiles:
    if 'NonReference' in ervfile:
        Type = 'NonReference'
    else:
        Type = 'Reference'
    print(ervfile)
    #ervfile = '/home/ampend/links/kidd-lab/ampend-projects/cfERVs/input/cfERV_NonReferenceInsertionSites_2017-06-29.bed'
    ervDir = '/home/ampend/links/kidd-lab/ampend-projects/cfERVs/Insertion_Assessment/'

    ###Selective Sweep Intersects
    windowSize = int(50000)
    #Get CDR intersects -- save to Dict
    cdrfile = '/home/ampend/links/kidd-lab/ampend-projects/Angela/Re-RunningAnalysis_NewScripts/FINAL_results/MergedOverlapping_Sliding_TotalSIGFstCalls_AutoXPar_54callset.bed'
    outfile = ervDir + 'results/' + '%ibp_Intersect_%sERVs_with_CDRs_SimulationThreshold.txt' % (windowSize, Type)
    do_window_intersects(ervfile,cdrfile,int(windowSize),outfile)
    cdrDict = process_intersect(outfile)
    #Get VCDR intersects -- save to Dict
    vcdrfile = '/home/ampend/links/kidd-lab/ampend-projects/Angela/Feichen_VST/FINAL_FILTERED_VCDR_SET/Filtered_VCDR.bed'
    outfile = ervDir + 'results/' + '%ibp_Intersect_%s_with_VCDRs.txt'% (windowSize, Type)
    do_window_intersects(ervfile,vcdrfile,int(windowSize),outfile)
    vcdrDict = process_intersect(outfile)
    #Get VCDR intersects -- save to Dict
    xpclrfile = '/home/ampend/links/kidd-lab/ampend-projects/Angela/XP-CLR/results/Campbell_No-MAF-Filter/50kbWindow_2kbGrid/Dogs_v_Wolves/SignificantWindows/Significant_3Windows_pct99_AverageXP-CLRScore.4.bed'
    outfile = ervDir + 'results/' + '%ibp_Intersect_%sERVs_with_XPCLR_SimulationThreshold.txt'% (windowSize, Type)
    do_window_intersects(ervfile,xpclrfile,int(windowSize),outfile)
    xpclrDict = process_intersect(outfile)
    #Get Axelsson loci intersects -- save to Dict
    axelssonBedFile = '/home/ampend/links/kidd-lab/ampend-projects/Angela/Re-RunningAnalysis_NewScripts/new_results/results/Axelsson_CaganBlass/input/Axelsson_canfam3.1.bed'
    outfile = ervDir + 'results/' + '%ibp_Intersect_%sERVs_with_AxelssonCDRs_SimulationThreshold.txt'% (windowSize, Type)
    do_window_intersects(ervfile,axelssonBedFile,int(windowSize),outfile)
    axDict = process_intersect(outfile)

    #Get ensembl gene intersects within window ranges -- save to Dict
    windowSizes = ['0','5000','10000','25000','50000','100000']
    index = 0
    ensGDict = make_ensG_dict(ervfile)
    for windowSize in windowSizes:
        genebedfile = '/home/ampend/links/kidd-lab/ampend-projects/BLAST2GO/results/BLAST2GO_Ensembl81_GeneTables_WithEnscafIDsAndChrom.bed'
        outfile = ervDir + 'results/' + str(windowSize) + 'bpWindowIntersect_%sERVs_with_EnsemblGenes.txt'% (Type)
        do_window_intersects(ervfile,genebedfile,int(windowSize),outfile)
        ensGDict = parse_gene_intersect_file(ensGDict,outfile,windowSize,index)
        index+=1

    #Intersecting and Parsing results with TransMap EST UCSC Track
    index = 0
    estDict = make_ensG_dict(ervfile)
    for windowSize in windowSizes:
        dogESTBedFile = '/home/ampend/links/kidd-lab/ampend-projects/cfERVs/GeneIntersects/dog_allESTs_CF3.bed'
        outfile = ervDir + 'results/' + str(windowSize) + 'bpWindowIntersect_%sERVs_with_UCSCDogESTs_SimulationThreshold.txt'% (Type)
        do_window_intersects(ervfile,dogESTBedFile,int(windowSize),outfile)
        estDict = parse_gene_intersect_file(estDict,outfile,windowSize,index)
        index+=1
        
    #5. Intersecting and Parsing results with "Other" RefSeq UCSC Track
    otherRefSeqBedFile = '/home/ampend/links/kidd-lab/ampend-projects/cfERVs/GeneIntersects/other_refseq.bed'
    outfile = ervDir + 'results/' + 'Intersect_%sERVs_with_OtherRefseqGenes_SimulationThreshold.txt'% (Type)
    otherDict = do_intersects(ervfile, otherRefSeqBedFile, outfile)
    
    #MAKING SUMMARY FILE 
    making_summary_text_files(ervDir,Type,ervfile,ensGDict,otherDict,estDict,vcdrDict,xpclrDict,axDict)
print('DONE!')
    

/home/ampend/links/kidd-lab/ampend-projects/cfERVs/input/cfERV_NonReferenceInsertionSites_2017-06-29.bed


NameError: name 'b2goGenes' is not defined

# BLAST2GO - TOPGO PROCESSING

In [None]:
b2goGenes = [] #for blast2go enrichment, kept at 50kb insertion distance
for ervfile in bedfiles:
    if 'NonReference' in ervfile:
        Type = 'NonReference'
    else:
        Type = 'Reference'
    print(Type)
    outfile = ervDir + 'results/' + str(50000) + 'bpWindowIntersect_%sERVs_with_EnsemblGenes.txt'% (Type)
    for line in open(outfile,'r'):
        line = line.rstrip().split('\t')
        winID = line[3]
        if '_' in line[7]:
            hit = line[7].split('_')
            Protein = hit[0]
            Gene = hit[1]
            b2goID = Gene + '|' + Protein
            ensGID = hit[2]
        b2goGenes.append(b2goID)
b2goGeneFile = open(ervDir + 'results/' + 'ForTopGO_GenesWithin50kb_cfERVInsertion.txt','w')
for i in range(0,len(b2goGenes)):
    b2goGeneFile.write(b2goGenes[i] + '\n')
b2goGeneFile.close()
print('Genes for TopGO enrichment saved here: ', b2goGeneFile)

In [99]:
#TopGO prints out a list of genes per GO category that are enriched. 

In [9]:
geneTable = '/home/ampend/links/kidd-lab/ampend-projects/BLAST2GO/results/BLAST2GO_GeneTables_WithEnscafIDsAndChrom.txt'
#Saving gene-protein links that were processed with BLAST2GO to array
TotEnsGene = {}

for line in open(geneTable,'r'):
    line = line.rstrip().split()
    ensG,ensP,chrom,start,end,geneName = line[1],line[0],line[5],line[6],line[7],line[8]
    TotEnsGene[ensG] = [ensG,ensP,chrom,start,end,geneName]
print ('\n%i genes read into gene-protein ID array' % len(TotEnsGene))


19856 genes read into gene-protein ID array


In [8]:
topGOPvalueFile=ervDir + 'results/' + 'Total_cfERV_50kbWindowIntersect_EnrichmentTable_Pvalue0.05_ParentChild_BP_MF_CC.txt'
topGODict={}
for line in open(topGOPvalueFile,'r'):
    line=line.rstrip().split('\t')
    if 'GO:' not in line[1]:
        continue
    GOID = line[1]
    topGODict[GOID] = line
print('%i enriched gene categories added to topGoDict' % len(topGODict))

218 enriched gene categories added to topGoDict


In [7]:
types = ['Reference','NonReference']
insDict = {}
for Type in types:
    intFile = ervDir + 'results/'  + '50000bpWindowIntersect_%sERVs_with_EnsemblGenes.txt' % (Type)
    for line in open(intFile,'r'):
        line=line.rstrip().split('\t')
        insertion = line[3]
        longgene = line[7].split('_')
        ensG = longgene[1]
        insDict[ensG] = insertion
print('%i insertions added to dictionary' % len(insDict))

196 insertions added to dictionary


In [125]:
def get_full_BLASTGO_category_names(ervDir):
    fullgonames = ervDir + 'results/' + 'full_go_cat_names.txt'
    nameDict = {}
    for line in open(fullgonames,'r'):
        line=line.rstrip().split('\t')
        if 'GO:' not in line[0]:
            continue
        nameDict[line[0]] = line[1]
    return nameDict
################################################################################################################
def find_perfect_insertions(ervDir):
    #list of genes that are 0kb insertions
    perfInsFile = ervDir + 'results/' + 'perfect_gene_insertion_gene_list.txt'
    perfIns = []
    for line in open(perfInsFile,'r'):
        line=line.rstrip().split('\t')
        perfIns.append(line[0])
    print('%i genes with perfect cfERV insertions (0kb window) / intronic/exonic' % len(perfIns))
    return perfIns
################################################################################################################
def get_humanphenotypes():
    hpoFile = '/home/ampend/links/kidd-lab/ampend-projects/BLAST2GO/Human-Phenotype-Ontology/HPO_Oct2017Release_genes-to-phenotypes.txt'
    hpoDict = {}
    for line in open(hpoFile,'r'):
        line=line.rstrip().split('\t')
        if '#' in line[0]: #skips header
            continue
        geneName, phenotype, HP = line[1], line[2], line[3]
        if geneName not in hpoDict.keys():
            hpoDict[geneName] = []
        hpoDict[geneName].append([phenotype, HP])    
    print('%i human phenotyped genes added to dictionary' % len(hpoDict))
    return hpoDict
################################################################################################################
def make_ens_transcript_map_dict():
    ensembl_MapFile = '/home/ampend/links/kidd-lab/ampend-projects/cfERVs/Insertion_Assessment/dog_Ensembl_EnsG_EnsP_EnsT_Map.txt'
    enDict = {}
    for line in open(ensembl_MapFile,'r'):
        line=line.rstrip().split()
        if '#' in line[0]: #skips header
            continue
        gene, enG, enT = line[1], line[2], line[0]
        enG = enG.split('.')[0]
        if enT not in enDict.keys():
            enDict[enG]=[gene, enT]
    print('%i genes added to enTDict' % len(enDict))
    return enDict

In [173]:
outputs = ['cfERV_50KbWindowIntersect_EnrichmentTable_Pvalue=0.05_MF_TopGOOutput_ParentChild_Genes.txt','cfERV_50KbWindowIntersect_EnrichmentTable_Pvalue=0.05_CC_TopGOOutput_ParentChild_Genes.txt', 'cfERV_50KbWindowIntersect_EnrichmentTable_Pvalue=0.05_BP_TopGOOutput_ParentChild_Genes.txt']
mergedOutFile = open(ervDir + 'results/' + 'AnnotatedTopGO_EnrichedGenes_50kbCfERVInsertions.txt','w')
insGeneGODict = {}
#Set-up inputs
perfIns = find_perfect_insertions(ervDir)
nameDict = get_full_BLASTGO_category_names(ervDir)
enDict = make_ens_transcript_map_dict()
hpoDict = get_humanphenotypes()

#Now begin processing
for topGoGeneFile in outputs:
    topGoGeneFile = open(ervDir + 'results/' + topGoGeneFile,'r')
    for line in topGoGeneFile:
        line=line.replace('"','')
        line=line.rstrip().split(' ')
        if 'GO:' not in line[0]: #skips headers
            continue
        GOID = line[0]
        geneLists = line[1]
        genes = geneLists.split(',')
        topGOInfo = topGODict[GOID]
        topGOInfo[2] = nameDict[GOID]
        genomeCount, setCount, pvalue1,pvalue2 =  topGOInfo[3],topGOInfo[4],float(topGOInfo[8]),float(topGOInfo[9])
        for i in range(0,len(topGOInfo)):
            mergedOutFile.write('%s\t' % topGOInfo[i])
        #What genes are annotated for this GO category
        ensGList, insList, geneNames, count = [], [], [], 0
        uniqueInsertionList = [] #Clear to keep track of how many unique insertions are found for this category
        for gene in genes:
            ensG,engP = gene.split('|')[0],gene.split('|')[1]
            ensGList.append(ensG)
            #info = [TotEnsGene[ensG][2],TotEnsGene[ensG][3],TotEnsGene[ensG][4],TotEnsGene[ensG][5]]
            #What insertions do these correspond to
            insertion = insDict[ensG]
            insList.append(insertion)
            geneName = TotEnsGene[ensG][5]
            geneNames.append(geneName)
            if insertion not in uniqueInsertionList:
                uniqueInsertionList.append(insertion)
            if geneName in perfIns:
                if GOID not in insGeneGODict.keys():
                    #0 = go description, #1 total gene count enriched #2 perf insertion count
                    #3 = gene names for perf ins, #4 = insertion IDs for perfect insertions
                    insGeneGODict[GOID] = [nameDict[GOID],genomeCount,setCount,'geneInsertionCount',pvalue1,pvalue2,[],[],0]
                insGeneGODict[GOID][6].append(geneName)
                insGeneGODict[GOID][7].append(insertion)
                count+=1
        y = ", ".join(map(str, ensGList))
        mergedOutFile.write('%s\t' % (y))
        y = ", ".join(map(str, geneNames))
        mergedOutFile.write('%s\t' % (y))
        y = ", ".join(map(str, insList))
        mergedOutFile.write('%s\t' % (y))
        #Number of Unique insertions
        mergedOutFile.write('%i\n' % (len(uniqueInsertionList)))
        if GOID in insGeneGODict.keys():        
            #Keeping track of information for the perfect insertions
            #insGeneGODict[GOID][5] = int(len(ensGList))
            insGeneGODict[GOID][3] = count
            insGeneGODict[GOID][8] = len(uniqueInsertionList)
            if '12505' in GOID:
                print(GOID,len(uniqueInsertionList))

mergedOutFile.close()

31 genes with perfect cfERV insertions (0kb window) / intronic/exonic
15548 genes added to enTDict
3682 human phenotyped genes added to dictionary
GO:0012505 29


In [174]:
perfInsertionGOFile = open(ervDir + 'results/' + 'GoAnnotations_PerfectInsertions_CountsTable.txt','w')
header = 'GO ID\tGO Description\tGenes in GO Category in Genome\tGenes in GO Category in cfERV set\tGenes in GO Category in Perfect Gene Insertion Set\tTopGO Fisher P-Value\tParent-Child Fisher P-Value\tGenes\tInsertions\tUnique Insertions Represented\n'
perfInsertionGOFile.write('%s\n' % header)
for GOID in insGeneGODict.keys():
    info = insGeneGODict[GOID]
    name, genomeCount, setCount, geneInsertionCount = info[0:4] 
    topgoP, parentP,uniqueInsertions = float(info[4]),float(info[5]),info[8]
    genes = info[6]
    for i in genes:
        y = ", ".join(map(str, genes))
    insertions = info[7]
    for j in insertions:
        z = ", ".join(map(str, insertions))
    perfInsertionGOFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (GOID,name, genomeCount, setCount, geneInsertionCount, topgoP, parentP, y, z, uniqueInsertions))
perfInsertionGOFile.close()

# Human-Phenotype Ontology Map

In [142]:
def get_phenotype_for_gene(geneName,intHPODict,hpoDict,humanPhenotypeLinkFile):
    for i in range(0,len(hpoDict[geneName])):
        phenotype, HPID = hpoDict[geneName][i][0],hpoDict[geneName][i][1]
        #Write total results to an out file
        humanPhenotypeLinkFile.write('%s\t%s\t%s\t%s\n' % (geneName, insertion,phenotype, HPID ))
        #but keep track of the number of instances you see a certain phenotype
        if HPID not in intHPODict.keys():
            intHPODict[HPID] = []
        intHPODict[HPID].append([phenotype,insertion,geneName])
    return intHPODict

In [172]:
intHPODict = {}
humanPhenotypeLinkFile = open(ervDir + 'results/' + 'HumanPhenotypeOntology_cfERV50kbWindowInsertions_AnnotationTable.txt', 'w')
processed = []

#Now begin processing
for topGoGeneFile in outputs:
    topGoGeneFile = open(ervDir + 'results/' + topGoGeneFile,'r')
    for line in topGoGeneFile:
        line=line.replace('"','')
        line=line.rstrip().split(' ')
        if 'GO:' not in line[0]: #skips headers
            continue
        GOID = line[0]
        geneLists = line[1]
        genes = geneLists.split(',')
        #What genes are annotated for this GO category
        for gene in genes:
            ensG,ensP = gene.split('|')[0],gene.split('|')[1]
            #What insertions do these correspond to
            insertion = insDict[ensG]
            #What is the full gene name?
            geneName = TotEnsGene[ensG][5]
            if geneName not in processed:
                processed.append(geneName)
            else:
                continue
            if geneName in hpoDict.keys():
                intHPODict = get_phenotype_for_gene(geneName,intHPODict,hpoDict,humanPhenotypeLinkFile)
            else:
                if ensG in enDict.keys():
                    geneName = enDict[ensG][0]
                    if geneName in hpoDict.keys():
                        intHPODict = get_phenotype_for_gene(geneName,intHPODict,hpoDict,humanPhenotypeLinkFile)
                else:
                    continue
humanPhenotypeLinkFile.close()      

summaryFile = open(ervDir + 'results/' + 'summary_morethan2_hpo_table.txt','w')
for HPID in intHPODict.keys():
    if len(intHPODict[HPID]) >= 2:
        summaryFile.write('%s\t%s\t%s\t' % (HPID,str(len(intHPODict[HPID])),str(intHPODict[HPID][0][0])))
        for i in range(0,len(intHPODict[HPID])):
            insert, gene = intHPODict[HPID][i][1], intHPODict[HPID][i][2]
            summaryFile.write('%s (%s)\t' % (insert, gene))
        summaryFile.write('\n')
summaryFile.close()