In [1]:
#this uses iPython magic to make plots appear inline
%matplotlib inline
import matplotlib.pyplot as plt
import scipy.stats as stats
import pandas as pd

import genutils
import os
import sys
import numpy as np
import re
import operator

because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.



In [2]:
def count_lines(f):
    lineCount = 0
    with open(f, 'r') as f:
        for line in f:
            lineCount += 1
        return lineCount

In [3]:
ervDir = '/home/ampend/kidd-lab/ampend-projects/cfERVs/'

bedfile = ervDir + 'input/' + 'cfERV_InsertionSites_2016-12-06.txt'
print '#Reading in coordinates from the following ERV insertion site bed file:\n', bedfile
f = bedfile
lineCount = count_lines(f)
print '#There are a total of %i cfERV insertions in this bed file\n' % (lineCount)
bedFile = open(bedfile, 'r')

ervDict = {}

for line in bedFile:
    line = line.rstrip()
    line = line.split('\t')
    chrom = line[0]
    start = int(line[1])
    end = int(line[2])
    ID = line[3]
    
    ervDict[ID] = [chrom,start,end,ID,'Genes','Olfactory','FST','VST']
bedFile.close()

#Reading in coordinates from the following ERV insertion site bed file:
/home/ampend/kidd-lab/ampend-projects/cfERVs/input/cfERV_InsertionSites_2016-12-06.txt
#There are a total of 59 cfERV insertions in this bed file



In [4]:
def intersect_with_FST_CDRs(ervDir):
    #New VST bedfile for intersecting
    bedfile = ervDir + 'input/' + 'cfERV_InsertionSites_2016-12-06.txt'
    #bedfile with the final FST CDR coordinates
    fstfile = '/home/ampend/kidd-lab/ampend-projects/Angela/Re-RunningAnalysis_NewScripts/new_results/results/MergedWindows/FinalMergedCDRs/Final_CDRCoordinates_AllSNPSets_Table.bed'

    #Intersecting the vst BED file with the FST results
    outfile = ervDir + 'results/' + 'Intersect_cfERVs_with_FstCDRs.txt'
    print '#Intersecting the FST and VST windows directly...'
    cmd = 'bedtools intersect -wo -a %s -b %s > %s' % (bedfile, fstfile, outfile)
    print cmd
    genutils.runCMD(cmd)
    #checking window counts
    f = outfile
    count = count_lines(f)
    print '-->%i cfERV insertions found to intersect FST regions' % count  

    #Intersecting the vst BED file with the FST results using window approach
    #50kb windows
    outfile = ervDir + 'results/' + '50kbWindowIntersect_cfERVs_with_FstCDRs.txt'
    print '\n#Intersecting the FST and VST windows with 50kb buffering windows...'
    cmd = 'bedtools window -w 50000 -a %s -b %s > %s' % (bedfile, fstfile, outfile)
    print cmd
    genutils.runCMD(cmd)
    #checking window counts
    f = outfile
    count = count_lines(f)
    print '-->%i cfERV insertions found to intersect FST regions with window size = 50kb' % count

    #200kb windows
    outfile = ervDir + 'results/' + '200kbWindowIntersect_cfERVs_with_FstCDRs.txt'
    print '\n#Intersecting the FST and VST windows with 200kb buffering windows...'
    cmd = 'bedtools window -w 50000 -a %s -b %s > %s' % (bedfile, fstfile, outfile)
    print cmd
    genutils.runCMD(cmd)
    #checking window counts
    f = outfile
    count = count_lines(f)
    print '-->%i cfERV insertions found to intersect FST regions with window size = 200kb' % count



In [26]:
def intersect_with_Ensembl_Genes(bedfile):
    print 'Now intersecting bedfiles...\n'
    #Determining variables
    inBedFile = bedfile
    genebedfile = '/home/ampend/kidd-lab/ampend-projects/BLAST2GO/results/BLAST2GO_Ensembl81_GeneTables_WithEnscafIDsAndChrom.bed'
    print '#Reading in the Ensembl 81 genes from the following file:\n', genebedfile
    geneBedFile = open(genebedfile, 'r')
    outfile = ervDir + 'results/' + 'Intersect_cfERVs_with_Ensembl81Genes.txt'
    
    #Exact Ensembl 81 intersect of cfERV insertion
    cmd = 'bedtools intersect -wo -a %s -b %s > %s' % (inBedFile, genebedfile, outfile)
    print cmd
    genutils.runCMD(cmd) 
    #How many intersect exactly?
    f = outfile
    count = count_lines(f)
    print '-->%i cfERV insertions found to intersect Ensembl 81 genes exactly' % count

    #Ensembl 81 intersection within 5kb of cfERV insertion
    outfile = ervDir + 'results/' + '5kb_Intersect_cfERVs_with_Ensembl81Genes.txt'
    cmd = 'bedtools window -w 5000 -a %s -b %s > %s' % (inBedFile, genebedfile, outfile)
    print cmd
    genutils.runCMD(cmd) 
    #How many intersect within 5kb?
    f = outfile
    count = count_lines(f)
    print '-->%i cfERV insertions found to intersect Ensembl 81 genes within 5kb' % count   

    #Ensembl 81 intersection within 50kb of cfERV insertion
    outfile = ervDir + 'results/' + '50kb_Intersect_cfERVs_with_Ensembl81Genes.txt'
    cmd = 'bedtools window -w 50000 -a %s -b %s > %s' % (inBedFile, genebedfile, outfile)
    print cmd
    genutils.runCMD(cmd) 
    #How many intersect within 50kb?
    f = outfile
    count = count_lines(f)
    print '-->%i cfERV insertions found to intersect Ensembl 81 genes within 50kb' % count


In [6]:
def parse_gene_intersect_file(intfile):
    b2goDir = '/home/ampend/kidd-lab/ampend-projects/Angela/Re-RunningAnalysis_NewScripts/new_results/results/BLAST2GO/'
    b2goList = open(ervDir + 'cfERV' + '_GeneList.txt', 'w')
    #Reading the intersect file results
    intersectFile = open(intfile,'r') #outfile from the intersect step with ensembl 81 genes

    # going through Ensembl bedtool intersect results file
    ensGDict = {} #ensGDict[windowID] = [gene1,gene2,gene3...]
    ensID = ''
    sigGenes = [] #To keep track of all gene IDs (ENSCAFG) that are in sig windows

    for line in intersectFile:
        line = line.rstrip() #removing extraneous whitespace characters
        line = line.split() #delimiting "columns" in the file based on tabs		
        winID = line[3]
        hit = line[7].split('_')
        Protein = hit[0]
        Gene = hit[1]
        b2goID = Gene + '|' + Protein
        ensGID = hit[2]
        
        if winID in ensGDict:
            ensGDict[winID].append(ensGID)
        else:
            ensGDict[winID] = []
            ensGDict[winID].append(ensGID) 
            sigGenes.append(b2goID)
    intersectFile.close()
    for i in sigGenes:
        gene = i
        b2goList.write('%s\n' % gene)
    
    return ensGDict

In [7]:
def parse_FST_CDR_intersect_file():
    #Reading the intersect file results to FST CDRs
    outfile = ervDir + 'results/' + 'Intersect_cfERVs_with_FstCDRs.txt'
    intersectFile = open(outfile,'r') #outfile from the intersect step above
    # going through Ensembl bedtool intersect results file
    fstDict = {} #ensGDict[windowID] = [gene1,gene2,gene3...]
    fstID = ''
    fstWindows = [] #To keep track of all gene IDs (ENSCAFG) that are in sig windows

    for line in intersectFile:
        line = line.rstrip() #removing extraneous whitespace characters
        line = line.split() #delimiting "columns" in the file based on tabs		
        vstID = line[3]
        fstID = line[7]

        if vstID in fstDict:
            fstDict[vstID].append(fstID)
        else:
            fstDict[vstID] = []
            fstDict[vstID].append(fstID) 
    intersectFile.close()
    return fstDict

In [16]:
totalCDR_file = '/home/ampend/kidd-lab/ampend-projects/Angela/Merged_Final_CDRs_VCDRs/results/Merged_Final_CDRs_VCDRs.bed'
erv_file = ervDir + 'input/' + 'cfERV_InsertionSites_2016-12-06.txt'

cmd = 'bedtools intersect -wo -a %s -b %s > %sresults/Intersect_cfERV_with_MergedVCDRandCDRSet.txt' % (totalCDR_file, erv_file, ervDir)
print cmd
genutils.runCMD(cmd)

#How many intersect exactly?
f = ervDir + 'results/Intersect_cfERV_with_MergedVCDRandCDRSet.txt'
count = count_lines(f)
print '-->%i cfERV insertions found to intersect merged CDR and VCDRs exactly' % count

cmd = 'bedtools window -w 50000 -a %s -b %s > %sresults/50kb_Intersect_cfERV_with_MergedVCDRandCDRSet.txt' % (totalCDR_file, erv_file, ervDir)
print cmd
genutils.runCMD(cmd)

#How many intersect exactly?
f = ervDir + 'results/50kb_Intersect_cfERV_with_MergedVCDRandCDRSet.txt'
count = count_lines(f)
print '-->%i cfERV insertions found to intersect merged CDR and VCDRs within 50kb' % count



bedtools intersect -wo -a /home/ampend/kidd-lab/ampend-projects/Angela/Merged_Final_CDRs_VCDRs/results/Merged_Final_CDRs_VCDRs.bed -b /home/ampend/kidd-lab/ampend-projects/cfERVs/input/cfERV_InsertionSites_2016-12-06.txt > /home/ampend/kidd-lab/ampend-projects/cfERVs/results/Intersect_cfERV_with_MergedVCDRandCDRSet.txt
-->2 cfERV insertions found to intersect merged CDR and VCDRs exactly
bedtools window -w 50000 -a /home/ampend/kidd-lab/ampend-projects/Angela/Merged_Final_CDRs_VCDRs/results/Merged_Final_CDRs_VCDRs.bed -b /home/ampend/kidd-lab/ampend-projects/cfERVs/input/cfERV_InsertionSites_2016-12-06.txt > /home/ampend/kidd-lab/ampend-projects/cfERVs/results/50kb_Intersect_cfERV_with_MergedVCDRandCDRSet.txt
-->3 cfERV insertions found to intersect merged CDR and VCDRs within 50kb


In [24]:
olfactory_file = '/home/ampend/kidd-lab-scratch/feichens-projects/dogs/olfactory/OR_all_sort_noU.bed'
erv_file = ervDir + 'input/' + 'cfERV_InsertionSites_2016-12-06.txt'

cmd = 'bedtools intersect -wo -a %s -b %s > %sresults/Intersect_cfERV_with_OlfactoryReceptors.txt' % (olfactory_file, erv_file, ervDir)
print cmd
genutils.runCMD(cmd)

#How many intersect exactly?
f = ervDir + 'results/Intersect_cfERV_with_OlfactoryReceptors.txt'
count = count_lines(f)
print '-->%i cfERV insertions found to intersect olfactory receptors exactly' % count

#Within 5kb?
cmd = 'bedtools window -w 5000 -a %s -b %s > %sresults/5kb_Intersect_cfERV_with_OlfactoryReceptors.txt' % (olfactory_file, erv_file, ervDir)
print cmd
genutils.runCMD(cmd)

#How many intersect Within 50kb??
f = ervDir + 'results/5kb_Intersect_cfERV_with_OlfactoryReceptors.txt'
count = count_lines(f)
print '-->%i cfERV insertions found to intersect olfactory receptors within 5kb' % count


#Within 50kb?
cmd = 'bedtools window -w 50000 -a %s -b %s > %sresults/50kb_Intersect_cfERV_with_OlfactoryReceptors.txt' % (olfactory_file, erv_file, ervDir)
print cmd
genutils.runCMD(cmd)

#How many intersect Within 50kb??
f = ervDir + 'results/50kb_Intersect_cfERV_with_OlfactoryReceptors.txt'
count = count_lines(f)
print '-->%i cfERV insertions found to intersect olfactory receptors within 50kb' % count



bedtools intersect -wo -a /home/ampend/kidd-lab-scratch/feichens-projects/dogs/olfactory/OR_all_sort_noU.bed -b /home/ampend/kidd-lab/ampend-projects/cfERVs/input/cfERV_InsertionSites_2016-12-06.txt > /home/ampend/kidd-lab/ampend-projects/cfERVs/results/Intersect_cfERV_with_OlfactoryReceptors.txt
-->0 cfERV insertions found to intersect merged CDR and VCDRs exactly
bedtools window -w 5000 -a /home/ampend/kidd-lab-scratch/feichens-projects/dogs/olfactory/OR_all_sort_noU.bed -b /home/ampend/kidd-lab/ampend-projects/cfERVs/input/cfERV_InsertionSites_2016-12-06.txt > /home/ampend/kidd-lab/ampend-projects/cfERVs/results/5kb_Intersect_cfERV_with_OlfactoryReceptors.txt
-->1 cfERV insertions found to intersect merged CDR and VCDRs within 5kb
bedtools window -w 50000 -a /home/ampend/kidd-lab-scratch/feichens-projects/dogs/olfactory/OR_all_sort_noU.bed -b /home/ampend/kidd-lab/ampend-projects/cfERVs/input/cfERV_InsertionSites_2016-12-06.txt > /home/ampend/kidd-lab/ampend-projects/cfERVs/results/

In [27]:
###############################
#1. Intersecting with FST CDRs
###############################
print 'Now intersecting cfERV insertions with FST CDRs'
intersect_with_FST_CDRs(ervDir)

####################################
#2. Intersecting with Ensembl Genes
####################################
print 'Now intersecting VST regions with Ensembl 81 genes'
intersect_with_Ensembl_Genes(bedfile)    

########################################################
#3. Parse bedtools intersect file against Ensembl genes
########################################################
intfile = ervDir + 'results/' + '50kb_Intersect_cfERVs_with_Ensembl81Genes.txt'
print 'Now parsing Ensembl gene intersect file:\n', intfile
ensGDict = parse_gene_intersect_file(intfile)
print ensGDict

###################################################
#4. Parse bedtools intersect file against FST CDRs
###################################################
print '\n#Now parsing the FST CDR intersect file...'
fstDict = parse_FST_CDR_intersect_file()

Now intersecting cfERV insertions with FST CDRs
#Intersecting the FST and VST windows directly...
bedtools intersect -wo -a /home/ampend/kidd-lab/ampend-projects/cfERVs/input/cfERV_InsertionSites_2016-12-06.txt -b /home/ampend/kidd-lab/ampend-projects/Angela/Re-RunningAnalysis_NewScripts/new_results/results/MergedWindows/FinalMergedCDRs/Final_CDRCoordinates_AllSNPSets_Table.bed > /home/ampend/kidd-lab/ampend-projects/cfERVs/results/Intersect_cfERVs_with_FstCDRs.txt
-->1 cfERV insertions found to intersect FST regions

#Intersecting the FST and VST windows with 50kb buffering windows...
bedtools window -w 50000 -a /home/ampend/kidd-lab/ampend-projects/cfERVs/input/cfERV_InsertionSites_2016-12-06.txt -b /home/ampend/kidd-lab/ampend-projects/Angela/Re-RunningAnalysis_NewScripts/new_results/results/MergedWindows/FinalMergedCDRs/Final_CDRCoordinates_AllSNPSets_Table.bed > /home/ampend/kidd-lab/ampend-projects/cfERVs/results/50kbWindowIntersect_cfERVs_with_FstCDRs.txt
-->1 cfERV insertions f

In [None]:



    
    ###########################################################
    #6. Summarizing VST and Ensembl hits to output text files
    ###########################################################
    print 'Now generating summary text files for %s results...' % (vstType)
    allDict = making_summary_text_files(vstType,fstDict,ensGDict)    
    
    ##################################################
    #7. How many genes and FST CDRs are intersecting?
    ##################################################
    print 'Now finding how many genes and CDR regions intersect with the %s results...' % (vstType)
    gene_fst_intersect_summary(allDict)
    
    ##################################################
    #8. Make gene list for GO gene enrichment steps
    ##################################################
    #print 'Now writing outfile with the gene IDs in these windows to search for enriched GO categories...'
    #make_gene_enrichment_list()
    
    
    