In [1]:
#this uses iPython magic to make plots appear inline
%matplotlib inline
import matplotlib.pyplot as plt
import scipy.stats as stats
import pandas as pd

import genutils
import os
import sys
import numpy as np
import re

because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.



In [2]:
vstTypes = ['FastCN', 'QuicKmer']
vstDir = '/home/ampend/kidd-lab/ampend-projects/Angela/Feichen_VST/'
print '#Working directory for the Vst analyses is the following:\n', vstDir


#Working directory for the Vst analyses is the following:
/home/ampend/kidd-lab/ampend-projects/Angela/Feichen_VST/


In [3]:
##############
# INPUT FST FILE
##############
fstfile = '/home/ampend/kidd-lab/ampend-projects/Angela/Re-RunningAnalysis_NewScripts/new_results/results/MergedWindows/FinalMergedCDRs/Final_CDRCoordinates_AllSNPSets_Table.bed'
print '\n#Reading in final FST CDR regions from:\n', fstfile
fstFile = open(fstfile, 'r')


#Reading in final FST CDR regions from:
/home/ampend/kidd-lab/ampend-projects/Angela/Re-RunningAnalysis_NewScripts/new_results/results/MergedWindows/FinalMergedCDRs/Final_CDRCoordinates_AllSNPSets_Table.bed


In [4]:
def count_lines(f):
    lineCount = 0
    with open(f, 'r') as f:
        for line in f:
            lineCount += 1
        return lineCount

In [5]:
def intersect_with_FST_CDRs(vstType):
    #New VST bedfile for intersecting
    newVstbedfile = vstDir + 'input/' + 'Mod_' + vstType + '_candidates_5.bed'

    #Intersecting the vst BED file with the FST results
    outfile = vstDir + 'results/' + 'Intersect_' + vstType + '_Vst_with_FstCDRs.txt'
    print '#Intersecting the FST and VST windows directly...'
    cmd = 'bedtools intersect -wo -a %s -b %s > %s' % (newVstbedfile, fstfile, outfile)
    print cmd
    genutils.runCMD(cmd)
    #checking window counts
    f = outfile
    count = count_lines(f)
    print '-->%i Vst windows found to intersect FST regions' % count  

    #Intersecting the vst BED file with the FST results using window approach
    #50kb windows
    outfile = vstDir + 'results/' + '50kbWindowIntersect_'+ vstType + '_Vst_with_FstCDRs.txt'
    print '\n#Intersecting the FST and VST windows with 50kb buffering windows...'
    cmd = 'bedtools window -w 50000 -a %s -b %s > %s' % (newVstbedfile, fstfile, outfile)
    print cmd
    genutils.runCMD(cmd)
    #checking window counts
    f = outfile
    count = count_lines(f)
    print '-->%i Vst windows found to intersect FST regions with window size = 50kb' % count

    #200kb windows
    outfile = vstDir + 'results/' + '200kbWindowIntersect_' + vstType + '_Vst_with_FstCDRs.txt'
    print '\n#Intersecting the FST and VST windows with 200kb buffering windows...'
    cmd = 'bedtools window -w 50000 -a %s -b %s > %s' % (newVstbedfile, fstfile, outfile)
    print cmd
    genutils.runCMD(cmd)
    #checking window counts
    f = outfile
    count = count_lines(f)
    print '-->%i Vst windows found to intersect FST regions with window size = 200kb' % count



In [6]:
def make_new_Vst_files(vstType, vstDir):
    vstfile = vstDir + 'input/' + vstType + '_candidates_5.bed'
    print '#Reading in candidate VST regions from:\n', vstfile
    vstFile = open(vstfile, 'r')
    #open new bed file to store information and rename the windows
    newVstbedfile = vstDir + 'input/' + 'Mod_' + vstType + '_candidates_5.bed'
    print 'Writing out new window IDs and modifying file structure for the following BED file:\n', newVstbedfile
    newVstbedFile = open(newVstbedfile, 'w')
    #open new text file to store information and rename the windows that matches the bed file
    newVstoutfile = vstDir + 'input/' + 'Mod_' + vstType + '_candidates_5.txt'
    print 'Writing out new window IDs and modifying file structure for the following TEXT file:\n', newVstoutfile
    newVstoutFile = open(newVstoutfile, 'w')

    vstCount = 0
    vstDict = {}

    for line in vstFile:
        line = line.rstrip()
        line = line.split('\t')
        vstCount += 1

        chrom = line[0]
        start = line[1]
        end = line[2]
        zScore = line[3]
        winID = line[4] + '_' + str(vstCount)

        newVstbedFile.write('%s\t%s\t%s\t%s\n' % (chrom, start,end, winID))
        #text file needs a 1-based start coordinate
        newStart = int(start) + 1
        newVstoutFile.write('%s\t%s\t%s\t%s\t%s\n' % (chrom, newStart,end, winID, zScore))

    print 'Wrote new information for %i VST windows to outfile' % (vstCount)
    vstFile.close()
    newVstbedFile.close()
    newVstoutFile.close()

In [7]:
def intersect_with_Ensembl_Genes(newVstbedfile):
    print 'Now intersecting bedfiles...\n'
    #Determining variables
    inBedFile = newVstbedfile
    genebedfile = '/home/ampend/kidd-lab/ampend-projects/BLAST2GO/results/BLAST2GO_Ensembl81_GeneTables_WithEnscafIDsAndChrom.bed'
    print 'Reading in the Ensembl 81 genes from the following file:\n', genebedfile
    geneBedFile = open(genebedfile, 'r')
    outfile = vstDir + 'results/' + 'Intersect_' + vstType + '_VSTWindows_Ensembl81Genes.txt'

    cmd = 'bedtools intersect -wo -a %s -b %s > %s' % (inBedFile, genebedfile, outfile)
    print cmd
    genutils.runCMD(cmd) 

In [34]:
def parse_gene_intersect_file(intfile, vstType):
    b2goDir = '/home/jmkidd/kidd-lab/ampend-projects/Angela/Re-RunningAnalysis_NewScripts/new_results/results/BLAST2GO/'
    b2goList = open(b2goDir + vstType + '_GeneList.txt', 'w')
    #Reading the intersect file results
    intersectFile = open(intfile,'r') #outfile from the intersect step with ensembl 81 genes

    # going through Ensembl bedtool intersect results file
    ensGDict = {} #ensGDict[windowID] = [gene1,gene2,gene3...]
    ensID = ''
    sigGenes = [] #To keep track of all gene IDs (ENSCAFG) that are in sig windows

    for line in intersectFile:
        line = line.rstrip() #removing extraneous whitespace characters
        line = line.split() #delimiting "columns" in the file based on tabs		
        winID = line[3]
        hit = line[7].split('_')
        Protein = hit[0]
        Gene = hit[1]
        b2goID = Gene + '|' + Protein
        ensGID = hit[2]
        
        if winID in ensGDict:
            ensGDict[winID].append(ensGID)
        else:
            ensGDict[winID] = []
            ensGDict[winID].append(ensGID) 
            sigGenes.append(b2goID)
    intersectFile.close()
    for i in sigGenes:
        gene = i
        b2goList.write('%s\n' % gene)
    
    return ensGDict

In [17]:
def parse_FST_CDR_intersect_file():
    #Reading the intersect file results to FST CDRs
    outfile = vstDir + 'results/' + 'Intersect_' + vstType + '_Vst_with_FstCDRs.txt'
    intersectFile = open(outfile,'r') #outfile from the intersect step above
    # going through Ensembl bedtool intersect results file
    fstDict = {} #ensGDict[windowID] = [gene1,gene2,gene3...]
    fstID = ''
    fstWindows = [] #To keep track of all gene IDs (ENSCAFG) that are in sig windows

    for line in intersectFile:
        line = line.rstrip() #removing extraneous whitespace characters
        line = line.split() #delimiting "columns" in the file based on tabs		
        vstID = line[3]
        fstID = line[7]

        if vstID in fstDict:
            fstDict[vstID].append(fstID)
        else:
            fstDict[vstID] = []
            fstDict[vstID].append(fstID) 
    intersectFile.close()
    return fstDict

In [19]:
def making_summary_text_files(vstType,fstDict,ensGDict):
    newVstoutfile = vstDir + 'input/' + 'Mod_' + vstType + '_candidates_5.txt'
    vstFile = open(newVstoutfile, 'r')

    ############################################################################
    # SUMMARIZING VST HITS WITH ENSEMBL GENES AND FST REGIONS
    ##########################################################################
    Outfile = vstDir + 'results/' + vstType + '_CDRs_WithFSTIntersects_GeneTable.txt' #Write out the VST Summary data table here
    print '\nWriting VST summary data table with FST intersects and gene IDs here: \n', Outfile
    outFile = open(Outfile, 'w')
    #Defining header based on the structure of the dictionary you created:
    #Add to this if you're adding more to your dictionary 
    headerLine = 'Chrom\tStart\tEnd\tWindowID\tWindowLength(bp)AverageZ-score\tIntersecting_FST_CDR\tIntersectingGenes\n'
    outFile.write(headerLine)

    vstCount = 0
    #Change this if you're working with the QuicKmer VST calls or the FastCN VST calls

    WinID = '' #Setting equal to nothing
    allDict = {} # defining the dictionary that will have all the results of bedtool intersect compiled by windows

    for line in vstFile:
        line = line.rstrip()
        line = line.split('\t')
        vstCount += 1 #To use when naming the VST regions with unique identifiers

        chrom = line[0]
        start_pos = int(line[1])
        end_pos = int(line[2])
        WinID = line[3]
        winLength = end_pos - start_pos
        zscore = line[4] #TO ADD ONCE YOU GET FILE BACK FROM FEICHEN
        winCoord = chrom + ":" + str(start_pos) + "-" + str(end_pos)

        #Open up dictionary for WinID
        #0=WinID, 1=chr, 2=start_pos, 3=end_pos, 4=Window length, #5=Zscore
        #6=Intersects with an FST CDR (default = False), 
        #7=Intersects with Ensembl Gene IDs (default = False)
        #8=window coordinates
        allDict[WinID] = [chrom,start_pos,end_pos,WinID,winLength,zscore,False,False,winCoord] #setting up empty dictionary

        #Did this VST window intersect with a FST CDR?
        if WinID in fstDict: #If this window is in the ensembl list you generated, then...:
            y = ", ".join(map(str, fstDict[WinID])) #This takes the list you generated from the Ensembl file and joins each entry in the list with a comma
            allDict[WinID][6] = y #Now it adds 'y' you generated in the line above to your dictionary

        #Did this VST window intersect with a gene?
        if WinID in ensGDict: #If this window is in the ensembl list you generated, then...:
            y = ", ".join(map(str, ensGDict[WinID])) #This takes the list you generated from the Ensembl file and joins each entry in the list with a comma
            allDict[WinID][7] = y #Now it adds 'y' you generated in the line above to your dictionary

    print 'There are now %i VST regions saved to the allDict dictionary for further processing' % (len(allDict))

    #Writing out results
    for keys in sorted(allDict.keys()): #Now loops through each 
        #print "\t".join(map(str,allDict[keys]))
        outFile.write("\t".join(map(str,allDict[keys])))
        outFile.write("\n")
    outFile.close()
    vstFile.close()
    return allDict

In [11]:
def gene_fst_intersect_summary(allDict):
    geneCount = 0
    fstCount = 0
    for keys in sorted(allDict.keys()):
        if allDict[keys][6] is not False:
            fstCount += 1
        if allDict[keys][7] is not False:
            geneCount += 1

    print '%i windows intersect with FST CDRs' % (fstCount)
    print '%i windows intersect with Ensembl genes' % (geneCount)

In [None]:
def make_gene_enrichment_list():
    for winID in allDict.keys():
        print 

In [36]:
for i in vstTypes:
    vstType = i
    print '\n############\n##%s\n############\n' % (vstType)
    intfile = vstDir + 'results/' + 'Intersect_' + vstType + '_VSTWindows_Ensembl81Genes.txt'
    
    ####################################################
    #1. Generating new VST files with proper unique IDs 
    ####################################################
    print 'Now intersecting VST regions with FST CDRs'
    make_new_Vst_files(vstType, vstDir)
    
    ###############################
    #2. Intersecting with FST CDRs
    ###############################
    print 'Now intersecting VST regions with FST CDRs'
    intersect_with_FST_CDRs(vstType)
    
    ####################################
    #3. Intersecting with Ensembl Genes
    ####################################
    print 'Now intersecting VST regions with Ensembl 81 genes'
    newVstbedfile = vstDir + 'input/' + 'Mod_' + vstType + '_candidates_5.bed'
    intersect_with_Ensembl_Genes(newVstbedfile)

    ########################################################
    #4. Parse bedtools intersect file against Ensembl genes
    ########################################################
    print 'Now parsing Ensembl gene intersect file:\n', intfile
    ensGDict = parse_gene_intersect_file(intfile, vstType)
    print ensGDict

    ###################################################
    #5. Parse bedtools intersect file against FST CDRs
    ###################################################
    print 'Now parsing the FST CDR intersect file...'
    fstDict = parse_FST_CDR_intersect_file()
    
    ###########################################################
    #6. Summarizing VST and Ensembl hits to output text files
    ###########################################################
    print 'Now generating summary text files for %s results...' % (vstType)
    allDict = making_summary_text_files(vstType,fstDict,ensGDict)    
    
    ##################################################
    #7. How many genes and FST CDRs are intersecting?
    ##################################################
    print 'Now finding how many genes and CDR regions intersect with the %s results...' % (vstType)
    gene_fst_intersect_summary(allDict)
    
    ##################################################
    #8. Make gene list for GO gene enrichment steps
    ##################################################
    #print 'Now writing outfile with the gene IDs in these windows to search for enriched GO categories...'
    #make_gene_enrichment_list()
    
    
    
    
    


############
##FastCN
############

Now intersecting VST regions with FST CDRs
#Reading in candidate VST regions from:
/home/ampend/kidd-lab/ampend-projects/Angela/Feichen_VST/input/FastCN_candidates_5.bed
Writing out new window IDs and modifying file structure for the following BED file:
/home/ampend/kidd-lab/ampend-projects/Angela/Feichen_VST/input/Mod_FastCN_candidates_5.bed
Writing out new window IDs and modifying file structure for the following TEXT file:
/home/ampend/kidd-lab/ampend-projects/Angela/Feichen_VST/input/Mod_FastCN_candidates_5.txt
Wrote new information for 120 VST windows to outfile
Now intersecting VST regions with FST CDRs
#Intersecting the FST and VST windows directly...
bedtools intersect -wo -a /home/ampend/kidd-lab/ampend-projects/Angela/Feichen_VST/input/Mod_FastCN_candidates_5.bed -b /home/ampend/kidd-lab/ampend-projects/Angela/Re-RunningAnalysis_NewScripts/new_results/results/MergedWindows/FinalMergedCDRs/Final_CDRCoordinates_AllSNPSets_Table.bed > /home/a

In [28]:
for i in allDict.keys():
    genes = allDict[i][7]
    if genes is False:
        continue
    else:
        genes = genes.split(', ')
        for geneID in genes:
            print geneID

ENSCAFG00000032113
ENSCAFG00000028745
ENSCAFG00000031570
ENSCAFG00000016517
VPREB1
ENSCAFG00000014142
ENSCAFG00000014144
ENSCAFG00000028972
ENSCAFG00000028561
MAGI2
ENSCAFG00000032099
DACH2
ENSCAFG00000019588


In [161]:
#Now intersect VST candidates from QuicKmer with FASTCN
fastcnfile = vstDir + 'input/' + 'Mod_FastCN_candidates_5.bed'
quickmerfile = vstDir + 'input/' + 'Mod_QuicKmer_candidates_5.bed'

#Intersecting command:
outfile = vstDir + 'results/Intersect_FastCN_with_QuicKmer_Regions.txt'
cmd = 'bedtools intersect -wo -a %s -b %s > %s' % (fastcnfile, quickmerfile, outfile)
print cmd
genutils.runCMD(cmd) 

inFile = open(outfile, 'r')
#checking window counts
f = outfile
count = count_lines(f)
print '-->%i FastCN Vst windows were found to precisely intersect QuicKmer FST regions (i.e. no buffering window of overlap)' % count


bedtools intersect -wo -a /home/ampend/kidd-lab/ampend-projects/Angela/Feichen_VST/input/Mod_FastCN_candidates_5.bed -b /home/ampend/kidd-lab/ampend-projects/Angela/Feichen_VST/input/Mod_QuicKmer_candidates_5.bed > /home/ampend/kidd-lab/ampend-projects/Angela/Feichen_VST/results/Intersect_FastCN_with_QuicKmer_Regions.txt
-->29 FastCN Vst windows were found to precisely intersect QuicKmer FST regions (i.e. no buffering window of overlap)
