In [2]:
#this uses iPython magic to make plots appear inline
%matplotlib inline
import matplotlib.pyplot as plt
import scipy.stats as stats
import pandas as pd

import genutils
import os
import sys
import numpy as np
import re


In [35]:
vstTypes = ['FastCN', 'QuicKmer']
vstDir = '/home/ampend/kidd-lab/ampend-projects/Angela/Feichen_VST/'
print '#Working directory for the Vst analyses is the following:\n', vstDir


#Working directory for the Vst analyses is the following:
/home/ampend/kidd-lab/ampend-projects/Angela/Feichen_VST/


In [4]:
##############
# INPUT FST FILE
##############
fstfile = '/home/ampend/kidd-lab/ampend-projects/Angela/Re-RunningAnalysis_NewScripts/new_results/results/MergedWindows/FinalMergedCDRs/Final_CDRCoordinates_AllSNPSets_Table.bed'
print '\n#Reading in final FST CDR regions from:\n', fstfile
fstFile = open(fstfile, 'r')


#Reading in final FST CDR regions from:
/home/ampend/kidd-lab/ampend-projects/Angela/Re-RunningAnalysis_NewScripts/new_results/results/MergedWindows/FinalMergedCDRs/Final_CDRCoordinates_AllSNPSets_Table.bed


In [5]:
def count_lines(f):
    lineCount = 0
    with open(f, 'r') as f:
        for line in f:
            lineCount += 1
        return lineCount

In [7]:
def make_new_Vst_files(vstType, vstDir):
    vstfile = vstDir + 'input/' + vstType + '_candidates_5.bed'
    print '#Reading in candidate VST regions from:\n', vstfile
    vstFile = open(vstfile, 'r')
    #open new bed file to store information and rename the windows
    newVstbedfile = vstDir + 'input/' + 'Mod_' + vstType + '_candidates_5.bed'
    print 'Writing out new window IDs and modifying file structure for the following BED file:\n', newVstbedfile
    newVstbedFile = open(newVstbedfile, 'w')
    #open new text file to store information and rename the windows that matches the bed file
    newVstoutfile = vstDir + 'input/' + 'Mod_' + vstType + '_candidates_5.txt'
    print 'Writing out new window IDs and modifying file structure for the following TEXT file:\n', newVstoutfile
    newVstoutFile = open(newVstoutfile, 'w')

    vstCount = 0
    vstDict = {}

    for line in vstFile:
        line = line.rstrip()
        line = line.split('\t')
        vstCount += 1

        chrom = line[0]
        start = line[1]
        end = line[2]
        zScore = line[3]
        winID = vstType + '_' + str(vstCount)

        newVstbedFile.write('%s\t%s\t%s\t%s\n' % (chrom, start,end, winID))
        #text file needs a 1-based start coordinate
        newStart = int(start) + 1
        newVstoutFile.write('%s\t%s\t%s\t%s\t%s\n' % (chrom, newStart,end, winID, zScore))

    print 'Wrote new information for %i VST windows to outfile' % (vstCount)
    vstFile.close()
    newVstbedFile.close()
    newVstoutFile.close()

In [39]:
def chrom_loc_compare(item1, item2):
    if item1[0] == item2[0]:
        if int(item1[1]) > int(item2[1]):
            return 1
        else:
            return -1
    else:
        try:
            chrom_num1 = int(item1[0][3:])
        except Exception:
            return 1
        try:
            chrom_num2 = int(item2[0][3:])
        except Exception:
            return -1
        if chrom_num1 > chrom_num2:
            return 1
        else:
            return -1
#Sorting the array based on the subroutine outlined above
cdrarr = sorted(cdrarr, cmp=chrom_loc_compare)

#Now re-naming the windows to CDR + integer based on the ordered array
for i,item in enumerate(cdrarr):
    cdrarr[i][3]='CDR_'+str(i+1)
    
print 'Created final table of the %i CDRs based on QuicKmer and fastCN VST analysis' % len(cdrarr)

Created final table of the 139 CDRs based on QuicKmer and fastCN VST analysis


In [40]:
def intersect_with_FST_CDRs(vstType):
    for i in vstTypes:
        vstType = i
        print '###Vst Type: ', vstType
        #New VST bedfile for intersecting
        newVstbedfile = vstDir + 'input/' + 'Mod_' + vstType + '_candidates_5.bed'

        #Intersecting the vst BED file with the FST results
        outfile = vstDir + 'results/' + 'Intersect_' + vstType + '_Vst_with_FstCDRs.txt'
        print '#Intersecting the FST and VST windows directly...'
        cmd = 'bedtools intersect -wo -a %s -b %s > %s' % (newVstbedfile, fstfile, outfile)
        print cmd
        genutils.runCMD(cmd)
        #checking window counts
        f = outfile
        count = count_lines(f)
        print '-->%i Vst windows found to intersect FST regions' % count  

        #Intersecting the vst BED file with the FST results using window approach
        #50kb windows
        outfile = vstDir + 'results/' + '50kbWindowIntersect_'+ vstType + '_Vst_with_FstCDRs.txt'
        print '\n#Intersecting the FST and VST windows with 50kb buffering windows...'
        cmd = 'bedtools window -w 50000 -a %s -b %s > %s' % (newVstbedfile, fstfile, outfile)
        print cmd
        genutils.runCMD(cmd)
        #checking window counts
        f = outfile
        count = count_lines(f)
        print '-->%i Vst windows found to intersect FST regions with window size = 50kb' % count

        #200kb windows
        outfile = vstDir + 'results/' + '200kbWindowIntersect_' + vstType + '_Vst_with_FstCDRs.txt'
        print '\n#Intersecting the FST and VST windows with 200kb buffering windows...'
        cmd = 'bedtools window -w 50000 -a %s -b %s > %s' % (newVstbedfile, fstfile, outfile)
        print cmd
        genutils.runCMD(cmd)
        #checking window counts
        f = outfile
        count = count_lines(f)
        print '-->%i Vst windows found to intersect FST regions with window size = 200kb' % count



In [41]:
def intersect_fastCN_with_QuicKmer(vstDir):
    #Now intersect VST candidates from QuicKmer with FASTCN
    fastcnfile = vstDir + 'input/' + 'Mod_FastCN_candidates_5.bed'
    quickmerfile = vstDir + 'input/' + 'Mod_QuicKmer_candidates_5.bed'

    #Intersecting command:
    outfile = vstDir + 'results/Intersect_FastCN_with_QuicKmer_Regions.txt'
    cmd = 'bedtools intersect -wo -a %s -b %s > %s' % (fastcnfile, quickmerfile, outfile)
    print cmd
    genutils.runCMD(cmd) 

    inFile = open(outfile, 'r')
    #checking window counts
    f = outfile
    count = count_lines(f)
    print '-->%i FastCN Vst windows were found to precisely intersect QuicKmer FST regions (i.e. no buffering window of overlap)' % count


In [47]:
def find_CDRs(vstDir):
    ###Read FastCN results into dictionary
    fastcnfile = vstDir + 'input/' + 'Mod_FastCN_candidates_5.txt'
    fastcnFile = open(fastcnfile,'r')

    fastDict = {}

    for line in fastcnFile:
        line = line.rstrip()
        line = line.split('\t')
        chrom = line[0]
        start = int(line[1]) #1-based coordinate
        end = int(line[2])
        ID = line[3]
        Z = float(line[4])
        fastDict[ID] = [chrom,start,end,ID,'NA', Z,'fastCN']
    print '#Finished reading fastCN window coordinates for %i windows' % (len(fastDict))

    ###Read QuicKmer results into dictionary
    quickmerfile = vstDir + 'input/' + 'Mod_QuicKmer_candidates_5.txt'
    quickmerFile = open(quickmerfile,'r')
    quickmerDict = {}

    for line in quickmerFile:
        line = line.rstrip()
        line = line.split('\t')
        chrom = line[0]
        start = int(line[1]) #1-based coordinate
        end = int(line[2])
        ID = line[3]
        Z = float(line[4])
        quickmerDict[ID] = [chrom,start,end,ID,'NA', Z,'Quic-Kmer']
    print '\n#Finished reading quickmer window coordinates for %i windows' % (len(quickmerDict))

    ###Processing the intersection file to create list of unique copy-number CDRs
    intfile = vstDir + 'results/Intersect_FastCN_with_QuicKmer_Regions.txt'
    print '\n#Reading in intersection results from:\n', intfile
    intFile = open(intfile, 'r')

    intDict = {}
    noList = []

    winCount = 0

    for line in intFile:
        line = line.rstrip()
        line = line.split('\t')
        winCount += 1 

        chrom = line[0]
        start1 = int(line[1])
        end1 = int(line[2])
        ID1 = line[3]
        start2 = int(line[5])
        end2 = int(line[6])
        ID2 = line[7]

        minStart = min([start1,start2])
        maxEnd = max([end1, end2])

        window = chrom + ':' + str(minStart) + '-' + str(maxEnd)
        if window in intDict:
            continue
        else:
            intDict[window] = [chrom, minStart, maxEnd, 'NA', 'NA', 'Both']
        #Add the IDs of the intersecting windows to the "no list", meaning that we want to use the coordinates
        #   we just determined based on the intersecting coordinates (minStart and maxStart)
        #Note: these MAY be the same coordinates as the original fastCN or QuicKmer results, but we just want to use
        #   what we just calculated instead.
        noList.append(ID1) #Will not be using this window ID in our final set, will be using the minStarts and maxEnds
        noList.append(ID2) #Will not be using this window ID in our final set, will be using the minStarts and maxEnds

    print '\n#Finished reading in the %i intersecting windows between QuicKmer and fastCN' % (winCount)
    print 'There are now %i entries in the intersect dictionary' % (len(intDict))

    #Now we need to determine the final coordinates of the CDRs
    cdrDict = {}
    cdrarr = []
    #Add the fastCN window information for all windows NOT in the "No list" AKA those windows that did not intersect
    #    with QuicKmer windows
    for i in fastDict:
        ID = i
        if ID not in noList:
            cdrDict[ID] = fastDict[ID]
            cdrarr.append(fastDict[ID])
    #Add the QuicKmer window information for all windows NOT in the "No list" AKA those windows that did not intersect
    #    with fastCN windows
    for i in quickmerDict:
        ID = i
        if ID not in noList:
            cdrDict[ID] = quickmerDict[ID]
            cdrarr.append(quickmerDict[ID])
    #Now add the remaining windows which are the windows that resulted from the intersect analysis:
    for i in intDict:
        ID = i
        cdrDict[ID] = intDict[ID]
        cdrarr.append(intDict[ID])

    print '\n#Renaming the CDR regions from sorted CDR array'
    
    for i in cdrarr:

In [48]:
print cdrarr

[['chr1', 98731163, 98734162, 'CDR_1', 'NA', 'NA', 'fastCN'], ['chr2', 4744801, 4749424, 'CDR_2', 2.7085449528200001, 'NA', 'fastCN'], ['chr2', 4753379, 4757311, 'CDR_3', 1.11281065981, 'NA', 'fastCN'], ['chr2', 5893665, 5897044, 'CDR_4', 'NA', 'NA', 'fastCN'], ['chr2', 5901157, 5905516, 'CDR_5', 'NA', 'NA', 'fastCN'], ['chr2', 69825432, 69838692, 'CDR_6', 'NA', 'NA'], ['chr2', 82411420, 82416607, 'CDR_7', 'NA', 'NA', 'fastCN'], ['chr3', 58219595, 58223305, 'CDR_8', 'NA', 'NA', 'fastCN'], ['chr5', 3352778, 3357464, 'CDR_9', 'NA', 'NA'], ['chr5', 28078011, 28081577, 'CDR_10', 'NA', 'NA', 'fastCN'], ['chr5', 41925909, 41930008, 'CDR_11', 2.7845478076200001, 'NA', 'fastCN'], ['chr5', 42899318, 42903237, 'CDR_12', 'NA', 'NA', 'fastCN'], ['chr5', 43169600, 43172693, 'CDR_13', 'NA', 'NA', 'fastCN'], ['chr5', 78144490, 78166450, 'CDR_14', 'NA', 'NA', 'fastCN'], ['chr5', 78182013, 78360039, 'CDR_15', 'NA', 'NA', 'fastCN'], ['chr5', 78374146, 78390852, 'CDR_16', 'NA', 'NA', 'fastCN'], ['chr6', 

In [46]:
####################################################
#1. Generating new VST files with proper unique IDs 
####################################################
print '\n#1. Changing the input file types for fastCN and QuicKmer'
make_new_Vst_files(vstType, vstDir)

####################################################
#2. Intersecting fastCN and QuicKmer regions to help identify VST CDRs
####################################################
print '\n#2. Now intersecting fastCN and QuicKmer regions to help in finding CDRs'
intersect_fastCN_with_QuicKmer(vstDir)

####################################################
#3. Identify VST CDRs
####################################################
print '\n#3. Parsing fastCN/QuicKmer intersect to find CDRs'
find_CDRs(vstDir)


#1. Changing the input file types for fastCN and QuicKmer
#Reading in candidate VST regions from:
/home/ampend/kidd-lab/ampend-projects/Angela/Feichen_VST/input/QuicKmer_candidates_5.bed
Writing out new window IDs and modifying file structure for the following BED file:
/home/ampend/kidd-lab/ampend-projects/Angela/Feichen_VST/input/Mod_QuicKmer_candidates_5.bed
Writing out new window IDs and modifying file structure for the following TEXT file:
/home/ampend/kidd-lab/ampend-projects/Angela/Feichen_VST/input/Mod_QuicKmer_candidates_5.txt
Wrote new information for 37 VST windows to outfile

#2. Now intersecting fastCN and QuicKmer regions to help in finding CDRs
bedtools intersect -wo -a /home/ampend/kidd-lab/ampend-projects/Angela/Feichen_VST/input/Mod_FastCN_candidates_5.bed -b /home/ampend/kidd-lab/ampend-projects/Angela/Feichen_VST/input/Mod_QuicKmer_candidates_5.bed > /home/ampend/kidd-lab/ampend-projects/Angela/Feichen_VST/results/Intersect_FastCN_with_QuicKmer_Regions.txt
-->29 Fa

In [8]:
def intersect_with_Ensembl_Genes(newVstbedfile):
    print 'Now intersecting bedfiles...\n'
    #Determining variables
    inBedFile = newVstbedfile
    genebedfile = '/home/ampend/kidd-lab/ampend-projects/BLAST2GO/results/BLAST2GO_Ensembl81_GeneTables_WithEnscafIDsAndChrom.bed'
    print 'Reading in the Ensembl 81 genes from the following file:\n', genebedfile
    geneBedFile = open(genebedfile, 'r')
    outfile = vstDir + 'results/' + 'Intersect_' + vstType + '_VSTWindows_Ensembl81Genes.txt'

    cmd = 'bedtools intersect -wo -a %s -b %s > %s' % (inBedFile, genebedfile, outfile)
    print cmd
    genutils.runCMD(cmd) 

In [9]:
def parse_gene_intersect_file(intfile, vstType):
    b2goDir = '/home/ampend/kidd-lab/ampend-projects/Angela/Re-RunningAnalysis_NewScripts/new_results/results/BLAST2GO/'
    b2goList = open(b2goDir + vstType + '_GeneList.txt', 'w')
    #Reading the intersect file results
    intersectFile = open(intfile,'r') #outfile from the intersect step with ensembl 81 genes

    # going through Ensembl bedtool intersect results file
    ensGDict = {} #ensGDict[windowID] = [gene1,gene2,gene3...]
    ensID = ''
    sigGenes = [] #To keep track of all gene IDs (ENSCAFG) that are in sig windows

    for line in intersectFile:
        line = line.rstrip() #removing extraneous whitespace characters
        line = line.split() #delimiting "columns" in the file based on tabs		
        winID = line[3]
        hit = line[7].split('_')
        Protein = hit[0]
        Gene = hit[1]
        b2goID = Gene + '|' + Protein
        ensGID = hit[2]
        
        if winID in ensGDict:
            ensGDict[winID].append(ensGID)
        else:
            ensGDict[winID] = []
            ensGDict[winID].append(ensGID) 
            sigGenes.append(b2goID)
    intersectFile.close()
    for i in sigGenes:
        gene = i
        b2goList.write('%s\n' % gene)
    
    return ensGDict

In [10]:
def parse_FST_CDR_intersect_file():
    #Reading the intersect file results to FST CDRs
    outfile = vstDir + 'results/' + 'Intersect_' + vstType + '_Vst_with_FstCDRs.txt'
    intersectFile = open(outfile,'r') #outfile from the intersect step above
    # going through Ensembl bedtool intersect results file
    fstDict = {} #ensGDict[windowID] = [gene1,gene2,gene3...]
    fstID = ''
    fstWindows = [] #To keep track of all gene IDs (ENSCAFG) that are in sig windows

    for line in intersectFile:
        line = line.rstrip() #removing extraneous whitespace characters
        line = line.split() #delimiting "columns" in the file based on tabs		
        vstID = line[3]
        fstID = line[7]

        if vstID in fstDict:
            fstDict[vstID].append(fstID)
        else:
            fstDict[vstID] = []
            fstDict[vstID].append(fstID) 
    intersectFile.close()
    return fstDict

In [11]:
def making_summary_text_files(vstType,fstDict,ensGDict):
    newVstoutfile = vstDir + 'input/' + 'Mod_' + vstType + '_candidates_5.txt'
    vstFile = open(newVstoutfile, 'r')

    ############################################################################
    # SUMMARIZING VST HITS WITH ENSEMBL GENES AND FST REGIONS
    ##########################################################################
    Outfile = vstDir + 'results/' + vstType + '_CDRs_WithFSTIntersects_GeneTable.txt' #Write out the VST Summary data table here
    print '\nWriting VST summary data table with FST intersects and gene IDs here: \n', Outfile
    outFile = open(Outfile, 'w')
    #Defining header based on the structure of the dictionary you created:
    #Add to this if you're adding more to your dictionary 
    headerLine = 'Chrom\tStart\tEnd\tWindowID\tWindowLength(bp)AverageZ-score\tIntersecting_FST_CDR\tIntersectingGenes\n'
    outFile.write(headerLine)

    vstCount = 0
    #Change this if you're working with the QuicKmer VST calls or the FastCN VST calls

    WinID = '' #Setting equal to nothing
    allDict = {} # defining the dictionary that will have all the results of bedtool intersect compiled by windows

    for line in vstFile:
        line = line.rstrip()
        line = line.split('\t')
        vstCount += 1 #To use when naming the VST regions with unique identifiers

        chrom = line[0]
        start_pos = int(line[1])
        end_pos = int(line[2])
        WinID = line[3]
        winLength = end_pos - start_pos
        zscore = line[4] #TO ADD ONCE YOU GET FILE BACK FROM FEICHEN
        winCoord = chrom + ":" + str(start_pos) + "-" + str(end_pos)

        #Open up dictionary for WinID
        #0=WinID, 1=chr, 2=start_pos, 3=end_pos, 4=Window length, #5=Zscore
        #6=Intersects with an FST CDR (default = False), 
        #7=Intersects with Ensembl Gene IDs (default = False)
        #8=window coordinates
        allDict[WinID] = [chrom,start_pos,end_pos,WinID,winLength,zscore,False,False,winCoord] #setting up empty dictionary

        #Did this VST window intersect with a FST CDR?
        if WinID in fstDict: #If this window is in the ensembl list you generated, then...:
            y = ", ".join(map(str, fstDict[WinID])) #This takes the list you generated from the Ensembl file and joins each entry in the list with a comma
            allDict[WinID][6] = y #Now it adds 'y' you generated in the line above to your dictionary

        #Did this VST window intersect with a gene?
        if WinID in ensGDict: #If this window is in the ensembl list you generated, then...:
            y = ", ".join(map(str, ensGDict[WinID])) #This takes the list you generated from the Ensembl file and joins each entry in the list with a comma
            allDict[WinID][7] = y #Now it adds 'y' you generated in the line above to your dictionary

    print 'There are now %i VST regions saved to the allDict dictionary for further processing' % (len(allDict))

    #Writing out results
    for keys in sorted(allDict.keys()): #Now loops through each 
        #print "\t".join(map(str,allDict[keys]))
        outFile.write("\t".join(map(str,allDict[keys])))
        outFile.write("\n")
    outFile.close()
    vstFile.close()
    return allDict

In [12]:
def gene_fst_intersect_summary(allDict):
    geneCount = 0
    fstCount = 0
    for keys in sorted(allDict.keys()):
        if allDict[keys][6] is not False:
            fstCount += 1
        if allDict[keys][7] is not False:
            geneCount += 1

    print '%i windows intersect with FST CDRs' % (fstCount)
    print '%i windows intersect with Ensembl genes' % (geneCount)

In [13]:
def make_gene_enrichment_list():
    for winID in allDict.keys():
        print 

In [14]:
for i in vstTypes:
    vstType = i
    print '\n############\n##%s\n############\n' % (vstType)
    intfile = vstDir + 'results/' + 'Intersect_' + vstType + '_VSTWindows_Ensembl81Genes.txt'
    
    ###############################
    #2. Intersecting with FST CDRs
    ###############################
    print 'Now intersecting VST regions with FST CDRs'
    intersect_with_FST_CDRs(vstType)
    
    ####################################
    #3. Intersecting with Ensembl Genes
    ####################################
    print 'Now intersecting VST regions with Ensembl 81 genes'
    newVstbedfile = vstDir + 'input/' + 'Mod_' + vstType + '_candidates_5.bed'
    intersect_with_Ensembl_Genes(newVstbedfile)

    ########################################################
    #4. Parse bedtools intersect file against Ensembl genes
    ########################################################
    print 'Now parsing Ensembl gene intersect file:\n', intfile
    ensGDict = parse_gene_intersect_file(intfile, vstType)
    print ensGDict

    ###################################################
    #5. Parse bedtools intersect file against FST CDRs
    ###################################################
    print 'Now parsing the FST CDR intersect file...'
    fstDict = parse_FST_CDR_intersect_file()
    
    ###########################################################
    #6. Summarizing VST and Ensembl hits to output text files
    ###########################################################
    print 'Now generating summary text files for %s results...' % (vstType)
    allDict = making_summary_text_files(vstType,fstDict,ensGDict)    
    
    ##################################################
    #7. How many genes and FST CDRs are intersecting?
    ##################################################
    print 'Now finding how many genes and CDR regions intersect with the %s results...' % (vstType)
    gene_fst_intersect_summary(allDict)
    
    ##################################################
    #8. Make gene list for GO gene enrichment steps
    ##################################################
    #print 'Now writing outfile with the gene IDs in these windows to search for enriched GO categories...'
    #make_gene_enrichment_list()
    
    
    
    
    


############
##FastCN
############

Now intersecting VST regions with FST CDRs
#Reading in candidate VST regions from:
/home/ampend/kidd-lab/ampend-projects/Angela/Feichen_VST/input/FastCN_candidates_5.bed
Writing out new window IDs and modifying file structure for the following BED file:
/home/ampend/kidd-lab/ampend-projects/Angela/Feichen_VST/input/Mod_FastCN_candidates_5.bed
Writing out new window IDs and modifying file structure for the following TEXT file:
/home/ampend/kidd-lab/ampend-projects/Angela/Feichen_VST/input/Mod_FastCN_candidates_5.txt
Wrote new information for 120 VST windows to outfile
Now intersecting VST regions with FST CDRs
#Intersecting the FST and VST windows directly...
bedtools intersect -wo -a /home/ampend/kidd-lab/ampend-projects/Angela/Feichen_VST/input/Mod_FastCN_candidates_5.bed -b /home/ampend/kidd-lab/ampend-projects/Angela/Re-RunningAnalysis_NewScripts/new_results/results/MergedWindows/FinalMergedCDRs/Final_CDRCoordinates_AllSNPSets_Table.bed > /home/a

In [38]:
#################
#Finding Zscores for CDRs
#################

#VST bedfiles that have Z-scores for all windows with at least 1.5 CN range (filtered VST window set)
fastcnfile = vstDir + 'input/' + 'FastCN_zscore_15CN_regions.bed' #'Mod_FastCN_candidates_5.txt'
quickmerfile = vstDir + 'input/' + 'QuicKmer_zscore_15CN_regions.bed' #'Mod_QuicKmer_candidates_5.txt'

#Saving these bedfiles to an array
fileArray = [fastcnfile, quickmerfile]

#Now we need to assign the Z-scores to the windows
for i,item in enumerate(cdrarr):
    chrom = cdrarr[i][0]
    start = int(cdrarr[i][1])
    end = int(cdrarr[i][2])
    WinID = cdrarr[i][3]
    #print 'Analyzing this window now...:\n', WinID
    #Reads through each file
    index = 0
    zscore1 = []
    zscore2 = []
    print WinID, start, end
    break
    for file in fileArray:
        #SET 1 SNPS ARE INDICES 1 & 2
        #SET 2 SNPS ARE INDICES 3 & 4
        index += 1 
        #open the file
        inFile = open(file, 'r')
        #reading through the Zscore file
        for line in inFile:
            line=line.rstrip()
            line=line.split('\t')
            if 'CHROM' in line:
                continue
            CHROM = line[0]
            START = int(line[1])
            END = int(line[2])
            ROA_Z = float(line[3])
            #Check: Does this chromosome match the CDR window chrom?
            if CHROM != chrom:
                continue #skip if the chromosomes dont match to the chr in CDR regions
            #Else, it's skipped
            #Now check start and end coordinates
            if start >= START and end <= END:
                if index == 1 or index == 2:
                    zscore1.append(ROA_Z)
                if index == 3 or index == 4:
                    zscore2.append(ROA_Z)
                print WinID,start,START,end,END,ROA_Z


CDR_1 98731163 98734162


In [None]:
    #If no SNPs were found in these windows then the lengths will 
    #	still be equal to zero. Just set the values in the dictionary to 'NA'
    #	and continue through the loop without calculating their means
    if len(zscore1) == 0: #SNP set 1
        cdrarr[i][4] = 'NA'
        #allDict[WinID][] = 'NA'
    if len(zscore2) == 0: #SNP set 2less 
        cdrarr[i][5] = 'NA'
        #allDict[WinID][11] = 'NA'
    #PROCESS SNP SET 1
    if len(zscore1) > 0:
        #Calculate the MEAN Zscores for the SNP1 and SNP2 sets
        #mean_Z1 = np.mean(zscore1) 
        #Now save these means to the dictionary in their respective indices 
        #cdrarr[i][4] = mean_Z1
        #Calculate the MAXIMUM Zscores for the SNP1 and SNP2 sets
        max_Z1 = np.max(zscore1) 
        #Now save these means to the dictionary in their respective indices 
        cdrarr[i][4] = max_Z1
    #DO THE SAME AS ABOVE FOR SNP SET 2
    if len(zscore2) > 0:
        #mean_Z2 = np.mean(zscore2)
        #cdrarr[i][4] = mean_Z2
        max_Z2 = np.max(zscore2)
        cdrarr[i][5] = max_Z2
    break
print cdrarr

In [None]:
###########################
#Creating bedfile
##########################
#INFILE
infile = options.input
inFile = open(infile, 'r')
print '\nInfile of significant windows is:\n ', options.input
#OUTFILE
newName = infile.replace('.txt','')
outfile = newName + '.bed'
print '\nBedfile is being written here:\n', outfile
outFile = open(outfile, 'w')

for line in inFile:
    #Skip header line	
    if 'Chrom' in line:
        continue

    line = line.rstrip()
    line = line.split()

    chrom = line[0]
    start = int(line[1]) - 1
    end = int(line[2])
    windowID = line[3]

    outFile.write('%s\t%i\t%i\t%s\n'%(chrom,start,end,windowID))
outFile.close()
inFile.close()



###########################
#Intersecting 
###########################
print '\nNow intersecting bedfiles...\n'
#Determining variables
inBedFile = outfile
geneBedFile = open(options.geneBed, 'r')
outfile = options.outdir + 'Intersect_' + 'SIGNonOverlappingWindows_' + 'FST_Ensembl81Genes.txt'

cmd = 'bedtools intersect -wo -a %s -b %s > %s' % (inBedFile, options.geneBed, outfile)
print cmd
genutils.runCMD(cmd)       

###############################
#Parse bedtools intersect file 
###############################
#Reading the intersect file results
intersectFile = open(outfile,'r') #outfile from the intersect step above
# going through Ensembl bedtool intersect results file
ensGDict = {} #ensGDict[windowID] = [gene1,gene2,gene3...]
ensID = ''
sigGenes = [] #To keep track of all gene IDs (ENSCAFG) that are in sig windows

for line in intersectFile:
    line = line.rstrip() #removing extraneous whitespace characters
    line = line.split() #delimiting "columns" in the file based on tabs		
    winID = line[3]
    hit = line[7].split('_')
    Gene = hit[1]
    ensGID = hit[2]
    sigGenes.append(Gene)
    if winID in ensGDict:
        ensGDict[winID].append(ensGID)
    else:
        ensGDict[winID] = []
        ensGDict[winID].append(ensGID) 
intersectFile.close()

############################################################################
# Input the bed file to make the dictionary
# The dictionary will have all outlier Fst windows, regardless if there are genes in those
#	windows
newName = options.input.replace('.txt', '')
Outfile = newName + '_GeneTable.txt' #Write out the new FST data here
print '\nWriting new FST data table with gene IDs here: \n', Outfile
outFile = open(Outfile, 'w')

#Defining header based on the structure of the dictionary you created:
headerLine = 'Chrom\tCDR_Start\tCDR_End\tCDR_ID\tCDR_Length\tSet\tGenes\tCDR_Coordinates\tSNP_Set1_OverlappingWindows_MeanZscore\t\tSNP_Set1_OverlappingWindows_MaxZscore\tSNP_Set2_OverlappingWindows_MeanZscore\tSNP_Set2_OverlappingWindows_MaxZscore\n'
outFile.write(headerLine)

WinID = '' #Setting equal to nothing
allDict = {} # defining the dictionary that will have all the results of bedtool intersect compiled by windows

windowCount = 0
inBedFile = open(options.input, 'r')

for line in inBedFile:
    if 'Chrom' in line: #skips header, if present
        continue
    line = line.rstrip() #removing extraneous whitespace characters
    line = line.split('\t') #delimiting "columns" in the file based on tabs		
    windowCount += 1 

    chr = line[0]
    start_pos = line[1]
    end_pos = line[2]
    WinID = line[3]
    winLength = line[4]
    callset = line[5]
    winCoord = chr + ":" + start_pos + "-" + end_pos
    #Open up dictionary for WinID
    #0=WinID, 1=chr, 2=start_pos, 3=end_pos, 4=zscore,
    #5 = CDR length
    #6=Ensembl Gene IDs (default = False)
    #7 = Window coordinates in one string (for ease of display in tables for manuscript)
    #8 = TOTAL SNPs (SET 1) - Mean ROA Zscore - Overlapping(Sliding) Window
    #9 = TOTAL SNPs (SET 1) - Maximum ROA Zscore - Overlapping(Sliding) Window
    #10 = CALLSET 3 SNPs (SET 2) - Mean ROA Zscore - Overlapping(Sliding) Window
    #11 = CALLSET 3 SNPs (SET 2) - Maximum ROA Zscore - Overlapping(Sliding) Window
    #once I get the script working I will add more to the dictionary. For example,
    # Broad bedtools intersect results
    allDict[WinID] = [chr,start_pos,end_pos,WinID,winLength,callset,False,winCoord,'NA', 'NA', 'NA', 'NA'] #setting up empty dictionary

    if WinID in ensGDict: #If this window is in the ensembl list you generated, then...:
        y = ", ".join(map(str, ensGDict[WinID])) #This takes the list you generated from the Ensembl file and joins each entry in the list with a comma
        allDict[WinID][6] = y
    #For when you add Broad for instance
    #if WinID in broadGDict:
    #	same thing as above
    #	now you'd add to allDict[WindID][6] or [7]

print allDict
# Sort dictionary
#sorted(allDict.keys())	

############################################################################
#Writing the geneID + proteinID of all ensembl 81 genes that intersect
#	with significant windows
####Input files:
##GeneList that went into BLAST2GO pipeline
#Format: [0] = ENSCAFP ID and [1] = ENSCAFG ID
inFile = open('/home/jmkidd/kidd-lab/ampend-projects/BLAST2GO/results/BLAST2GO_GeneTables_WithEnscafIDsAndChrom.txt', 'r')

#Saving gene-protein links that were processed with BLAST2GO to array
protGene = {}

for line in inFile:
    line = line.rstrip()
    line = line.split()
    geneID = line[1]
    protID = line[0]
    protGene[geneID] = []
    protGene[geneID] = protID
print '\n%i genes read into gene-protein ID array' % len(protGene)

#FOR BLAST2GO ENRICHMENT FILES
blast2goDir = '/home/jmkidd/kidd-lab/ampend-projects/Angela/Re-RunningAnalysis_NewScripts/new_results/results/BLAST2GO/'
enrFile = open(blast2goDir + options.root + 'EnrichmentGeneSet.txt', 'w')

#Goes through sig gene array from above and links the proteins and gene IDs together
#	to then write to the enrichment file that can be inputted into BLAST2GO
print 'Writing significant genes to test for enrichment in BLAST2GO to this file: ', enrFile
for i in sigGenes:
    geneID = i
    protID = protGene[i]
    enrFile.write('%s|%s\n' % (geneID, protID))


In [18]:
#MAKING UCSC TRACKS
zfile = vstDir + 'input/' + vstType + '_zscore_15CN_regions.bed'
print 'Reading in Z-scores from the following file\n', zfile
zFile = open(zfile, 'r')

for line in zFile:
    line = line.rstrip()
    line = line.split('\t')
    chrom = line[0]
    start = line[1]
    end = line[2]
    zScore = 
    print line
    break

Reading in Z-scores from the following file
/home/ampend/kidd-lab/ampend-projects/Angela/Feichen_VST/input/QuicKmer_zscore_15CN_regions.bed
chr1	81318	183299	-0.655461080375

