# 2017-10-18
# A. Pendleton
# Re-processing the first round of FST CDRs using the Total SNP Set. 

In [1]:
#this uses iPython magic to make plots appear inline
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import subprocess
import sys
import numpy as np
import matplotlib.patches as patches
import gzip
import fileinput
import glob
from scipy import stats
import re
from matplotlib_venn import venn3, venn3_circles

def count_lines(f):
    lineCount = 0
    with open(f, 'r') as f:
        for line in f:
            lineCount += 1
        return lineCount
def runCMD(cmd):
    val = subprocess.Popen(cmd, shell=True).wait()
    if val == 0:
        pass
    else:
        print ('command failed')
        print (cmd)
        sys.exit(1)

In [3]:
chromToDo = []
for num in range(1,39):
    chr = 'chr' + str(num)
    chromToDo.append(chr)
chromToDo.append('chrX')

In [4]:
inDir = '/home/ampend/links/kidd-lab/ampend-projects/Angela/Re-RunningAnalysis_NewScripts/ReProcessing_PrimaryCDRs_AxelssonMethods/'

In [13]:
#Write bedfile for subsequent intersections 
fstFile = inDir + 'input/' + 'AutoXParXNonPar_Sliding_TotalSIGFstCalls_54callset_mxbRemoved_GeneTable.txt'
bedFile = open(inDir + 'input/' + 'AutoXParXNonPar_Sliding_TotalSIGFstCalls_54callset_mxbRemoved.bed', 'w')

for line in open(fstFile, 'r'):
    if 'Chrom' in line: #skips header
        continue
    line=line.rstrip().split('\t')
    chrom,start,end,ID,AverageROA, AverageZ = line[0], int(line[1])-1, int(line[2]), line[3], float(line[7]), float(line[9])
    bedFile.write('%s\t%i\t%i\t%s\t%f\t%f\n' % (chrom,start,end,ID, AverageROA, AverageZ))
bedFile.close()

########################################################################################

In [17]:
def do_intersects(a, b, outfile):
    cmd = 'bedtools intersect -wo -a %s -b %s > %s' % (a, b, outfile)
    #print(cmd)
    runCMD(cmd)
    Dict = process_intersect(outfile)
    return Dict
def process_intersect(outfile):
    Dict={}
    for line in open(outfile,'r'):
        line=line.rstrip().split()
        chrom,start,end,ID = line[0:4]
        hitID = line[9]
        if ID in Dict.keys():
            Dict[ID].append(hitID)
        else:
            Dict[ID] = []
            Dict[ID].append(hitID)
    return Dict
def gene_intersects(a,b,intersectfile):
    #cmd = 'bedtools intersect -wo -a %s -b %s > %s' % (a, b, outfile)
    cmd = 'bedtools window -w 50000 -a %s -b %s > %s' % (a, b, outfile)
    runCMD(cmd)
    
    b2goDir = '/home/ampend/links/kidd-lab/ampend-projects/Angela/Re-RunningAnalysis_NewScripts/new_results/results/BLAST2GO/'
    # going through Ensembl bedtool intersect results file
    ensGDict = {} #ensGDict[windowID] = [gene1,gene2,gene3...]
    ensID = ''
    sigGenes = [] #To keep track of all gene IDs (ENSCAFG) that are in sig windows

    for line in open(intersectfile,'r'):
        line = line.rstrip().split('\t')
        winID = line[3]
        hit = line[9].split('_')
        Protein = hit[0]
        Gene = hit[1]
        b2goID = Gene + '|' + Protein
        ensGID = hit[2]
        if winID in ensGDict:
            ensGDict[winID].append(ensGID)
        else:
            ensGDict[winID] = []
            ensGDict[winID].append(ensGID) 
            sigGenes.append(b2goID)
    return ensGDict

In [19]:
resultsDir = inDir + 'results/'
#Our merged CDRs
cdrfile = inDir + 'input/' + 'AutoXParXNonPar_Sliding_TotalSIGFstCalls_54callset_mxbRemoved.bed'

#Get ensembl gene intersects -- save to Dict
genebedfile = '/home/ampend/links/kidd-lab/ampend-projects/BLAST2GO/results/BLAST2GO_Ensembl81_GeneTables_WithEnscafIDsAndChrom.bed'
outfile = resultsDir + 'Intersect_PrimaryCDRs_with_EnsemblGenes_50kbIntersectWindow.txt'
ensGDict = gene_intersects(cdrfile, genebedfile, outfile)
#Get Axelsson loci intersects -- save to Dict
axelssonBedFile = '/home/ampend/links/kidd-lab/ampend-projects/Angela/Re-RunningAnalysis_NewScripts/new_results/results/Axelsson_CaganBlass/input/Axelsson_canfam3.1.bed'
outfile = resultsDir + 'Intersect_PrimaryCDRs_with_AxelssonLoci.txt'
axDict = do_intersects(cdrfile, axelssonBedFile, outfile)
#Get CaganBlass loci intersects -- save to Dict
caganBlassBedFile = '/home/ampend/links/kidd-lab/ampend-projects/Angela/Re-RunningAnalysis_NewScripts/new_results/results/Axelsson_CaganBlass/input/CaganBlass_canfam3.1.bed '
outfile = resultsDir + 'Intersect_PrimaryCDRs_with_CaganBlassLoci.txt'
cbDict = do_intersects(cdrfile, caganBlassBedFile, outfile)

In [20]:
cdrFile = open(cdrfile, 'r')
cdrCount = 0
allDict = {}

for line in cdrFile:
    if 'Chrom' in line:
        continue
    line=line.rstrip().split('\t')
    cdrCount += 1
    chrom, start_pos, end_pos, WinID, Fst,Z = line[0:6]
    winLength = int(end_pos) - int(start_pos) + 1
    winCoord = chrom + ":" + str(start_pos) + "-" + str(end_pos)
    #Open up dictionary for WinID
    #0=chr, 1=start_pos, 2=end_pos, 3=WinID, 4=Window length, 5=WindowCoordinates(forUCSC)
    #6=RofA_WindowAverage_Fst #7 = Average_Z_Score_For_MergedWindow
    #8=Intersects with Ensembl Gene IDs (default = False)
    #9=Intersects with Axelsson Loci (default = False)
    #10=Intersects with Cagan/Blass Loci (default = False)
    allDict[WinID] = [chrom,start_pos,end_pos,WinID,winLength,winCoord,Fst,Z,'','',''] #setting up empty dictionary
    #Did this CDR intersect with a gene?
    if WinID in ensGDict: #If this window is in the ensembl list you generated, then...:
        y = ", ".join(map(str, ensGDict[WinID])) #This takes the list you generated from the Ensembl file and joins each entry in the list with a comma
        allDict[WinID][8] = y #Now it adds 'y' you generated in the line above to your dictionary
    #Did this CDR intersect with a gene?
    if WinID in axDict: #If this window is in the ensembl list you generated, then...:
        y = ", ".join(map(str, axDict[WinID])) #This takes the list you generated from the Ensembl file and joins each entry in the list with a comma
        allDict[WinID][9] = y #Now it adds 'y' you generated in the line above to your dictionary
    #Did this CDR intersect with a gene?
    if WinID in cbDict: #If this window is in the ensembl list you generated, then...:
        y = ", ".join(map(str, cbDict[WinID])) #This takes the list you generated from the Ensembl file and joins each entry in the list with a comma
        allDict[WinID][10] = y #Now it adds 'y' you generated in the line above to your dictionary
    allDict[WinID][3] = WinID.replace('FST','CDR')  #now we can call them CDRs instead of FST
    
#Sorts a dictionary's alphanumeric keys
def key_func(s):
    return [int(x) if x.isdigit() else x for x in re.findall(r'\D+|\d+', s)]
sorted_keys = sorted(allDict, key=key_func)

outFile = open(resultsDir + 'SummaryTable_PrimaryCDRs_Genes_Axelsson_CaganBlass.txt','w')
for key in sorted_keys: #Now loops through each 
    print ("\t".join(map(str,allDict[key])))
    outFile.write("\t".join(map(str,allDict[key])))
    outFile.write("\n")
outFile.close()

chr1	2350000	3250000	CDR_1	900001	chr1:2350000-3250000	0.564374	5.992389	GALR1, MBP, ZNF236	AX_1	CB_1, CB_2
chr1	79800000	80150000	CDR_2	350001	chr1:79800000-80150000	0.557803	5.897154	ENSCAFG00000030357	AX_4	
chr3	18800000	19050000	CDR_3	250001	chr3:18800000-19050000	0.514167	5.264731		AX_7	
chr4	40800000	41000000	CDR_4	200001	chr4:40800000-41000000	0.512679	5.243158	TLX3, ENSCAFG00000016912, RANBP17	AX_10	
chr5	3850000	4250000	CDR_5	400001	chr5:3850000-4250000	0.539992	5.639010	SNX19		
chr6	39900000	40100000	CDR_6	200001	chr6:39900000-40100000	0.523042	5.393353	HAGHL, CCDC78, NARFL, METRN, WDR24, FBXL16, RHBDL1, RHOT2, FAM195A, WFIKKN1, PRR35, NHLRC4, CAPN15, ENSCAFG00000032180, ENSCAFG00000019677, DECR2, MRPL28, C16orf13, WDR90, JMJD8, STUB1, RAB11FIP3, TMEM8A, PIGQ		
chr6	40350000	40600000	CDR_7	250001	chr6:40350000-40600000	0.515529	5.284464	ENSCAFG00000032615, ENSCAFG00000031055, HBM, ENSCAFG00000029904, ENSCAFG00000028569, ENSCAFG00000024468, ENSCAFG00000028465, MPG, SNRNP25, EN