In [1]:
#this uses iPython magic to make plots appear inline
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import subprocess
import sys
import numpy as np
import matplotlib.patches as patches
import glob
import gzip
import fileinput
import time
import datetime

def count_lines(f):
    lineCount = 0
    with open(f, 'r') as f:
        for line in f:
            lineCount += 1
        return lineCount
def runCMD(cmd):
    val = subprocess.Popen(cmd, shell=True).wait()
    if val == 0:
        pass
    else:
        print ('command failed')
        print (cmd)
        sys.exit(1)

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 

In [2]:
def get_chrom_sizes():
    chromSizefile = '/home/ampend/links/kidd-lab/www-mirror/www/track-hub/canFam3/canFam3.1-browser-chrom-sizes.fai'
    print ('Reading in chromosome lengths from: ', chromSizefile)
    chromSizeFile = open(chromSizefile, 'r')

    chromSizeList,chromlist = {},[]
    #I also need a list of chr and position for finding the index for windows 
    for line in chromSizeFile:
        if "chrUn" in line: #ignore chromsome unknowns 
            continue
        if "chrM" in line: #ignore mito
            continue

        line = line.rstrip()
        line = line.split()

        chromID = line[0] #chromosome ID
        chromlist.append(chromID)
        chromLength = int(line[1]) #length of chromosome
        chromSizeList[chromID] = chromLength

    return chromSizeList, chromlist
#####################################################################################################

In [3]:
wkDir = '/home/ampend/links/kidd-lab/ampend-projects/Angela/Simulations/'

#Getting chromosome sizes
chromSizeList,chromlist = get_chrom_sizes()


Reading in chromosome lengths from:  /home/ampend/links/kidd-lab/www-mirror/www/track-hub/canFam3/canFam3.1-browser-chrom-sizes.fai


# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 

In [8]:
#Combining the VCF frq.count files into whole genome files for simulations 0-9
for sim in range(0,10): #for simulations 0-9
    #for wolves
    cmd = 'cat %sinput/wolf_chr*.%i.simulation.recode.frq.count > %sinput/wolf_%i.simulation.recode.frq.count' % (wkDir,sim,wkDir,sim)
    runCMD(cmd)
    #for village dogs
    cmd = 'cat %sinput/dog_chr*.%i.simulation.recode.frq.count > %sinput/dog_%i.simulation.recode.frq.count' % (wkDir,sim,wkDir,sim)
    runCMD(cmd)
print('DONE merging the dog and wolf frq.count files within simulation')

cat /home/ampend/links/kidd-lab/ampend-projects/Angela/Simulations/input/wolf_chr*.0.simulation.recode.frq.count > /home/ampend/links/kidd-lab/ampend-projects/Angela/Simulations/input/wolf_0.simulation.recode.frq.count
cat /home/ampend/links/kidd-lab/ampend-projects/Angela/Simulations/input/dog_chr*.0.simulation.recode.frq.count > /home/ampend/links/kidd-lab/ampend-projects/Angela/Simulations/input/dog_0.simulation.recode.frq.count
cat /home/ampend/links/kidd-lab/ampend-projects/Angela/Simulations/input/wolf_chr*.1.simulation.recode.frq.count > /home/ampend/links/kidd-lab/ampend-projects/Angela/Simulations/input/wolf_1.simulation.recode.frq.count
cat /home/ampend/links/kidd-lab/ampend-projects/Angela/Simulations/input/dog_chr*.1.simulation.recode.frq.count > /home/ampend/links/kidd-lab/ampend-projects/Angela/Simulations/input/dog_1.simulation.recode.frq.count
cat /home/ampend/links/kidd-lab/ampend-projects/Angela/Simulations/input/wolf_chr*.2.simulation.recode.frq.count > /home/ampend/

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 

In [24]:
def get_chr_pos(infile):
    chr_pos = []
    for line in open(infile,'r'):
        if line[0] == 'C':
            continue 
        line = line.rstrip() #removing extraneous whitespace characters
        line = line.split() #delimiting "columns" in the file based on tabs		
        if len(chromlist) == 0 or line[0] != chromlist[-1]:
            chromlist.append(line[0])
        chr_pos.append(line) #I dont want to append the whole line I just need chr and pos...
    return chr_pos
#####################################################################################################
def process_inputlines(line):
    line = line.rstrip() #removing extraneous whitespace characters
    line = line.split() #delimiting "columns" in the file based on tabs
    line[4] = float(line[4].split(':')[-1])
    line[5] = float(line[5].split(':')[-1])
    return line[4],line[5] 
#####################################################################################################
#####################################################################
# PER JEFF:
# Calculates estimate of 2 pop Fst using Hudson's estimator
# as described in Bhatia et al 2013
# input is [a1_count,a2_count][a1_counts,a2_count]
# returns (fst,numerator,denominator) for use in taking ratio of averages for multiple markers
def fst_hudson_twopop(pop1,pop2):
    n1 = pop1[0] + pop1[1]
    n2 = pop2[0] + pop2[1]
    p1 = float(pop1[0]) / float((pop1[0]+pop1[1]))  # allele freq in pop 1
    p2 = float(pop2[0]) / float((pop2[0]+pop2[1]))  # allele freq in pop 2

    num = (p1-p2)*(p1-p2) -  ((p1*(1.0-p1))/(n1-1)) -  ((p2*(1.0-p2))/(n2-1.0))
    denom = p1*(1.0-p2) + p2*(1.0-p1)
    f = float(num)/float(denom)
    
    return (f,num,denom)
#####################################################################################################
def calc_windows_fst_data(fstOutFile,chrom):
    head_idx, tail_idx, last_line = 0, 0, 0
    chrom = 'chr' + str(chrom)
    #for chrom in chromlist:
    window_start, window_end, windowCount, lastWindow = 0, 0, 0, False
    for i in range(1,130000000,50000):  #figure out the head/tail index
        window_start = i
        window_end = i + 200000 - 1
        if window_start > chromSizeList[chrom]:
            break
        if int(window_end) > int(chromSizeList[chrom]): #if the end of the window extends further than the length of the chromsome, then the window end is the length of the chromosome
            window_end = int(chromSizeList[chrom])
        while int(chr_pos[head_idx][1]) < i and chr_pos[head_idx][0] == chrom:
            head_idx += 1
        if head_idx >= len(chr_pos):
            break
        if tail_idx >= len(chr_pos):
            break
        while (int(chr_pos[tail_idx][1]) <= (i+200000) and chr_pos[tail_idx][0] == chrom):
            tail_idx += 1
            if tail_idx == len(chr_pos): 
                last_line = 1
                break
        #print tail_idx
        tail_ind_new = tail_idx - 1
        # Columns for output
        window_chr = chr_pos[head_idx][0]
        N_varients = tail_ind_new - head_idx
        windowCount += 1 
        N_varients_fixed,N_varients_tot,N_varients_forFst = 0,0,0 

        if N_varients < 1: #This excludes windows with no variants in them. These will not go to calculating Fst
            continue
        #Writing out chrom positions of windows with variant counts for the FST outfile
        fList, numList, denomList = [], [], []

        #Here is where you'll read in the new subroutine from Jeff (at the top)
        for k in range(head_idx, tail_ind_new+1): #This is going to calculate the F,num,denom at each site in the window
            N_varients_tot += 1
            if pop1[k][0] == 0 and pop2[k][0] == 0 or pop1[k][1] == 0 and pop2[k][1] == 0:
                print ('pop1[k][0] and pop2[k][0]', pop1[k][0], pop2[k][0])
                print ('pop1[k][1] and pop2[k][1]', pop1[k][1], pop2[k][1])
                N_varients_fixed+=1
                continue
            temp = fst_hudson_twopop(pop1[k],pop2[k]) #This function writes out three values: 1) f, 2) numerator, and 3) denominator
            f = temp[0] #defining variables for comprehension, not utility, here that were output from the fst_hudson_twopop function
            num = temp[1]
            denom = temp[2]
            #N_varients_tot += 1
            #counting fixed SNPs:(if fst = 0, then it's a fixed snp in both dogs and wolves)
            #if f == 0:
                #N_varients_fixed_OLD +=1
                #continue #This is throwing out all sites that are fixed in the wolves AND dogs at the same site. These aren't informative and shouldn't be added to the lists of f/num/denom
            N_varients_forFst += 1 
            fList.append(f) #now we are saving those to a list here that can be used to calculate the average later
            numList.append(num)
            denomList.append(denom)    

        ########################
        #Only until all the values for f,num,and denom are calculated can you find the average of ratios, etc.
        #Here is where you would calculate the AofR and RofA from the lists we generated above (f,num,denom)
        #AofR = averaging all of the Fst ratios 
        if len(fList) == 0:
            continue
        AofR = float(sum(fList))/len(fList)

        #RofA = averaging all the numerators and dividing it by the average of denominators:
        meanNum = float(sum(numList))/len(numList) #check where to put the float
        meanDenom = float(sum(denomList))/len(denomList)
        RofA = float(meanNum)/meanDenom
        ########################
        fstOutFile.write('%s\t%s\t%s\t%s_%s\t' % (window_chr, window_start, window_end, window_chr, window_start)) #change this to the new outfile for FST data
        #here is where you want to write out the windowID, RofA, AofR
        fstOutFile.write('%i\t%i\t%i\t%f\t%f\n' % (N_varients_tot, N_varients_forFst, N_varients_fixed, RofA, AofR)) #(remember that you can use %.3f (example) to limit the decimal places for these if you want)

        if last_line == 1:
            break
        if chr_pos[tail_idx][0] != chrom:
            head_idx = tail_idx
            break

    #This prints out the chromosome stats when everything is done running and this corresponds to stats for the last chromosome only
    #print ('Chrom: ', chrom)
    #print ('Window count', windowCount)
    #break
    fstOutFile.close()
#####################################################################################################
def filter_windows(fstOutfile):
    inFile = open(fstOutfile, 'r')
    outFile = open(fstOutfile + '_filtered_windows', 'w')
    print('Writing filtered windows (at least 10 SNPs) to output file: ', fstOutfile + '_filtered_windows' )
    count = 0 #number of windows with more than 10 variants
    for line in inFile: #processing the outfile generated from fst calculations
        LINE=line.rstrip().split('\t')
        if 'CHROM' in LINE[0]: #skip header
            outFile.write(line) #writes header to the outfile
            continue
        var_count = int(LINE[4]) #counts the variants
        if var_count >= 10:
            count+=1
            outFile.write(line)    
    print('%i windows with at least ten variants' % count)
    inFile.close()
    outFile.close()
#####################################################################################################


# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 

In [38]:
ts1 = time.time()
st = datetime.datetime.fromtimestamp(ts1).strftime('%Y-%m-%d %H:%M:%S')
print('START= ',st)

for i in range(0,10):
    print('Processing Simulation %i' % i)
    if i != 3:
        continue
    for chrom in range(1,39):
        print('Chr%i' % chrom)
        """dogCountFile = wkDir + 'input/' + 'dog_chr1.0.simulation.recode.frq.count'
        wolfCountFile = wkDir + 'input/' + 'wolf_chr1.0.simulation.recode.frq.count'"""
        #IN FILES
        dogCountFile = wkDir + 'input/' + 'dog_chr%i.%i.simulation.recode.frq.count' % (chrom,i)
        wolfCountFile = wkDir + 'input/' + 'wolf_chr%i.%i.simulation.recode.frq.count' % (chrom,i)
        #OUT FILES
        fstOutfile = wkDir + 'results/' + 'Simulation.chr%i.%i_Hudson_Fst_200kbWindow_50kbSlide.txt' % (chrom,i)#Write out the new FST data here
        fstOutFile = open(fstOutfile, 'w')
        #write header to outfile:
        fstOutFile.write('CHROM\tSTART_POS\tEND_POS\tWINDOW_ID\tN_variants_tot\tN_variants_forFst\tN_variants_fixed\tRofA_Fst\tAofR_Fst\n')

        #1. Get SNP positions
        chr_pos = get_chr_pos(dogCountFile)

        #2. Process dog SNP frequency file
        pop1 = [] #dog allele counts
        for line in fileinput.input([dogCountFile]):  #change count file for dog and count file for wolf here
            if line[0] == 'C':
                continue 
            temp = process_inputlines(line) #temp's structure is allele1, allele2
            pop1.append(temp) #this is adding the two alleles to pop1

        #3. Process WOLF SNP frequency file
        pop2 = [] #wolf allele counts
        for line in fileinput.input([wolfCountFile]):	
            if line[0] == 'C':
                continue 
            temp = process_inputlines(line)
            pop2.append(temp)

        #4. Make sliding windows and calculate Fst within them
        calc_windows_fst_data(fstOutFile,chrom)
        
    #5. Merge the individual chromosome FST data together per simulation
    simulation = i
    mergedOutfile = wkDir + 'results/' + 'Simulation.%i_Hudson_Fst_200kbWindow_50kbSlide.txt' % simulation
    cmd = 'cat %sresults/Simulation.chr*.%i_Hudson_Fst_200kbWindow_50kbSlide.txt | grep -v "CHROM" > %s' % (wkDir,simulation,mergedOutfile)
    runCMD(cmd)

    header = 'CHROM\tSTART_POS\tEND_POS\tWINDOW_ID\tN_variants_tot\tN_variants_forFst\tN_variants_fixed\tRofA_Fst\tAofR_Fst'
    cmd = 'echo \'%s\' | cat - %s >  temp && mv temp %s' % (header, mergedOutfile,mergedOutfile)
    runCMD(cmd)    
    
    #6. Filter windows == must have at least ten SNPs within it
    filter_windows(mergedOutfile)
    
    #6. Z-transform the data
    cmd = 'Rscript %sscripts/Ztransform.R %s_filtered_windows' % (wkDir,mergedOutfile)
    runCMD(cmd)
    
    #break
fstOutFile.close()

#Check how long it took
print(datetime.datetime.fromtimestamp(time.time()).strftime('%H:%M:%S'))
 

START=  2017-08-16 15:03:52
Processing Simulation 0
Processing Simulation 1
Processing Simulation 2
Processing Simulation 3
Chr1
Chrom:  chr1
Window count 2451
Chr2
Chrom:  chr2
Window count 1706
Chr3
Chrom:  chr3
Window count 1835
Chr4
Chrom:  chr4
Window count 1763
Chr5
Chrom:  chr5
Window count 1776
Chr6
Chrom:  chr6
Window count 1549
Chr7
Chrom:  chr7
Window count 1617
Chr8
Chrom:  chr8
Window count 1484
Chr9
Chrom:  chr9
Window count 1219
Chr10
Chrom:  chr10
Window count 1384
Chr11
Chrom:  chr11
Window count 1485
Chr12
Chrom:  chr12
Window count 1447
Chr13
Chrom:  chr13
Window count 1262
Chr14
Chrom:  chr14
Window count 1217
Chr15
Chrom:  chr15
Window count 1281
Chr16
Chrom:  chr16
Window count 1190
Chr17
Chrom:  chr17
Window count 1283
Chr18
Chrom:  chr18
Window count 1114
Chr19
Chrom:  chr19
Window count 1072
Chr20
Chrom:  chr20
Window count 1160
Chr21
Chrom:  chr21
Window count 1015
Chr22
Chrom:  chr22
Window count 1226
Chr23
Chrom:  chr23
Window count 1043
Chr24
Chrom:  chr24


In [None]:
for i in range(0,10):
    simulation = i
    inFile = wkDir + 'results/' + 'Simulation.%i_Hudson_Fst_200kbWindow_50kbSlide.txt_filtered_windows_RofA_Fst_Zscores.txt' % (simulation)

