Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 143 additions & 4 deletions cms/combine/recalc_func.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
##
## 07.23.16 vitti@broadinstitute.org
## functions for transforming component score calculations as part of composite.py
## last updated 07.24.16 vitti@broadinstitute.org
from math import fabs
import sys
import os
Expand Down Expand Up @@ -48,8 +48,147 @@ def interpolate_haps(starts, ends, scores, ihh1s, ihh0s, thisPos, freqs):
else:
pass #advance window.
return float('nan'), float('nan'), float('nan'), float('nan')
def calc_winIhs(score_array):
"""returns mean of absolute value of all scores in window"""
def calc_windowscore(score_array):
"""returns mean of absolute value of all scores in window as part of winiHS (see Schlebusch et al 2012)"""
absolutes = [fabs(x) for x in score_array]
total = sum(absolutes)
return total/len(score_array)
def windows(windowsize, jumplen, infilename, writefilename):
'''from winiHS.py (/windelihh?)'''
posindex, scoreindex = 1, 6
ihh1_index, ihh0_index, freq1_index = 3,4,2

positions, scores = [], []
ihh1s, ihh0s, freq1s = [], [], []
infile = open(infilename, 'r')
for line in infile:
entries = line.split()
position, score = int(entries[posindex]), float(entries[scoreindex])
positions.append(position)
scores.append(score)

ihh1, ihh0, freq1 = float(entries[ihh1_index]), float(entries[ihh0_index]), float(entries[freq1_index])
ihh1s.append(ihh1)
ihh0s.append(ihh0)
freq1s.append(freq1) #not sure that this even counts for anything.

infile.close()

###################
## DEFINE WINDOWS #
#this is what I interpret
#Schlebusch to be doing (?)

numSnps = len(scores)
print(str(numSnps) + " scores available")
all_phys_windows, all_score_windows = [], []
all_ihh1_windows, all_ihh0_windows, all_freq1_windows = [], [], []
iSnp = 0
while iSnp < numSnps:
window_phys, window_scores = [], []
window_ihh1, window_ihh0, window_freq1 = [], [], []
while len(window_scores) < windowsize:
if iSnp < numSnps: #?
window_phys.append(positions[iSnp])
window_scores.append(scores[iSnp])
window_ihh1.append(ihh1s[iSnp])
window_ihh0.append(ihh0s[iSnp])
window_freq1.append(freq1s[iSnp])
else:
break
iSnp +=1
all_phys_windows.append(window_phys)
all_score_windows.append(window_scores)
all_ihh1_windows.append(window_ihh1)
all_ihh0_windows.append(window_ihh0)
all_freq1_windows.append(window_freq1)
iSnp += jumplen
#print(str(len(all_score_windows)))
numWindows = len(all_score_windows)
print("chunked " + str(numSnps) + " SNPs with available haplotype scores into " + str(numWindows) + " windows of size " + str(windowsize) + " SNPs with jump length " + str(jumplen) + ".")

################################
## CALC AVE AND WRITE TO FILE #
################################

writefile = open(writefilename, 'w')
for iWindow in range(numWindows):
positions = all_phys_windows[iWindow]
startPos, endPos = positions[0], positions[1]
scores = all_score_windows[iWindow]
winiHS = calc_winIhs(scores)

ihh1_scores = all_ihh1_windows[iWindow] #since these are all positive,
ihh0_scores = all_ihh0_windows[iWindow] #we can just reuse calc_winIhs func
freq1_vals = all_freq1_windows[iWindow] #to get averages.

winihh1 = calc_winIhs(ihh1_scores)
winihh0 = calc_winIhs(ihh0_scores)
freq_ave = calc_winIhs(freq1_vals)

#writeline = str(startPos) + "\t" + str(endPos) + "\t" + str(winiHS) +'\n'
writeline = str(startPos) + "\t" + str(endPos) + "\t" + str(winiHS) +'\t' + str(winihh1) + "\t" + str(winihh0) + "\t" + str(freq_ave) + "\n"
writefile.write(writeline)
writefile.close()
print("wrote to file: " + writefilename)
def interpolate_from_windows(inputTpedFilename, inputIhsFilename, inputWinihsFilename, outputFilename):
'''from interpolate.py'''
all_snps = []
openfile = open(inputTpedFilename, 'r')
for line in openfile:
entries = line.split()
pos = int(entries[1])
all_snps.append(pos)
openfile.close()
nsnps = len(all_snps)
print('loaded ' + str(nsnps) + ' snps from' + inputTpedFilename)

# load scores and windows from winIhs
openfile = open(inputWinihsFilename, 'r')
starts, ends, scores = [], [], []
ihh1s, ihh0s = [], []
freqs = []

for line in openfile:
entries = line.split()
startBp, endBp, winIhs = int(entries[0]), int(entries[1]), float(entries[2])

ihh1, ihh0 = float(entries[3]), float(entries[4])
freq_win = float(entries[5])

starts.append(startBp)
ends.append(endBp)
scores.append(winIhs)
ihh1s.append(ihh1)
ihh0s.append(ihh0)
freqs.append(freq_win)
openfile.close()

readfile = open(inputIhsFilename, 'r')
writefile = open(outputFilename, 'w')
readline = readfile.readline()
entries = readline.split()
thisScorePos = int(entries[0])

for iSnp in range(nsnps):
thisPos = all_snps[iSnp]
if thisPos < thisScorePos: #interpolate until we advance to the first one for which we have a score
interpolated_ihs, interpolated_ihh1, interpolated_ihh0, interpol_freq = interpolate(starts, ends, scores, ihh1s, ihh0s, thisPos, freqs)

unnormed_ihs= 0 #dummy to pass to selscan so norm will function
writeline = str(thisPos) + "\t" + str(thisPos) + "\t" + str(interpol_freq) + "\t" + str(interpolated_ihh1) + "\t" + str(interpolated_ihh0) + "\t" + str(unnormed_ihs) + "\t" + str(interpolated_ihs) + "\t9\n" #index interpolated in outputfile, in case it's not already obvious?
writefile.write(writeline)
elif thisPos == thisScorePos: #found a match
writefile.write(readline) #propagate to writefile
readline = readfile.readline() #advance to next SNP for which we have a full iHS calc
entries = readline.split()
if len(entries) < 1:
break
else:
thisScorePos = int(entries[0])
else: #thisPos > thisScorePos
print("ERROR: does your input iHS score file contain SNPs missing from your input TPED? Shame.")

readfile.close()
writefile.close()
print('wrote to ' + outputFilename)
152 changes: 5 additions & 147 deletions cms/composite.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
## top-level script for combining scores into composite statistics as part of CMS 2.0.
## last updated: 07.23.16 vitti@broadinstitute.org
## last updated: 07.24.16 vitti@broadinstitute.org

prefixstring = "{CMS2.0}>>\t\t" #for stderr (make global?)
from combine.recalc_func import write_delIHH_file, interpolate_haps, calc_winIhs
from combine.recalc_func import write_delIHH_file, interpolate_haps, windows, interpolate_from_windows
from combine.likes_func import get_likesfiles_frommaster
from dists.scores_func import calc_fst_deldaf
from dists.scores_func import calc_fst_deldaf
import subprocess
import argparse
import sys
Expand Down Expand Up @@ -84,160 +84,18 @@ def execute_freqscores(args):
calc_fst_deldaf(args.inTped1, args.inTped2, args.recomFile, args.outfile)
return
def execute_win_haps(args):
'''from winiHS.py (/windelihh?)'''
#if args.ihs:
#if args.delihh:
posindex, scoreindex = 1, 6
ihh1_index, ihh0_index, freq1_index = 3,4,2

windowsize = args.windowsize
jumplen = args.jumplen
infilename = args.infilename
writefilename = args.writefilename

positions, scores = [], []
ihh1s, ihh0s, freq1s = [], [], []
infile = open(infilename, 'r')
for line in infile:
entries = line.split()
position, score = int(entries[posindex]), float(entries[scoreindex])
positions.append(position)
scores.append(score)

ihh1, ihh0, freq1 = float(entries[ihh1_index]), float(entries[ihh0_index]), float(entries[freq1_index])
ihh1s.append(ihh1)
ihh0s.append(ihh0)
freq1s.append(freq1) #not sure that this even counts for anything.

infile.close()

###################
## DEFINE WINDOWS #
#this is what I interpret
#Schlebusch to be doing (?)

numSnps = len(scores)
print(str(numSnps) + " scores available")
all_phys_windows, all_score_windows = [], []
all_ihh1_windows, all_ihh0_windows, all_freq1_windows = [], [], []
iSnp = 0
while iSnp < numSnps:
window_phys, window_scores = [], []
window_ihh1, window_ihh0, window_freq1 = [], [], []
while len(window_scores) < windowsize:
if iSnp < numSnps: #?
window_phys.append(positions[iSnp])
window_scores.append(scores[iSnp])
window_ihh1.append(ihh1s[iSnp])
window_ihh0.append(ihh0s[iSnp])
window_freq1.append(freq1s[iSnp])
else:
break
iSnp +=1
all_phys_windows.append(window_phys)
all_score_windows.append(window_scores)
all_ihh1_windows.append(window_ihh1)
all_ihh0_windows.append(window_ihh0)
all_freq1_windows.append(window_freq1)
iSnp += jumplen
#print(str(len(all_score_windows)))
numWindows = len(all_score_windows)
print("chunked " + str(numSnps) + " SNPs with available haplotype scores into " + str(numWindows) + " windows of size " + str(windowsize) + " SNPs with jump length " + str(jumplen) + ".")

################################
## CALC AVE AND WRITE TO FILE #
################################

writefile = open(writefilename, 'w')
for iWindow in range(numWindows):
positions = all_phys_windows[iWindow]
startPos, endPos = positions[0], positions[1]
scores = all_score_windows[iWindow]
winiHS = calc_winIhs(scores)

ihh1_scores = all_ihh1_windows[iWindow] #since these are all positive,
ihh0_scores = all_ihh0_windows[iWindow] #we can just reuse calc_winIhs func
freq1_vals = all_freq1_windows[iWindow] #to get averages.

winihh1 = calc_winIhs(ihh1_scores)
winihh0 = calc_winIhs(ihh0_scores)
freq_ave = calc_winIhs(freq1_vals)


#writeline = str(startPos) + "\t" + str(endPos) + "\t" + str(winiHS) +'\n'
writeline = str(startPos) + "\t" + str(endPos) + "\t" + str(winiHS) +'\t' + str(winihh1) + "\t" + str(winihh0) + "\t" + str(freq_ave) + "\n"
writefile.write(writeline)
writefile.close()
print("wrote to file: " + writefilename)

windows(windowsize, jumplen, infilename, writefilename)
return
def execute_interpolate_hapscores(args):
'''from interpolate.py'''
inputTpedFilename = args.intpedfilename
inputIhsFilename = args.inihsfilename
inputWinihsFilename = args.inwinihsfilename
outputFilename = args.outfilename
all_snps = []
openfile = open(inputTpedFilename, 'r')
for line in openfile:
entries = line.split()
pos = int(entries[1])
all_snps.append(pos)
openfile.close()
nsnps = len(all_snps)
print('loaded ' + str(nsnps) + ' snps from' + inputTpedFilename)

# load scores and windows from winIhs
openfile = open(inputWinihsFilename, 'r')
starts, ends, scores = [], [], []
ihh1s, ihh0s = [], []
freqs = []

for line in openfile:
entries = line.split()
startBp, endBp, winIhs = int(entries[0]), int(entries[1]), float(entries[2])

ihh1, ihh0 = float(entries[3]), float(entries[4])
freq_win = float(entries[5])

starts.append(startBp)
ends.append(endBp)
scores.append(winIhs)
ihh1s.append(ihh1)
ihh0s.append(ihh0)
freqs.append(freq_win)
openfile.close()

readfile = open(inputIhsFilename, 'r')
writefile = open(outputFilename, 'w')
readline = readfile.readline()
entries = readline.split()
thisScorePos = int(entries[0])

for iSnp in range(nsnps):
thisPos = all_snps[iSnp]
if thisPos < thisScorePos: #interpolate until we advance to the first one for which we have a score
interpolated_ihs, interpolated_ihh1, interpolated_ihh0, interpol_freq = interpolate(starts, ends, scores, ihh1s, ihh0s, thisPos, freqs)

unnormed_ihs= 0 #dummy to pass to selscan so norm will function
writeline = str(thisPos) + "\t" + str(thisPos) + "\t" + str(interpol_freq) + "\t" + str(interpolated_ihh1) + "\t" + str(interpolated_ihh0) + "\t" + str(unnormed_ihs) + "\t" + str(interpolated_ihs) + "\t9\n" #index interpolated in outputfile, in case it's not already obvious?
writefile.write(writeline)
elif thisPos == thisScorePos: #found a match
writefile.write(readline) #propagate to writefile
readline = readfile.readline() #advance to next SNP for which we have a full iHS calc
entries = readline.split()
if len(entries) < 1:
break
else:
thisScorePos = int(entries[0])
else: #thisPos > thisScorePos
print("ERROR: does your input iHS score file contain SNPs missing from your input TPED? Shame.")

readfile.close()
writefile.close()
print('wrote to ' + outputFilename)


interpolate_from_windows(inputTpedFilename, inputIhsFilename, inputWinihsFilename, outputFilename)
return
def execute_delihh_from_ihs(args):
write_delIHH_file(args.readfile, args.writefile)
Expand Down
11 changes: 3 additions & 8 deletions cms/likes_from_model.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
## top-level script for generating probability distributions for component scores as part of CMS 2.0. Assumes snakemake
## last updated: 07.23.16 vitti@broadinstitute.org
## last updated: 07.26.16 vitti@broadinstitute.org

prefixstring = "{CMS2.0}>>\t\t" #for stderr (make global?)

Expand Down Expand Up @@ -114,9 +114,9 @@ def execute_run_neut_sims(args):
runStatsCommand += " --genmapRandomRegions"
if args.dropSings is not None:
neutSimCommand += " --drop-singletons " + str(args.dropSings)
neutSimCommand += " -n " + str(args.n) + " -o " + runDir + "rep"
neutSimCommand += " -n " + str(args.n) + " --tped -o " + runDir + "rep"
print(prefixstring + neutSimCommand)
subprocess.check_output(n=eutSimCommand.split())
subprocess.check_output(neutSimCommand.split())
return
def execute_get_sel_trajs(args):
'''adapted from JV run_sims_from_model_vers.py'''
Expand Down Expand Up @@ -266,11 +266,6 @@ def execute_likes_from_scores(args):
causal_scores, linked_scores, neut_scores = data[(bin_medians_str[ibin], 'causal_scores_final')], data[(bin_medians_str[ibin], 'linked_scores_final')], data[(bin_medians_str[ibin], 'neut_scores_final')]
n_causal, n_linked, n_neut, bin_causal, bins_linked, bins_neut = calc_hist_from_scores(causal_scores, linked_scores, neut_scores, xlims, args.thinToSize)
write_hist_to_files(args.outPrefix +"_" + bin_medians_str[ibin], histBins, n_causal, n_linked, n_neut)
#if args.plot:
# if comparison: #fst, xpehh, deldaf
# pass
# else: #ihs, delihh
# pass
return
def execute_visualize_likes(args):
'''currently hard-wired to view all'''
Expand Down
1 change: 1 addition & 0 deletions conda-environment_py3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ dependencies:
- scipy=0.17.1
- setuptools=23.0.0
- six=1.10.0
- snakemake
- snowballstemmer=1.2.1
- sphinx_rtd_theme=0.1.9
- sqlite=3.13.0
Expand Down