diff --git a/cms/combine/cms_data.c b/cms/combine/cms_data.c index d4cfa5ce..bed13189 100644 --- a/cms/combine/cms_data.c +++ b/cms/combine/cms_data.c @@ -1,5 +1,5 @@ // functions for handling cms component(+composite) score datastructures -// last updated: 1.02.17 vitti@broadinstitute.org +// last updated: 1.10.17 vitti@broadinstitute.org #include #include @@ -298,7 +298,7 @@ void get_ihs_data(ihs_data* data, char filename[]) { else if (itoken == 4) { data->ihh1[isnp] = atof(token); } - else if (itoken == 9) { + else if (itoken == 5) { data->ihs_unnormed[isnp] = atof(token); } else if (itoken == 10) { diff --git a/cms/combine/write_xpehh_from_ihh.c b/cms/combine/write_xpehh_from_ihh.c index a166f24d..b186fa62 100644 --- a/cms/combine/write_xpehh_from_ihh.c +++ b/cms/combine/write_xpehh_from_ihh.c @@ -25,8 +25,8 @@ int main(int argc, char **argv) { exit(0); } - fprintf(stderr, "found sign error in calculation on 12.28.16, creating potential issues with previous calcs. if using this program, slow down and verify\n"); - exit(0); + //fprintf(stderr, "found sign error in calculation on 12.28.16, creating potential issues with previous calcs. if using this program, slow down and verify\n"); +// exit(0); newLine = malloc((line_size+1) * sizeof(char)); assert(newLine != NULL); @@ -73,7 +73,7 @@ int main(int argc, char **argv) { if (pos1 == pos2){ if (ihh1 == 0){ihh1 = 1e-08;} if (ihh2 == 0){ihh2 = 1e-08;} - xpehh = log(ihh1/ihh2); + xpehh = log(ihh1/ihh2); // times neg 1? //fprintf(outf, chromstr); //fprintf(outf, "_"); fprintf(outf, "%d\t%d\t%f\t%f\t%f\t%f\t%f\t%f\n", pos1, pos1, gdpos1, ihh1, p1, ihh2, p2, xpehh); diff --git a/cms/composite.py b/cms/composite.py index 0aa74a40..c5cd398e 100644 --- a/cms/composite.py +++ b/cms/composite.py @@ -194,7 +194,8 @@ def execute_xp_from_ihh(args): inputtped1 = args.inIhh1 inputtped2 = args.inIhh2 outfilename = args.outfilename - cmd = "dists/write_xpehh_fromihh" + + cmd = args.cmsdir + "combine/write_xpehh_from_ihh" argstring = inputtped1 + " " + inputtped2 + " " + outfilename cmdstring = cmd + " " + argstring if args.printOnly: diff --git a/cms/dists/scores_func.py b/cms/dists/scores_func.py index 89e27014..596ede7b 100644 --- a/cms/dists/scores_func.py +++ b/cms/dists/scores_func.py @@ -33,7 +33,7 @@ def calc_delihh(readfilename, writefilename): #ancestral - derived else: locus, phys, freq_1, ihh_1, ihh_0, ihs_unnormed, ihs_normed, lastcol = entries - unstand_delIHH = fabs(float(ihh_1) - float(ihh_0)) + unstand_delIHH = fabs(float(ihh_1) - float(ihh_0)) #WAIT WHY FABS????????? writeline = locus + "\t" + phys + "\t" + freq_1 + "\t" + str(ihs_unnormed) + "\t" + str(unstand_delIHH) +"\t" + str(unstand_delIHH) + "\n" #6 columns for selscan norm writefile.write(writeline) writefile.close() @@ -125,8 +125,6 @@ def norm_sel_ihs(inputScoreFile, neutNormfilename): normfile.close() print("wrote to: " + normfilename) return - - def norm_neut_xpehh(inputScoreFile, outfileName, runProgram = "scans.py"): '''from func_clean.py''' cmdStr = "python " + runProgram + " selscan_norm_xpehh " + inputScoreFile + " > " + outfileName @@ -326,7 +324,6 @@ def choose_vals_from_files(filename, numCols, takeindices, comp, stripHeader = F allpos.extend(chosenpos) alltoreturn = [allpos, allscores] return alltoreturn - def choose_from_reps(repscores, reppositions, repanc, mode="max"): '''flexible function to choose for likes ''' ncomp = len(repscores) @@ -369,8 +366,6 @@ def choose_from_reps(repscores, reppositions, repanc, mode="max"): #print(scores) #print(itemtoreturn) return toreturn, allpositions - - def calc_hist_from_scores(causal_scores, linked_scores, neut_scores, xlims, givenBins, thinToSize = False): if thinToSize: limiting = min(len(causal_scores), len(linked_scores), len(neut_scores)) @@ -425,8 +420,6 @@ def calc_hist_from_scores(causal_scores, linked_scores, neut_scores, xlims, give # n_neut[ibin] = 1e-10 return n_causal, n_linked, n_neut, bins_causal, bins_linked, bins_neut - - def write_hists_to_files(writePrefix, givenBins, n_causal, n_linked, n_neut): assert len(givenBins) == (len(n_causal) + 1) for status in ['causal', 'linked', 'neut']: diff --git a/cms/likes_from_model.py b/cms/likes_from_model.py index 52c486ee..a033465b 100644 --- a/cms/likes_from_model.py +++ b/cms/likes_from_model.py @@ -195,6 +195,12 @@ def execute_run_sel_sims(args): return def execute_scores_from_sims(args): ''' adapted from JV scores_from_tped_vers.py. functions point to scans.py''' + """ PHASE OUT; replace with power.py functions that parallelize by REP (much more efficient)""" + + #print("YOUR SHIT'S BUSTED") + #sys.exit(0) + + inputFilename, outputFilename = args.inputFilename, args.outputFilename ################# diff --git a/cms/model/Makefile b/cms/model/Makefile index 2fe8cf24..bdfe6ac0 100644 --- a/cms/model/Makefile +++ b/cms/model/Makefile @@ -1,12 +1,12 @@ ## leaf Makefile for cms2.0/model -## last updated 11.4.16 vitti@broadinstitute.org +## last updated 1.25.17 vitti@broadinstitute.org ###################### ## DEFINE VARIABLES ## ###################### CC = gcc -CCFLAG = -O0 -ggdb3 -lm -Wall +CCFLAG = -O0 -ggdb3 -lm -lz -Wall ################## ## DEFINE RULES ## diff --git a/cms/model/calc_fst_deldaf.c b/cms/model/calc_fst_deldaf.c index fb4de80d..0e45b562 100644 --- a/cms/model/calc_fst_deldaf.c +++ b/cms/model/calc_fst_deldaf.c @@ -1,4 +1,4 @@ -// last updated 09.13.16 vitti@broadinstitute.org +// last updated 09.13.16 vitti@broadinstitute.org // 1.25.17 experimenting with zlib for large data (eg 1kg tpeds) #include #include @@ -43,7 +43,7 @@ int main(int argc, char **argv) { fst_sum = 0; nfst = 0; //count alleles for pop0 - get_coal_data_tped_vers(&data, inTped1, inRecomfile); + get_coal_data_tped_vers_gz(&data, inTped1, inRecomfile); //if (data.nsample == 0) {continue;} nall0[0] = calloc(data.nsnp, sizeof(int)); @@ -60,7 +60,7 @@ int main(int argc, char **argv) { free_coal_data(&data); //count alleles for pop1 - get_coal_data_tped_vers(&data, inTped2, inRecomfile); + get_coal_data_tped_vers_gz(&data, inTped2, inRecomfile); //if (data.nsample == 0) {continue;} nall0[1] = calloc(data.nsnp, sizeof(int)); diff --git a/cms/model/coal_data_tped_vers.c b/cms/model/coal_data_tped_vers.c index 13eaa7e4..b9510384 100644 --- a/cms/model/coal_data_tped_vers.c +++ b/cms/model/coal_data_tped_vers.c @@ -1,10 +1,10 @@ -// last updated 10.20.16 vitti@broadinstitute.org +// last updated 1.25.17 vitti@broadinstitute.org #include #include #include #include -//#include +#include #include "coal_data_tped_vers.h" /************************/ @@ -97,17 +97,15 @@ void get_coal_data_tped_vers(coal_data* data, char tpedfilename[], char recomfil //char cmd[600]; char *newLine, *token, *running; int isamp, isnp, itoken, iRecom; - double genrate; - FILE *inf=NULL; + double genrate; + FILE *inf=NULL; /************************** INITIALIZATION, ALLOCATION, COUNTING nsample nsnp nrecom - **************************/ - + **************************/ newLine = malloc((line_size+1) * sizeof(char)); assert(newLine != NULL); - data->nsample = 0; data->nsnp = 0; data->nRecom = 0; @@ -124,20 +122,16 @@ void get_coal_data_tped_vers(coal_data* data, char tpedfilename[], char recomfil //fprintf(stderr, "Getting information from file: %s\n", tpedfilename); + //handle zipped? + // Count number of SNPs in tped inf = fopen(tpedfilename, "r"); - - //handle zipped - //sprintf(cmd, "gunzip -c %s", tpedfilename); - //inf = popen(cmd, "r"); - if (inf == NULL) {fprintf(stderr, "Missing TPED file: %s\n", tpedfilename);} assert (inf != NULL); while (fgets(newLine, line_size, inf) != NULL) { assert(strlen(newLine) < line_size); data->nsnp++; } - //get number of samples from last line int ik = 0; char *pch=strchr(newLine,' '); @@ -146,13 +140,12 @@ void get_coal_data_tped_vers(coal_data* data, char tpedfilename[], char recomfil pch=strchr(pch+1,' '); } data->nsample = ik-3; - fclose(inf); fprintf(stderr, "nSNP: %d\n", data->nsnp); fprintf(stderr, "nSample: %d\n", data->nsample); // Count number of lines in recombination file - inf = fopen(recomfilename, "r"); assert(inf != NULL); + inf = fopen(recomfilename, "r"); assert(inf != NULL); fgets(newLine, line_size, inf); //header if (inf == NULL) {fprintf(stderr, "Missing recombination file: %s\n", recomfilename);} while (fgets(newLine, line_size, inf) != NULL) { @@ -192,8 +185,6 @@ void get_coal_data_tped_vers(coal_data* data, char tpedfilename[], char recomfil GET DATA FROM TPED *******************/ //handle zipped - //sprintf(cmd, "gunzip -c %s", tpedfilename); - ///inf = popen(cmd, "r"); inf = fopen(tpedfilename, "r"); if (inf == NULL) {fprintf(stderr, "Missing TPED file: %s\n", tpedfilename);} assert(inf != NULL); @@ -242,13 +233,159 @@ void get_coal_data_tped_vers(coal_data* data, char tpedfilename[], char recomfil data->genPos[iRecom] = (genrate / 1000000.); //per Mb rate iRecom++; break; - } - } + } // end else if + } // end for running } // end while fgets fclose(inf); free(newLine); } // end function +void get_coal_data_tped_vers_gz(coal_data* data, char tpedfilename[], char recomfilename[]) { + const int line_size = 999999999; // upper limit + const int numRecomLines = 500000; + //char cmd[600]; + char *newLine, *token, *running; + int isamp, isnp, itoken, iRecom; + double genrate; + //FILE *inf=NULL; + gzFile *inf=NULL; + + /************************** + INITIALIZATION, ALLOCATION, + COUNTING nsample nsnp nrecom + **************************/ + newLine = malloc((line_size+1) * sizeof(char)); + assert(newLine != NULL); + data->nsample = 0; + data->nsnp = 0; + data->nRecom = 0; + data->samp_id = NULL; + data->pos = NULL; + data->snp_id = NULL; + data->chrom = NULL; + data->genotype = NULL; + data->anc_base = NULL; + data->nallele = NULL; + data->genPos = NULL; + data->physPos = NULL; + data->genloc = NULL; + + fprintf(stderr, "Getting information from file: %s\n", tpedfilename); + // Count number of SNPs in tped + inf = gzopen(tpedfilename, "rb"); + if (inf == NULL) {fprintf(stderr, "Missing TPED file: %s\n", tpedfilename);} + assert (inf != NULL); + while (gzgets(newLine, line_size, inf) != NULL) { + assert(strlen(newLine) < line_size); + data->nsnp++; + } + //get number of samples from last line + int ik = 0; + char *pch=strchr(newLine,' '); + while (pch!=NULL) { + ik++; + pch=strchr(pch+1,' '); + } + data->nsample = ik-3; + gzclose(inf); + fprintf(stderr, "nSNP: %d\n", data->nsnp); + fprintf(stderr, "nSample: %d\n", data->nsample); + + // Count number of lines in recombination file + inf = fopen(recomfilename, "r"); assert(inf != NULL); + fgets(newLine, line_size, inf); //header + if (inf == NULL) {fprintf(stderr, "Missing recombination file: %s\n", recomfilename);} + while (fgets(newLine, line_size, inf) != NULL) { + assert(strlen(newLine) < line_size); + data->nRecom++; + } + fclose(inf); + + // Allocate memory; initialize + data->samp_id = malloc(data->nsample * sizeof(char*)); assert(data->samp_id != NULL); + data->genotype = malloc(data->nsample * sizeof(int*)); assert(data->genotype != NULL); + for (isamp = 0; isamp < data->nsample; isamp++) { + data->samp_id[isamp] = malloc(64 * sizeof(char)); assert(data->samp_id[isamp] != NULL); + data->genotype[isamp] = malloc(data->nsnp * sizeof(int)); assert(data->genotype[isamp] !=NULL); + } + data->snp_id = malloc(data->nsnp * sizeof(char*)); assert(data->snp_id != NULL); + for (isnp = 0; isnp < data->nsnp; isnp++) { + data->snp_id[isnp] = malloc(64 * sizeof(char)); assert(data->snp_id[isnp] != NULL); + } + data->pos = malloc(data->nsnp * sizeof(long int)); assert(data->pos != NULL); + data->chrom = malloc(data->nsnp * sizeof(int)); assert(data->chrom != NULL); + data->anc_base = malloc(data->nsnp * sizeof(int)); assert(data->anc_base != NULL); + data->snp_base[0] = malloc(data->nsnp * sizeof(char*)); assert(data->snp_base[0] != NULL); + data->snp_base[1] = malloc(data->nsnp * sizeof(char*)); assert(data->snp_base[1] != NULL); + for (isnp = 0; isnp < data->nsnp; isnp++) { + data->snp_base[0][isnp] = malloc(3 * sizeof(char)); assert(data->snp_base[0][isnp] != NULL); + data->snp_base[1][isnp] = malloc(3 * sizeof(char)); assert(data->snp_base[1][isnp] != NULL); + } + data->genloc = malloc(data->nsnp * sizeof(double)); assert(data->genloc != NULL); + data->snp_base[2] = calloc(data->nsnp, sizeof(char*)); // STRIKE? + data->snp_base[3] = calloc(data->nsnp, sizeof(char*)); // STRIKE? + data->nallele = malloc(data->nsnp * sizeof(int)); assert(data->nallele != NULL); + data->genPos = malloc(numRecomLines * sizeof(double)); assert(data->genPos !=NULL); + data->physPos = malloc(numRecomLines * sizeof(int)); assert(data->physPos != NULL); + + /******************* + GET DATA FROM TPED + *******************/ + //handle zipped + inf = gzopen(tpedfilename, "rb"); + if (inf == NULL) {fprintf(stderr, "Missing TPED file: %s\n", tpedfilename);} + assert(inf != NULL); + + isnp = 0; + while (gzgets(newLine, line_size, inf) != NULL) { + for (running = newLine, itoken = 0; (token = strsep(&running, " \t")) != NULL; itoken++) { + if (itoken == 0) { + //data->chrom[isnp] = chromosome; + data->nallele[isnp] = 2; //only biallelics in tped + } + else if (itoken == 1) { + strcpy(data->snp_id[isnp], token); + } + else if (itoken == 2){ + data->genloc[isnp] = atof(token); + } + + else if (itoken == 3) { + data->pos[isnp] = atoi(token); + } + else if (itoken >= 4) { + isamp = itoken - 4; //RIGHT? + data->genotype[isamp][isnp] = atoi(token); + } + } // END for running=newLine + isnp++; + } //END while(fgets(newLine)) + gzclose(inf); + + /************************** + GET DATA FROM RECOMB FILE + **************************/ + inf = fopen(recomfilename, "r"); + if (inf == NULL) {fprintf(stderr, "Missing recombination file: %s\n", recomfilename);} + assert(inf != NULL); + fgets(newLine, line_size, inf); //header + iRecom = 0; + while (fgets(newLine, line_size, inf) != NULL) { + for (running = newLine, itoken = 0; (token = strsep(&running, "\t")) != NULL; itoken++) { + if (itoken == 1) { + data->physPos[iRecom] = atoi(token); + } + else if (itoken == 2) { + genrate = atof(token); + data->genPos[iRecom] = (genrate / 1000000.); //per Mb rate + iRecom++; + break; + } // end else if + } // end for running + } // end while fgets + fclose(inf); + free(newLine); +} // end function void free_coal_data(coal_data* data) { int isamp, isnp; diff --git a/cms/model/coal_data_tped_vers.h b/cms/model/coal_data_tped_vers.h index af744eff..04de1aa2 100644 --- a/cms/model/coal_data_tped_vers.h +++ b/cms/model/coal_data_tped_vers.h @@ -1,7 +1,6 @@ // {{POP GEN DATA --> DEM MODEL}: CALC POP SUMMARY STATS} // {{POP GEN DATA --> COMPONENT SCORES}} -//sourcefile: coal_data_1kg_autosome_tped_thinned.h -// last updated: 06.07.16 vitti@broadinstitute.org +// last updated: 01.25.17 vitti@broadinstitute.org /************************/ /**DEFINE DATASTRUCTURE**/ @@ -33,4 +32,5 @@ int getLowerIndexOfItem(int *values, int numVals, int itemToFind); int getUpperIndexOfItem(int *values, int numVals, int itemToFind); /*manipulating datastrucutre*/ void get_coal_data_tped_vers(coal_data* data, char tpedfilename[], char recomfilename[]); +void get_coal_data_tped_vers_gz(coal_data* data, char tpedfilename[], char recomfilename[]); void free_coal_data(coal_data* data); \ No newline at end of file diff --git a/cms/model/params_func.py b/cms/model/params_func.py index 4a29cfaf..c52bb424 100644 --- a/cms/model/params_func.py +++ b/cms/model/params_func.py @@ -399,7 +399,7 @@ def write_paramfile(paramfilename, paramDict): return def get_dict_from_paramfile(paramfilename): #initialize empty lists so we can append from paramfile as we parse - paramDict = {'numPops':4,'labels':{1:'LWK', 2:'GWD', 3:'MSL', 4:'YRI'}, 'singrate':.5, 'presentSizes':[], 'num_indivs_per_sample':200} + paramDict = {'numPops':4,'labels':{1:'LWK', 2:'GWD', 3:'MSL', 4:'YRI'}, 'singrate':.25, 'presentSizes':[], 'num_indivs_per_sample':200} for pop in range(1,5): for parameter in ['Ne', 'Ne_times', 'Ne_change', 'bn', 'bn_times']: key = (parameter, pop) diff --git a/cms/power.py b/cms/power.py index 2633321f..8b938868 100644 --- a/cms/power.py +++ b/cms/power.py @@ -1,5 +1,5 @@ ## script to manipulate and analyze empirical/simulated CMS output -## last updated 12.31.16 vitti@broadinstitute.org +## last updated 2.7.17 vitti@broadinstitute.org from power.power_parser import full_parser_power from power.power_func import normalize, merge_windows, get_window, check_outliers, check_rep_windows, calc_pr, get_pval, plotManhattan, \ @@ -12,7 +12,7 @@ mp.use('agg') import matplotlib.pyplot as plt from tempfile import TemporaryFile -#from xlwt import Workbook, easyxf +from xlwt import Workbook, easyxf #from pybedtools import BedTool import numpy as np import argparse @@ -108,10 +108,15 @@ def execute_run_neut_repscores(args): tpeddir = args.tpedfolder simRecomFile = args.simRecomFile - tped = tpeddir + "rep" + str(repNum) + "_" + str(pop) + ".tped" + tped = tpeddir + "rep_" + str(repNum) + "_" + str(pop) + ".tped" #temp quick fix assert os.path.isfile(tped) #in_ihs_file, in_delihh_file, in_xp_file, in_fst_deldaf_file = get_component_score_files(model, repNum, pop, altpop, scenario = "neut", filebase = basedir) + for scorefiledir in ['ihs', 'delihh', 'nsl', 'xpehh', 'fst_deldaf']: + #dircmd = "mkdir -p " + basedir + scorefiledir + "/" + #subprocess.check_output( dircmd.split() ) + check_create_dir(basedir + scorefiledir) + ####### Calculate per-population ####### scores: iHS, delIHH, nSL ihs_commandstring = "python " + cmsdir + "scans.py selscan_ihs" @@ -123,10 +128,10 @@ def execute_run_neut_repscores(args): if proceed: print(ihs_fullcmd) execute(ihs_fullcmd) - delihh_commandstring = "python " + cmsdir + "likes_from_model.py scores_from_sims --delIhh" + delihh_commandstring = "python " + cmsdir + "likes_from_model.py scores_from_sims --delIhh" #WAIT WHAT? delihh_unnormedfile = basedir + "delihh/rep" + str(repNum) + "_" + str(pop) + ".txt" delihh_argstring = ihs_unnormedfile + " "+ delihh_unnormedfile - delihh_fullcmd = delihh_commandstring + " " + delihh_argstring + delihh_fullcmd = delihh_commandstring + " " + delihh_argstring + " --cmsdir " + args.cmsdir proceed = check_create_file(delihh_unnormedfile, args.checkOverwrite) if proceed: print(delihh_fullcmd) @@ -148,7 +153,7 @@ def execute_run_neut_repscores(args): altpops.remove(int(pop)) for altpop in altpops: xpehh_commandstring = "python " + cmsdir + "scans.py selscan_xpehh --threads 7" - tped2 = tpeddir + "rep" + str(repNum) + "_" + str(altpop) + ".tped" + tped2 = tpeddir + "rep_" + str(repNum) + "_" + str(altpop) + ".tped" xpehh_outfileprefix = basedir + "xpehh/rep" + str(repNum) + "_" + str(pop) + "_" + str(altpop) xpehh_unnormedfile = basedir + "xpehh/rep" + str(repNum) + "_" + str(pop) + "_" + str(altpop) + ".xpehh.out" xpehh_argumentstring = tped + " " + xpehh_outfileprefix + " " + tped2 @@ -156,11 +161,12 @@ def execute_run_neut_repscores(args): proceed = check_create_file(xpehh_unnormedfile, args.checkOverwrite) if proceed: print(xpehh_fullcmd) - execute(xpehh_fullcmd) + execute(xpehh_fullcmd) + fstdeldaf_commandstring = "python " + cmsdir + "likes_from_model.py scores_from_sims" fstdeldaf_outfilename = basedir + "fst_deldaf/rep" + str(repNum) + "_" + str(pop) + "_" + str(altpop) - fstdeldaf_argumentstring = tped + " --fst_deldaf " + tped2 + " " + fstdeldaf_outfilename + " --recomfile " + simRecomFile - fstdeldaf_fullcmd = fstdeldaf_commandstring + " " + fstdeldaf_argumentstring + fstdeldaf_argumentstring = tped + " --fst_deldaf " + tped2 + " " + fstdeldaf_outfilename + " --recomfile " + simRecomFile + " --cmsdir " + args.cmsdir + fstdeldaf_fullcmd = fstdeldaf_commandstring + " " + fstdeldaf_argumentstring proceed = check_create_file(fstdeldaf_outfilename, args.checkOverwrite) if proceed: print(fstdeldaf_fullcmd) @@ -175,7 +181,7 @@ def execute_run_sel_repscores(args): cmsdir = args.cmsdir tpeddir = args.tpedfolder simRecomFile = args.simRecomFile - tped = tpeddir + "rep" + str(repNum) + "_" + str(pop) + ".tped" + tped = tpeddir + "rep_" + str(repNum) + "_" + str(pop) + ".tped" assert os.path.isfile(tped) ####### Calculate per-population @@ -214,7 +220,7 @@ def execute_run_sel_repscores(args): altpops.remove(int(pop)) for altpop in altpops: xpehh_commandstring = "python " + cmsdir + "scans.py selscan_xpehh --threads 7" - tped2 = tpeddir + "rep" + str(repNum) + "_" + str(altpop) + ".tped" + tped2 = tpeddir + "rep_" + str(repNum) + "_" + str(altpop) + ".tped" xpehh_outfileprefix = basedir + "xpehh/rep" + str(repNum) + "_" + str(pop) + "_" + str(altpop) xpehh_unnormedfile = basedir + "xpehh/rep" + str(repNum) + "_" + str(pop) + "_" + str(altpop) + ".xpehh.out" xpehh_argumentstring = tped + " " + xpehh_outfileprefix + " " + tped2 @@ -411,70 +417,6 @@ def execute_sel_norm_from_binfile(args): print(fullcmd) execute(fullcmd) return -""" -def execute_run_poppair(args): - '''PHASE OUT''' - ''' from run_additioanal_poppair.py''' - model = args.model - selpop = args.simpop - altpop = args.altpop - repNum = args.nrep - writedir = args.writedir - cmsdir = args.cmsdir - scorebase = args.scorebase - cmd = "python " + cmsdir + "composite.py poppair" - - modeldir = writedir + model + "/" - check_create_dir(modeldir) - - neutdir = modeldir + "neut/" - check_create_dir(neutdir) - neutpairdir = neutdir + "pairs/" - check_create_dir(neutpairdir) - - #### NEUT SIMS - for irep in range(1, repNum+1): - in_ihs_file, in_nsl_file, in_delihh_file, in_xp_file, in_fst_deldaf_file = get_component_score_files(model, irep, selpop, altpop, "neut", normed=True, filebase=scorebase) - outfile = neutdir + "pairs/rep" + str(irep) + "_" + str(selpop) + "_" + str(altpop) + ".pair" - alreadyExists = False - if args.checkOverwrite: - if not os.path.isfile(outfile): #check for overwrite - alreadyExists = False - else: - alreadyExists = True - if alreadyExists == False: - argstring = in_ihs_file + " " + in_nsl_file + " " + in_delihh_file + " " + in_xp_file + " " + in_fst_deldaf_file + " " + outfile - fullcmd = cmd + " " + argstring - print(fullcmd) - if os.path.isfile(in_ihs_file) and os.path.isfile(in_nsl_file) and os.path.isfile(in_delihh_file) and os.path.isfile(in_xp_file) and os.path.isfile(in_fst_deldaf_file): - execute(fullcmd) - - #### SEL SIMS - selbins = ["0.10", "0.20", "0.30", "0.40", "0.50", "0.60", "0.70", "0.80", "0.90"] - for selbin in selbins: - for irep in range(1, repNum+1): - in_ihs_file, in_nsl_file, in_delihh_file, in_xp_file, in_fst_deldaf_file = get_component_score_files(model, irep, selpop, altpop, selbin, normed=True, filebase=scorebase) - selpopdir = modeldir + "sel" + str(selpop) + "/" - check_create_dir(selpopdir) - selbindir = selpopdir + "sel_" + str(selbin) + "/" - check_create_dir(selbindir) - pairbindir = modeldir + "pairs/" - check_create_dir(pairbindir) - outfile = pairbindir + "rep" + str(irep) + "_" + str(selpop) + "_" + str(altpop) + ".pair" - alreadyExists = False - if args.checkOverwrite: - if not os.path.isfile(outfile): #check for overwrite - alreadyExists = False - else: - alreadyExists = True - if alreadyExists == False: - argstring = in_ihs_file + " " + in_nsl_file + " " + in_delihh_file + " " + in_xp_file + " " + in_fst_deldaf_file + " " + outfile - fullcmd = cmd + " " + argstring - print(fullcmd) - if os.path.isfile(in_ihs_file) and os.path.isfile(in_nsl_file) and os.path.isfile(in_delihh_file) and os.path.isfile(in_xp_file) and os.path.isfile(in_fst_deldaf_file): - execute(fullcmd) - return -""" def execute_composite_sims(args): model = args.model selpop = args.simpop @@ -594,7 +536,8 @@ def execute_normsims(args): if os.path.isfile(outfile): normedfile = outfile + ".norm" - if not os.path.isfile(normedfile): + if True: + #if not os.path.isfile(normedfile): #CHANGE FOR --checkOverwrite openfile = open(outfile, 'r') writefile = open(normedfile, 'w') for line in openfile: @@ -613,7 +556,8 @@ def execute_normsims(args): rawfile = get_sel_repfile_name(model, irep, selpop, sel_freq_bin, suffix=suffix, normed = False, basedir=writedir) if os.path.isfile(rawfile): normedfile = rawfile + ".norm" - if not os.path.isfile(normedfile): + if True: + #if not os.path.isfile(normedfile): openfile = open(rawfile, 'r') writefile = open(normedfile, 'w') for line in openfile: @@ -681,9 +625,10 @@ def execute_composite_emp(args): def execute_normemp(args): selpop = args.emppop model = args.model - likessuffix = args.likessuffix + #likessuffix = args.likessuffix + suffix = args.suffix - scores2 = load_empscores(model, selpop, likessuffix, normed=False) + scores2 = load_empscores(model, selpop, normed=False, suffix=suffix) scores = [np.log(item) for item in scores2] #check for nans @@ -705,7 +650,7 @@ def execute_normemp(args): ############## chroms = range(1,23) for chrom in chroms: - unnormedfile = get_emp_cms_file(selpop, model, likessuffix, chrom, normed=False) + unnormedfile = get_emp_cms_file(selpop, model, chrom, normed=False, suffix=suffix) assert os.path.isfile(unnormedfile) normedfile = unnormedfile + ".norm" @@ -990,8 +935,9 @@ def execute_tpr(args): if os.path.isfile(repfilename): allrepfilenames.append(repfilename) print('loaded ' + str(len(allrepfilenames)) + " replicates...") - numToTake = min(500, len(allrepfilenames)) - chosen = np.random.choice(allrepfilenames, numToTake, replace=False) #take random sample + #numToTake = min(500, len(allrepfilenames)) + #chosen = np.random.choice(allrepfilenames, numToTake, replace=False) #take random sample + chosen = allrepfilenames #this was just to expedite, no? for repfilename in chosen: physpos, genpos, seldaf, ihs_normed, delihh_normed, nsl_normed, xpehh_normed, fst, deldaf, cms_unnormed, cms_normed = read_cms_repfile(repfilename) #physpos, genpos, ihs_normed, delihh_normed, nsl_normed, xpehh_normed, fst, deldaf, cms_unnormed, cms_normed = read_cms_repfile(repfilename) @@ -1113,7 +1059,7 @@ def execute_roc(args): return -######## Apply significance cutoff +######## Apply significance cutoffs ######## to empirical results. def execute_gw_regions(args): model = args.model @@ -1122,7 +1068,8 @@ def execute_gw_regions(args): cutoff = args.cutoff pop = args.emppop windowlen = args.regionlen - vs = args.likessuffix + #vs = args.likessuffix + suffix = args.suffix chroms = range(1,23) signif_windows = [] @@ -1131,11 +1078,11 @@ def execute_gw_regions(args): #################### for chrom in chroms: chrom_signif = [] - normedempfilename = get_emp_cms_file(pop, model, vs, chrom, normed=True) + normedempfilename = get_emp_cms_file(pop, model, chrom, normed=True, suffix = suffix) if not os.path.isfile(normedempfilename): print("missing: " + normedempfilename) else: - physpos, genpos, ihs_normed, delihh_normed, xpehh_normed, fst, deldaf, cms_unnormed, cms_normed = read_cms_repfile(normedempfilename) + physpos, genpos, seldaf, ihs_normed, delihh_normed, nsl_normed, xpehh_normed, fst, deldaf, cms_unnormed, cms_normed = read_cms_repfile(normedempfilename) for iPos in range(len(physpos)): ################## ## CHECK REGION ## @@ -1180,16 +1127,19 @@ def execute_gw_regions(args): return def execute_regionlog(args): ''' writes an excel file ''' + model = args.model regionfiles = [] pops = ['YRI', 'CEU', 'CHB', 'BEB'] takepops = [] - for pop in pops: + for pop in pops: #too tired to soft-code rn + regionfilename = "/n/regal/sabeti_lab/jvitti/clear-synth/1kg_regions/" + model + "/" + pop + "_" + str(int(args.regionlen)) + "_" + str(int(args.thresshold)) + "_" + str(int(args.cutoff)) + ".txt" #or _alt #regionfilename = "/idi/sabeti-scratch/jvitti/cms2_power/regions/regions_103116_" + str(pop) +"_" + (args.model) + "_" + str(args.likessuffix) +"_" + str(int(args.regionlen)) + "_" + str(int(args.thresshold)) + "_" + str(args.cutoff) + ".txt" print(regionfilename) if os.path.isfile(regionfilename): regionfiles.append(regionfilename) takepops.append(pop) if len(regionfiles) == 0: + print("found no regions") return else: totalselregions = 0 @@ -1258,23 +1208,24 @@ def execute_regionlog(args): for icol in range(len(colWidths)): sheet1.col(icol).width = colWidths[icol] * 256 #~x char wide - book.save(args.savefile) + book.save(args.saveLog) book.save(TemporaryFile()) - print('wrote ' + str(totalselregions) + ' significant regions to: ' + args.savefile) + print('wrote ' + str(totalselregions) + ' significant regions to: ' + args.saveLog) return def execute_manhattan(args): selpop = args.emppop model = args.model - likessuffix = args.likessuffix + #likessuffix = args.likessuffix savename = args.savefilename + suffix = args.suffix #nRep = args.nrep ############################### ### LOAD NEUTRAL SIM VALUES ### ############################### - if likessuffix == "neut": - vsNeut = True - elif likessuffix == "linked": - vsNeut = False + #if likessuffix == "neut": + # vsNeut = True + #elif likessuffix == "linked": + # vsNeut = False modelpops = {'YRI':1, 'CEU':2, 'CHB':3, 'BEB':4} pop = modelpops[selpop] @@ -1298,7 +1249,7 @@ def execute_manhattan(args): nSnps = 0 for chrom in range(1,23): thesepos, thesescores = [], [] - emp_cms_filename = get_emp_cms_file(selpop, model, likessuffix, chrom, normed=True) + emp_cms_filename = get_emp_cms_file(selpop, model, chrom, normed=True, suffix=suffix) print('loading chr ' + str(chrom) + ": " + emp_cms_filename) if not os.path.isfile(emp_cms_filename): print("missing: " + emp_cms_filename) @@ -1323,46 +1274,46 @@ def execute_manhattan(args): print('saved to: ' + savename) return def execute_extended_manhattan(args): + plotscore = args.plotscore selpop = args.emppop model = args.model - likessuffix = args.likessuffix + suffix = args.suffix savename = args.savefilename numChr = 22 - if likessuffix == "neut": - vsNeut = True - elif likessuffix == "linked": - vsNeut = False + titlestring = args.titlestring + modelpops = {'YRI':1, 'CEU':2, 'CHB':3, 'BEB':4} pop = modelpops[selpop] + colorDict = {1:'#FFB933', 2:'#0EBFF0', 3:'#ADCD00', 4:'#8B08B0'} - f, axarr = plt.subplots(numChr, 1, sharex = True) + f, axarr = plt.subplots(numChr, 1, sharex = True, sharey=True, figsize=(7, 7)) + plt.suptitle(titlestring, fontsize=10) plt.xlabel('position') plt.ylabel('cms_gw normed score') - #plt.tick_params(axis='y', left='off') only one subplot? all_emp_pos, all_emp_scores = [], [] for chrom in range(1,numChr +1): - emp_cms_filename = get_emp_cms_file(selpop, model, likessuffix, chrom, normed=True) + emp_cms_filename = get_emp_cms_file(selpop, model, chrom, normed=True, suffix=suffix) print('loading chr ' + str(chrom) + ": " + emp_cms_filename) if not os.path.isfile(emp_cms_filename): print("missing: " + emp_cms_filename) break - physpos, genpos, seldaf, ihs_normed, delihh_normed, nsl_normed, xpehh_normed, fst, deldaf, cms_unnormed, cms_normed = read_cms_repfile(cmsfilename) - #physpos, genpos, ihs_normed, delihh_normed, xpehh_normed, fst, deldaf, cms_unnormed, cms_normed = read_cms_repfile(emp_cms_filename) + physpos, genpos, seldaf, ihs_normed, delihh_normed, nsl_normed, xpehh_normed, fst, deldaf, cms_unnormed, cms_normed = read_cms_repfile(emp_cms_filename) iax = chrom-1 ax = axarr[iax] - plotManhattan_extended(ax, cms_normed, physpos, chrom) + plot_data = eval(plotscore) + plotManhattan_extended(ax, plot_data, physpos, chrom) all_emp_pos.append(physpos) - all_emp_scores.append(cms_normed) + all_emp_scores.append(plot_data) + ################################ ## HILITE SIGNIFICANT REGIONS ## ################################ if args.regionsfile is not None: regionchrs, regionstarts, regionends = loadregions(args.regionsfile) - print('loaded ' + str(len(regionchrs)) + ' significant regions from ' + args.regionsfile) for iregion in range(len(regionchrs)): regionchr, regionstart, regionend = regionchrs[iregion], regionstarts[iregion], regionends[iregion] @@ -1377,10 +1328,38 @@ def execute_extended_manhattan(args): plotvals.append(locus[1]) if locus[0] > regionend: break - - axarr[ichrom].plot(plotpos, plotvals, color="red") + axarr[ichrom].plot(plotpos, plotvals, color=colorDict[pop], markersize=1) #axarr[ichrom].plot([regionstart, regionend], [0, 0], color="red") - #axarr.get_yaxis().set_visible(False) + + + if args.percentile is not None: + percentile = float(args.percentile) + print('plotting data with heuristic cutoff for ' + str(percentile) + " percentile...") + flat_emp_scores = [item for sublist in all_emp_scores for item in sublist if not np.isnan(item)] + score_cutoff = float(np.percentile(flat_emp_scores, percentile)) + print("score cutoff: " + str(score_cutoff)) + for chrom in range(1,numChr +1): + iax = chrom-1 + ax = axarr[iax] + maximumVal = ax.get_xlim()[1] + xpoints = np.array([0, maximumVal]) + ypoints = np.array([score_cutoff, score_cutoff]) + ax.plot(xpoints, ypoints ,linestyle = "dotted", color="grey", markersize=.3) + + #get empirical scores and positions for pass threshhold and plot them as above with color + these_scores, these_pos = all_emp_scores[iax], all_emp_pos[iax] + zipped = zip(these_scores, these_pos) + significant = [item for item in zipped if item[0] >= score_cutoff] + #print(str(len(significant))) + signif_vals = [item[0] for item in significant] + signif_pos = [item[1] for item in significant] + #for i in range(len(signif_pos)): + # print(str(signif_pos[i]) + "\t" + str(signif_vals[i])) + #print(str(signif_vals)) + #print(str(signif_pos)) + ax.plot(signif_pos, signif_vals, color=colorDict[pop], linestyle='None', marker=".", markersize=.3)#, markersize=1) + + plt.savefig(savename) print('saved to: ' + savename) return diff --git a/cms/power/parse_func.py b/cms/power/parse_func.py index 4fe5938f..58c1ca12 100644 --- a/cms/power/parse_func.py +++ b/cms/power/parse_func.py @@ -1,4 +1,4 @@ -## last updated 1.6.17 +## last updated 2.7.17 import subprocess import numpy as np @@ -73,11 +73,15 @@ def get_pr_filesnames(key, basedir): fprfile = basedir + "/fpr/" + model + "/sel" + str(pop) + "/fpr_" + str(regionlen) + "_" + str(percentage) + "_" + str(cutoff) tprfile = basedir + "/tpr/" + model + "/sel" + str(pop) + "/tpr_" + str(regionlen) + "_" + str(percentage) + "_" + str(cutoff) for filename in [tprfile, fprfile]: - #if not os.path.isfile(filename): - # print("missing: " + filename) + if not os.path.isfile(filename): + print("missing: " + filename) + else: + pass return fprfile, tprfile -def get_emp_cms_file(selpop, model, likessuffix, chrom, normed = False, basedir = "/idi/sabeti-scratch/jvitti/scores_composite4_b/"):#"/idi/sabeti-scratch/jvitti/synth/cms_composite/"): - filename = basedir + selpop +"_" + str(model) + "_" + likessuffix + ".chr" + str(chrom) + ".txt" +def get_emp_cms_file(selpop, model, chrom, normed = False, basedir = "/n/regal/sabeti_lab/jvitti/clear-synth/1kg_composite/", suffix = ""): + #"/n/regal/sabeti_lab/jvitti/clear-synth/1kg_composite"):#"/idi/sabeti-scratch/jvitti/synth/cms_composite/"): + #filename = basedir + selpop +"_" + str(model) + "_" + likessuffix + ".chr" + str(chrom) + ".txt" + filename = basedir + "chr" + str(chrom) + "_" + str(selpop) + "_strictMask_" + model + ".cms" + suffix if normed: filename += ".norm" if not os.path.isfile(filename): @@ -159,11 +163,11 @@ def load_simscores(model, pop, vsNeut = False, numRep = 500, normed =False): infostring += " scores." print(infostring) return all_simscores -def load_empscores(model, selpop, likessuffix, normed=False): +def load_empscores(model, selpop, normed=False, suffix=''): chroms = range(1,23) scores = [] for chrom in chroms: - scorefile = get_emp_cms_file(selpop, model, likessuffix, chrom, normed=normed) + scorefile = get_emp_cms_file(selpop, model, chrom, normed=normed, suffix=suffix) print('loading from ' + scorefile) assert os.path.isfile(scorefile) openfile = open(scorefile, 'r') @@ -262,140 +266,3 @@ def write_master_likesfile(writefilename, model, selpop, freq,basedir, miss = " writefile.close() print("wrote to: " + writefilename) return - - -""" -def get_component_score_files_old(model, repNum, selPop, altPop, scenario = "neut", filebase = "/idi/sabeti-scratch/jvitti/clean/scores/"): - '''PHASE OUT''' - if scenario == "neut": - in_ihs_file = filebase + model + "/neut/ihs/rep" + str(repNum) + "_" + str(selPop) + "_normed.txt" - in_delihh_file = filebase + model + "/neut/delihh/rep" + str(repNum) + "_" + str(selPop) + ".txt.norm" - in_xp_file = filebase + model + "/neut/xpehh/rep" + str(repNum) + "_" + str(selPop) + "_" + str(altPop) + "_normed.txt" - in_fst_deldaf_file = filebase + model + "/neut/fst_deldaf/rep" + str(repNum) + "_" + str(selPop) + "_" + str(altPop) - - else: - in_ihs_file = filebase + model + "/sel" + str(selPop) + "/ihs/sel_" + str(scenario) + "/rep" + str(repNum) + "_" + str(selPop) + "_truncOk.ihs.out.norm" - in_delihh_file = filebase + model + "/sel" + str(selPop) + '/delihh/sel_' + str(scenario) + '/rep' + str(repNum) + "_" + str(selPop) + ".txt.norm" - in_xp_file = filebase + model + "/sel" + str(selPop) + '/xpehh/sel_' + str(scenario) +'/rep' + str(repNum) + "_" + str(selPop) + "_" + str(altPop) + ".xpehh.out.norm" - in_fst_deldaf_file = filebase + model + "/sel" + str(selPop) + '/fst_deldaf/sel_' + str(scenario) +'/rep' + str(repNum) + "_" + str(selPop) + "_" + str(altPop) - - for returnfile in [in_ihs_file, in_delihh_file, in_xp_file, in_fst_deldaf_file]: - if not os.path.isfile(returnfile): - print("Missing: " + returnfile) - - return in_ihs_file, in_delihh_file, in_xp_file, in_fst_deldaf_file -def getSelFiles_asList(model, score, pop, listfilename, selbins, binclustername, altpop = "", overwrite = True, numPerBin = 500): - if not overwrite: - if os.path.isfile(listfilename): - print(listfilename + " exists.") - return listfilename - listfile = open(listfilename, 'w') - nFile = 0 - if score in ['ihs', 'delihh', 'nsl']: - assert(altpop == "") - if score in ['xpehh', 'deldaf', 'fst']: - assert(altpop != "") - if score in ['fst', 'deldaf']: - score = 'fst_deldaf' - - for selbin in selbins: - selfilebase = '/idi/sabeti-scratch/jvitti/scores/' + model + "/sel" + str(pop) + '/' + score + '/sel_' + str(selbin) + '/' - for irep in range(1, numPerBin+1): - if score == "ihs": - repfilename = selfilebase + "rep" + str(irep) + "_" + str(pop) + "_truncOk.ihs.out.norm" - elif score == "delihh": - repfilename = selfilebase + "rep" + str(irep) + "_" + str(pop) + ".txt.norm" - elif score == "nsl": - repfilename = selfilebase + "rep" + str(irep) + "_" + str(pop) + "." + score + ".out.norm" - elif score == "xpehh": - repfilename = selfilebase + "rep"+ str(irep) + "_" + str(pop) +"_" + str(altpop) + ".xpehh.out.norm" - elif score == "fst_deldaf": - repfilename = selfilebase + "rep"+ str(irep) + "_" + str(pop) +"_" + str(altpop) - if not os.path.isfile(repfilename): - print("missing: " + repfilename) - if os.path.isfile(repfilename): - listfile.write(repfilename + "\n") - nFile +=1 - listfile.close() - print('wrote ' + str(nFile) + ' files to ' + listfilename) - return listfilename -def getNeutFiles_asList(model, score, pop, listfilename, altpop = "", overwrite = True, numPerBin = 500): - if not overwrite: - if os.path.isfile(listfilename): - print(listfilename + " exists.") - return listfilename - listfile = open(listfilename, 'w') - nFile = 0 - if score in ['ihs', 'delihh', 'nsl']: - assert(altpop == "") - if score in ['xpehh', 'deldaf', 'fst']: - assert(altpop != "") - if score in ['fst', 'deldaf']: - score = 'fst_deldaf' - neutfilebase = '/idi/sabeti-scratch/jvitti/scores/' + model + "/neut/" + score + '/' - for irep in range(1, numPerBin+1): - if score == "ihs": - repfilename = neutfilebase + "rep" + str(irep) + "_" + str(pop) + "_normed.txt" - elif score == "delihh": - repfilename = neutfilebase + "rep" + str(irep) + "_" + str(pop) + ".txt.norm" - elif score == "nsl": - repfilename = neutfilebase + "rep" + str(irep) + "_" + str(pop) + "_normed.txt"#"." + score +# ".out.norm" - elif score == "xpehh": - repfilename = neutfilebase + "rep"+ str(irep) + "_" + str(pop) +"_" + str(altpop) + "_normed.txt" - elif score == "fst_deldaf": - repfilename = neutfilebase + "rep"+ str(irep) + "_" + str(pop) +"_" + str(altpop) - if not os.path.isfile(repfilename): - print("missing: " + repfilename) - if os.path.isfile(repfilename): - listfile.write(repfilename + "\n") - nFile +=1 - listfile.close() - print('wrote ' + str(nFile) + ' files to ' + listfilename) - return listfilename -def getNeutCompFiles(model, pop, writedir, writeprefix, overwrite=False, scoredir = "/idi/sabeti-scratch/jvitti/scores_composite2/"): - if writedir[-1] != "/": - writedir += "/" - writefilename = writedir + writeprefix + "_sel" + str(pop) +"_comp_neut.list" - if os.path.isfile(writefilename) and overwrite == False: - print("already exists: " + writefilename) - return writefilename - else: - modeldir = scoredir + model + "/neut/" - #print(modeldir) - modelcontents = os.listdir(modeldir) - selidstring = "_" + str(pop) +".cms.out" - filecontents = [item for item in modelcontents if selidstring in item and ".norm" not in item and ".decompose" not in item] - print('found ' + str(len(filecontents)) + ' files for pop...') - writefile = open(writefilename, 'w') - for filename in filecontents: - writefile.write(modeldir + filename + "\n") - writefile.close() - print('wrote to: ' + writefilename) - return writefilename -def getSelCompFiles(model, pop, writedir, writeprefix, selbins, binclustername, overwrite=False, scoredir = "/idi/sabeti-scratch/jvitti/scores_composite2/"): - if writedir[-1] != "/": - writedir += "/" - writefilename = writedir + writeprefix + "_sel" + str(pop) +"_comp_" + binclustername + ".list" - if os.path.isfile(writefilename) and overwrite == False: - print("already exists: " + writefilename) - return writefilename - else: - allcontents = [] - modeldir = scoredir + model + "/sel" + str(pop) + "/" - for selbin in selbins: - bindir = modeldir + "sel_" + str(selbin) + '/' - #print(bindir) - bincontents = os.listdir(bindir) - selidstring = "_" + str(pop) +"_vsneut.cms.out" - #print(selidstring) - filecontents = [bindir + item for item in bincontents if selidstring in item and ".norm" not in item and ".decompose" not in item] - allcontents.extend(filecontents) - print('found ' + str(len(allcontents)) + ' files for pop...') - writefile = open(writefilename, 'w') - for filename in allcontents: - writefile.write(filename + "\n") - writefile.close() - print('wrote to: ' + writefilename) - return writefilename -""" - diff --git a/cms/power/power_func.py b/cms/power/power_func.py index 1c05b16c..5a6517a2 100644 --- a/cms/power/power_func.py +++ b/cms/power/power_func.py @@ -1,8 +1,6 @@ -## /idi/sabeti-scratch/jvitti/cms2_power/ -## last updated 11.19.16 +## last updated 2.7.17 import numpy as np -#import scipy.stats import math ################# @@ -10,7 +8,7 @@ ################# def normalize(rawscore, mean, sd): rawscore, mean, sd = float(rawscore), float(mean), float(sd) - normalizedvalue = (rawscore - mean) / sd + normalizedvalue = (rawscore - mean) #/ sd return normalizedvalue ############################### @@ -91,9 +89,13 @@ def calc_pr(all_percentages, threshhold): numNeutReps_exceedThresh +=1 numNeutReps_exceedThresh, totalnumNeutReps = float(numNeutReps_exceedThresh), float(totalnumNeutReps) #print(str(numNeutReps_exceedThresh) + "\t" + str(totalnumNeutReps) + "\n") - fpr = numNeutReps_exceedThresh / totalnumNeutReps - #print("fpr: " + "\t" + str(fpr) + '\n') - return fpr + if totalnumNeutReps != 0: + pr = numNeutReps_exceedThresh / totalnumNeutReps + #print("fpr: " + "\t" + str(fpr) + '\n') + else: + pr = 0 + print('ERROR; empty set') + return pr def get_pval(all_simscores, thisScore): r = np.searchsorted(all_simscores,thisScore) @@ -150,12 +152,10 @@ def plotManhattan(ax, neut_rep_scores, emp_scores, chrom_pos, nSnps, maxSkipVal return ax def plotManhattan_extended(ax, emp_scores, chrom_pos, chrom): ''' makes a figure more like in Karlsson 2013 instead of Grossman 2013''' - - ax.plot(chrom_pos, emp_scores) + ax.plot(chrom_pos, emp_scores, linestyle='None', marker=".", markersize=.3, color="grey") ax.set_ylabel('chr' + str(chrom), fontsize=6, rotation='horizontal') labels = ax.get_yticklabels() ax.set_yticklabels(labels, fontsize=6) - #ax.get_xticks().set_visible(False) return ax def loadregions(regionfile): openfile = open(regionfile, 'r') @@ -173,12 +173,12 @@ def loadregions(regionfile): def quick_plot(ax, pos, val, ylabel,causal_index=-1): ax.scatter(pos, val, s=.8) if causal_index != -1: - print('huzzah!') + #print('huzzah!') ax.scatter(pos[causal_index], val[causal_index], color='r', s=4) for tick in ax.yaxis.get_major_ticks(): tick.label.set_fontsize('6') ax.set_ylabel(ylabel, fontsize='6') - ax.set_xlim([0, 1000000]) + ax.set_xlim([0, 1500000]) ax.yaxis.set_label_position('right') return ax def get_causal_rank(values, causal_val): diff --git a/cms/power/power_parser.py b/cms/power/power_parser.py index 65f329d6..9ac5c920 100644 --- a/cms/power/power_parser.py +++ b/cms/power/power_parser.py @@ -1,5 +1,5 @@ ## structures input to various functions of the script, power.py -## last updated 12.17.16 +## last updated 2.07.17 import argparse @@ -17,7 +17,6 @@ def full_parser_power(): run_sims_parser.add_argument('--paramfolder', type=str, action="store", help='location to read model parameters', default="/idi/sabeti-scratch/jvitti/params/") run_sel_sims_parser.add_argument('--trajdir', action="store") - run_neut_repscores_parser = subparsers.add_parser('run_neut_repscores', help ='run per-replicate component scores') run_sel_repscores_parser = subparsers.add_parser('run_sel_repscores', help='run per-replicate component scores') for run_repscores_parser in [run_neut_repscores_parser, run_sel_repscores_parser]: @@ -43,6 +42,8 @@ def full_parser_power(): composite_sims_parser = subparsers.add_parser('composite_sims', help='calculate composite scores for simulations') normsims_parser = subparsers.add_parser('normsims', help="normalize simulated composite scores to neutral") + normsims_parser.add_argument('--nrep_sel', type= int, action='store', default='500') + normsims_parser.add_argument('--nrep_neut', type= int, action='store', default='1000') ############################# ## LIKELIHOODS FROM SCORES ## @@ -69,8 +70,6 @@ def full_parser_power(): distviz_parser.add_argument('--empDist', action="store_true", default=False, help="visualize from empirical scores") distviz_parser.add_argument('--normed_cms', action="store_true", help="normalized values", default = False) - - ##################### ## QUANTIFY POWER ### ##################### @@ -81,9 +80,6 @@ def full_parser_power(): roc_parser.add_argument('--plot_curve', action="store_true", default=False) roc_parser.add_argument('--find_opt', action="store_true", default=False) roc_parser.add_argument('--maxFPR', type=float, action="store", default=.001) - #roc_parser.add_argument('--fprloc', type=str, action="store", default="/idi/sabeti-scratch/jvitti/cms2_power/fpr_4c/") - #roc_parser.add_argument('--tprloc', type=str, action="store", default="/idi/sabeti-scratch/jvitti/cms2_power/tpr_4c/") - cdf_parser.add_argument('--selPos', type=int, action='store', default=750000, help="position of the causal allele in simulates") @@ -95,11 +91,16 @@ def full_parser_power(): regionlog_parser = subparsers.add_parser('regionlog', help='write regions to excel sheet with gene overlap') regionlog_parser.add_argument('--gene_bedfile', help="name of file", type = str, action='store', default = "/idi/sabeti-scratch/jvitti/knownGenes_110116.txt") manhattan_parser = subparsers.add_parser('manhattan', help='generate manhattan plot of p-values of empirical results.') - manhattan_parser.add_argument('--zscores', action = 'store_true', help="plot -log10(p-values) estimated from neutral simulation") + manhattan_parser.add_argument('--zscores', action = 'store_true', help="plot -log10(p-values) estimated from neutral simulation") #nix manhattan_parser.add_argument('--maxSkipVal', help="expedite plotting by ignoring anything obviously insignificant", default=-10e10) manhattan_parser.add_argument('--poolModels', help="experimental - combine neut distributions from multiple demographic models") extended_manhattan_parser = subparsers.add_parser('extended_manhattan', help = "generate per-chrom plots as one fig") - extended_manhattan_parser.add_argument('--regionsfile', help="input file of regions designated as above threshhold") + extended_manhattan_parser.add_argument('--plotscore', help="string label for score to plot: {seldaf, ihs_normed, delihh_normed, nsl_normed, xpehh_normed, fst, deldaf, cms_unnormed, cms_normed}", type=str, default="cms_normed") + extended_manhattan_parser.add_argument('--regionsfile', help="optional; input file of regions designated as above threshhold") + extended_manhattan_parser.add_argument('--percentile', help="percentile to hilite") + extended_manhattan_parser.add_argument('--titlestring', help="title for plot") + + ################## ## SHARED ARGS ### @@ -115,8 +116,6 @@ def full_parser_power(): makelikes_parser.add_argument('numPerBin', type=int, action='store', default=1000, help="number of replicates in each bin") makelikes_parser.add_argument('selPos', type=int, action='store', default=500000, help="position of the causal allele in simulates") makelikes_parser.add_argument('numLikesBins', type=int, action='store', default=60, help='number of histogram bins') - - for uselikes_parser in [write_master_likes_parser, composite_sims_parser, composite_emp_parser]: uselikes_parser.add_argument('--likes_basedir', default="/idi/sabeti-scratch/jvitti/likes_111516_b/", help="location of likelihood tables ") @@ -127,7 +126,6 @@ def full_parser_power(): regions_parser.add_argument('cutoff', type = float, action='store', help='minimum significant value for region definition', default="3.0") regions_parser.add_argument('--saveLog', type =str, help="save results as text file", ) - for sim_parser in [normsims_parser, repviz_parser, distviz_parser, fpr_parser, tpr_parser, run_neut_repscores_parser,run_norm_neut_repscores_parser, norm_from_binfile_parser, composite_sims_parser, run_neut_sims_parser, run_poppair_parser, run_sel_sims_parser, run_sel_repscores_parser, sel_norm_from_binfile_parser]: sim_parser.add_argument('--simpop', action='store', help='simulated population', default=1) @@ -135,27 +133,19 @@ def full_parser_power(): for emp_parser in [normemp_parser, manhattan_parser, extended_manhattan_parser, gw_regions_parser, distviz_parser]: emp_parser.add_argument('--emppop', action='store', help='empirical population', default="YRI") + for suffixed_parser in [normsims_parser, fpr_parser, tpr_parser, cdf_parser, normemp_parser, manhattan_parser, extended_manhattan_parser, gw_regions_parser, distviz_parser]: + suffixed_parser.add_argument('--suffix', type= str, action='store', default='') + for plot_parser in [repviz_parser, distviz_parser, manhattan_parser, extended_manhattan_parser, cdf_parser]: plot_parser.add_argument('--savefilename', action='store', help='path of file to save', default="/web/personal/vitti/test.png") for run_cms_parser in [run_neut_repscores_parser, run_norm_neut_repscores_parser, norm_from_binfile_parser, run_poppair_parser, composite_sims_parser, run_sel_repscores_parser, sel_norm_from_binfile_parser]: run_cms_parser.add_argument('--cmsdir', help='TEMPORARY, will become redundant with conda packaging', action = 'store', default= "/idi/sabeti-scratch/jvitti/cms/cms/") - for sel_sim_parser in [normsims_parser]: - sel_sim_parser.add_argument('--nrep_sel', type= int, action='store', default='500') - sel_sim_parser.add_argument('--nrep_neut', type= int, action='store', default='1000') - - - for sel_sim_parser in [normsims_parser, fpr_parser, tpr_parser, cdf_parser]: - sel_sim_parser.add_argument('--suffix', type= str, action='store', default='') - - - for commonparser in [normsims_parser, repviz_parser, distviz_parser, fpr_parser, normemp_parser, gw_regions_parser, manhattan_parser, run_norm_neut_repscores_parser,norm_from_binfile_parser, regionlog_parser, cdf_parser, tpr_parser, extended_manhattan_parser, run_neut_sims_parser, run_neut_repscores_parser, composite_sims_parser, run_poppair_parser, run_sel_sims_parser, run_sel_repscores_parser, sel_norm_from_binfile_parser]: commonparser.add_argument('--model', type=str, default="nulldefault") commonparser.add_argument('--nrep', type=int, default=1000) - commonparser.add_argument('--likessuffix', action='store', help='neut or linked', default="neut") return parser diff --git a/cms/tools/__init__.py b/cms/tools/__init__.py index ab332e15..2eaa4934 100644 --- a/cms/tools/__init__.py +++ b/cms/tools/__init__.py @@ -280,7 +280,7 @@ def __init__( last_path_component = os.path.basename(os.path.normpath(os.environ["CONDA_ENV_PATH"])) self.env_path = os.path.dirname(os.environ["CONDA_ENV_PATH"]) if last_path_component == "bin" else os.environ["CONDA_ENV_PATH"] elif "CONDA_DEFAULT_ENV" in os.environ and len(os.environ["CONDA_DEFAULT_ENV"]): - #_log.debug('CONDA_ENV_PATH not found, using CONDA_DEFAULT_ENV') + #_log.debug('CONDA_PREFIX not found, using CONDA_DEFAULT_ENV') conda_env_path = os.environ.get('CONDA_DEFAULT_ENV') # path to current conda environment if conda_env_path: if os.path.isdir(conda_env_path): @@ -421,9 +421,9 @@ def verify_install(self): def _attempt_install(self): try: # check for presence of conda command - util.misc.run_and_print(["conda", "-V"], loglevel=logging.INFO, check=True, env=self.conda_env) + util.misc.run_and_print(["conda", "-V"], buffered=True, check=True, env=self.conda_env, silent=True) except: - _log.debug("conda NOT installed") + _log.warning("conda NOT installed") self._is_attempted = True self.installed = False return @@ -475,7 +475,10 @@ def get_installed_version(self): return # return rather than raise so we can fall back to the next install method if data and len(data): - installed_package_string = data[0] + if isinstance(data[0], dict): + installed_package_string = data[0]["dist_name"] + else: + installed_package_string = data[0] package_info_re = re.compile(r"(?P.*)-(?P.*)-(?P.*)") matches = package_info_re.match(installed_package_string) if matches: @@ -708,4 +711,4 @@ def unpack(self, download_dir): else: shutil.move( os.path.join(download_dir, self.download_file), os.path.join(self.destination_dir, self.download_file) - ) + ) \ No newline at end of file