diff --git a/intervene/__init__.py b/intervene/__init__.py index 4e42380..969c4f2 100755 --- a/intervene/__init__.py +++ b/intervene/__init__.py @@ -1 +1 @@ -__version__ = '0.33' +__version__ = '0.34' diff --git a/intervene/intervene b/intervene/intervene index 3fdc2dc..34618a0 100755 --- a/intervene/intervene +++ b/intervene/intervene @@ -183,11 +183,11 @@ def main(): #venn module argument parser venn_parser = subparsers.add_parser('venn', usage='intervene venn [options]', - description='Create Venn diagram upto 6-way after intersection of genomic regions in or list sets.', + description='Create Venn diagram upto 6-way after intersection of genomic regions in (BED/GTF/GFF) format or list sets.', help='Venn diagram of intersection of genomic regions or list sets (upto 6-way).') venn_parser.add_argument('-i', dest='input', nargs="*", - help='Input genomic regions in format or list files. ' + help='Input genomic regions in (BED/GTF/GFF) format or list files. ' 'For files in a directory use *.. e.g. *.bed') venn_parser.add_argument('--type', dest='type', default='genomic', choices=('genomic','list'), @@ -224,12 +224,16 @@ def main(): #upset module argument parser upset_parser = subparsers.add_parser('upset', usage='intervene upset [options]', - description='Create UpSet diagram after intersection of genomic regions in or list sets.', + description='Create UpSet diagram after intersection of genomic regions in (BED/GTF/GFF) or list sets.', help='UpSet diagram of intersection of genomic regions or list sets.') - + upset_parser.add_argument('-i', dest='input', nargs="*", - help='Input genomic regions in format or list files. ' + help='Input genomic regions in (BED/GTF/GFF) format or list files. ' 'For files in a directory use *.. e.g. *.bed') + ''' + upset_parser.add_argument('-c', dest='compare', nargs=1, + help='Input genomic region file (BED/GTF/GFF) format to compare with input files.') + ''' upset_parser.add_argument('--type', dest='type', choices=('genomic','list'),default='genomic', help='Type of input sets. Genomic regions or lists of genes sets. ' @@ -295,11 +299,11 @@ def main(): #pairwise module argument parser pairwise_parser = subparsers.add_parser('pairwise', usage='intervene pairwise [options]', - description='Pairwise intersection and heatmap of N genomic region sets in format.', - help='Pairwise intersection and heatmap of N genomic region sets in format.') + description='Pairwise intersection and heatmap of N genomic region sets in (BED/GTF/GFF/VCF) format.', + help='Pairwise intersection and heatmap of N genomic region sets in format.') pairwise_parser.add_argument('-i', dest='input', nargs="*", - help='Input genomic regions in format. ' + help='Input genomic regions in (BED/GTF/GFF) format. ' 'For files in a directory use *.. e.g. *.bed') pairwise_parser.add_argument('--type', choices=('count','frac','jaccard','fisher','reldist'), @@ -311,18 +315,22 @@ def main(): 'Default is "%(default)s"') pairwise_parser.add_argument('--htype', dest="htype",choices=("tribar","color", "pie","circle", "square", "ellipse", "number", "shade"), - default='tribar',help='Heatmap plot type. ' + default='pie',help='Heatmap plot type. ' 'Default is "%(default)s"') - pairwise_parser.add_argument('--names', dest='labels', default='A,B,C,D,E,F', + pairwise_parser.add_argument('--names', dest='labels', help='Comma-separated list of names for input files. ' - 'Default is: --names=A,B,C,D,E,F') + 'Default is base name of input files') pairwise_parser.add_argument('--filenames', action='store_true', default=False, help='Use file names as labels instead. ' 'Default is "%(default)s"') - - #''' + + pairwise_parser.add_argument('--sort', action='store_true', default=False, + help='Set this only if your files are not sorted. ' + 'Default is "%(default)s"') + + ''' pairwise_parser.add_argument('--enrichment', action='store_true', help='Run randomizations (default 1000, specify otherwise ' 'with --iterations) on each pairwise comparison and ' @@ -334,7 +342,7 @@ def main(): pairwise_parser.add_argument('--processes', default=1, type=int, help='Number of processors to perform for enrichement ' 'scores') - #''' + ''' pairwise_parser.add_argument('--genome', help='Required argument if --type=fisher. ' 'Needs to be a string assembly name like "mm10" or "hg38"') pairwise_parser.add_argument('-o', '--output', dest='output', @@ -398,6 +406,13 @@ def main(): pairwise.pairwise_intersection(options) sys.exit(1) + ''' + if options.command =='upset': + if options.compare: + upset.one_vs_rest_intersection(options.input,options.compare,options.output) + sys.exit(1) + ''' + output_name = options.output+'/InterVene_'+options.command+'_'+str(venn_order(options.input))+'way_'+plot_type+'.'+options.figtype #check file name already exists and ask if user want to overwrite diff --git a/intervene/modules/pairwise/pairwise.py b/intervene/modules/pairwise/pairwise.py index 5be620a..9aa974e 100644 --- a/intervene/modules/pairwise/pairwise.py +++ b/intervene/modules/pairwise/pairwise.py @@ -32,8 +32,8 @@ def jaccard_of_a(a, b): return a.jaccard(b,u=True)['jaccard'] #Calculate the fisher -def fisher_of_a(a, b): - return a.fisher(b,genome='hg19').two_tail +def fisher_of_a(a, b, genome): + return a.fisher(b,genome=genome).two_tail #Calculate the reldist def reldist_of_a(a, b): @@ -48,7 +48,7 @@ def enrichment_score(a, b, genome_fn, iterations=1000, processes=1): results = a.randomstats(b, new=True, genome_fn=genome_fn, iterations=iterations, processes=processes) return (results['actual'] + 1) / (results['median randomized'] + 1) -def create_matrix(beds, func, verbose=False, **kwoptions): +def create_matrix(beds, func, verbose=False, sort_bed=False, **kwoptions): nfiles = len(beds) total = nfiles ** 2 i = 0 @@ -58,10 +58,13 @@ def create_matrix(beds, func, verbose=False, **kwoptions): matrix = collections.defaultdict(dict) for fa in beds: a = BedTool(fa) + if sort_bed: + a = a.sort() for fb in beds: i += 1 b = BedTool(fb) - + if sort_bed: + b = b.sort() if verbose: sys.stderr.write( '%(i)s of %(total)s: %(fa)s + %(fb)s\n' % locals()) @@ -74,6 +77,7 @@ def create_matrix(beds, func, verbose=False, **kwoptions): return matrix, bed_names, bed_sizes + def barplot(series, matrix, outfile, options, max_size=1): """Create a bar plot and place the lower triangle of a heatmap directly adjacent so that the bases of the bars line up with the diagonal of the @@ -215,15 +219,18 @@ def heatmap_triangle(dataframe, axes, hlabel='Fraction of overlap'): return caxes, D.index + def pairwise_intersection(options): + ''' if options.enrichment: FUNC = enrichment_score genome_fn = pybedtools.chromsizes_to_file(pybedtools.chromsizes(options.genome)) kwoptions = dict(genome_fn=genome_fn, iterations=options.iterations, processes=options.processes) + ''' - elif options.type == "frac": + if options.type == "frac": FUNC = frac_of_a kwoptions = {} @@ -233,7 +240,9 @@ def pairwise_intersection(options): elif options.type == 'fisher': FUNC = fisher_of_a - kwoptions = {} + kwoptions = dict(genome=options.genome) + + #kwoptions = {} elif options.type == 'reldist': FUNC = reldist_of_a @@ -243,17 +252,13 @@ def pairwise_intersection(options): FUNC = actual_intersection kwoptions = {} - t0 = time.time() #matrix = create_matrix(beds=options.input, func=FUNC, verbose=options.verbose, **kwoptions) - matrix, bed_names, bed_sizes = create_matrix(beds=options.input, func=FUNC, verbose=False, **kwoptions) + matrix, bed_names, bed_sizes = create_matrix(beds=options.input, func=FUNC, verbose=False, sort_bed=options.sort, **kwoptions) - #print(bed_sizes) - - t1 = time.time() nfiles = len(options.input) - output_name = options.output+'/InterVene_'+options.command+'_'+str(nfiles)+'_files_' + output_name = options.output+'/Intervene_'+options.command+'_'+str(nfiles)+'_files_' #if options.verbose: # sys.stderr.write('Time to construct %s x %s matrix: %.1fs' \ @@ -303,13 +308,18 @@ def pairwise_intersection(options): #df = pd.read_csv('data.csv',index_col=0, delim_whitespace=True) #matrix = pd.DataFrame(np.random.random((nrows, ncols)), columns=labels) outfile = options.output+'/'+str(options.type)+'_barplot_heatmap.'+options.figtype - barplot(series, matrix, outfile, options, max_size=max(bed_sizes)) + barplot(series, matrix, outfile, options, max_size=max(bed_sizes)) + + print('\nYou are done! Please check your results @ '+options.output+'. \nThank you for using Intervene!\n') + else: #print("Please check the matrix file "+matrix_file) cmd = 'heatmap_intervene.R %s %s %s %s %s' % (matrix_file,options.htype,options.type, output_name,options.figtype) os.system(cmd) - print('\nYou are done! Please check your results @ '+options.output+'. \nThank you for using InterVene!\n') + print('\nYou are done! Please check your results @ '+options.output+'. \nThank you for using Intervene!\n') + + diff --git a/intervene/modules/upset/upset.py b/intervene/modules/upset/upset.py index a8204fc..45ae978 100644 --- a/intervene/modules/upset/upset.py +++ b/intervene/modules/upset/upset.py @@ -9,6 +9,9 @@ import sys import os import tempfile +from intervene.modules.pairwise.pairwise import get_name +from pybedtools import BedTool + def create_r_script(labels, names, options): """ @@ -130,4 +133,46 @@ def draw_genomic(labels, names, output, fig_type): cmd = 'upset_plot_intervene.R %s %s %s %s %s ' % ('genomic',len(key),temp_f.name, output, fig_type) os.system(cmd) sys.exit(1) + + +def one_vs_rest_intersection(beds, peaks, output, **kwoptions): + ''' + Compares a set of peaks with several other peaks sets. + + ''' + names = [] + matrix_file = output+'/One_vs_all_peak_set_matrix.txt' + f = open(matrix_file, 'w') + + f.write('peak_id') + #f.write('peak_id\tchrom\tstart\tend') + + for bed in beds: + names.append(get_name(bed)) + f.write('\t' + str(get_name(bed))) + #main_int.append(names) + + peaks = BedTool(peaks[0]) + f.write('\n') + for i in peaks: + #region_int = [] + peak_id = str(i.chrom)+"_"+str(i.start)+"_"+str(i.end) + f.write(peak_id) + #f.write(peak_id + '\t' + i.chrom + '\t' + str(i.start) + '\t' + str(i.end)) + + for bed in beds: + b = BedTool(bed) + int_count = BedTool(str(i), from_string=True).intersect(b).count() + if (int_count > 0): + #region_int.append("1") + f.write('\t' + str(1)) + else: + #region_int.append("0") + f.write('\t' + str(0)) + f.write('\n') + #main_int.append(region_int) + #matrix[peak_id] = region_int + f.close() + + return matrix_file \ No newline at end of file