Added sort option

asntech · Feb 5, 2017 · e8c11c8 · e8c11c8
1 parent 29b7938
commit e8c11c8
Show file tree

Hide file tree

Showing 4 changed files with 99 additions and 29 deletions.
diff --git a/intervene/__init__.py b/intervene/__init__.py
@@ -1 +1 @@
-__version__ = '0.33'
+__version__ = '0.34'
diff --git a/intervene/intervene b/intervene/intervene
@@ -183,11 +183,11 @@ def main():
 
     #venn module argument parser
     venn_parser = subparsers.add_parser('venn', usage='intervene venn [options]', 
-        description='Create Venn diagram upto 6-way after intersection of genomic regions in <BED/GTF/GFF/VCF> or list sets.', 
+        description='Create Venn diagram upto 6-way after intersection of genomic regions in (BED/GTF/GFF) format or list sets.', 
         help='Venn diagram of intersection of genomic regions or list sets (upto 6-way).')
 
     venn_parser.add_argument('-i', dest='input', nargs="*",
-        help='Input genomic regions in <BED/GTF/GFF/VCF> format or list files. '
+        help='Input genomic regions in (BED/GTF/GFF) format or list files. '
         'For files in a directory use *.<extension>. e.g. *.bed')
 
     venn_parser.add_argument('--type', dest='type', default='genomic', choices=('genomic','list'),
@@ -224,12 +224,16 @@ def main():
 
     #upset module argument parser
     upset_parser = subparsers.add_parser('upset', usage='intervene upset [options]', 
-        description='Create UpSet diagram after intersection of genomic regions in <BED/GTF/GFF/VCF> or list sets.', 
+        description='Create UpSet diagram after intersection of genomic regions in (BED/GTF/GFF) or list sets.', 
         help='UpSet diagram of intersection of genomic regions or list sets.')
-
+    
     upset_parser.add_argument('-i', dest='input', nargs="*",
-        help='Input genomic regions in <BED/GTF/GFF/VCF> format or list files. '
+        help='Input genomic regions in (BED/GTF/GFF) format or list files. '
         'For files in a directory use *.<extension>. e.g. *.bed')
+    '''
+    upset_parser.add_argument('-c', dest='compare', nargs=1,
+        help='Input genomic region file (BED/GTF/GFF) format to compare with input files.')
+    '''
 
     upset_parser.add_argument('--type', dest='type', choices=('genomic','list'),default='genomic',
                   help='Type of input sets. Genomic regions or lists of genes sets. '
@@ -295,11 +299,11 @@ def main():
 
     #pairwise module argument parser
     pairwise_parser = subparsers.add_parser('pairwise', usage='intervene pairwise [options]', 
-        description='Pairwise intersection and heatmap of N genomic region sets in <BED/GTF/GFF/VCF> format.',
-        help='Pairwise intersection and heatmap of N genomic region sets in <BED/GTF/GFF/VCF> format.')
+        description='Pairwise intersection and heatmap of N genomic region sets in (BED/GTF/GFF/VCF) format.',
+        help='Pairwise intersection and heatmap of N genomic region sets in <BED/GTF/GFF> format.')
 
     pairwise_parser.add_argument('-i', dest='input', nargs="*",
-        help='Input genomic regions in <BED/GTF/GFF/VCF> format. '
+        help='Input genomic regions in (BED/GTF/GFF) format. '
         'For files in a directory use *.<extension>. e.g. *.bed')
 
     pairwise_parser.add_argument('--type', choices=('count','frac','jaccard','fisher','reldist'),
@@ -311,18 +315,22 @@ def main():
                        'Default is "%(default)s"')
 
     pairwise_parser.add_argument('--htype', dest="htype",choices=("tribar","color", "pie","circle", "square", "ellipse", "number", "shade"), 
-                   default='tribar',help='Heatmap plot type. '
+                   default='pie',help='Heatmap plot type. '
                            'Default is "%(default)s"')  
 
-    pairwise_parser.add_argument('--names', dest='labels', default='A,B,C,D,E,F',
+    pairwise_parser.add_argument('--names', dest='labels',
                   help='Comma-separated list of names for input files. '
-                       'Default is: --names=A,B,C,D,E,F')
+                       'Default is base name of input files')
 
     pairwise_parser.add_argument('--filenames', action='store_true', default=False,
                   help='Use file names as labels instead. '
                        'Default is "%(default)s"')
-
-    #'''
+
+    pairwise_parser.add_argument('--sort', action='store_true', default=False,
+                  help='Set this only if your files are not sorted. '
+                       'Default is "%(default)s"')
+
+    '''
     pairwise_parser.add_argument('--enrichment', action='store_true',
                     help='Run randomizations (default 1000, specify otherwise '
                     'with --iterations) on each pairwise comparison and '
@@ -334,7 +342,7 @@ def main():
     pairwise_parser.add_argument('--processes', default=1, type=int,
                     help='Number of processors to perform for enrichement '
                     'scores')
-    #'''
+    '''
     pairwise_parser.add_argument('--genome', help='Required argument if --type=fisher. '
                     'Needs to be a string assembly name like "mm10" or "hg38"')
     pairwise_parser.add_argument('-o', '--output', dest='output', 
@@ -398,6 +406,13 @@ def main():
         pairwise.pairwise_intersection(options)
         sys.exit(1)
 
+    '''
+    if options.command =='upset':
+      if options.compare:
+        upset.one_vs_rest_intersection(options.input,options.compare,options.output)
+        sys.exit(1)
+    '''
+
     output_name =  options.output+'/InterVene_'+options.command+'_'+str(venn_order(options.input))+'way_'+plot_type+'.'+options.figtype
 
     #check file name already exists and ask if user want to overwrite

diff --git a/intervene/modules/pairwise/pairwise.py b/intervene/modules/pairwise/pairwise.py
@@ -32,8 +32,8 @@ def jaccard_of_a(a, b):
     return a.jaccard(b,u=True)['jaccard']
 
 #Calculate the fisher 
-def fisher_of_a(a, b):
-    return a.fisher(b,genome='hg19').two_tail
+def fisher_of_a(a, b, genome):
+    return a.fisher(b,genome=genome).two_tail
 
 #Calculate the reldist 
 def reldist_of_a(a, b):
@@ -48,7 +48,7 @@ def enrichment_score(a, b, genome_fn, iterations=1000, processes=1):
     results = a.randomstats(b, new=True, genome_fn=genome_fn, iterations=iterations, processes=processes)
     return (results['actual'] + 1) / (results['median randomized'] + 1)
 
-def create_matrix(beds, func, verbose=False, **kwoptions):
+def create_matrix(beds, func, verbose=False, sort_bed=False, **kwoptions):
     nfiles = len(beds)
     total = nfiles ** 2
     i = 0
@@ -58,10 +58,13 @@ def create_matrix(beds, func, verbose=False, **kwoptions):
     matrix = collections.defaultdict(dict)
     for fa in beds:
         a = BedTool(fa)
+        if sort_bed:
+            a = a.sort()
         for fb in beds:
             i += 1
             b = BedTool(fb)
-
+            if sort_bed:
+                b = b.sort()
             if verbose:
                 sys.stderr.write(
                         '%(i)s of %(total)s: %(fa)s + %(fb)s\n' % locals())
@@ -74,6 +77,7 @@ def create_matrix(beds, func, verbose=False, **kwoptions):
     return matrix, bed_names, bed_sizes
 
 
+
 def barplot(series, matrix, outfile, options, max_size=1):
     """Create a bar plot and place the lower triangle of a heatmap directly
     adjacent so that the bases of the bars line up with the diagonal of the
@@ -215,15 +219,18 @@ def heatmap_triangle(dataframe, axes, hlabel='Fraction of overlap'):
 
     return caxes, D.index
 
+
 def pairwise_intersection(options):
 
+    '''
     if options.enrichment:
         FUNC = enrichment_score
         genome_fn = pybedtools.chromsizes_to_file(pybedtools.chromsizes(options.genome))
         kwoptions = dict(genome_fn=genome_fn, iterations=options.iterations,
                 processes=options.processes)
+    '''
 
-    elif options.type == "frac":
+    if options.type == "frac":
         FUNC = frac_of_a
         kwoptions = {}
 
@@ -233,7 +240,9 @@ def pairwise_intersection(options):
 
     elif options.type == 'fisher':
         FUNC = fisher_of_a
-        kwoptions = {}
+        kwoptions = dict(genome=options.genome)
+
+        #kwoptions = {}
 
     elif options.type == 'reldist':
         FUNC = reldist_of_a
@@ -243,17 +252,13 @@ def pairwise_intersection(options):
         FUNC = actual_intersection
         kwoptions = {}
 
-    t0 = time.time()
     #matrix = create_matrix(beds=options.input, func=FUNC, verbose=options.verbose, **kwoptions)
-    matrix, bed_names, bed_sizes = create_matrix(beds=options.input, func=FUNC, verbose=False, **kwoptions)
+    matrix, bed_names, bed_sizes = create_matrix(beds=options.input, func=FUNC, verbose=False, sort_bed=options.sort, **kwoptions)
 
-    #print(bed_sizes)
-
-    t1 = time.time()
 
     nfiles = len(options.input)
 
-    output_name =  options.output+'/InterVene_'+options.command+'_'+str(nfiles)+'_files_'
+    output_name =  options.output+'/Intervene_'+options.command+'_'+str(nfiles)+'_files_'
 
     #if options.verbose:
     #    sys.stderr.write('Time to construct %s x %s matrix: %.1fs' \
@@ -303,13 +308,18 @@ def pairwise_intersection(options):
         #df = pd.read_csv('data.csv',index_col=0, delim_whitespace=True)
         #matrix = pd.DataFrame(np.random.random((nrows, ncols)), columns=labels)
         outfile = options.output+'/'+str(options.type)+'_barplot_heatmap.'+options.figtype
-        barplot(series, matrix, outfile, options, max_size=max(bed_sizes))  
+        barplot(series, matrix, outfile, options, max_size=max(bed_sizes))
+
+        print('\nYou are done! Please check your results @ '+options.output+'. \nThank you for using Intervene!\n')
+
 
     else:
         #print("Please check the matrix file "+matrix_file)
         cmd = 'heatmap_intervene.R %s %s %s %s %s' % (matrix_file,options.htype,options.type, output_name,options.figtype)
         os.system(cmd)
 
-        print('\nYou are done! Please check your results @ '+options.output+'. \nThank you for using InterVene!\n')
+        print('\nYou are done! Please check your results @ '+options.output+'. \nThank you for using Intervene!\n')
+
+
 
 
diff --git a/intervene/modules/upset/upset.py b/intervene/modules/upset/upset.py
@@ -9,6 +9,9 @@
 import sys
 import os
 import tempfile
+from intervene.modules.pairwise.pairwise import get_name
+from pybedtools import BedTool
+
 
 def create_r_script(labels, names, options):
     """
@@ -130,4 +133,46 @@ def draw_genomic(labels, names, output, fig_type):
     cmd = 'upset_plot_intervene.R %s %s %s %s %s ' % ('genomic',len(key),temp_f.name, output, fig_type)
     os.system(cmd)
     sys.exit(1)
+
+
+def one_vs_rest_intersection(beds, peaks, output, **kwoptions):
+    '''
+    Compares a set of peaks with several other peaks sets.
+
+    '''
+    names = []
+    matrix_file = output+'/One_vs_all_peak_set_matrix.txt'
+    f = open(matrix_file, 'w')
+
+    f.write('peak_id')
+    #f.write('peak_id\tchrom\tstart\tend')
+
+    for bed in beds:
+        names.append(get_name(bed))
+        f.write('\t' + str(get_name(bed)))
+    #main_int.append(names)
+
+    peaks = BedTool(peaks[0])
+    f.write('\n')
+    for i in peaks:
+        #region_int = []
+        peak_id = str(i.chrom)+"_"+str(i.start)+"_"+str(i.end)
+        f.write(peak_id)
+        #f.write(peak_id + '\t' + i.chrom + '\t' + str(i.start) + '\t' + str(i.end))
+
+        for bed in beds:
+            b = BedTool(bed)
+            int_count = BedTool(str(i), from_string=True).intersect(b).count()
+            if (int_count > 0):
+                #region_int.append("1")
+                f.write('\t' + str(1))
+            else:
+                #region_int.append("0")
+                f.write('\t' + str(0))
+        f.write('\n')
+        #main_int.append(region_int)
+        #matrix[peak_id] = region_int
+    f.close()
+
+    return matrix_file