Merge pull request #56 from biolink/assocmodel-enhancement

Assocmodel enhancement
biolink · Jul 3, 2017 · c4e4ddb · c4e4ddb
2 parents c026697 + c893324
commit c4e4ddb
Show file tree

Hide file tree

Showing 12 changed files with 255 additions and 35 deletions.
diff --git a/bin/biogolr-search.py b/bin/biogolr-search.py
@@ -107,6 +107,7 @@ def main():
         for r in docs:
             print(str(r))
     else:
+        logging.info("FQ={}".format(args.fq))
         q = GolrSearchQuery(args.search,
                             is_go=args.legacy_solr,
                             fq=args.fq,

diff --git a/bin/ontobio-assoc.py b/bin/ontobio-assoc.py
@@ -88,12 +88,14 @@ def main():
     subparsers = parser.add_subparsers(dest='subcommand', help='sub-command help')
 
     # EXTRACT ONTOLOGY
-    parser_n = subparsers.add_parser('subontology', help='Extract sub-ontology')
+    parser_n = subparsers.add_parser('subontology',
+                                     help='Extract sub-ontology, include only annotated nodes or their descendants')
     parser_n.add_argument('-M', '--minimal', dest='minimal', action='store_true', default=False, help='If set, remove non-MRCA nodes')
     parser_n.set_defaults(function=extract_ontology)
 
     # ENRICHMENT
-    parser_n = subparsers.add_parser('enrichment', help='Perform an enrichment test')
+    parser_n = subparsers.add_parser('enrichment',
+                                     help='Perform an enrichment test over a sample set of annotated entities')
     parser_n.add_argument('-q', '--query',type=str, help='query all genes for this class an use as subject')
     parser_n.add_argument('-H', '--hypotheses',nargs='*', help='list of classes to test against')
     parser_n.add_argument('-s', '--sample_file', type=str, help='file containing list of gene IDs in sample set')
@@ -103,40 +105,46 @@ def main():
     parser_n.set_defaults(function=run_enrichment_test)
 
     # PHENOLOG
-    parser_n = subparsers.add_parser('phenolog', help='Perform multiple enrichment tests')
+    parser_n = subparsers.add_parser('phenolog',
+                                     help='Perform multiple enrichment tests, using a second ontology and assoc set to build gene sets')
     parser_n.add_argument('-R', '--resource2',type=str, required=True, help='path to second GAF')
     parser_n.add_argument('-F', '--file2',type=str, required=True, help='handle for second ontology')
     parser_n.set_defaults(function=run_phenolog)
 
     # QUERY
-    parser_n = subparsers.add_parser('query', help='Query based on positive and negative terms')
+    parser_n = subparsers.add_parser('query',
+                                     help='Query for entities (e.g. genes) based on positive and negative terms')
     parser_n.add_argument('-q', '--query',nargs='*', help='positive classes')
     parser_n.add_argument('-N', '--negative',type=str, help='negative classes')
     parser_n.set_defaults(function=run_query)
 
     # QUERY ASSOCIATIONS
-    parser_n = subparsers.add_parser('associations', help='Query for association pairs')
+    parser_n = subparsers.add_parser('associations',
+                                     help='Query for associations for a set of entities (e.g. genes)')
     parser_n.add_argument('subjects',nargs='*', help='subject ids')
     parser_n.set_defaults(function=run_query_associations)
 
     # INTERSECTIONS
-    parser_n = subparsers.add_parser('intersections', help='Query intersections')
+    parser_n = subparsers.add_parser('intersections',
+                                     help='Query intersections')
     parser_n.add_argument('-X', '--xterms',nargs='*', help='x classes')
     parser_n.add_argument('-Y', '--yterms',nargs='*', help='y classes')
     parser_n.add_argument('--useids',type=bool, default=False, help='if true, use IDs not labels on axes')
     parser_n.add_argument('terms',nargs='*', help='all terms (x and y)')
     parser_n.set_defaults(function=plot_intersections)
 
-    # INTERSECTION DENDROGRAM
-    parser_n = subparsers.add_parser('dendrogram', help='Plot dendrogram from intersections')
+    # INTERSECTION DENDROGRAM (TODO: merge into previous?)
+    parser_n = subparsers.add_parser('intersection-dendrogram',
+                                     help='Plot dendrogram from intersections')
     parser_n.add_argument('-X', '--xterms',nargs='*', help='x classes')
     parser_n.add_argument('-Y', '--yterms',nargs='*', help='y classes')
     parser_n.add_argument('--useids',type=bool, default=False, help='if true, use IDs not labels on axes')
     parser_n.add_argument('terms',nargs='*', help='all terms (x and y)')
     parser_n.set_defaults(function=plot_term_intersection_dendrogram)
 
-    # SIMILARITY MATRIX
-    parser_n = subparsers.add_parser('simmatrix', help='Plot dendrogram for similarities between subjects')
+    # SIMILARITY MATRIX (may move to another module)
+    parser_n = subparsers.add_parser('simmatrix',
+                                     help='Plot dendrogram for similarities between subjects')
     parser_n.add_argument('-X', '--xsubjects',nargs='*', help='x subjects')
     parser_n.add_argument('-Y', '--ysubjects',nargs='*', help='y subjects')
     parser_n.add_argument('--useids',type=bool, default=False, help='if true, use IDs not labels on axes')
@@ -202,6 +210,9 @@ def extract_ontology(ont, aset, args):
 
 
 def run_enrichment_test(ont, aset, args):
+    """
+    Runs aset.enrichment_test, printing results
+    """
     subjects = args.sample_ids
     background = None
     if args.sample_file is not None:
@@ -217,13 +228,17 @@ def run_enrichment_test(ont, aset, args):
         print("{:8.3g} {} {:40s}".format(r['p'],r['c'],str(r['n'])))
 
 def run_phenolog(ont, aset, args):
+    """
+    Like run_enrichment_test, but uses classes from a 2nd ontology/assocset to build the gene set.
+    """
     ofactory = OntologyFactory()
     ont2 = ofactory.create(args.resource2)
 
     afactory = AssociationSetFactory()
     aset2 = afactory.create(ontology=ont2,
                             file=args.file2)
 
+    # only test for genes (or other subjects of statements) in common
     common = set(aset.subjects).intersection(aset2.subjects)
     num_common = len(common)
     logging.info("Genes in common between two KBs: {}/\{} = {}".format(len(aset.subjects), len(aset2.subjects), num_common))
@@ -242,6 +257,9 @@ def run_phenolog(ont, aset, args):
 
 
 def run_query(ont, aset, args):
+    """
+    Basic querying by positive/negative class lists
+    """
     subjects = aset.query(args.query, args.negative)
     for s in subjects:
         print("{} {}".format(s, str(aset.label(s))))
@@ -276,6 +294,7 @@ def run_query_associations(ont, aset, args):
     py.plot(data, filename='labelled-heatmap')
     #plot_dendrogram(z, xaxis, yaxis)
 
+# TODO: fix this really dumb implementation
 def tuple_to_matrix(tups):
     import numpy as np
     xset = set()

diff --git a/docs/analyses.rst b/docs/analyses.rst
@@ -72,7 +72,7 @@ analyses. Some examples:
 Create a gene set for all genes in "regulation of bone development"
 (GO:1903010). Find other terms for which this is enriched (in human)
 
-.. code-block:: 
+.. code-block:: console
 
     # find all mouse genes that have 'abnormal synaptic transmission' phenotype
     # (using remote sparql service for MP, and default (Monarch) for associations

diff --git a/ontobio/assoc_factory.py b/ontobio/assoc_factory.py
@@ -15,6 +15,7 @@
 from ontobio.assocmodel import AssociationSet, AssociationSetMetadata
 import ontobio.io.gafparser as px
 from ontobio.io.gafparser import GafParser
+from collections import defaultdict
 
 SHELF_LIFE = datetime.timedelta(days=3)
 
@@ -96,7 +97,32 @@ def create_from_tuples(self, tuples, **args):
         aset = AssociationSet(subject_label_map=subject_label_map, association_map=amap, **args)
         return aset
 
-    def create_from_file(self, file=None, fmt='gaf', **args):
+    def create_from_assocs(self, assocs, **args):
+        """
+        Creates from a list of association objects
+        """
+        amap = defaultdict(list)
+        subject_label_map = {}
+        for a in assocs:
+            subj = a['subject']
+            subj_id = subj['id']
+            subj_label = subj['label']
+            subject_label_map[subj_id] = subj_label
+            if not a['negated']:
+                amap[subj_id].append(a['object']['id'])
+
+        aset = AssociationSet(subject_label_map=subject_label_map, association_map=amap, **args)
+        aset.associations_by_subj = defaultdict(list)
+        aset.associations_by_subj_obj = defaultdict(list)
+        for a in assocs:
+            sub_id = a['subject']['id']
+            obj_id = a['object']['id']
+            aset.associations_by_subj[sub_id].append(a)
+            aset.associations_by_subj_obj[(sub_id,obj_id)].append(a)
+
+        return aset
+
+    def create_from_file(self, file=None, fmt='gaf', skim=True, **args):
         """
         Creates from a file.
 
@@ -108,9 +134,6 @@ def create_from_file(self, file=None, fmt='gaf', **args):
             name of format e.g. gaf
         
         """
-        if isinstance(file,str):
-            file = open(file,"r")
-
         p = None
         if fmt == 'gaf':
             p = px.GafParser()
@@ -121,16 +144,20 @@ def create_from_file(self, file=None, fmt='gaf', **args):
         else:
             logging.error("Format not recognized: {}".format(fmt))
         logging.info("Parsing {} with {}/{}".format(file, fmt, p))
-        results = p.skim(file)
-        return self.create_from_tuples(results, **args)
+        if skim:
+            results = p.skim(file)
+            return self.create_from_tuples(results, **args)
+        else:
+            assocs = p.parse(file)
+            return self.create_from_assocs(assocs, **args)
+
+
 
     def create_from_gaf(self, file, **args):
         """
         Creates from a GAF file
         """
-        p = GafParser()
-        results = p.skim(file)
-        return self.create_from_tuples(results, **args)
+        return self.create_from_file(file, fmt='gaf', **args)
 
     def create_from_phenopacket(self, file):
         """
@@ -143,6 +170,17 @@ def create_from_simple_json(self, file):
         Creates from a simple json rendering
         """
         pass
+
+    def create_from_remote_file(self, group, snapshot=True, **args):
+        """
+        Creates from remote GAF
+        """
+        import requests
+        url = "http://snapshot.geneontology.org/annotations/{}.gaf.gz".format(group)
+        r = requests.get(url, stream=True)
+        p = px.GafParser()
+        results = p.skim(r.raw)
+        return self.create_from_tuples(results, **args)
 
 
 @cachier(stale_after=SHELF_LIFE)

diff --git a/ontobio/assocmodel.py b/ontobio/assocmodel.py
@@ -15,6 +15,7 @@
 import logging
 import scipy.stats # TODO - move
 import scipy as sp # TODO - move
+import pandas as pd
 
 class UnknownSubjectException():
     pass
@@ -43,6 +44,7 @@ def __init__(self, ontology=None, association_map={}, subject_label_map=None, me
         self.subject_label_map = subject_label_map
         self.subject_to_inferred_map = {}
         self.meta = meta  # TODO
+        self.associations_by_subj = None
         self.associations_by_subj_obj = None
         self.strict = False
         self.index()
@@ -59,6 +61,11 @@ def index(self):
         You do not need to call this yourself; called on initialization
         """
         self.subjects = list(self.association_map.keys())
+
+        # ensure annotations unique
+        for (subj,terms) in self.association_map.items():
+            self.association_map[subj] = list(set(self.association_map[subj]))
+
         logging.info("Indexing {} items".format(len(self.subjects)))
         n = 0
         all_objs = set()
@@ -249,6 +256,11 @@ def query_intersections(self, x_terms=[], y_terms=[], symmetric=False):
         return ilist
 
     def intersectionlist_to_matrix(self, ilist, xterms, yterms):
+        """
+        WILL BE DEPRECATED
+
+        Replace with method to return pandas dataframe
+        """
         z = [ [0] * len(xterms) for i1 in range(len(yterms)) ]
 
         xmap = {}
@@ -268,6 +280,24 @@ def intersectionlist_to_matrix(self, ilist, xterms, yterms):
 
         logging.debug("Z={}".format(z))
         return (z,xterms,yterms)
+
+    def as_dataframe(self):
+        """
+        Return association set as pandas DataFrame
+
+        Each row is a subject (e.g. gene)
+        Each column is the inferred class used to describe the subject
+        """
+        entries = []
+        subjs = self.subjects
+        for s in subjs:
+            vmap = {}
+            for c in self.inferred_types(s):
+                vmap[c] = 1
+            entries.append(vmap)
+        df = pd.DataFrame(entries, index=subjs)
+        df = df.fillna(0)
+        return df
 
     def label(self, id):
         """
@@ -289,22 +319,27 @@ def subontology(self, minimal=False):
         """
         return self.ontology.subontology(self.objects, minimal=minimal)
 
-    # TODO
-    def get_assocations(self, subj, obj):
+    def associations(self, subject, object=None):
         """
         Given a subject-object pair (e.g. gene id to ontology class id), return all association
         objects that match.
 
-        Status: not yet implemented
         """
-        if self.associations_by_subj_obj is not None:
-            return self.associations_by_subj_obj[subj][obj]
+        if object is None:
+            if self.associations_by_subj is not None:
+                return self.associations_by_subj[subject]
+            else:
+                return []
         else:
-            return []
-
+            if self.associations_by_subj_obj is not None:
+                return self.associations_by_subj_obj[(subject,object)]
+            else:
+                return []
+
+    # TODO: consider moving to other module
     def enrichment_test(self, subjects=[], background=None, hypotheses=None, threshold=0.05, labels=False, direction='greater'):
         """
-        Performs term enrichment analysis
+        Performs term enrichment analysis. 
 
         Arguments
         ---------

diff --git a/ontobio/golr/golr_associations.py b/ontobio/golr/golr_associations.py
@@ -286,3 +286,13 @@ def calculate_information_content(**kwargs):
         freq = fc/pop_size
         icmap[f] = -math.log(freq, 2)
     return icmap
+
+from ontobio.vocabulary.relations import HomologyTypes
+
+def get_homologs(gene, relation=HomologyTypes.Ortholog.value):
+    search_associations(subject_category='gene',
+                        object_category='gene',
+                        relation=relation,
+                        subject=gene)
+
+
diff --git a/ontobio/golr/golr_query.py b/ontobio/golr/golr_query.py
@@ -252,7 +252,7 @@ def __init__(self,
                  url=None,
                  solr=None,
                  config=None,
-                 fq=None,
+                 fq={},
                  hl=True,
                  facet_fields=None,
                  search_fields=None,
@@ -283,7 +283,7 @@ def __init__(self,
     def solr_params(self):
         #facet_fields = [ map_field(fn, self.field_mapping) for fn in self.facet_fields ]
 
-        fq = {}
+        fq = self.fq
         if self.category is not None:
             fq['category'] = self.category
 
@@ -324,6 +324,7 @@ def solr_params(self):
             'facet': 'on',
             'facet.field': self.facet_fields,
             'facet.limit': 25,
+            'facet.mincount': 1,
             'fl': ",".join(select_fields),
             "defType": "edismax",
             "qf": qf,