Merge pull request #55 from biolink/enrichment-docs

WIP: Enrichment docs
biolink · Jul 3, 2017 · c026697 · c026697
2 parents 360f2df + 0b844b8
commit c026697
Show file tree

Hide file tree

Showing 4 changed files with 134 additions and 17 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,6 +1,11 @@
 CHANGES
 =======
 
+0.2.10
+------
+
+* Fixed bug with handling of replaced_by fields in obsolete nodes, #51
+
 0.2.9
 -----
 

diff --git a/bin/ontobio-assoc.py b/bin/ontobio-assoc.py
@@ -70,6 +70,8 @@ def main():
                         help='ECO class')
     parser.add_argument('-p', '--properties', nargs='*', type=str, required=False,
                         help='Properties')
+    parser.add_argument('-P', '--plot', type=bool, default=False,
+                        help='if set, plot output (requires plotly)')
     parser.add_argument('-y', '--yamlconfig', type=str, required=False,
                         help='Path to setup/configuration yaml file')
     parser.add_argument('-S', '--slim', type=str, default='', required=False,
@@ -94,7 +96,10 @@ def main():
     parser_n = subparsers.add_parser('enrichment', help='Perform an enrichment test')
     parser_n.add_argument('-q', '--query',type=str, help='query all genes for this class an use as subject')
     parser_n.add_argument('-H', '--hypotheses',nargs='*', help='list of classes to test against')
-    parser_n.add_argument('subjects',nargs='*')
+    parser_n.add_argument('-s', '--sample_file', type=str, help='file containing list of gene IDs in sample set')
+    parser_n.add_argument('-b', '--background_file', type=str, help='file containing list of gene IDs in background set')
+    parser_n.add_argument('-t', '--threshold', type=float, help='p-value threshold')
+    parser_n.add_argument('sample_ids', nargs='*', help='list of gene IDs in sample set')
     parser_n.set_defaults(function=run_enrichment_test)
 
     # PHENOLOG
@@ -195,12 +200,19 @@ def extract_ontology(ont, aset, args):
     w.outfile = args.outfile
     w.write(ont)
 
+
 def run_enrichment_test(ont, aset, args):
-    subjects = args.subjects
+    subjects = args.sample_ids
+    background = None
+    if args.sample_file is not None:
+        subjects = read_ids_from_file(args.sample_file)
+    if args.background_file is not None:
+        subjects = read_ids_from_file(args.background_file)
     if args.query is not None:
         subjects = aset.query([args.query])
     print("SUBJECTS q={} : {}".format(args.query, subjects))
-    enr = aset.enrichment_test(subjects=subjects, hypotheses=args.hypotheses, labels=True)
+    enr = aset.enrichment_test(subjects=subjects, background=background, hypotheses=args.hypotheses, threshold=args.threshold, labels=True)
+    print('ENRICHED_TERMS={}'.format(len(enr)))
     for r in enr:
         print("{:8.3g} {} {:40s}".format(r['p'],r['c'],str(r['n'])))
 
@@ -230,22 +242,24 @@ def run_phenolog(ont, aset, args):
 
 
 def run_query(ont, aset, args):
-    import plotly.plotly as py
-    import plotly.graph_objs as go
     subjects = aset.query(args.query, args.negative)
     for s in subjects:
         print("{} {}".format(s, str(aset.label(s))))
-    tups = aset.query_associations(subjects=subjects)
-    z, xaxis, yaxis = tuple_to_matrix(tups)
-    spacechar = " "
-    xaxis = mk_axis(xaxis, aset, args, spacechar=" ")
-    yaxis = mk_axis(yaxis, aset, args, spacechar=" ")
-    logging.info("PLOTTING: {} x {} = {}".format(xaxis, yaxis, z))
-    trace = go.Heatmap(z=z,
-                       x=xaxis,
-                       y=yaxis)
-    data=[trace]
-    py.plot(data, filename='labelled-heatmap')
+
+    if args.plot:
+        import plotly.plotly as py
+        import plotly.graph_objs as go
+        tups = aset.query_associations(subjects=subjects)
+        z, xaxis, yaxis = tuple_to_matrix(tups)
+        spacechar = " "
+        xaxis = mk_axis(xaxis, aset, args, spacechar=" ")
+        yaxis = mk_axis(yaxis, aset, args, spacechar=" ")
+        logging.info("PLOTTING: {} x {} = {}".format(xaxis, yaxis, z))
+        trace = go.Heatmap(z=z,
+                           x=xaxis,
+                           y=yaxis)
+        data=[trace]
+        py.plot(data, filename='labelled-heatmap')
 
 def run_query_associations(ont, aset, args):
     import plotly.plotly as py
@@ -438,5 +452,11 @@ def label_or_id(x, kb):
     else:
         return label
 
+def read_ids_from_file(fn):
+    f = open(fn, 'r')
+    ids = [id.strip() for id in f.readlines()]
+    f.close()
+    return ids
+
 if __name__ == "__main__":
     main()
diff --git a/docs/analyses.rst b/docs/analyses.rst
@@ -13,11 +13,103 @@ Enrichment
 
 See the `Notebook example <http://nbviewer.jupyter.org/github/biolink/ontobio/blob/master/notebooks/Phenotype_Enrichment.ipynb>`_
 
+OntoBio allows for generalized gene set enrichment: given a set of
+annotations that map genes to descriptor terms, and an input set of
+genes, and a background set, find what terms are enriched in the input
+set compared to the background.
+
+With OntoBio, enrichment tests work for any annotation corpus, not
+necessarily just gene-oriented. For example,
+disease-phenotype. However, care must be taken with underlying
+assumptions with non-gene sets.
+
+The very first thing you need to do before an enrichment analysis is
+fetch both an `Ontology` object and an `AsssociationSet` object. This
+could be a mix of local files or remote service/database. See
+:ref:`inputs` for details.
+
+Assume that we are using a remote ontology and local GAF:     
+
+.. code-block:: python
+
+    from ontobio import OntologyFactory
+    from ontobio import AssociationSetFactory
+    ofactory = OntologyFactory()
+    afactory = AssociationSetFactory()
+    ont = ofactory.create('go')
+    aset = afactory.create_from_gaf('my.gaf', ontology=ont)
+
+Assume also that we have a set of sample and background gene IDs, the
+test is:    
+
+.. code-block:: python
+
+    enr = aset.enrichment_test(subjects=gene_ids, background=background_gene_ids, threshold=0.00005, labels=True)    
+
+This returns a list of dicts (**TODO** - decide if we want to make
+this an object and follow a standard class model)
+
+**NOTE** the input gene IDs *must* be the same ones used in the
+AssociationSet. If you load from a GAF, this is the IDs that are
+formed by combining col1 and col2, separated by a
+":". E.g. UniProtKB:P123456
+
+What if you have different IDs? Or what if you just have a list of
+gene symbols? In this case you will need to *map* these names or IDs,
+the subject of the next section.
+
+Reproducibility
+~~~~~~~~~~~~~~~
+
+For reproducible analyses, use a **versioned PURL** for the ontology
+
+Command line wrapper
+~~~~~~~~~~~~~~~~~~~~
+
+You can use the `ontobio-assoc` command to run enrichment
+analyses. Some examples:
+
+Create a gene set for all genes in "regulation of bone development"
+(GO:1903010). Find other terms for which this is enriched (in human)
+
+.. code-block:: 
+
+    # find all mouse genes that have 'abnormal synaptic transmission' phenotype
+    # (using remote sparql service for MP, and default (Monarch) for associations
+    ontobio-assoc.py -v -r mp -T NCBITaxon:10090 -C gene phenotype query -q MP:0003635 > genes.txt
+
+    # get IDs
+    cut -f1 -d ' ' genes.txt > genes.ids
+
+    # enrichment, using GO
+    ontobio-assoc.py  -r go -T NCBITaxon:10090 -C gene function enrichment -s genes.ids
+
+    # resulting GO terms are not very surprising...
+    2.48e-12 GO:0045202 synapse
+    2.87e-11 GO:0044456 synapse part
+    3.66e-08 GO:0007270 neuron-neuron synaptic transmission
+    3.95e-08 GO:0098793 presynapse
+    1.65e-07 GO:0099537 trans-synaptic signaling
+    1.65e-07 GO:0007268 chemical synaptic transmission
+    
+
+Further reading
+~~~~~~~~~~~~~~~
+
+For API docs, see `enrichment_test in AssociationSet model <http://ontobio.readthedocs.io/en/latest/api.html#assocation-object-model>`_
+
+Identifier Mapping
+------------------
+
+**TODO**
+
 Semantic Similarity
 -------------------
 
 **TODO**
 
+To follow progress, see `this PR <https://github.com/biolink/ontobio/pull/49>`_
+
 Slimming
 --------
 

diff --git a/ontobio/__init__.py b/ontobio/__init__.py
@@ -1,6 +1,6 @@
 from __future__ import absolute_import
 
-__version__ = '0.2.9'
+__version__ = '0.2.10'
 
 from .ontol_factory import OntologyFactory
 from .io.ontol_renderers import GraphRenderer