Skip to content

Commit

Permalink
Merge pull request #56 from biolink/assocmodel-enhancement
Browse files Browse the repository at this point in the history
Assocmodel enhancement
  • Loading branch information
cmungall committed Jul 3, 2017
2 parents c026697 + c893324 commit c4e4ddb
Show file tree
Hide file tree
Showing 12 changed files with 255 additions and 35 deletions.
1 change: 1 addition & 0 deletions bin/biogolr-search.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ def main():
for r in docs:
print(str(r))
else:
logging.info("FQ={}".format(args.fq))
q = GolrSearchQuery(args.search,
is_go=args.legacy_solr,
fq=args.fq,
Expand Down
39 changes: 29 additions & 10 deletions bin/ontobio-assoc.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,12 +88,14 @@ def main():
subparsers = parser.add_subparsers(dest='subcommand', help='sub-command help')

# EXTRACT ONTOLOGY
parser_n = subparsers.add_parser('subontology', help='Extract sub-ontology')
parser_n = subparsers.add_parser('subontology',
help='Extract sub-ontology, include only annotated nodes or their descendants')
parser_n.add_argument('-M', '--minimal', dest='minimal', action='store_true', default=False, help='If set, remove non-MRCA nodes')
parser_n.set_defaults(function=extract_ontology)

# ENRICHMENT
parser_n = subparsers.add_parser('enrichment', help='Perform an enrichment test')
parser_n = subparsers.add_parser('enrichment',
help='Perform an enrichment test over a sample set of annotated entities')
parser_n.add_argument('-q', '--query',type=str, help='query all genes for this class an use as subject')
parser_n.add_argument('-H', '--hypotheses',nargs='*', help='list of classes to test against')
parser_n.add_argument('-s', '--sample_file', type=str, help='file containing list of gene IDs in sample set')
Expand All @@ -103,40 +105,46 @@ def main():
parser_n.set_defaults(function=run_enrichment_test)

# PHENOLOG
parser_n = subparsers.add_parser('phenolog', help='Perform multiple enrichment tests')
parser_n = subparsers.add_parser('phenolog',
help='Perform multiple enrichment tests, using a second ontology and assoc set to build gene sets')
parser_n.add_argument('-R', '--resource2',type=str, required=True, help='path to second GAF')
parser_n.add_argument('-F', '--file2',type=str, required=True, help='handle for second ontology')
parser_n.set_defaults(function=run_phenolog)

# QUERY
parser_n = subparsers.add_parser('query', help='Query based on positive and negative terms')
parser_n = subparsers.add_parser('query',
help='Query for entities (e.g. genes) based on positive and negative terms')
parser_n.add_argument('-q', '--query',nargs='*', help='positive classes')
parser_n.add_argument('-N', '--negative',type=str, help='negative classes')
parser_n.set_defaults(function=run_query)

# QUERY ASSOCIATIONS
parser_n = subparsers.add_parser('associations', help='Query for association pairs')
parser_n = subparsers.add_parser('associations',
help='Query for associations for a set of entities (e.g. genes)')
parser_n.add_argument('subjects',nargs='*', help='subject ids')
parser_n.set_defaults(function=run_query_associations)

# INTERSECTIONS
parser_n = subparsers.add_parser('intersections', help='Query intersections')
parser_n = subparsers.add_parser('intersections',
help='Query intersections')
parser_n.add_argument('-X', '--xterms',nargs='*', help='x classes')
parser_n.add_argument('-Y', '--yterms',nargs='*', help='y classes')
parser_n.add_argument('--useids',type=bool, default=False, help='if true, use IDs not labels on axes')
parser_n.add_argument('terms',nargs='*', help='all terms (x and y)')
parser_n.set_defaults(function=plot_intersections)

# INTERSECTION DENDROGRAM
parser_n = subparsers.add_parser('dendrogram', help='Plot dendrogram from intersections')
# INTERSECTION DENDROGRAM (TODO: merge into previous?)
parser_n = subparsers.add_parser('intersection-dendrogram',
help='Plot dendrogram from intersections')
parser_n.add_argument('-X', '--xterms',nargs='*', help='x classes')
parser_n.add_argument('-Y', '--yterms',nargs='*', help='y classes')
parser_n.add_argument('--useids',type=bool, default=False, help='if true, use IDs not labels on axes')
parser_n.add_argument('terms',nargs='*', help='all terms (x and y)')
parser_n.set_defaults(function=plot_term_intersection_dendrogram)

# SIMILARITY MATRIX
parser_n = subparsers.add_parser('simmatrix', help='Plot dendrogram for similarities between subjects')
# SIMILARITY MATRIX (may move to another module)
parser_n = subparsers.add_parser('simmatrix',
help='Plot dendrogram for similarities between subjects')
parser_n.add_argument('-X', '--xsubjects',nargs='*', help='x subjects')
parser_n.add_argument('-Y', '--ysubjects',nargs='*', help='y subjects')
parser_n.add_argument('--useids',type=bool, default=False, help='if true, use IDs not labels on axes')
Expand Down Expand Up @@ -202,6 +210,9 @@ def extract_ontology(ont, aset, args):


def run_enrichment_test(ont, aset, args):
"""
Runs aset.enrichment_test, printing results
"""
subjects = args.sample_ids
background = None
if args.sample_file is not None:
Expand All @@ -217,13 +228,17 @@ def run_enrichment_test(ont, aset, args):
print("{:8.3g} {} {:40s}".format(r['p'],r['c'],str(r['n'])))

def run_phenolog(ont, aset, args):
"""
Like run_enrichment_test, but uses classes from a 2nd ontology/assocset to build the gene set.
"""
ofactory = OntologyFactory()
ont2 = ofactory.create(args.resource2)

afactory = AssociationSetFactory()
aset2 = afactory.create(ontology=ont2,
file=args.file2)

# only test for genes (or other subjects of statements) in common
common = set(aset.subjects).intersection(aset2.subjects)
num_common = len(common)
logging.info("Genes in common between two KBs: {}/\{} = {}".format(len(aset.subjects), len(aset2.subjects), num_common))
Expand All @@ -242,6 +257,9 @@ def run_phenolog(ont, aset, args):


def run_query(ont, aset, args):
"""
Basic querying by positive/negative class lists
"""
subjects = aset.query(args.query, args.negative)
for s in subjects:
print("{} {}".format(s, str(aset.label(s))))
Expand Down Expand Up @@ -276,6 +294,7 @@ def run_query_associations(ont, aset, args):
py.plot(data, filename='labelled-heatmap')
#plot_dendrogram(z, xaxis, yaxis)

# TODO: fix this really dumb implementation
def tuple_to_matrix(tups):
import numpy as np
xset = set()
Expand Down
2 changes: 1 addition & 1 deletion docs/analyses.rst
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ analyses. Some examples:
Create a gene set for all genes in "regulation of bone development"
(GO:1903010). Find other terms for which this is enriched (in human)

.. code-block::
.. code-block:: console
# find all mouse genes that have 'abnormal synaptic transmission' phenotype
# (using remote sparql service for MP, and default (Monarch) for associations
Expand Down
56 changes: 47 additions & 9 deletions ontobio/assoc_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from ontobio.assocmodel import AssociationSet, AssociationSetMetadata
import ontobio.io.gafparser as px
from ontobio.io.gafparser import GafParser
from collections import defaultdict

SHELF_LIFE = datetime.timedelta(days=3)

Expand Down Expand Up @@ -96,7 +97,32 @@ def create_from_tuples(self, tuples, **args):
aset = AssociationSet(subject_label_map=subject_label_map, association_map=amap, **args)
return aset

def create_from_file(self, file=None, fmt='gaf', **args):
def create_from_assocs(self, assocs, **args):
"""
Creates from a list of association objects
"""
amap = defaultdict(list)
subject_label_map = {}
for a in assocs:
subj = a['subject']
subj_id = subj['id']
subj_label = subj['label']
subject_label_map[subj_id] = subj_label
if not a['negated']:
amap[subj_id].append(a['object']['id'])

aset = AssociationSet(subject_label_map=subject_label_map, association_map=amap, **args)
aset.associations_by_subj = defaultdict(list)
aset.associations_by_subj_obj = defaultdict(list)
for a in assocs:
sub_id = a['subject']['id']
obj_id = a['object']['id']
aset.associations_by_subj[sub_id].append(a)
aset.associations_by_subj_obj[(sub_id,obj_id)].append(a)

return aset

def create_from_file(self, file=None, fmt='gaf', skim=True, **args):
"""
Creates from a file.
Expand All @@ -108,9 +134,6 @@ def create_from_file(self, file=None, fmt='gaf', **args):
name of format e.g. gaf
"""
if isinstance(file,str):
file = open(file,"r")

p = None
if fmt == 'gaf':
p = px.GafParser()
Expand All @@ -121,16 +144,20 @@ def create_from_file(self, file=None, fmt='gaf', **args):
else:
logging.error("Format not recognized: {}".format(fmt))
logging.info("Parsing {} with {}/{}".format(file, fmt, p))
results = p.skim(file)
return self.create_from_tuples(results, **args)
if skim:
results = p.skim(file)
return self.create_from_tuples(results, **args)
else:
assocs = p.parse(file)
return self.create_from_assocs(assocs, **args)



def create_from_gaf(self, file, **args):
"""
Creates from a GAF file
"""
p = GafParser()
results = p.skim(file)
return self.create_from_tuples(results, **args)
return self.create_from_file(file, fmt='gaf', **args)

def create_from_phenopacket(self, file):
"""
Expand All @@ -143,6 +170,17 @@ def create_from_simple_json(self, file):
Creates from a simple json rendering
"""
pass

def create_from_remote_file(self, group, snapshot=True, **args):
"""
Creates from remote GAF
"""
import requests
url = "http://snapshot.geneontology.org/annotations/{}.gaf.gz".format(group)
r = requests.get(url, stream=True)
p = px.GafParser()
results = p.skim(r.raw)
return self.create_from_tuples(results, **args)


@cachier(stale_after=SHELF_LIFE)
Expand Down
51 changes: 43 additions & 8 deletions ontobio/assocmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import logging
import scipy.stats # TODO - move
import scipy as sp # TODO - move
import pandas as pd

class UnknownSubjectException():
pass
Expand Down Expand Up @@ -43,6 +44,7 @@ def __init__(self, ontology=None, association_map={}, subject_label_map=None, me
self.subject_label_map = subject_label_map
self.subject_to_inferred_map = {}
self.meta = meta # TODO
self.associations_by_subj = None
self.associations_by_subj_obj = None
self.strict = False
self.index()
Expand All @@ -59,6 +61,11 @@ def index(self):
You do not need to call this yourself; called on initialization
"""
self.subjects = list(self.association_map.keys())

# ensure annotations unique
for (subj,terms) in self.association_map.items():
self.association_map[subj] = list(set(self.association_map[subj]))

logging.info("Indexing {} items".format(len(self.subjects)))
n = 0
all_objs = set()
Expand Down Expand Up @@ -249,6 +256,11 @@ def query_intersections(self, x_terms=[], y_terms=[], symmetric=False):
return ilist

def intersectionlist_to_matrix(self, ilist, xterms, yterms):
"""
WILL BE DEPRECATED
Replace with method to return pandas dataframe
"""
z = [ [0] * len(xterms) for i1 in range(len(yterms)) ]

xmap = {}
Expand All @@ -268,6 +280,24 @@ def intersectionlist_to_matrix(self, ilist, xterms, yterms):

logging.debug("Z={}".format(z))
return (z,xterms,yterms)

def as_dataframe(self):
"""
Return association set as pandas DataFrame
Each row is a subject (e.g. gene)
Each column is the inferred class used to describe the subject
"""
entries = []
subjs = self.subjects
for s in subjs:
vmap = {}
for c in self.inferred_types(s):
vmap[c] = 1
entries.append(vmap)
df = pd.DataFrame(entries, index=subjs)
df = df.fillna(0)
return df

def label(self, id):
"""
Expand All @@ -289,22 +319,27 @@ def subontology(self, minimal=False):
"""
return self.ontology.subontology(self.objects, minimal=minimal)

# TODO
def get_assocations(self, subj, obj):
def associations(self, subject, object=None):
"""
Given a subject-object pair (e.g. gene id to ontology class id), return all association
objects that match.
Status: not yet implemented
"""
if self.associations_by_subj_obj is not None:
return self.associations_by_subj_obj[subj][obj]
if object is None:
if self.associations_by_subj is not None:
return self.associations_by_subj[subject]
else:
return []
else:
return []

if self.associations_by_subj_obj is not None:
return self.associations_by_subj_obj[(subject,object)]
else:
return []

# TODO: consider moving to other module
def enrichment_test(self, subjects=[], background=None, hypotheses=None, threshold=0.05, labels=False, direction='greater'):
"""
Performs term enrichment analysis
Performs term enrichment analysis.
Arguments
---------
Expand Down
10 changes: 10 additions & 0 deletions ontobio/golr/golr_associations.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,3 +286,13 @@ def calculate_information_content(**kwargs):
freq = fc/pop_size
icmap[f] = -math.log(freq, 2)
return icmap

from ontobio.vocabulary.relations import HomologyTypes

def get_homologs(gene, relation=HomologyTypes.Ortholog.value):
search_associations(subject_category='gene',
object_category='gene',
relation=relation,
subject=gene)


5 changes: 3 additions & 2 deletions ontobio/golr/golr_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ def __init__(self,
url=None,
solr=None,
config=None,
fq=None,
fq={},
hl=True,
facet_fields=None,
search_fields=None,
Expand Down Expand Up @@ -283,7 +283,7 @@ def __init__(self,
def solr_params(self):
#facet_fields = [ map_field(fn, self.field_mapping) for fn in self.facet_fields ]

fq = {}
fq = self.fq
if self.category is not None:
fq['category'] = self.category

Expand Down Expand Up @@ -324,6 +324,7 @@ def solr_params(self):
'facet': 'on',
'facet.field': self.facet_fields,
'facet.limit': 25,
'facet.mincount': 1,
'fl': ",".join(select_fields),
"defType": "edismax",
"qf": qf,
Expand Down

0 comments on commit c4e4ddb

Please sign in to comment.