Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Add support for custom gene sets, reference genes and enrichment #67

Merged
merged 5 commits into from
Jun 27, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions orangecontrib/bioinformatics/geneset/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@ def list_all(**kwargs):
all_available = [filename_parse(f_name) for domain, f_name
in serverfiles.ServerFiles().listfiles(DOMAIN) + serverfiles.listfiles(DOMAIN)]
if organism:
return [(hier, org) for hier, org in all_available if org == organism]
return [hier for hier, org in all_available if org == organism]
else:
return all_available


def load_gene_sets(hierarchy):
file_path = serverfiles.localpath_download(DOMAIN, filename(*hierarchy))
def load_gene_sets(hierarchy, tax_id):
file_path = serverfiles.localpath_download(DOMAIN, filename(hierarchy, tax_id))
return GeneSets.from_gmt_file_format(file_path)
72 changes: 68 additions & 4 deletions orangecontrib/bioinformatics/geneset/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
""" GeneSets utility functions """
from typing import List, Tuple
import numpy as np


from typing import List, Tuple, NamedTuple

from orangecontrib.bioinformatics.geneset.config import GENE_SET_ATTRIBUTES
from orangecontrib.bioinformatics.utils import ensure_type
from orangecontrib.bioinformatics.utils.statistics import Hypergeometric


def filename(hierarchy, organism): # type: (Tuple[str, str], str) -> str
Expand Down Expand Up @@ -34,6 +39,17 @@ def filename_parse(fn): # type: (str) -> (Tuple[Tuple[str, str], str])
return hierarchy, org


HYPERGEOMETRIC = Hypergeometric()

# change this when python 3.4 is not supported anymore
enrichment_result = NamedTuple('enrichment_result', [
('query', set),
('reference', set),
('p_value', float),
('enrichment_score', float)
])


class GeneSet:
__slots__ = GENE_SET_ATTRIBUTES

Expand Down Expand Up @@ -68,6 +84,27 @@ def __eq__(self, other):

return False

def set_enrichment(self, reference, query): # type: (List, List) -> enrichment_result
"""
Args:
reference:
query:
"""

assert len(reference) > 0
query_mapped = self.genes.intersection(query)
reference_mapped = self.genes.intersection(reference)

query_p = len(query_mapped) / len(query) if query else np.nan
ref_p = len(reference_mapped) / len(reference) if reference else np.nan
enrichment = query_p / ref_p if ref_p else np.nan

return enrichment_result(
set(query_mapped), set(reference_mapped),
HYPERGEOMETRIC.p_value(len(query_mapped), len(reference),
len(reference_mapped), len(query)),
enrichment)

def gmt_description(self):
""" Represent GeneSet as line in GMT file format

Expand Down Expand Up @@ -134,12 +171,39 @@ def common_hierarchy(self):
for org in hierarchies:
return org

def delete_sets_by_hierarchy(self, hier):
selected_sets = self.map_hierarchy_to_sets().get(hier, None)
if selected_sets:
[self.remove(gene_set) for gene_set in selected_sets]

def map_hierarchy_to_sets(self):
try:
split_by_hier = {hier: GeneSets() for hier in self.hierarchies()}
[split_by_hier[gs.hierarchy].update([gs]) for gs in self]
return split_by_hier

except GeneSetException:
return {}

def split_by_hierarchy(self):
""" Split gene sets by hierarchies. Return a list of :class:`GeneSets` objects. """
split_by_hier = {hier: GeneSets() for hier in self.hierarchies()}
[split_by_hier[gs.hierarchy].update([gs]) for gs in self]

return list(split_by_hier.values())
try:
split_by_hier = {hier: GeneSets() for hier in self.hierarchies()}
[split_by_hier[gs.hierarchy].update([gs]) for gs in self]
return list(split_by_hier.values())

except GeneSetException:
return []

def genes(self):
"""
Returns:
All genes from GeneSets
"""
genes = set()
[genes.update(gene_set.genes) for gene_set in self]
return genes

def to_gmt_file_format(self, file_path): # type: (str) -> None
""" The GMT file format is a tab delimited file format that describes gene sets.
Expand Down
Loading