-
Notifications
You must be signed in to change notification settings - Fork 30
/
assoc_factory.py
208 lines (168 loc) · 6.66 KB
/
assoc_factory.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
"""
Factory class for generating association sets based on a variety of handle types.
Currently only supports golr query
"""
import networkx as nx
import pathlib
import logging
import os
import subprocess
import hashlib
from ontobio.golr.golr_associations import bulk_fetch
from ontobio.assocmodel import AssociationSet, AssociationSetMetadata
from ontobio.io.hpoaparser import HpoaParser
from ontobio.io.gpadparser import GpadParser
from ontobio.io.gafparser import GafParser
from ontobio.util.user_agent import get_user_agent
from collections import defaultdict
import json
logger = logging.getLogger(__name__)
class AssociationSetFactory():
"""
Factory for creating AssociationSets
Currently support for golr (GO and Monarch) is provided but other stores possible
"""
def __init__(self):
"""
initializes based on an ontology name
"""
def create(self, ontology=None,subject_category=None,object_category=None,evidence=None,taxon=None,relation=None, file=None, fmt=None, skim=True):
"""
creates an AssociationSet
Currently, this uses an eager binding to a `ontobio.golr` instance. All compact associations for the particular combination
of parameters are fetched.
Arguments
---------
ontology: an `Ontology` object
subject_category: string representing category of subjects (e.g. gene, disease, variant)
object_category: string representing category of objects (e.g. function, phenotype, disease)
taxon: string holding NCBITaxon:nnnn ID
"""
meta = AssociationSetMetadata(subject_category=subject_category,
object_category=object_category,
taxon=taxon)
if file is not None:
return self.create_from_file(file=file,
fmt=fmt,
ontology=ontology,
meta=meta,
skim=skim)
logger.info("Fetching assocs from store")
assocs = bulk_fetch_cached(subject_category=subject_category,
object_category=object_category,
evidence=evidence,
taxon=taxon)
logger.info("Creating map for {} subjects".format(len(assocs)))
amap = {}
subject_label_map = {}
for a in assocs:
rel = a['relation']
subj = a['subject']
subject_label_map[subj] = a['subject_label']
amap[subj] = a['objects']
aset = AssociationSet(ontology=ontology,
meta=meta,
subject_label_map=subject_label_map,
association_map=amap)
return aset
def create_from_tuples(self, tuples, **args):
"""
Creates from a list of (subj,subj_name,obj) tuples
"""
amap = {}
subject_label_map = {}
for a in tuples:
subj = a[0]
subject_label_map[subj] = a[1]
if subj not in amap:
amap[subj] = []
amap[subj].append(a[2])
aset = AssociationSet(subject_label_map=subject_label_map, association_map=amap, **args)
return aset
def create_from_assocs(self, assocs, **args):
"""
Creates from a list of association objects
"""
assocs = [a.to_hash_assoc() for a in assocs]
print(json.dumps(assocs[0], indent=4))
amap = defaultdict(list)
subject_label_map = {}
for a in assocs:
subj = a['subject']
subj_id = subj['id']
subj_label = subj['label']
subject_label_map[subj_id] = subj_label
if not a['negated']:
amap[subj_id].append(a['object']['id'])
aset = AssociationSet(subject_label_map=subject_label_map, association_map=amap, **args)
aset.associations_by_subj = defaultdict(list)
aset.associations_by_subj_obj = defaultdict(list)
for a in assocs:
sub_id = a['subject']['id']
obj_id = a['object']['id']
aset.associations_by_subj[sub_id].append(a)
aset.associations_by_subj_obj[(sub_id,obj_id)].append(a)
return aset
def create_from_file(self, file=None, fmt='gaf', skim=True, **args):
"""
Creates from a file. If fmt is set to None then the file suffixes will
be used to choose a parser.
Arguments
---------
file : str or file
input file or filename
fmt : str
name of format e.g. gaf
"""
if fmt is not None and not fmt.startswith('.'):
fmt = '.{}'.format(fmt)
d = {
'.gaf' : GafParser,
'.gpad' : GpadParser,
'.hpoa' : HpoaParser,
}
if fmt is None:
filename = file if isinstance(file, str) else file.name
suffixes = pathlib.Path(filename).suffixes
iterator = (fn() for ext, fn in d.items() if ext in suffixes)
else:
iterator = (fn() for ext, fn in d.items() if ext == fmt)
try:
parser = next(iterator)
except StopIteration:
logger.error("Format not recognized: {}".format(fmt))
logger.info("Parsing {} with {}/{}".format(file, fmt, parser))
if skim:
results = parser.skim(file)
return self.create_from_tuples(results, **args)
else:
assocs = parser.parse(file, skipheader=True)
return self.create_from_assocs(assocs, **args)
def create_from_gaf(self, file, **args):
"""
Creates from a GAF file
"""
return self.create_from_file(file, fmt='gaf', **args)
def create_from_phenopacket(self, file):
"""
Creates from a phenopacket file
"""
pass
def create_from_simple_json(self, file):
"""
Creates from a simple json rendering
"""
pass
def create_from_remote_file(self, group, snapshot=True, **args):
"""
Creates from remote GAF
"""
import requests
url = "http://snapshot.geneontology.org/annotations/{}.gaf.gz".format(group)
r = requests.get(url, stream=True, headers={'User-Agent': get_user_agent(modules=[requests], caller_name=__name__)})
p = GafParser()
results = p.skim(r.raw)
return self.create_from_tuples(results, **args)
def bulk_fetch_cached(**args):
logger.info("Fetching assocs from store (will be cached)")
return bulk_fetch(**args)