-
Notifications
You must be signed in to change notification settings - Fork 22
/
database.py
437 lines (373 loc) · 17.7 KB
/
database.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
'''
database access functions (:mod:`calour.database`)
==================================================
.. currentmodule:: calour.database
Functions
^^^^^^^^^
.. autosummary::
:toctree: generated
add_terms_to_features
enrichment
'''
# ----------------------------------------------------------------------------
# Copyright (c) 2016--, Calour development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------
from logging import getLogger
from abc import ABC
import importlib
from .util import get_config_value, get_config_file, get_config_sections
from .experiment import Experiment
logger = getLogger(__name__)
def _get_database_class(dbname, exp=None, config_file_name=None):
'''Get the database class for the given database name
Uses the calour config file (calour.config) keys
Parameters
----------
dbname : str
the database name. common options are:
'dbbact' : the amplicon sequence manual annotation database
'spongeworld' : the sponge microbiome database
'redbiome' : the qiita automatic amplicon sequence database
Names are listed in the calour.config file as section names
config_file_name: str or None, optional
None (default) to use the default calour condig file.
str to use the file names str as the conig file
Returns
-------
calour.database.Database
A ``Database`` class for the requested dbname
'''
class_name = get_config_value('class_name', section=dbname, config_file_name=config_file_name)
module_name = get_config_value('module_name', section=dbname, config_file_name=config_file_name)
if class_name is not None and module_name is not None:
try:
# import the database module
db_module = importlib.import_module(module_name)
except ImportError:
module_website = get_config_value('website', section=dbname, config_file_name=config_file_name)
module_installation = get_config_value('installation', section=dbname, config_file_name=config_file_name)
logger.warning('Database interface %s not installed.\nSkipping.\n'
'You can install the database using:\n%s\n'
'For details see: %s' % (module_name, module_installation, module_website))
return None
# get the class
DBClass = getattr(db_module, class_name)
cdb = DBClass(exp)
return cdb
# not found, so print available database names
databases = []
sections = get_config_sections()
for csection in sections:
class_name = get_config_value('class_name', section=csection, config_file_name=config_file_name)
module_name = get_config_value('class_name', section=csection, config_file_name=config_file_name)
if class_name is not None and module_name is not None:
databases.append(csection)
if len(databases) == 0:
logger.warning('calour config file %s does not contain any database sections. Skipping' % get_config_file())
return None
logger.warning('Database %s not found in config file (%s).\nSkipping.\n'
'Current databases in config file: %s' % (dbname, get_config_file(), databases))
return None
def add_terms_to_features(exp: Experiment, dbname, use_term_list=None, field_name='common_term', term_type=None, ignore_exp=None, **kwargs):
'''Add a field to the feature metadata, with most common term for each feature
Create a new feature_metadata field, with the most common term (out of term_list) for each feature in experiment.
It adds annotations in-place.
Parameters
----------
use_term_list : list of str or None, optional
Use only terms appearing in this list
None (default) to use all terms
field_name : str, optional
Name of feature_metadata field to store the annotatiosn.
term_type : str or None, optional
type of the annotation summary to get from the database (db specific)
None to get default type
ignore_exp : list of int or None, optional
list of experiments to ignore when adding the terms
**kwargs: database specific additional parameters (see database interface get_feature_terms() function for specific terms)
Returns
-------
Experiment
with feature_metadata field containing the most common database term for each feature
'''
logger.debug('Adding terms to features for database %s' % dbname)
db = _get_database_class(dbname, exp)
features = exp.feature_metadata.index.values
logger.debug('found %d features' % len(features))
# get the per feature term scores
term_list = db.get_feature_terms(features, exp=exp, term_type=term_type, ignore_exp=ignore_exp, **kwargs)
logger.debug('got %d terms from database' % len(term_list))
# find the most enriched term (out of the list) for each feature
feature_terms = []
for cfeature in features:
if cfeature not in term_list:
feature_terms.append('na')
continue
if len(term_list[cfeature]) == 0:
feature_terms.append('na')
continue
if use_term_list is None:
bterm = max(term_list[cfeature], key=term_list[cfeature].get)
else:
bterm = 'other'
bscore = 0
for cterm in use_term_list:
if cterm not in term_list[cfeature]:
continue
cscore = term_list[cfeature][cterm]
if cscore > bscore:
bscore = cscore
bterm = cterm
feature_terms.append(bterm)
exp.feature_metadata[field_name] = feature_terms
return exp
def enrichment(exp: Experiment, features, dbname, *args, **kwargs):
'''Get the list of enriched annotation terms in features compared to all other features in exp.
Uses the database specific enrichment analysis method.
Parameters
----------
features : list of str
The features to test for enrichment (compared to all other features in exp)
dbname : str
the database to use for the annotation terms and enrichment analysis
*args : tuple
**kwargs : dict
Additional database specific parameters (see per-database module documentation for .enrichment() method)
Returns
-------
pandas.DataFrame with info about significantly enriched terms. The columns include:
* 'feature' : the feature ID (str)
* 'pval' : the p-value for the enrichment (float)
* 'odif' : the effect size (float)
* 'observed' : the number of observations of this term in group1 (int)
* 'expected' : the expected (based on all features) number of observations of this term in group1 (float)
* 'frac_group1' : fraction of total terms in group 1 which are the specific term (float)
* 'frac_group2' : fraction of total terms in group 2 which are the specific term (float)
* 'num_group1' : number of total terms in group 1 which are the specific term (float)
* 'num_group2' : number of total terms in group 2 which are the specific term (float)
* 'description' : the term (str)
numpy.Array where rows are features (ordered like the dataframe), columns are terms, and value is score
for term in feature
pandas.DataFrame with info about the features used. columns:
* 'group' : int, the group (1/2) to which the feature belongs
* 'sequence': str
'''
db = _get_database_class(dbname, exp=exp)
if not db.can_do_enrichment:
raise ValueError('database %s does not support enrichment analysis' % dbname)
return db.enrichment(exp, features, *args, **kwargs)
class Database(ABC):
def __init__(self, exp=None, database_name=None, methods=['get', 'annotate', 'enrichment']):
'''Initialize the database interface
Parameters
----------
exp : Experiment or None, optional
The experiment link for the database (if needed)
database_name : str, optional
name of the database (for showing errors, etc.)
methods : list of str, optional
The integration level this database interface supports.
'get' if database interface supports get_seq_annotation_strings()
'annotate' if database interface supports add_annotation()
'enrichment' if database interface supports get_feature_terms()
'''
self.database_name = database_name
self._methods = set(methods)
self._exp = exp
@property
def annotatable(self):
'''True if the database supports adding annotations via the add_annotation() function
'''
return 'annotate' in self._methods
@property
def can_do_enrichment(self):
'''True if the database supports getting a dict of terms per feature via the get_feature_terms() function
'''
return 'enrichment' in self._methods
def get_seq_annotation_strings(self, feature):
'''Get nice string summaries of annotations for a given sequence
Parameters
----------
feature : str
the feature ID to query the database about
Returns
-------
list of [(dict,str)] (annotationdetails,annotationsummary)
a list of:
annotationdetails : dict
'annotationid' : int, the annotation id in the database
'annotationtype : str
...
annotationsummary : str
a short user readble summary of the annotation.
This will be displayed for the user
'''
logger.debug('Generic function for get_annotation_strings')
return []
def get_annotation_website(self, annotation):
'''Get the database website address of information about the annotation.
Used for the Jupyter GUI when clicking on an annotation in the list
(to open in a new browser tab)
Parameters
----------
annotation : dict
keys/values are database specific (supplied by the database interface when calling get_annotation_strings() ).
These keys/values can be used by the database interface here to determine which website address to return.
Returns
-------
str or None
The webaddress of the html page with details about the annotation,
or None if not available
'''
logger.debug('Generic function for get_annotation_website')
return None
def show_annotation_info(self, annotation):
'''Show details about the annotation.
This should use a database interface created GUI to show more details about the annotation.
Called from the qt5 heatmap GUI when double clicking on a database annotation.
A common GUI can be a new browser window with details about the annotation.
Parameters
----------
annotation : dict
keys/values are database specific (supplied by the database interface when calling get_annotation_strings() ).
These keys/values can be used by the database interface here to determine which website address to return.
'''
logger.debug('Generic function for show annotation info')
return
def add_annotation(self, features, exp):
'''Add an entry to the database about a set of features.
This is an optional function for databases that support manual annotations (level L4).
supporting this option is indicated by the "annotate" method in __init__()
It is called from the qt5 heatmap GUI when pressing the "Annotate" button.
All GUI should be supplied by the database interface.
Parameters
----------
features : list of str
the features to add to the database
exp : Experiment
the experiment where the features are coming from
Returns
-------
err : str
empty if ok, otherwise the error encountered
'''
logger.debug('Generic function for add_annotations')
raise NotImplementedError
def upadte_annotation(self, annotation, exp=None):
'''Update an existing annotation
This is an optional function for databases that support manual annotations (level L4).
supporting this option is indicated by the "annotate" method in __init__().
It is called when right clicking on an annotation in the qt5 GUI and selecting "update".
All GUI should be supplied by the database interface.
Parameters
----------
annotation : dict
The annotation to update (keys/values are database specific)
exp : Experiment, optional
The calour experiment from which the annotation is coming from
Returns
-------
str
empty if ok, otherwise the error encountered
'''
logger.debug('Generic function for update_annotation')
raise NotImplementedError
def delete_annotation(self, annotation_details):
'''Delete an annotation from the database (if allowed). All features associated with this annotation
lose this annotation.
This is an optional function for databases that support manual annotations (level L4).
supporting this option is indicated by the "annotate" method in __init__()
It is called when right clicking on an annotation in the qt5 GUI and selecting "delete".
Parameters
----------
annotation_details : dict
The details about the annotation to delete (annotationdetails from get_seq_annotation_strings() )
Should contain a unique identifier for the annotation (created/used by the database)
Returns
-------
str
empty if ok, otherwise the error encountered
'''
logger.debug('Generic function for delete_annotation')
return 'Not implemented'
def remove_feature_from_annotation(self, features, annotation_details):
'''remove a feature from the annotation in the database (if allowed). If after the removal the annotation contains no features,
it will be removed from the database. Otherwise, the annotation remains for the features not removed from it.
This is an optional function for databases that support manual annotations (level L4).
supporting this option is indicated by the "annotate" method in __init__()
It is called when right clicking on an annotation in the qt5 GUI and selecting "remove feature".
Parameters
----------
features : list of str
The feature ids to remove
annotation_details : dict
The details about the annotation to delete (annotationdetails from get_seq_annotation_strings() )
Should contain a unique identifier for the annotation (created/used by the database)
Returns
-------
str
empty if ok, otherwise the error encountered
'''
logger.debug('Generic function for remove_features_from_annotation')
return 'Not implemented'
def get_feature_terms(self, features, exp=None):
'''Get list of terms per feature
Parameters
----------
features : list of str
the features to get the terms for
exp : Experiment, optional
not None to store results inthe exp (to save time for multiple queries)
Returns
-------
feature_terms : dict of term scores associated with each feature.
Key is the feature (str), and the value is a dict of score for each term (i.e. key is the term str, value is the score for this term in this feature)
'''
logger.debug('Generic function for get_feature_terms')
return {}
def enrichment(self, exp, features, *args, **kwargs):
'''Get the list of enriched terms in features compared to all features in exp.
This is an optional function for databases that support enrichment analysis (level L3).
Parameters
----------
exp : Experiment
The experiment to compare the features to
features : list of str
The features (from exp) to test for enrichment
*args : tuple
**kwargs : dict
Additional database specific parameters
Returns
-------
pandas.DataFrame
Its columns include:
feature : str. the feature
pval : float. the p-value for the enrichment
odif : float. the effect size for the enrichment
term : str. the enriched term
'''
logger.debug('Generic function for enrichment')
return None
def show_term_details(self, term, exp, features, *args, **kwargs):
'''
Show details about the specific term in the database and in what features it appears.
This is an optional function, and is called when a user double clicks
an enriched term in the qt5 enrichment analysis (for an integration level L3 database).
It shows details why this term was denoted as enriched. This is a database specific implementation.
Parameters
----------
term : str
The term to get the details for
exp : Experiment
The calour experiment for showing the term details in
features: list of str
The features in the experiment for which to show the term details
Returns
-------
'''
logger.debug('Generic function for term details')
return None