Skip to content

Commit

Permalink
Update terms index fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
wshayes committed Sep 6, 2018
1 parent 7044e2a commit 1efcd04
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 60 deletions.
9 changes: 0 additions & 9 deletions bel/db/elasticsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

cur_dir_name = os.path.dirname(os.path.realpath(__file__))
mappings_terms_fn = f'{cur_dir_name}/es_mappings_terms.yml'
settings_terms_fn = f'{cur_dir_name}/es_settings_terms.yml'
terms_alias = 'terms'


Expand Down Expand Up @@ -54,17 +53,9 @@ def create_terms_index(es, index_name: str):
with open(mappings_terms_fn, 'r') as f:
mappings_terms = yaml.load(f)

with open(settings_terms_fn, 'r') as f:
settings_terms = yaml.load(f)

try:
es.indices.create(index=index_name, body=mappings_terms)

# Update settings - have to close before and then open after
es.indices.close(index=index_name)
es.indices.put_settings(index=index_name, body=settings_terms)
es.indices.open(index=index_name)

except Exception as e:
log.error(f'Could not create elasticsearch terms index: {e}')

Expand Down
34 changes: 34 additions & 0 deletions bel/db/es_mappings_terms.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,37 @@
settings:
analysis:
filter:
autocomplete_filter:
type: edge_ngram
min_gram: 1
max_gram: 20
analyzer:
autocomplete:
type: custom
tokenizer: autocomplete_tokenizer
filter:
- lowercase
- autocomplete_filter
autocomplete_search:
type: custom
tokenizer: autocomplete_tokenizer
filter:
- lowercase

# Added simple pattern split to allow SP:H4_HUMAN in alt_ids to be matched
# This will also allow IL-6 to be completed against. The standard tokenizer/analyzer
# broke it into IL and 6
tokenizer:
autocomplete_tokenizer:
type: simple_pattern_split
pattern: " |:"
normalizer:
lowercase:
type: custom
char_filter: []
filter:
- lowercase

mappings:
term:
properties:
Expand Down
81 changes: 33 additions & 48 deletions bel/resources/namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,17 @@
terms_alias = 'terms'


def load_terms(fo: IO, metadata: dict):
def load_terms(fo: IO, metadata: dict, forceupdate: bool):
"""Load terms into Elasticsearch and ArangoDB
Forceupdate will create a new index in Elasticsearch regardless of whether
an index with the resource version already exists.
Args:
fo: file obj - terminology file
metadata: dict containing the metadata for terminology
forceupdate: force full update - e.g. don't leave Elasticsearch indexes
alone if their version ID matches
"""

version = metadata['metadata']['version']
Expand All @@ -38,18 +43,17 @@ def load_terms(fo: IO, metadata: dict):
es = bel.db.elasticsearch.get_client()

es_version = version.replace('T', '').replace('-', '').replace(':', '')
index_prefix = metadata['metadata']['namespace'].lower()
index_name = f"terms_{index_prefix}_{es_version}"
index_prefix = f"terms_{metadata['metadata']['namespace'].lower()}"
index_name = f"{index_prefix}_{es_version}"

# Create index with mapping
for idx in range(0, 10):
if not elasticsearch.index_exists(es, f'{index_name}_{idx}'):
index_name = f'{index_name}_{idx}'
break

log.info(f'Creating Elasticsearch index {index_name}')

elasticsearch.create_terms_index(es, index_name)
if not elasticsearch.index_exists(es, index_name):
elasticsearch.create_terms_index(es, index_name)
elif forceupdate: # force an update to the index
index_name += '_alt'
elasticsearch.create_terms_index(es, index_name)
else:
return # Skip loading if not forced and not a new namespace

terms_iterator = terms_iterator_for_elasticsearch(fo, index_name)
elasticsearch.bulk_load_docs(es, terms_iterator)
Expand All @@ -69,10 +73,7 @@ def load_terms(fo: IO, metadata: dict):
with timy.Timer('Load Term Equivalences') as timer:
arango_client = arangodb.get_client()
belns_db = arangodb.get_belns_handle(arango_client)
try:
arangodb.batch_load_docs(belns_db, terms_iterator_for_arangodb(fo, version), on_duplicate='update')
except Exception as e:
log.error(e, exc_info=True)
arangodb.batch_load_docs(belns_db, terms_iterator_for_arangodb(fo, version))

# TODO - delete old equivalences based on namespace and version
# delete resources matching namespace and NOT current version
Expand Down Expand Up @@ -105,34 +106,16 @@ def terms_iterator_for_arangodb(fo, version):
if species_list and species_id and species_id not in species_list:
continue

source = term['namespace']
term_id = term['id']
term_key = arangodb.arango_id_to_key(term_id)

(ns, val) = term_id.split(':', maxsplit=1)

if 'equivalences' in term:
source = term['namespace']
term_id = term['id']
term_key = arangodb.arango_id_to_key(term_id)

yield (arangodb.equiv_nodes_name, {'_key': term_key, 'name': term_id, 'namespace': ns, 'source': source, 'primary': True, 'version': version})
(ns, val) = term_id.split(':', maxsplit=1)

for eqv in term['equivalences']:
(ns, val) = eqv.split(':', maxsplit=1)
eqv_key = arangodb.arango_id_to_key(eqv)

# Primary indicates the source
yield (arangodb.equiv_nodes_name, {'_key': eqv_key, 'name': eqv, 'namespace': ns, 'source': source, 'version': version})

arango_edge = {
'_from': f"{arangodb.equiv_nodes_name}/{term_key}",
'_to': f"{arangodb.equiv_nodes_name}/{eqv_key}",
'_key': bel.utils._create_hash(f'{term_key}>>{eqv_key}'),
'type': 'equivalent_to',
'source': source,
'version': version,
}
yield (arangodb.equiv_edges_name, arango_edge)
yield (arangodb.equiv_nodes_name, {'_key': term_key, 'name': term_id, 'namespace': ns, 'source': source, 'version': version})

for eqv in term['alt_ids']:
for eqv in term['equivalences']:
(ns, val) = eqv.split(':', maxsplit=1)
eqv_key = arangodb.arango_id_to_key(eqv)

Expand All @@ -148,9 +131,6 @@ def terms_iterator_for_arangodb(fo, version):
}
yield (arangodb.equiv_edges_name, arango_edge)

else: # Add primary ID to nodes in equivalents - otherwise it won't have a primary: true (EG has no equivalents)
yield (arangodb.equiv_nodes_name, {'_key': term_key, 'name': term_id, 'namespace': ns, 'source': source, 'primary': True, 'version': version})


def terms_iterator_for_elasticsearch(fo: IO, index_name: str):
"""Add index_name to term documents for bulk load"""
Expand All @@ -171,6 +151,13 @@ def terms_iterator_for_elasticsearch(fo: IO, index_name: str):
if species_list and species_id and species_id not in species_list:
continue

all_term_ids = set()
for term_id in [term['id']] + term.get('alt_ids', []):
all_term_ids.add(term_id)
all_term_ids.add(lowercase_term_id(term_id))

term['alt_ids'] = copy.copy(list(all_term_ids))

yield {
'_op_type': 'index',
'_index': index_name,
Expand All @@ -189,9 +176,7 @@ def lowercase_term_id(term_id: str) -> str:
Returns:
str: lowercased, e.g. MESH:atherosclerosis
"""
try:
(ns, val) = term_id.split(':', maxsplit=1)
term_id = f'{ns}:{val.lower()}'
return term_id
except Exception:
return term_id
(ns, val) = term_id.split(':', maxsplit=1)
term_id = f'{ns}:{val.lower()}'

return term_id
14 changes: 11 additions & 3 deletions bel/resources/resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,16 @@
timy.timy_config.tracking_mode = timy.TrackingMode.LOGGING


def load_resource(resource_url):
"""Load BEL Resource file"""
def load_resource(resource_url: str, forceupdate: bool = False):
"""Load BEL Resource file
Forceupdate will create a new index in Elasticsearch regardless of whether
an index with the resource version already exists.
Args:
resource_url: URL from which to download the resource to load into the BEL API
forceupdate: force full update - e.g. don't leave Elasticsearch indexes alone if their version ID matches
"""

log.info(f'Loading resource {resource_url}')

Expand All @@ -43,7 +51,7 @@ def load_resource(resource_url):

# Load resource files
if metadata['metadata']['type'] == 'namespace':
bel.resources.namespace.load_terms(fo, metadata)
bel.resources.namespace.load_terms(fo, metadata, forceupdate)

elif metadata['metadata']['type'] == 'ortholog':
bel.resources.ortholog.load_orthologs(fo, metadata)
Expand Down

0 comments on commit 1efcd04

Please sign in to comment.