Skip to content

Commit

Permalink
Merge pull request #52 from cancervariants/delta-model
Browse files Browse the repository at this point in the history
Delta model for CIViC and MOAlmanac
  • Loading branch information
korikuzma committed Jan 29, 2021
2 parents 1332370 + a88f6de commit fb80940
Show file tree
Hide file tree
Showing 7 changed files with 408 additions and 55 deletions.
98 changes: 51 additions & 47 deletions metakb/deltas/civic.py → metakb/delta.py
Original file line number Diff line number Diff line change
@@ -1,119 +1,122 @@
"""A module for computing CIViC deltas."""
"""A module for computing deltas."""
from metakb import PROJECT_ROOT
import json
import logging
from jsondiff import diff
from datetime import date
import pkg_resources
from metakb.harvesters import CIViC

from metakb.harvesters import CIViC, MOAlmanac
HARVESTER_CLASS = {
'civic': CIViC,
'moa': MOAlmanac
}
logger = logging.getLogger('metakb')
logger.setLevel(logging.DEBUG)


class CIVICDelta:
"""A class for computing CIViC deltas."""
class Delta:
"""A class for computing deltas."""

def __init__(self, main_json, *args, **kwargs):
"""Initialize the CIVICDelta class.
def __init__(self, main_json, src, *args, **kwargs):
"""Initialize the Delta class.
:param str main_json: The path to the main CIViC composite json file.
:param str main_json: The path to the main composite json file.
:param str src: The source to compute the delta on
"""
self._src = src.lower()
assert self._src in HARVESTER_CLASS.keys()
self._main_json = main_json
if '_updated_json' in kwargs:
# The path to the updated CIViC harvester composite json file.
# The path to the updated harvester composite json file.
self._updated_json = kwargs['_updated_json']
else:
self._updated_json = None

def compute_delta(self):
"""Compute delta for CIViC and store computed delta in a JSON file.
"""Compute delta for store computed delta in a JSON file.
:return: A dictionary of ids to delete, update, or insert to the main
harvester.
"""
# Main harvester
with open(self._main_json, 'r') as f:
main_civic = json.load(f)
main_json = json.load(f)

current_date = date.today().strftime('%Y%m%d')

# updated harvester
if self._updated_json:
# Updated harvester file already exists
with open(self._updated_json, 'r') as f:
updated_civic = json.load(f)
updated_json = json.load(f)
else:
# Want to create updated harvester file
fn = f"civic_harvester_{current_date}.json"
c = CIViC()
c.harvest(fn=fn)
fn = f"{self._src}_harvester_{current_date}.json"
HARVESTER_CLASS[self._src]().harvest(fn=fn)

with open(f"{PROJECT_ROOT}/data/civic/{fn}", 'r') as f:
updated_civic = json.load(f)
with open(f"{PROJECT_ROOT}/data/{self._src}/{fn}", 'r') as f:
updated_json = json.load(f)

delta = {
'_meta': {
# TODO: Is this needed?
'civicpy_version':
pkg_resources.get_distribution("civicpy").version,
# TODO: Check version.
'metakb_version': '1.0.1',
# TODO: Might change. Assuming we harvest when computing delta
'date_harvested': current_date
}
}
civic_records = ['evidence', 'genes', 'variants', 'assertions']

for civic_record in civic_records:
delta[civic_record] = {
if self._src == 'civic':
delta['_meta']['civicpy_version'] = '1.1.2'
elif self._src == 'moa':
delta['_meta']['moa_api_version'] = '0.2'

for record_type in main_json.keys():
delta[record_type] = {
'DELETE': [],
'INSERT': [],
'UPDATE': []
}
updated = updated_civic[civic_record]
main = main_civic[civic_record]
updated = updated_json[record_type]
main = main_json[record_type]
updated_ids = self._get_ids(updated)
main_ids = self._get_ids(main)

additional_ids = list(set(updated_ids) - set(main_ids))
self._ins_del_delta(delta, civic_record, 'INSERT', additional_ids,
self._ins_del_delta(delta, record_type, 'INSERT', additional_ids,
updated)
remove_ids = list(set(main_ids) - set(updated_ids))
self._ins_del_delta(delta, civic_record, 'DELETE', remove_ids,
self._ins_del_delta(delta, record_type, 'DELETE', remove_ids,
main)

self._update_delta(delta, civic_record, updated, main)
self._update_delta(delta, record_type, updated, main)

self._create_json(delta, current_date)
return delta

def _ins_del_delta(self, delta, civic_record, key, ids_list, data):
def _ins_del_delta(self, delta, record_type, key, ids_list, data):
"""Store records that will be deleted or inserted.
:param dict delta: The CIViC deltas
:param str civic_record: The type of CIViC record
:param dict delta: The deltas
:param str record_type: The type of record
:param str key: 'INSERT' or 'DELETE'
:param list ids_list: A list of ids
:param dict data: CIViC harvester data
:param dict data: Harvester data
"""
for record in data:
if record['id'] in ids_list:
delta[civic_record][key].append(record)
delta[record_type][key].append(record)

def _update_delta(self, delta, civic_record, updated, main):
"""Store CIViC deltas.
def _update_delta(self, delta, record_type, updated, main):
"""Store deltas.
:param dict delta: The CIViC deltas
:param str civic_record: The type of CIViC record
:param dict delta: The deltas
:param str record_type: The type of record
:param dict updated: updated harvester data
:param dict main: Main harvester data
"""
for updated_record in updated:
for main_record in main:
if main_record['id'] == updated_record['id']:
if updated_record != main_record:
delta[civic_record]['UPDATE'].append({
delta[record_type]['UPDATE'].append({
str(main_record['id']):
diff(main_record, updated_record, marshal=True)
})
Expand All @@ -122,7 +125,7 @@ def _update_delta(self, delta, civic_record, updated, main):
def _get_ids(self, records):
"""Return list of ids from data.
:param dict records: A dictionary of CIViC records
:param dict records: A dictionary of records
:return: A list of ids
"""
ids = list()
Expand All @@ -133,13 +136,14 @@ def _get_ids(self, records):
return ids

def _create_json(self, delta, current_date):
"""Create a JSON of CIViC deltas.
"""Create a JSON of deltas.
:param dict delta: A dictionary containing CIViC deltas.
:param dict delta: A dictionary containing deltas.
:param str current_date: The current date
"""
civic_dir = PROJECT_ROOT / 'data' / 'civic'
civic_dir.mkdir(exist_ok=True, parents=True)
src_dir = PROJECT_ROOT / 'data' / self._src
src_dir.mkdir(exist_ok=True, parents=True)

with open(f"{civic_dir}/civic_deltas_{current_date}.json", 'w+') as f:
with open(f"{src_dir}/{self._src}_deltas_{current_date}.json",
'w+') as f:
json.dump(delta, f)
2 changes: 0 additions & 2 deletions metakb/deltas/__init__.py

This file was deleted.

2 changes: 1 addition & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ coverage<5
pytest-cov==2.9.0
coveralls==2.0.0
sphinxjp.themes.basicstrap==0.5.0
setuptools~=50.3.2
setuptools~=50.3.2
78 changes: 78 additions & 0 deletions tests/data/deltas/main_moa.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
{
"assertions": [
{
"id": 3,
"context": null,
"description": "The U.S. Food and Drug Administration (FDA) granted approval for dasatinib for adults with Philidelphia chromosome-positive acute lymphphoblastic leukemia (Ph+ ALL) with resistance or intolerance to prior therapy.",
"disease": {
"name": "Acute Lymphoid Leukemia",
"oncotree_code": null,
"oncotree_term": null
},
"therapy_name": "Dasatinib",
"therapy_type": "Targeted therapy",
"clinical_significance": "sensitivity",
"predictive_implication": "FDA-Approved",
"feature_ids": [3],
"favorable_prognosis": null,
"created_on": "01/07/21",
"last_updated": "2020-11-12",
"submitted_by": "breardon@broadinstitute.org",
"validated": true,
"source_ids": [2],
"variant": {
"id": 3,
"feature_type": "rearrangement",
"rearrangement_type": "Fusion",
"gene1": "BCR",
"gene2": "ABL1",
"locus": null,
"feature": "BCR--ABL1 Fusion"
},
"test update delete": "delete"
}
],
"sources": [
{
"id": 2,
"type": "FDA",
"assertion_id": [2],
"doi": null,
"nct": null,
"pmid": "None",
"url": "https://www.accessdata.fda.gov/drugsatfda_docs/label/2018/021986s021lbl.pdf",
"citation": "Bristol-Myers Squibb Company. Sprycel (dasatinib) [package insert]. U.S. Food and Drug Administration website. https://www.accessdata.fda.gov/drugsatfda_docs/label/2018/021986s021lbl.pdf. Revised December 2018. Accessed November 12, 2020."
}
],
"variants": [
{
"id": 3,
"feature_type": "rearrangement",
"rearrangement_type": "Fusion",
"gene1": "BCR",
"gene2": "ABL1",
"locus": null,
"feature": "BCR--ABL1 Fusion"
},
{
"id": 296,
"feature_type": "somatic_variant",
"gene": "EZH2",
"chromosome": "7",
"start_position": "148508728.0",
"end_position": "148508728.0",
"reference_allele": "A",
"alternate_allele": "T",
"cdna_change": "c.1936T>A",
"protein_change": "p.Y646N",
"variant_annotation": "Missense",
"exon": "16.0",
"rsid": null,
"feature": "EZH2 p.Y646N (Missense)"
},
{
"id": 5,
"feature_type": "test_removal"
}
]
}
83 changes: 83 additions & 0 deletions tests/data/deltas/updated_moa.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
{
"assertions": [
{
"id": 3,
"context": null,
"description": "The U.S. Food and Drug Administration (FDA) granted approval for dasatinib for adults with Philidelphia chromosome-positive acute lymphphoblastic leukemia (Ph+ ALL) with resistance or intolerance to prior therapy.",
"disease": {
"name": "Acute Lymphoid Leukemia",
"oncotree_code": "ALL",
"oncotree_term": "Acute Lymphoid Leukemia"
},
"therapy_name": "Dasatinib",
"therapy_type": "Targeted therapy",
"clinical_significance": "sensitivity",
"predictive_implication": "FDA-Approved",
"feature_ids": [3],
"favorable_prognosis": null,
"created_on": "01/16/21",
"last_updated": "2020-11-12",
"submitted_by": "breardon@broadinstitute.org",
"validated": true,
"source_ids": [2],
"variant": {
"id": 3,
"feature_type": "rearrangement",
"rearrangement_type": "Fusion",
"gene1": "BCR",
"gene2": "ABL1",
"locus": null,
"feature": "BCR--ABL1 Fusion"
}
}
],
"sources": [
{
"id": 2,
"type": "FDA",
"assertion_id": [2, 3],
"doi": null,
"nct": null,
"pmid": "None",
"url": "https://www.accessdata.fda.gov/drugsatfda_docs/label/2018/021986s021lbl.pdf",
"citation": "Bristol-Myers Squibb Company. Sprycel (dasatinib) [package insert]. U.S. Food and Drug Administration website. https://www.accessdata.fda.gov/drugsatfda_docs/label/2018/021986s021lbl.pdf. Revised December 2018. Accessed November 12, 2020."
},
{
"id": 22,
"type": "Journal",
"assertion_id": [30, 288],
"doi": "10.1371/journal.pgen.1004135",
"nct": null,
"pmid": "24550739",
"url": "https://doi.org/10.1371/journal.pgen.1004135",
"citation": "Borad MJ, Champion MD, Egan JB, et al. Integrated genomic characterization reveals novel, therapeutically relevant drug targets in FGFR and EGFR pathways in sporadic intrahepatic cholangiocarcinoma. PLoS Genet. 2014;10(2):e1004135."
}
],
"variants": [
{
"id": 3,
"feature_type": "rearrangement",
"rearrangement_type": "Fusion",
"gene1": "BCR",
"gene2": "ABL1",
"locus": null,
"feature": "BCR--ABL1 Fusion"
},
{
"id": 296,
"feature_type": "somatic_variant",
"gene": "EZH2",
"chromosome": "7",
"start_position": "148508728.0",
"end_position": "148508728.0",
"reference_allele": "A",
"alternate_allele": "T",
"cdna_change": "c.1936T>A",
"protein_change": "p.Y646N",
"variant_annotation": "Missense",
"exon": "16.0",
"rsid": null,
"feature": "EZH2 p.Y646N (Missense)"
}
]
}

0 comments on commit fb80940

Please sign in to comment.