In [1]:
import json
import re
import ast
import zipfile

In [2]:
all_variant_ids_we_cant_get = set()

In [3]:
with zipfile.ZipFile("metakb.log.zip", "r") as zip_ref:
    zip_ref.extractall(".")

# Get total counts for CIViC data

In [4]:
with open("metakb_log_analysis.log", "w+") as wf:
    with open("metakb.log", "r") as rf:
        for line in rf:
            if "metakb" in line:
                wf.write(line)

                if "TOTAL" in line:
                    count = int(line.split(": ")[-1])
                    if "CIViC Variants in Approved Predictive Evidence" in line:
                          total_vids_approved_predictive = count
                    elif "approved CIViC EIDs" in line:
                        total_approved_eids = count
                    elif "approved predictive CIViC EIDs" in line:
                        total_approved_predictive_eids = count
                    elif "CIViC EIDs" in line:
                        total_eids = count
                    elif "CIViC Variants" in line:
                        total_vids = count

print("Total CIViC EIDs:", total_eids)
print("Total approved CIViC EIDs:", total_approved_eids)
print("Total approved predictive CIViC EIDs:", total_approved_predictive_eids)
print("Total CIViC Variants:", total_vids)
print("Total CIViC Variants in approved predictive CIViC EIDs", total_vids_approved_predictive)

Total CIViC EIDs: 9456
Total approved CIViC EIDs: 4022
Total approved predictive CIViC EIDs: 2478
Total CIViC Variants: 3362
Total CIViC Variants in approved predictive CIViC EIDs 944


In [5]:
print("% of EIDs we could capture:", total_approved_predictive_eids / total_eids * 100)
print("% of VIDs we could capture:", total_vids_approved_predictive / total_vids * 100)

% of EIDs we could capture: 26.20558375634518
% of VIDs we could capture: 28.078524687685903


# Variation Normalizer Analysis

In [6]:
civic_vids_unable_to_normalize = set()

with open("civic_unable_to_normalize_variation.txt", "w+") as wf:
    with open("metakb_log_analysis.log", "r") as rf:
        wf.write(f"--------CIViC variants that variation normalizer could not normalize---------")
        for line in rf:
            if "metakb.transform.civic" in line and "Variation Normalizer unable to normalize" in line:
                civic_vid = re.findall(r"civic.vid:\d+", line)[0].strip()
                all_variant_ids_we_cant_get.add(civic_vid.split(".vid:")[-1])
                query = line.split("using query ")[-1].strip()
                civic_vids_unable_to_normalize.add(civic_vid)
                wf.write(f"\n{civic_vid} : {query}")

print("# of CIViC VIDs we can't normalize:", len(civic_vids_unable_to_normalize))
print("% of CIViC VIDs we can't normalize:", len(civic_vids_unable_to_normalize) / total_vids_approved_predictive * 100)    

# of CIViC VIDs we can't normalize: 227
% of CIViC VIDs we can't normalize: 24.046610169491526


In [7]:
civic_vids_not_supported = set()

with open("civic_does_not_support_variation.txt", "w+") as wf:
    with open("metakb_log_analysis.log", "r") as rf:
        wf.write(f"--------CIViC variants that are not yet supported---------")
        for line in rf:
            if "metakb.transform.civic" in line and "Variation Normalizer does not support" in line:
                civic_vid = re.findall(r"civic.vid:\d+", line)[0].strip()
                all_variant_ids_we_cant_get.add(civic_vid.split(".vid:")[-1])
                query = line.split(":")[-1].strip()
                civic_vids_not_supported.add(civic_vid)
                wf.write(f"\n{civic_vid} : {query}")

print("# of CIViC VIDs that aren't supported:", len(civic_vids_not_supported))            
print("% of CIViC VIDs that aren't supported:", len(civic_vids_not_supported) / total_vids_approved_predictive * 100)

# of CIViC VIDs that aren't supported: 339
% of CIViC VIDs that aren't supported: 35.91101694915254


In [8]:
# Show # of each keyword that are not supported
unable_to_normalize = {
    "mutation", "amplification", "exon", "overexpression",
    "frameshift", "promoter", "deletion", "type", "insertion",
    "expression", "duplication", "copy", "underexpression",
    "number", "variation", "repeat", "rearrangement", "activation",
    "expression", "mislocalization", "translocation", "wild",
    "polymorphism", "frame", "shift", "loss", "function", "levels",
    "inactivation", "snp", "fusion", "dup", "truncation",
    "homozygosity", "gain", "phosphorylation",
}
unable_to_normalize_counts = dict()
with open("civic_does_not_support_variation.txt", "r") as rf:
    rf.readline()
    for vname in rf.readlines():
        keys = set(vname.lower().split()) & unable_to_normalize
        for k in keys:
            if k in unable_to_normalize_counts:
                unable_to_normalize_counts[k] += 1
            else:
                unable_to_normalize_counts[k] = 1
unable_to_normalize_counts = dict(sorted(unable_to_normalize_counts.items(), key=lambda x: x[1], reverse=True))
unable_to_normalize_counts

{'mutation': 71,
 'expression': 67,
 'overexpression': 55,
 'amplification': 34,
 'exon': 23,
 'underexpression': 17,
 'fusion': 16,
 'loss': 15,
 'deletion': 12,
 'phosphorylation': 5,
 'homozygosity': 3,
 'frameshift': 3,
 'promoter': 3,
 'rearrangement': 3,
 'levels': 3,
 'insertion': 2,
 'repeat': 1,
 'inactivation': 1,
 'mislocalization': 1,
 'polymorphism': 1,
 'truncation': 1,
 'duplication': 1,
 'frame': 1,
 'shift': 1,
 'translocation': 1,
 'type': 1,
 'wild': 1}

In [9]:
print("Total % of VIDs we can't capture:", len(all_variant_ids_we_cant_get) / total_vids_approved_predictive * 100)

Total % of VIDs we can't capture: 59.95762711864406


In [10]:
# See how this affects EIDs
eids_missed_due_to_variant = set()
variant_id_to_eids_missed = dict()

with open("metakb_log_analysis.log", "r") as rf:
    for line in rf:
        if "has no variation descriptor" in line:
            eid = re.findall(r"EID\d+", line)[0].strip()
            eids_missed_due_to_variant.add(eid)
            variant_id = line.split("variant_id")[-1].strip()

            if variant_id in variant_id_to_eids_missed:
                variant_id_to_eids_missed[variant_id].add(eid)
            else:
                variant_id_to_eids_missed[variant_id] = {eid}

variant_id_to_eids_missed = dict(sorted(variant_id_to_eids_missed.items(), key=lambda x: len(x[1]), reverse=True))
with open("civic_variant_id_to_eids_missed.txt", "w+") as wf:
    for k, v in variant_id_to_eids_missed.items():
        wf.write(f"civic.variant:{k} (count of EIDs {len(v)}) : {v}\n")
            
print(f"# of EIDs missed due to Variation Normalizer:", len(eids_missed_due_to_variant))            

# of EIDs missed due to Variation Normalizer: 1549


# Therapy Normalizer Analysis

In [11]:
drug_ids_cant_normalize = set()
eid_to_drugs = dict()
drug_id_to_queries = dict()
eids_missed_due_to_therapy = set()
drug_id_to_eids_missed = dict()

with open("metakb_log_analysis.log", "r") as rf:
    for line in rf:
        if "Therapy Normalizer unable to normalize" in line:
            drug_id = re.findall(r"civic.drug:\d+", line)[0].strip()
            drug_ids_cant_normalize.add(drug_id)

            queries = ast.literal_eval(line.split("queries ")[-1])
            drug_id_to_queries[drug_id] = queries
            
        elif "has no therapeutic descriptor" in line:
            eid = re.findall(r"EID\d+", line)[0].strip()
            eids_missed_due_to_therapy.add(eid)
            drugs = set(ast.literal_eval(line.split("drugs: ")[-1]))
            for drug_id in drugs:
                if drug_id in drug_id_to_eids_missed:
                    drug_id_to_eids_missed[drug_id].add(eid)
                else:
                    drug_id_to_eids_missed[drug_id] = {eid}


with open("civic_unable_to_normalize_drugs.txt", "w+") as wf:
    wf.write(f"--------Unable to normalize CIViC drug COUNTS (Total drugs: {len(drug_ids_cant_normalize)})---------")
    for drug in drug_ids_cant_normalize:
        wf.write(f"\n{drug}: {drug_id_to_queries.get(drug)}")

drug_id_to_eids_missed = dict(sorted(drug_id_to_eids_missed.items(), key=lambda x: len(x[1]), reverse=True))
with open("civic_drug_id_to_eids_missed.txt", "w+") as wf:
    for k, v in drug_id_to_eids_missed.items():
        wf.write(f"{k} (count of EIDs {len(v)}) : {v}\n")

print(f"# of CIViC drugs we can't normalize: {len(drug_ids_cant_normalize)}")
print(f"# of EIDs missed due to Therapy Normalizer: {len(eids_missed_due_to_therapy)}")

# of CIViC drugs we can't normalize: 54
# of EIDs missed due to Therapy Normalizer: 105


# Summary

In [12]:
perc_eids_missed_therapy = len(eids_missed_due_to_therapy) / total_approved_predictive_eids * 100
perc_eids_missed_variation = len(eids_missed_due_to_variant) / total_approved_predictive_eids * 100
print(f"% of EIDs missed due to Therapy Normalizer: {perc_eids_missed_therapy}")
print(f"# of EIDs missed due to Variation Normalizer: {perc_eids_missed_variation}")   



% of EIDs missed due to Therapy Normalizer: 4.23728813559322
# of EIDs missed due to Variation Normalizer: 62.51008878127522
