In [1]:
import json
import re
import ast
import zipfile

In [2]:
with zipfile.ZipFile("moa.log.zip", "r") as zip_ref:
    zip_ref.extractall(".")

# Get total counts for MOA data

In [3]:
with open("moa_analysis.log", "w+") as wf:
    with open("moa.log", "r") as rf:
        for line in rf:
            if "metakb" in line:
                wf.write(line)

            if "TOTAL" in line:
                count = int(line.split(": ")[-1])

                if "MOA Genes able to normalize" in line:
                    total_genes_normalized = count
                elif "MOA Variants able to normalize" in line:
                    total_variants_normalized = count              
                elif "MOA assertions able to transform" in line:
                    total_assertions_transformed = count      
                elif "MOA Assertions" in line:
                    total_assertions = count
                elif "MOA Variants" in line:
                    total_variants = count
                elif "MOA Genes" in line:
                    total_genes = count

print("TOTAL MOA Variants:", total_variants)
print("TOTAL MOA Variants able to normalize:", total_variants_normalized, f"({total_variants_normalized/total_variants*100} %)")
print("\nTOTAL MOA Genes:", total_genes)
print("TOTAL MOA Genes able to normalize:", total_genes_normalized, f"({total_genes_normalized / total_genes * 100} %)")
print("\nTOTAL MOA Assertions:", total_assertions)
print("TOTAL MOA Assertions able to transform:", total_assertions_transformed, f"({total_assertions_transformed / total_assertions * 100} %)")

TOTAL MOA Variants: 420
TOTAL MOA Variants able to normalize: 146 (34.76190476190476 %)

TOTAL MOA Genes: 145
TOTAL MOA Genes able to normalize: 145 (100.0 %)

TOTAL MOA Assertions: 864
TOTAL MOA Assertions able to transform: 189 (21.875 %)


# Variant Analysis

In [4]:
moa_vids_unable_to_normalize = set()

with open("moa_unable_to_normalize_variation.txt", "w+") as wf:
    with open("moa_analysis.log", "r") as rf:
        for line in rf:
            if "metakb.transform.moa" in line and "Variation Normalizer unable to normalize" in line:
                moa_vid = re.findall(r"moa.variant:\d+", line)[0].strip()
                query = line.split(":")[-1].strip()
                moa_vids_unable_to_normalize.add(moa_vid)
                wf.write(f"{moa_vid} : {query}\n")

print("# of MOA Variants that we can't normalize:", len(moa_vids_unable_to_normalize))
print("% of MOA Variants that we can't normalize:", len(moa_vids_unable_to_normalize) / total_variants * 100)

# of MOA Variants that we can't normalize: 11
% of MOA Variants that we can't normalize: 2.619047619047619


In [5]:
moa_vids_not_supported = set()
no_gene_provided_count = 0

with open("moa_does_not_support_variation.txt", "w+") as wf:
    with open("moa_analysis.log", "r") as rf:
        for line in rf:
            if "metakb.transform.moa" in line and "Variation Normalizer does not support" in line:
                if "(no gene provided)" in line:
                    no_gene_provided_count += 1
                moa_vid = re.findall(r"moa.variant:\d+", line)[0].strip()
                query = line.split(": ")[-1].strip()
                moa_vids_not_supported.add(moa_vid)
                wf.write(f"{moa_vid} : {query}\n")

print("# of MOA Variants that aren't supported:", len(moa_vids_not_supported))
print("  # of MOA Variants with no gene provided:", no_gene_provided_count)
print("% of MOA Variants that aren't supported:", len(moa_vids_not_supported) / total_variants * 100)

# of MOA Variants that aren't supported: 263
  # of MOA Variants with no gene provided: 12
% of MOA Variants that aren't supported: 62.61904761904762


In [6]:
# See how this affects AIDs
aids_missed_due_to_variant = set()
variant_id_to_aids_missed = dict()

with open("moa_analysis.log", "r") as rf:
    for line in rf:
        if "has no variation descriptor" in line:
            aid = re.findall(r"moa.assertion:\d+", line)[0].strip()
            aids_missed_due_to_variant.add(aid)
            variant_id = line.split("variant_id")[-1].strip()

            if variant_id in variant_id_to_aids_missed:
                variant_id_to_aids_missed[variant_id].add(aid)
            else:
                variant_id_to_aids_missed[variant_id] = {aid}

variant_id_to_aids_missed = dict(sorted(variant_id_to_aids_missed.items(), key=lambda x: len(x[1]), reverse=True))
with open("moa_variant_id_to_aids_missed.txt", "w+") as wf:
    for k, v in variant_id_to_aids_missed.items():
        wf.write(f"moa.variant:{k} (count of AIDs {len(v)}) : {v}\n")
            
print(f"# of AIDs missed due to Variation Normalizer:", len(aids_missed_due_to_variant)) 

# of AIDs missed due to Variation Normalizer: 619


# Therapy Analysis

In [7]:
# No therapy_name provided means we can't run thru therapy normalizer
aids_no_therapy_name = set()
with open("moa_analysis.log", "r") as rf:
    for line in rf:
        if "has no therapy_name" in line:
            aid = re.findall(r"moa.assertion:\d+", line)[0].strip()
            aids_no_therapy_name.add(aid)

count_aid_no_therapy = len(aids_no_therapy_name)
print(f"# of AIDs with no therapy name: {count_aid_no_therapy}")
print(f"% of AIDs with no therapy name: {count_aid_no_therapy / total_assertions * 100}")
sorted(aids_no_therapy_name)

# of AIDs with no therapy name: 42
% of AIDs with no therapy name: 4.861111111111112


['moa.assertion:146',
 'moa.assertion:162',
 'moa.assertion:191',
 'moa.assertion:250',
 'moa.assertion:334',
 'moa.assertion:335',
 'moa.assertion:336',
 'moa.assertion:341',
 'moa.assertion:390',
 'moa.assertion:488',
 'moa.assertion:489',
 'moa.assertion:490',
 'moa.assertion:491',
 'moa.assertion:495',
 'moa.assertion:535',
 'moa.assertion:536',
 'moa.assertion:538',
 'moa.assertion:539',
 'moa.assertion:540',
 'moa.assertion:541',
 'moa.assertion:543',
 'moa.assertion:544',
 'moa.assertion:545',
 'moa.assertion:546',
 'moa.assertion:547',
 'moa.assertion:548',
 'moa.assertion:549',
 'moa.assertion:550',
 'moa.assertion:551',
 'moa.assertion:552',
 'moa.assertion:553',
 'moa.assertion:554',
 'moa.assertion:555',
 'moa.assertion:556',
 'moa.assertion:558',
 'moa.assertion:559',
 'moa.assertion:560',
 'moa.assertion:561',
 'moa.assertion:654',
 'moa.assertion:655',
 'moa.assertion:661',
 'moa.assertion:680']

In [8]:
therapy_names_cant_normalize = set()
aid_to_drugs = dict()
aids_missed_due_to_therapy = set()
drug_id_to_aids_missed = dict()
therapy_name_to_queries = dict()

with open("moa_analysis.log", "r") as rf:
    for line in rf:
        if "Therapy Normalizer unable to normalize" in line:
            therapy_name = line.split(":")[-1].strip()
            therapy_names_cant_normalize.add(therapy_name)
            
        elif "has no therapeutic descriptor" in line:
            aid = re.findall(r"moa.assertion:\d+", line)[0].strip()
            aids_missed_due_to_therapy.add(aid)
            therapy_name = line.split("therapy_name")[-1]

            therapy_names = [tn.strip() for tn in therapy_name.split("+")]
            for therapy_name in therapy_names:
                if therapy_name not in therapy_names_cant_normalize:
                    continue
                
                if therapy_name in drug_id_to_aids_missed:
                    drug_id_to_aids_missed[therapy_name].add(aid)
                else:
                    drug_id_to_aids_missed[therapy_name] = {aid}

print(f"# of MOA therapy names we can't normalize: {len(therapy_names_cant_normalize)}")
print(f"# of AIDs missed due to Therapy Normalizer: {len(aids_missed_due_to_therapy)}\n")

print("---MOA Therapy Normalizer Unable to Normalize---")
for drug in therapy_names_cant_normalize:
    print(drug)

drug_id_to_aids_missed = dict(sorted(drug_id_to_aids_missed.items(), key=lambda x: len(x[1]), reverse=True))
print("\n---MOA Therapy to AIDs Missed---")
for k, v in drug_id_to_aids_missed.items():
    print(f"{k.strip()} (count of AIDs {len(v)}) : {v}")

# of MOA therapy names we can't normalize: 3
# of AIDs missed due to Therapy Normalizer: 4

---MOA Therapy Normalizer Unable to Normalize---
EXEL-8232
GANT61
Mito-CP

---MOA Therapy to AIDs Missed---
GANT61 (count of AIDs 2) : {'moa.assertion:183', 'moa.assertion:182'}
Mito-CP (count of AIDs 1) : {'moa.assertion:174'}
EXEL-8232 (count of AIDs 1) : {'moa.assertion:421'}


# Disease Analysis

In [9]:
disease_cant_normalize = set()
aid_to_disease = dict()
aids_missed_due_to_disease = set()
disease_to_aids_missed = dict()
disease_to_queries = dict()

with open("moa_analysis.log", "r") as rf:
    for line in rf:
        if "Disease Normalizer unable to normalize" in line:
            disease = line.split("unable to normalize:")[-1].strip()
            disease_cant_normalize.add(disease)
            
        elif "has no disease descriptor" in line:
            aid = re.findall(r"moa.assertion:\d+", line)[0].strip()
            aids_missed_due_to_disease.add(aid)
            disease = ast.literal_eval(line.split("disease")[-1])

            if not any(v for v in disease.values()):
                disease_str = "No disease provided"
            else:
                disease_str = ";".join(sorted({v for k,v in disease.items() if v}))

            if disease_str in disease_to_aids_missed:
                disease_to_aids_missed[disease_str].add(aid)
            else:
                disease_to_aids_missed[disease_str] = {aid}           

print(f"# of MOA disease we can't normalize: {len(disease_cant_normalize)}")
print(f"# of AIDs missed due to Disease Normalizer: {len(aids_missed_due_to_disease)}\n")

print("---MOA Disease Normalizer Unable to Normalize---")
for disease in disease_cant_normalize:
    print(disease)

disease_to_aids_missed = dict(sorted(disease_to_aids_missed.items(), key=lambda x: len(x[1]), reverse=True))
print("\n---MOA Disease to AIDs Missed---")
for k, v in disease_to_aids_missed.items():
    print(f"{k.strip()} (count of AIDs {len(v)}) : {v}")

# of MOA disease we can't normalize: 1
# of AIDs missed due to Disease Normalizer: 8

---MOA Disease Normalizer Unable to Normalize---
['oncotree:TALL', 'T-Cell Acute Lymphoid Leukemia', 'T-Cell Acute Lymphoid Leukemia']

---MOA Disease to AIDs Missed---
T-Cell Acute Lymphoid Leukemia;TALL (count of AIDs 6) : {'moa.assertion:344', 'moa.assertion:343', 'moa.assertion:345', 'moa.assertion:342', 'moa.assertion:347', 'moa.assertion:346'}
No disease provided (count of AIDs 1) : {'moa.assertion:112'}
Any solid tumor (count of AIDs 1) : {'moa.assertion:821'}


# Proposition Analysis

In [10]:
aid_no_prop = dict()
with open("moa_analysis.log", "r") as rf:
    for line in rf:
        if "No predicate found for" in line:
            aid = re.findall(r"moa.assertion:\d+", line)[0].strip()
            clin_sig = line.split("clinical significance:")[-1].strip()
            aid_no_prop[aid] = ast.literal_eval(clin_sig)

count_aid_no_prop = len(aid_no_prop.keys())
print(f"# of AIDs missed due to no proposition: {count_aid_no_prop}")
print(f"% of AIDs missed due to no proposition: {count_aid_no_prop / total_assertions * 100}")
aid_no_prop

# of AIDs missed due to no proposition: 2
% of AIDs missed due to no proposition: 0.23148148148148145


{'moa.assertion:369': None, 'moa.assertion:459': None}

# Summary

In [11]:
perc_aids_missed_no_therapy = count_aid_no_therapy / total_assertions * 100
perc_aids_missed_therapy = len(aids_missed_due_to_therapy) / total_assertions * 100
perc_aids_missed_variation = len(aids_missed_due_to_variant) / total_assertions * 100
perc_aids_missed_disease = len(aids_missed_due_to_disease) / total_assertions * 100
perc_aids_missed_no_prop = count_aid_no_prop / total_assertions * 100

print(f"% of AIDs missed due to no therapy name: {perc_aids_missed_no_therapy}")
print(f"% of AIDs missed due to Therapy Normalizer: {perc_aids_missed_therapy}")
print(f"% of AIDs missed due to Disease Normalizer: {perc_aids_missed_disease}")
print(f"% of AIDs missed due to Variation Normalizer: {perc_aids_missed_variation}")   
print(f"% of AIDs missed due to no proposition: {perc_aids_missed_no_prop}")

% of AIDs missed due to no therapy name: 4.861111111111112
% of AIDs missed due to Therapy Normalizer: 0.4629629629629629
% of AIDs missed due to Disease Normalizer: 0.9259259259259258
% of AIDs missed due to Variation Normalizer: 71.64351851851852
% of AIDs missed due to no proposition: 0.23148148148148145
