In [302]:
from neo4j import GraphDatabase
import pandas as pd
from tqdm import tqdm
import plotly
import plotly.express as px
import plotly.io as pio

# Evidence Statement Overlap in MetaKB v2
Previous work identified that while six prominent somatic cancer variant knowledgebases were highly disparate in content structure, the actual content within had large degrees of overlap in individual variants, diseases, and drugs as a result of harmonization. Since this original work, we have demonstrated improvements in harmonization of terminology and seek to expand on this analysis by demonstrating overlap at the conceptual level. This includes harmonization of variants, diseases, and therapeutics individually but also together as an evidence level statement.

### Table of Contents:
* [Evidence Statements](#evidence-statements)
    * [Connect to Neo4j Database](#connect-to-the-neo4j-database)
    * [Structure Data Response](#structure-response-data-for-analysis)
    * [Grab Additional Data](#grab-additional-data-connected-to-study-nodes)
    * [Create Evidence Statements](#create-evidence-statements)
    * [Make the Data Graph](#make-the-data-graphs)
* [Individual Components](#individual-components)
    * [Variant Overlap](#variants)
    * [Therapeutic Overlap](#therapeutics)
    * [Disease Overlap](#diseases)
    * [Uniqueness (Singletons)](####uniqueness-singletons)
    * [Therapeutic/Disease Overlap](#therapeutics-on-disease)
    * [Therapeutic/Variant Overlap](#variants-with-therapeutics)
    * [Variant/Disease Overlap](#variants-in-disease)
    * [Uniqueness (Paired)](#uniqueness-pairs)
* [Evidence Strength](#evidence-strength)
    * Incomplete
* [Concordance](#concordance)
    * Incomplete

## Evidence Statements
Exhibit overlap of concepts at the statement level (variant + predicate + drug + disease qualifier)

#### Connect to the Neo4j Database

In [62]:
def create_db_connection(uri, user, password):
    driver = GraphDatabase.driver(uri, auth=(user, password))
    return driver

def execute_query(driver, query):
    with driver.session() as session:
        result = session.run(query)
        return [record for record in result]

# Connection Objects
uri = "bolt://localhost:7687"
user = "neo4j"
password = "password"
driver = create_db_connection(uri, user, password)

query_studies = """MATCH (s:StudyStatement)
    RETURN properties(s) AS Study
"""
result = execute_query(driver, query_studies)

driver.close()

#### Structure Response Data for Analysis
We are performing our analysis from the perspective of the study nodes. We will first create a dataframe of all the existing study nodes in metakb and then sequentially attach pieces of information, such as defining contexts, variants, therapeutics, etc. We can currently do it this way due to the fact that every study currently has a one-to-one relationship for these nodes and not a one-to-many. If this changes, we will have to adjust this going forward.

In [63]:
data = []
for record in result:
    study = record.get('Study', {})
    row = {
        'id': study.get('id', None),
        'description': study.get('description', None),
        'predicate': study.get('predicate', None)
        # 'id': variation.get('id', None),
        # 'expression_hgvs_c': variation.get('expression_hgvs_c', None),
        # 'expression_hgvs_p': variation.get('expression_hgvs_p', None),
        # 'digest': variation.get('digest', None),
        # 'label': variation.get('label', None),
        # 'type': variation.get('type', None),
        # 'expression_hgvs_g': variation.get('expression_hgvs_g', None),
    }
    data.append(row)

df = pd.DataFrame(data)

df[0:5]

Unnamed: 0,id,description,predicate
0,civic.eid:238,The T790M mutation in EGFR has been shown to c...,predictsResistanceTo
1,civic.eid:1409,Phase 3 randomized clinical trial comparing ve...,predictsSensitivityTo
2,civic.eid:1592,Osimertinib has been approved for the treatmen...,predictsSensitivityTo
3,civic.eid:1867,"Randomized, international, open-label, phase 3...",predictsSensitivityTo
4,civic.eid:2994,"On May 14, 2013, the U.S. Food and Drug Admini...",predictsSensitivityTo


In [81]:
df

Unnamed: 0,id,description,predicate,subject,obj,qualifier,statement_full
0,civic.eid:238,The T790M mutation in EGFR has been shown to c...,predictsResistanceTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:337525,Lung Non-small Cell Carcinoma,"(ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-, pr..."
1,civic.eid:1409,Phase 3 randomized clinical trial comparing ve...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,Skin Melanoma,"(ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, pr..."
2,civic.eid:1592,Osimertinib has been approved for the treatmen...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,Lung Non-small Cell Carcinoma,"(ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-, pr..."
3,civic.eid:1867,"Randomized, international, open-label, phase 3...",predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,Lung Non-small Cell Carcinoma,"(ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-, pr..."
4,civic.eid:2994,"On May 14, 2013, the U.S. Food and Drug Admini...",predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,Lung Non-small Cell Carcinoma,"(ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ, pr..."
...,...,...,...,...,...,...,...
1138,moa.assertion:990,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.YqiDFzE2K6fcXhXkCGYHIQ75RgutPq_7,ncit:C152914,Oligodendroglioma,"(ga4gh:VA.YqiDFzE2K6fcXhXkCGYHIQ75RgutPq_7, pr..."
1139,moa.assertion:991,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,"(CombinationTherapy, moa.ctid:1TlyFhyafDojGZE0...",Non-Small Cell Lung Cancer,"(ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ, pr..."
1140,moa.assertion:993,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,"(CombinationTherapy, moa.ctid:9k0z3QBtBa8PgGFl...",Non-Small Cell Lung Cancer,"(ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ, pr..."
1141,moa.assertion:996,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,Non-Small Cell Lung Cancer,"(ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-, pr..."


#### Grab Additional Data Connected to Study Nodes

In [65]:
def grab_subject_variant(study_id):
    query_subject_variant = f"""MATCH (s:StudyStatement) WHERE s.id = '{study_id}'
        MATCH (s)-[:HAS_VARIANT]-(c)
        MATCH (c)-[:HAS_DEFINING_CONTEXT]-(v)
        RETURN properties(s) AS Study,
                properties(c) AS Variation,
                properties(v) AS Context
    """
    result = execute_query(driver, query_subject_variant)
    driver.close()
    if len(result) > 1:
        print(f'{study_id} has {len(result)} subjects possible')
    subject = result[0]['Context']['id']
    return(subject)

def grab_object_therapeutic(study_id): 
    query = f"""MATCH (s:StudyStatement) WHERE s.id = '{study_id}'
        MATCH (s)-[:HAS_THERAPEUTIC]-(c)
        RETURN properties(s) AS Study,
                properties(c) AS Therapeutic
    """
    result = execute_query(driver, query)
    driver.close()
    if len(result) > 1:
        print(f'{study_id} has {len(result)} subjects possible')

    # Check for empty result
    if not(result): 
        return(None)

    # Grab concept identifier (if not single therapeutic, handle other cases)
    if 'normalizer_id' in result[0]['Therapeutic']:
        therapeutic = result[0]['Therapeutic']['normalizer_id']
        return(therapeutic)
    
    ther_type = result[0]['Therapeutic']['type']

    # TODO: Consider additional ways to handle these group therapy types?
    if ther_type == 'CombinationTherapy':
        # DO SOMETHING
        return((ther_type, result[0]['Therapeutic']['id']))
        
    if ther_type == 'TherapeuticSubstituteGroup':
        # DO SOMETHING ELSE
        return((ther_type, result[0]['Therapeutic']['id']))

def grab_qualifier_disease(study_id): 
    query = f"""MATCH (s:StudyStatement) WHERE s.id = '{study_id}'
        MATCH (s)-[:HAS_TUMOR_TYPE]-(c)
        RETURN properties(s) AS Study,
                properties(c) AS Disease
    """
    result = execute_query(driver, query)
    driver.close()
    if len(result) > 1:
        print(f'{study_id} has {len(result)} subjects possible')
    
    try:
        disease = result[0]['Disease']['disease_normalizer_id']
    except: # TODO: This is gross, handle it better
        try:
            disease = result[0]['Disease']['label']
        except:
            disease = result[0]['Disease']['id']
    return(disease)


In [66]:
df['subject'] = None
df['subject'] = df['id'].apply(grab_subject_variant)

df['obj'] = None
df['obj'] = df['id'].apply(grab_object_therapeutic)

df['qualifier'] = None
df['qualifier'] = df['id'].apply(grab_qualifier_disease)


Using a driver after it has been closed is deprecated. Future versions of the driver will raise an error.


Using a driver after it has been closed is deprecated. Future versions of the driver will raise an error.


Using a driver after it has been closed is deprecated. Future versions of the driver will raise an error.



#### Create Evidence Statements 
We will now start the harmonization analysis of evidence statements in metakb. To do this, we will use the individual data points we grabbed to create therapeutic evidence statements. After we create them, we will look for duplicates to determine overlap across studies and sources. After statements are created, use the .value_counts() method to get quick tallys of overlap. 

In [67]:
df['statement_full'] = None
for idx, row in df.iterrows():
    sentence = (row['subject'], row['predicate'], row['obj'], row['qualifier'])
    df.at[idx, 'statement_full'] = sentence
    
df[0:5]

Unnamed: 0,id,description,predicate,subject,obj,qualifier,statement_full
0,civic.eid:238,The T790M mutation in EGFR has been shown to c...,predictsResistanceTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:337525,Lung Non-small Cell Carcinoma,"(ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-, pr..."
1,civic.eid:1409,Phase 3 randomized clinical trial comparing ve...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,Skin Melanoma,"(ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, pr..."
2,civic.eid:1592,Osimertinib has been approved for the treatmen...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,Lung Non-small Cell Carcinoma,"(ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-, pr..."
3,civic.eid:1867,"Randomized, international, open-label, phase 3...",predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,Lung Non-small Cell Carcinoma,"(ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-, pr..."
4,civic.eid:2994,"On May 14, 2013, the U.S. Food and Drug Admini...",predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,Lung Non-small Cell Carcinoma,"(ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ, pr..."


In [68]:
print(f'Total Evidence Statements: {len(df)}')

Total Evidence Statements: 1143


In [69]:
data = df['statement_full'].value_counts().rename_axis('Statement').reset_index(name='counts')
data

Unnamed: 0,Statement,counts
0,"(ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, as...",8
1,"(ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, pr...",7
2,"(ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, as...",7
3,"(ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ, pr...",6
4,"(ga4gh:VA.ORvaSNcFK71WOVr_gi2vv6oPCcXgljab, pr...",6
...,...,...
988,"(ga4gh:VA.aC9fkvhXcUHMGbm-h7A7kVVCoa5JLj7R, pr...",1
989,"(ga4gh:VA.jxXA9Qdh-T0IFIMxKVYl5dVT7XAOD-P7, pr...",1
990,"(ga4gh:VA.Dd_SVbvci6MXhWspBp1t9GI3DfUt_eDn, pr...",1
991,"(ga4gh:VA.AXrGOKDo8ORbkbD1hAtwxLGaCaVgLnVT, pr...",1


In [70]:
# Sanity Check
df[df['subject']=='ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L'].reset_index(drop=True)

Unnamed: 0,id,description,predicate,subject,obj,qualifier,statement_full
0,civic.eid:1409,Phase 3 randomized clinical trial comparing ve...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,Skin Melanoma,"(ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, pr..."
1,civic.eid:9851,The open-label phase 3 BEACON CRC trial includ...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,"(CombinationTherapy, civic.ctid:P1PY89shAjemg7...",Colorectal Cancer,"(ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, pr..."
2,civic.eid:3017,Patients with BRAF V600E-mutant NSCLC (n=57) w...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,"(CombinationTherapy, civic.ctid:oBrlcO23adoVXv...",Lung Non-small Cell Carcinoma,"(ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, pr..."
3,civic.eid:95,Dabrafenib with trametinib provides higher res...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,"(CombinationTherapy, civic.ctid:oBrlcO23adoVXv...",Melanoma,"(ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, pr..."
4,civic.eid:102,Unlike other studies that suggest a poorer out...,associatedWithWorseOutcomeFor,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,,Papillary Thyroid Carcinoma,"(ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, as..."
...,...,...,...,...,...,...,...
115,moa.assertion:182,Cetuximab (Erbitux) is not recommended by the ...,predictsResistanceTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:318341,Colorectal Adenocarcinoma,"(ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, pr..."
116,moa.assertion:184,Vemurafenib alone had insufficient activity in...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,"(CombinationTherapy, moa.ctid:fDe-2sEQxMUyi876...",Colorectal Adenocarcinoma,"(ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, pr..."
117,moa.assertion:865,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,"(CombinationTherapy, moa.ctid:IAMn2xc8QDzi0_zv...","Low-Grade Glioma, NOS","(ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, pr..."
118,moa.assertion:911,The U.S. Food and Drug Administration (FDA) ap...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,"(CombinationTherapy, moa.ctid:PGEv4QIKz-vDJF0e...",Non-Small Cell Lung Cancer,"(ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, pr..."


### Make the Data Graphs
Now that we have all of our duplicates, let's make some graphs. We will make a bar graph colorized by predicate type to visually show how much overlap we have across different statements, statement types. Additionally, we will make a quick and dirty pie chart to show percentage of evidence that is considered duplicate throughout MetaKB. 
  
NOTE: There is probably a more elegant way to do this without creating an additional dataframe, we should look into this in the future but for now this will work.

In [71]:
# Find just the duplicated (>=2)
data[0:92]

Unnamed: 0,Statement,counts
0,"(ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, as...",8
1,"(ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, pr...",7
2,"(ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, as...",7
3,"(ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ, pr...",6
4,"(ga4gh:VA.ORvaSNcFK71WOVr_gi2vv6oPCcXgljab, pr...",6
...,...,...
87,"(ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-, pr...",2
88,"(ga4gh:VA.mipQFzApQNp2KyVXcZig8b4uu7kFofIZ, pr...",2
89,"(ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, pr...",2
90,"(ga4gh:VA.pfWn9x9oFBRzGda1xXcOrE-BrX0R__N8, pr...",2


In [72]:
# Sanity check for how many records have overlap (or are considered a duplicate)
data['counts'][0:92].sum()

241

In [73]:
data['variant'] = None
data['predicate'] = None
data['therapeutic'] = None
data['disease'] = None

# Plotly handles tuples weird, use this to graph in the future
data['joined'] = None

for idx, row in data.iterrows():
    data.at[idx, 'variant'] = row['Statement'][0]
    data.at[idx, 'predicate'] = row['Statement'][1]
    data.at[idx, 'therapeutic'] = row['Statement'][2]
    data.at[idx, 'disease'] = row['Statement'][3]
    data.at[idx, 'joined'] = ', '.join(str(item) if item is not None else 'None' for item in row['Statement'])


data 

Unnamed: 0,Statement,counts,variant,predicate,therapeutic,disease,joined
0,"(ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, as...",8,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,associatedWithWorseOutcomeFor,,Papillary Thyroid Carcinoma,"ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, ass..."
1,"(ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, pr...",7,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,predictsSensitivityTo,rxcui:1147220,Melanoma,"ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, pre..."
2,"(ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, as...",7,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,associatedWithWorseOutcomeFor,,Colorectal Cancer,"ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, ass..."
3,"(ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ, pr...",6,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,predictsSensitivityTo,rxcui:337525,Lung Non-small Cell Carcinoma,"ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ, pre..."
4,"(ga4gh:VA.ORvaSNcFK71WOVr_gi2vv6oPCcXgljab, pr...",6,ga4gh:VA.ORvaSNcFK71WOVr_gi2vv6oPCcXgljab,predictsResistanceTo,rxcui:318341,Colorectal Cancer,"ga4gh:VA.ORvaSNcFK71WOVr_gi2vv6oPCcXgljab, pre..."
...,...,...,...,...,...,...,...
988,"(ga4gh:VA.aC9fkvhXcUHMGbm-h7A7kVVCoa5JLj7R, pr...",1,ga4gh:VA.aC9fkvhXcUHMGbm-h7A7kVVCoa5JLj7R,predictsSensitivityTo,rxcui:2267574,Lung Non-small Cell Carcinoma,"ga4gh:VA.aC9fkvhXcUHMGbm-h7A7kVVCoa5JLj7R, pre..."
989,"(ga4gh:VA.jxXA9Qdh-T0IFIMxKVYl5dVT7XAOD-P7, pr...",1,ga4gh:VA.jxXA9Qdh-T0IFIMxKVYl5dVT7XAOD-P7,predictsSensitivityTo,rxcui:2267574,Lung Non-small Cell Carcinoma,"ga4gh:VA.jxXA9Qdh-T0IFIMxKVYl5dVT7XAOD-P7, pre..."
990,"(ga4gh:VA.Dd_SVbvci6MXhWspBp1t9GI3DfUt_eDn, pr...",1,ga4gh:VA.Dd_SVbvci6MXhWspBp1t9GI3DfUt_eDn,predictsSensitivityTo,rxcui:1312397,Colorectal Adenocarcinoma,"ga4gh:VA.Dd_SVbvci6MXhWspBp1t9GI3DfUt_eDn, pre..."
991,"(ga4gh:VA.AXrGOKDo8ORbkbD1hAtwxLGaCaVgLnVT, pr...",1,ga4gh:VA.AXrGOKDo8ORbkbD1hAtwxLGaCaVgLnVT,predictsResistanceTo,rxcui:1721560,Lung Adenocarcinoma,"ga4gh:VA.AXrGOKDo8ORbkbD1hAtwxLGaCaVgLnVT, pre..."


In [74]:
# Mark evidence statements as duplicate or not to be able to pie graph
def duplicate_detect(count):
    if count > 1:
        return('Duplicate')
    else:
        return('Not Duplicate')
    
data['is_duplicate'] = None
data['is_duplicate'] = data['counts'].apply(duplicate_detect)

In [75]:
data

Unnamed: 0,Statement,counts,variant,predicate,therapeutic,disease,joined,is_duplicate
0,"(ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, as...",8,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,associatedWithWorseOutcomeFor,,Papillary Thyroid Carcinoma,"ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, ass...",Duplicate
1,"(ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, pr...",7,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,predictsSensitivityTo,rxcui:1147220,Melanoma,"ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, pre...",Duplicate
2,"(ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, as...",7,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,associatedWithWorseOutcomeFor,,Colorectal Cancer,"ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, ass...",Duplicate
3,"(ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ, pr...",6,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,predictsSensitivityTo,rxcui:337525,Lung Non-small Cell Carcinoma,"ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ, pre...",Duplicate
4,"(ga4gh:VA.ORvaSNcFK71WOVr_gi2vv6oPCcXgljab, pr...",6,ga4gh:VA.ORvaSNcFK71WOVr_gi2vv6oPCcXgljab,predictsResistanceTo,rxcui:318341,Colorectal Cancer,"ga4gh:VA.ORvaSNcFK71WOVr_gi2vv6oPCcXgljab, pre...",Duplicate
...,...,...,...,...,...,...,...,...
988,"(ga4gh:VA.aC9fkvhXcUHMGbm-h7A7kVVCoa5JLj7R, pr...",1,ga4gh:VA.aC9fkvhXcUHMGbm-h7A7kVVCoa5JLj7R,predictsSensitivityTo,rxcui:2267574,Lung Non-small Cell Carcinoma,"ga4gh:VA.aC9fkvhXcUHMGbm-h7A7kVVCoa5JLj7R, pre...",Not Duplicate
989,"(ga4gh:VA.jxXA9Qdh-T0IFIMxKVYl5dVT7XAOD-P7, pr...",1,ga4gh:VA.jxXA9Qdh-T0IFIMxKVYl5dVT7XAOD-P7,predictsSensitivityTo,rxcui:2267574,Lung Non-small Cell Carcinoma,"ga4gh:VA.jxXA9Qdh-T0IFIMxKVYl5dVT7XAOD-P7, pre...",Not Duplicate
990,"(ga4gh:VA.Dd_SVbvci6MXhWspBp1t9GI3DfUt_eDn, pr...",1,ga4gh:VA.Dd_SVbvci6MXhWspBp1t9GI3DfUt_eDn,predictsSensitivityTo,rxcui:1312397,Colorectal Adenocarcinoma,"ga4gh:VA.Dd_SVbvci6MXhWspBp1t9GI3DfUt_eDn, pre...",Not Duplicate
991,"(ga4gh:VA.AXrGOKDo8ORbkbD1hAtwxLGaCaVgLnVT, pr...",1,ga4gh:VA.AXrGOKDo8ORbkbD1hAtwxLGaCaVgLnVT,predictsResistanceTo,rxcui:1721560,Lung Adenocarcinoma,"ga4gh:VA.AXrGOKDo8ORbkbD1hAtwxLGaCaVgLnVT, pre...",Not Duplicate


In [76]:
# Sanity check a breakdown of the predicates
data['predicate'].value_counts()

predicate
predictsSensitivityTo             569
predictsResistanceTo              367
associatedWithWorseOutcomeFor      43
associatedWithBetterOutcomeFor     14
Name: count, dtype: int64

In [77]:
# Bar Graph of Evidence Overlap
fig = px.bar(data[0:92], x='joined', y='counts', color='predicate', title='Overlap of Full Evidence Statements in MetaKB v2')
fig.update_layout(yaxis_title='# Overlapping Statements',
                  xaxis=dict(tickfont=dict(size=4)),
                  bargap=0.2)
fig.show()
pio.write_image(fig, "Full_Evidence_Statement_overlap_bar.png", format='png', width=1500, height=500, scale=5)

In [78]:
# TODO: Tuples/Triplets of different components of the full evidence statement for more overlap
# fig = px.bar(data[0:86], x=['Variant','Therapeutic'], y='counts', color='predicate', title='Overlap of Full Evidence Statements in MetaKB v2')
# fig.update_layout(yaxis_title='# Overlapping Statements') 
# fig.show()

In [79]:
fig = px.pie(data, values='counts', names='is_duplicate', title='% of Identical Evidence Statements in MetaKB v2')
fig.update_traces(textinfo='percent+label')
fig.show()
pio.write_image(fig, "Duplicate_statements_metakbv2_pie.png", format='png', width=800, height=800, scale=5)

In [80]:
# TODO: Loop in evidence level, coding table (HAS_STRENGTH) ---> https://docs.google.com/spreadsheets/d/1FpUmoXmDLVXsNgqog6A9q6o0jIOi_760j02rxIQGFMo/edit?gid=0#gid=0
# Check to see if regulatory approval exists in the extensions, does it have specific indications? If the disease occurs outside the indication, it gets lowered to a different level. 

## Individual Components
Exhibit overlap in concepts at individual and pairwise component levels. This is important to demonstrate potential overlap and utility for planned aggregation features on the HTML interface and to demonstrate uniqueness of concepts obtained from definitive sources.

In [330]:
def check_civic(record, column, df):
    tdf = df[df[column]==record]
    return (any('civic.eid' in s for s in list(tdf['id'])))

def check_moa(record, column, df):
    tdf = df[df[column]==record]
    return (any('moa.assertion' in s for s in list(tdf['id'])))

def calculate_total_kbs(df):
    df['total_kbs'] = None
    for idx, row in df.iterrows():
        counter = 0
        if row['in_civic?']:
            counter += 1
        if row['in_moa?']:
            counter += 1
        df.at[idx, 'total_kbs'] = counter
    return(df)

#### Variants

In [None]:
df['subject'].value_counts()[0:166] # Dups

subject
ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L    120
ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ     49
ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-     41
ga4gh:VA.5GEqm-zIzvvPheyGb9rlu5DyIueIubXm     25
ga4gh:VA.cdcXITbBWUXPjy3zPdhs7xjU1yvlkSrj     21
                                            ... 
ga4gh:VA.zElnUUWNdxH5elaDdhFo8dYA6dVjCZNs      2
ga4gh:VA.lSZe19pmSXAaE602TVOL2sZZKGxemTKw      2
ga4gh:VA.iCbjI9KjbK2qqD4_iZdEOJKe0WC97hdm      2
ga4gh:VA.EstQP5mWZWkmgYMkqEJMSnbZ12K88Y3v      2
ga4gh:VA.SnJZPfiwIGwkyiAx-XnLElqheOPwlMA0      2
Name: count, Length: 166, dtype: int64

In [140]:
tdf = df['subject'].value_counts().rename_axis('Variant').reset_index(name='counts')
tdf[0:166]

Unnamed: 0,Variant,counts
0,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,120
1,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,49
2,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,41
3,ga4gh:VA.5GEqm-zIzvvPheyGb9rlu5DyIueIubXm,25
4,ga4gh:VA.cdcXITbBWUXPjy3zPdhs7xjU1yvlkSrj,21
...,...,...
161,ga4gh:VA.zElnUUWNdxH5elaDdhFo8dYA6dVjCZNs,2
162,ga4gh:VA.lSZe19pmSXAaE602TVOL2sZZKGxemTKw,2
163,ga4gh:VA.iCbjI9KjbK2qqD4_iZdEOJKe0WC97hdm,2
164,ga4gh:VA.EstQP5mWZWkmgYMkqEJMSnbZ12K88Y3v,2


In [142]:
tdf['is_duplicate'] = tdf['counts'].apply(duplicate_detect)
tdf[0:166]

Unnamed: 0,Variant,counts,is_duplicate
0,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,120,Duplicate
1,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,49,Duplicate
2,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,41,Duplicate
3,ga4gh:VA.5GEqm-zIzvvPheyGb9rlu5DyIueIubXm,25,Duplicate
4,ga4gh:VA.cdcXITbBWUXPjy3zPdhs7xjU1yvlkSrj,21,Duplicate
...,...,...,...
161,ga4gh:VA.zElnUUWNdxH5elaDdhFo8dYA6dVjCZNs,2,Duplicate
162,ga4gh:VA.lSZe19pmSXAaE602TVOL2sZZKGxemTKw,2,Duplicate
163,ga4gh:VA.iCbjI9KjbK2qqD4_iZdEOJKe0WC97hdm,2,Duplicate
164,ga4gh:VA.EstQP5mWZWkmgYMkqEJMSnbZ12K88Y3v,2,Duplicate


In [None]:
for variation in tdf['Variant'][0:4]:
    print(variation)
    # V600E
    # L858R
    # T790M
    # H1047R

ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L
ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ
ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-
ga4gh:VA.5GEqm-zIzvvPheyGb9rlu5DyIueIubXm


In [146]:
# Bar Graph of Evidence Overlap
fig = px.bar(tdf[0:166], x='Variant', y='counts', title='Overlap of Variant Evidence in MetaKB v2')
fig.update_layout(yaxis_title='# of Study Statements containing Variant',
                  xaxis=dict(tickfont=dict(size=4)),
                  bargap=0.2)
fig.show()
pio.write_image(fig, "Variant_overlap_bar.png", format='png', width=1500, height=500, scale=5)

In [151]:
fig = px.pie(tdf, values='counts', names='is_duplicate', title='% Evidence containing duplicate Variant in MetaKB v2')
fig.update_traces(textinfo='percent+label')
fig.show()
pio.write_image(fig, "Duplicate_Variants_metakbv2_pie.png", format='png', width=800, height=800, scale=5)

#### Therapeutics

In [None]:
df['obj'].value_counts()[0:115] # Duplicates

obj
rxcui:1147220                                                                 71
rxcui:337525                                                                  59
rxcui:282388                                                                  48
rxcui:318341                                                                  39
rxcui:328134                                                                  31
                                                                              ..
ncit:C115109                                                                   2
rxcui:2550714                                                                  2
(CombinationTherapy, moa.ctid:C91ScST5e8OGj8fXpwjf1rACl3_viw9L)                2
(TherapeuticSubstituteGroup, civic.tsgid:bf_BuyNMwaqXScLEAS5F0_LZZf1t1H2f)     2
(CombinationTherapy, civic.ctid:-TUOzS0ClpVaUo5oisn2Bt0H0ZHPi1cJ)              2
Name: count, Length: 115, dtype: int64

In [197]:
tdf = df['obj'].value_counts().rename_axis('Therapeutics').reset_index(name='counts')
tdf[0:115]

Unnamed: 0,Therapeutics,counts
0,rxcui:1147220,71
1,rxcui:337525,59
2,rxcui:282388,48
3,rxcui:318341,39
4,rxcui:328134,31
...,...,...
110,ncit:C115109,2
111,rxcui:2550714,2
112,"(CombinationTherapy, moa.ctid:C91ScST5e8OGj8fX...",2
113,"(TherapeuticSubstituteGroup, civic.tsgid:bf_Bu...",2


In [198]:
tdf['is_duplicate'] = tdf['counts'].apply(duplicate_detect)
tdf[0:115]

Unnamed: 0,Therapeutics,counts,is_duplicate
0,rxcui:1147220,71,Duplicate
1,rxcui:337525,59,Duplicate
2,rxcui:282388,48,Duplicate
3,rxcui:318341,39,Duplicate
4,rxcui:328134,31,Duplicate
...,...,...,...
110,ncit:C115109,2,Duplicate
111,rxcui:2550714,2,Duplicate
112,"(CombinationTherapy, moa.ctid:C91ScST5e8OGj8fX...",2,Duplicate
113,"(TherapeuticSubstituteGroup, civic.tsgid:bf_Bu...",2,Duplicate


In [200]:
# Bar Graph of Evidence Overlap
fig = px.bar(tdf[0:115], x='Therapeutics', y='counts', title='Overlap of Therapeutic Evidence in MetaKB v2')
fig.update_layout(yaxis_title='# of Study Statements containing Therapeutic',
                  xaxis=dict(tickfont=dict(size=4)),
                  bargap=0.2)
fig.show()
pio.write_image(fig, "Therapeutic_overlap_bar.png", format='png', width=1500, height=500, scale=5)

In [202]:
fig = px.pie(tdf, values='counts', names='is_duplicate', title='% Evidence containing duplicate Therapeutic in MetaKB v2')
fig.update_traces(textinfo='percent+label')
fig.show()
pio.write_image(fig, "Duplicate_Therapeutic_metakbv2_pie.png", format='png', width=800, height=800, scale=5)

#### Diseases

In [214]:
df['qualifier'].value_counts()[0:66] # Duplicates


qualifier
Colorectal Cancer                 150
Lung Non-small Cell Carcinoma     138
Melanoma                          117
Cancer                             73
Gastrointestinal Stromal Tumor     63
                                 ... 
Osteosarcoma                        2
Thyroid Gland Carcinoma             2
Ovarian Serous Carcinoma            2
Ewing Sarcoma Of Bone               2
Rectum Cancer                       2
Name: count, Length: 66, dtype: int64

In [215]:
tdf = df['qualifier'].value_counts().rename_axis('Disease').reset_index(name='counts')
tdf[0:66]

Unnamed: 0,Disease,counts
0,Colorectal Cancer,150
1,Lung Non-small Cell Carcinoma,138
2,Melanoma,117
3,Cancer,73
4,Gastrointestinal Stromal Tumor,63
...,...,...
61,Osteosarcoma,2
62,Thyroid Gland Carcinoma,2
63,Ovarian Serous Carcinoma,2
64,Ewing Sarcoma Of Bone,2


In [216]:
tdf['is_duplicate'] = tdf['counts'].apply(duplicate_detect)
tdf[0:66]

Unnamed: 0,Disease,counts,is_duplicate
0,Colorectal Cancer,150,Duplicate
1,Lung Non-small Cell Carcinoma,138,Duplicate
2,Melanoma,117,Duplicate
3,Cancer,73,Duplicate
4,Gastrointestinal Stromal Tumor,63,Duplicate
...,...,...,...
61,Osteosarcoma,2,Duplicate
62,Thyroid Gland Carcinoma,2,Duplicate
63,Ovarian Serous Carcinoma,2,Duplicate
64,Ewing Sarcoma Of Bone,2,Duplicate


In [219]:
# Bar Graph of Evidence Overlap
fig = px.bar(tdf[0:66], x='Disease', y='counts', title='Overlap of Disease Evidence in MetaKB v2')
fig.update_layout(yaxis_title='# of Study Statements containing Disease',
                  xaxis=dict(tickfont=dict(size=8)),
                  bargap=0.2)
fig.show()
pio.write_image(fig, "Disease_overlap_bar.png", format='png', width=1500, height=500, scale=5)

In [220]:
fig = px.pie(tdf, values='counts', names='is_duplicate', title='% Evidence containing duplicate Therapeutic in MetaKB v2')
fig.update_traces(textinfo='percent+label')
fig.show()
pio.write_image(fig, "Duplicate_Disease_metakbv2_pie.png", format='png', width=800, height=800, scale=5)

#### Uniqueness (Singletons)
While pie graphs above show uniqueness in a sense, going to convert this to a bar graph to compare across singletons on the same axis. Additionally, in the future when there are more sources we will want to edit this to also track number of sources reporting a value.

In [331]:
uq_diseases = df['qualifier'].value_counts().rename_axis('Disease').reset_index(name='counts')
uq_therapeutics = df['obj'].value_counts().rename_axis('Therapeutics').reset_index(name='counts')
uq_variants = df['subject'].value_counts().rename_axis('Variant').reset_index(name='counts')

In [332]:
uq_diseases['is_duplicate'] = uq_diseases['counts'].apply(duplicate_detect)
uq_therapeutics['is_duplicate'] = uq_therapeutics['counts'].apply(duplicate_detect)
uq_variants['is_duplicate'] = uq_variants['counts'].apply(duplicate_detect)

In [333]:
uq_diseases['in_civic?'] = uq_diseases['Disease'].apply(check_civic, args=('qualifier',df))
uq_diseases['in_moa?'] = uq_diseases['Disease'].apply(check_moa, args=('qualifier',df))
uq_diseases = calculate_total_kbs(uq_diseases)

uq_therapeutics['in_civic?'] = uq_therapeutics['Therapeutics'].apply(check_civic, args=('obj',df))
uq_therapeutics['in_moa?'] = uq_therapeutics['Therapeutics'].apply(check_moa, args=('obj',df))
uq_therapeutics = calculate_total_kbs(uq_therapeutics)

uq_variants['in_civic?'] = uq_variants['Variant'].apply(check_civic, args=('subject',df))
uq_variants['in_moa?'] = uq_variants['Variant'].apply(check_moa, args=('subject',df))
uq_variants = calculate_total_kbs(uq_variants)



In [334]:
uq_variants

Unnamed: 0,Variant,counts,is_duplicate,in_civic?,in_moa?,total_kbs
0,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,120,Duplicate,True,True,2
1,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,49,Duplicate,True,True,2
2,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,41,Duplicate,True,True,2
3,ga4gh:VA.5GEqm-zIzvvPheyGb9rlu5DyIueIubXm,25,Duplicate,True,True,2
4,ga4gh:VA.cdcXITbBWUXPjy3zPdhs7xjU1yvlkSrj,21,Duplicate,True,False,1
...,...,...,...,...,...,...
445,ga4gh:VA.jxXA9Qdh-T0IFIMxKVYl5dVT7XAOD-P7,1,Not Duplicate,True,False,1
446,ga4gh:VA.aC9fkvhXcUHMGbm-h7A7kVVCoa5JLj7R,1,Not Duplicate,True,False,1
447,ga4gh:VA.cj7G9KWqisifzLXsGHpSU2u3yuvbMhyb,1,Not Duplicate,True,False,1
448,ga4gh:VA.ZjIF_aKWFimqIkAZ38a5n-kX84IMYL5Z,1,Not Duplicate,True,False,1


In [None]:
# uq = {}

# uq['labels'] = ['Disease','Therapeutics','Variants']
# uq['1 KB'] = 

In [335]:
# Create grouped counts by gene / drug / variant with counts for Present in 1 KB, Present in 2 KB
uq = {}
uq['labels'] = ['Disease','Therapeutics','Variants']
uq['1 KB'] = [uq_diseases[uq_diseases['total_kbs']==1]['counts'].sum(),
                uq_therapeutics[uq_therapeutics['total_kbs']==1]['counts'].sum(),
                uq_variants[uq_variants['total_kbs']==1]['counts'].sum()]
uq['2 KBs'] = [uq_diseases[uq_diseases['total_kbs']==2]['counts'].sum(),
                    uq_therapeutics[uq_therapeutics['total_kbs']==2]['counts'].sum(),
                    uq_variants[uq_variants['total_kbs']==2]['counts'].sum()]

uq = pd.DataFrame(uq)

# Calculate Percentages
100 * (uq['2 KBs'][0] / (uq['2 KBs'][0] + uq['2 KBs'][0]))

uq['percent_1KB'] = [100 * (uq['1 KB'][0] / (uq['1 KB'][0] + uq['2 KBs'][0])),
                     100 * (uq['1 KB'][1] / (uq['1 KB'][1] + uq['2 KBs'][1])),
                     100 * (uq['1 KB'][2] / (uq['1 KB'][2] + uq['2 KBs'][2]))]
uq['percent_2KB'] = [100 * (uq['2 KBs'][0] / (uq['1 KB'][0] + uq['2 KBs'][0])),
                     100 * (uq['2 KBs'][1] / (uq['1 KB'][1] + uq['2 KBs'][1])),
                     100 * (uq['2 KBs'][2] / (uq['1 KB'][2] + uq['2 KBs'][2]))]

uq['percent_1KB'] = [f"{p:.2f}%" for p in uq['percent_1KB']]
uq['percent_2KB'] = [f"{p:.2f}%" for p in uq['percent_2KB']]

uq



Unnamed: 0,labels,1 KB,2 KBs,percent_1KB,percent_2KB
0,Disease,887,256,77.60%,22.40%
1,Therapeutics,552,510,51.98%,48.02%
2,Variants,708,435,61.94%,38.06%


In [336]:
fig = px.bar(uq, x='labels', y=['1 KB','2 KBs'], title='Uniqueness of Records in MetaKB v2', color_discrete_map={'1 KB': '#063970', '2 KBs': '#2596be'},text_auto=True)
# #76b5c5, #abdbe3, #154c79, #eeeee4

fig.data[0].text = uq['percent_1KB']
fig.data[0].texttemplate = '%{text}'
fig.data[1].text = uq['percent_2KB']
fig.data[1].texttemplate = '%{text}'

fig.update_layout(xaxis_title='Type of Record',
                  yaxis_title='# of Records',
                  legend_title_text='Record Present in:'
                  )
fig.show()

pio.write_image(fig, "Single_record_uniqueness.png", format='png', width=600, height=300, scale=5)

#### Therapeutics on Disease

In [None]:
df[['obj','qualifier']].value_counts()[0:153] # Duplicates

obj                                                                         qualifier                    
rxcui:318341                                                                Colorectal Cancer                37
rxcui:1147220                                                               Melanoma                         31
rxcui:337525                                                                Lung Non-small Cell Carcinoma    22
rxcui:2289380                                                               Melanoma                         20
ncit:C165479                                                                Invasive Breast Carcinoma        20
                                                                                                             ..
(TherapeuticSubstituteGroup, civic.tsgid:bf_BuyNMwaqXScLEAS5F0_LZZf1t1H2f)  Breast Cancer                     2
(CombinationTherapy, civic.ctid:7g5X81wE2go4GE_fB45SjFamVMgpnwsA)           Colorectal Cancer                 

In [154]:
tdf = df[['obj','qualifier']].value_counts().rename_axis(['Therapeutic','Disease']).reset_index(name='counts')
tdf[0:153]

Unnamed: 0,Therapeutic,Disease,counts
0,rxcui:318341,Colorectal Cancer,37
1,rxcui:1147220,Melanoma,31
2,rxcui:337525,Lung Non-small Cell Carcinoma,22
3,rxcui:2289380,Melanoma,20
4,ncit:C165479,Invasive Breast Carcinoma,20
...,...,...,...
148,"(TherapeuticSubstituteGroup, civic.tsgid:bf_Bu...",Breast Cancer,2
149,"(CombinationTherapy, civic.ctid:7g5X81wE2go4GE...",Colorectal Cancer,2
150,drugbank:DB12638,Acute Lymphoid Leukemia,2
151,"(CombinationTherapy, moa.ctid:C91ScST5e8OGj8fX...",Acute Myeloid Leukemia,2


In [157]:
tdf['joined'] = None
for idx, row in tdf.iterrows():
    tdf.at[idx,'joined'] = f'{str(row["Therapeutic"])}, {str(row["Disease"])}'

tdf

Unnamed: 0,Therapeutic,Disease,counts,joined
0,rxcui:318341,Colorectal Cancer,37,"rxcui:318341, Colorectal Cancer"
1,rxcui:1147220,Melanoma,31,"rxcui:1147220, Melanoma"
2,rxcui:337525,Lung Non-small Cell Carcinoma,22,"rxcui:337525, Lung Non-small Cell Carcinoma"
3,rxcui:2289380,Melanoma,20,"rxcui:2289380, Melanoma"
4,ncit:C165479,Invasive Breast Carcinoma,20,"ncit:C165479, Invasive Breast Carcinoma"
...,...,...,...,...
390,rxcui:1312397,Colorectal Adenocarcinoma,1,"rxcui:1312397, Colorectal Adenocarcinoma"
391,"(TherapeuticSubstituteGroup, civic.tsgid:Bp7LS...",Gastrointestinal Stromal Tumor,1,"('TherapeuticSubstituteGroup', 'civic.tsgid:Bp..."
392,"(TherapeuticSubstituteGroup, civic.tsgid:7UdV-...",Melanoma,1,"('TherapeuticSubstituteGroup', 'civic.tsgid:7U..."
393,rxcui:1363267,Lung Non-small Cell Carcinoma,1,"rxcui:1363267, Lung Non-small Cell Carcinoma"


In [158]:
tdf['is_duplicate'] = tdf['counts'].apply(duplicate_detect)
tdf

Unnamed: 0,Therapeutic,Disease,counts,joined,is_duplicate
0,rxcui:318341,Colorectal Cancer,37,"rxcui:318341, Colorectal Cancer",Duplicate
1,rxcui:1147220,Melanoma,31,"rxcui:1147220, Melanoma",Duplicate
2,rxcui:337525,Lung Non-small Cell Carcinoma,22,"rxcui:337525, Lung Non-small Cell Carcinoma",Duplicate
3,rxcui:2289380,Melanoma,20,"rxcui:2289380, Melanoma",Duplicate
4,ncit:C165479,Invasive Breast Carcinoma,20,"ncit:C165479, Invasive Breast Carcinoma",Duplicate
...,...,...,...,...,...
390,rxcui:1312397,Colorectal Adenocarcinoma,1,"rxcui:1312397, Colorectal Adenocarcinoma",Not Duplicate
391,"(TherapeuticSubstituteGroup, civic.tsgid:Bp7LS...",Gastrointestinal Stromal Tumor,1,"('TherapeuticSubstituteGroup', 'civic.tsgid:Bp...",Not Duplicate
392,"(TherapeuticSubstituteGroup, civic.tsgid:7UdV-...",Melanoma,1,"('TherapeuticSubstituteGroup', 'civic.tsgid:7U...",Not Duplicate
393,rxcui:1363267,Lung Non-small Cell Carcinoma,1,"rxcui:1363267, Lung Non-small Cell Carcinoma",Not Duplicate


In [170]:
# Bar Graph of Evidence Overlap
fig = px.bar(tdf[0:153], x='joined', y='counts', title='Overlap of Therapeutic/Disease Evidence in MetaKB v2')
fig.update_layout(yaxis_title='# of Study Statements containing Therapeutic/Disease',
                  xaxis_title='Therapeutic/Disease Aggregated Evidence',
                  xaxis=dict(tickfont=dict(size=4)),
                  yaxis_title_font=dict(size=12),
                  bargap=0.2)
fig.show()
pio.write_image(fig, "Therapeutic_Disease_aggregation_overlap_bar.png", format='png', width=1500, height=500, scale=5)


In [173]:
fig = px.pie(tdf, values='counts', names='is_duplicate', title='% Evidence containing duplicate Therapeutic/Disease in MetaKB v2')
fig.update_traces(textinfo='percent+label')
fig.show()
pio.write_image(fig, "Duplicate_therapeutic_disease_metakbv2_pie.png", format='png', width=800, height=800, scale=5)

#### Variants with Therapeutics

In [184]:
df[['subject','obj']].value_counts()[0:114]

subject                                    obj                                                              
ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L  rxcui:1147220                                                        30
ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ  rxcui:337525                                                         14
ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L  (CombinationTherapy, civic.ctid:oBrlcO23adoVXv51xh-5Wigy0QyDWtfr)    12
ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-  rxcui:1721560                                                        11
ga4gh:VA.ORvaSNcFK71WOVr_gi2vv6oPCcXgljab  rxcui:318341                                                          8
                                                                                                                ..
ga4gh:VA.-dMnJf9oUBfl9De0llc3LqJaGdFzfATK  rxcui:2103478                                                         2
ga4gh:VA.3ybIkH_QhR6C7QmnTeD2-P6U1Pu7M__0  ncit:C165479                               

In [185]:
tdf = df[['subject','obj']].value_counts().rename_axis(['Variant','Therapeutic']).reset_index(name='counts')
tdf[0:114]

Unnamed: 0,Variant,Therapeutic,counts
0,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,30
1,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,14
2,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,"(CombinationTherapy, civic.ctid:oBrlcO23adoVXv...",12
3,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,11
4,ga4gh:VA.ORvaSNcFK71WOVr_gi2vv6oPCcXgljab,rxcui:318341,8
...,...,...,...
109,ga4gh:VA.-dMnJf9oUBfl9De0llc3LqJaGdFzfATK,rxcui:2103478,2
110,ga4gh:VA.3ybIkH_QhR6C7QmnTeD2-P6U1Pu7M__0,ncit:C165479,2
111,ga4gh:VA.2NMrhxfirY2q6Zj0wtBNovJQkBwykfPF,rxcui:4492,2
112,ga4gh:VA.1oFFaTlmKzBPBb4fiyAGEilaWJ4sce8_,rxcui:2370147,2


In [186]:
tdf['joined'] = None
for idx, row in tdf.iterrows():
    tdf.at[idx,'joined'] = f'{str(row["Variant"])}, {str(row["Therapeutic"])}'

tdf

Unnamed: 0,Variant,Therapeutic,counts,joined
0,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,30,"ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, rxc..."
1,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,14,"ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ, rxc..."
2,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,"(CombinationTherapy, civic.ctid:oBrlcO23adoVXv...",12,"ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, ('C..."
3,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,11,"ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-, rxc..."
4,ga4gh:VA.ORvaSNcFK71WOVr_gi2vv6oPCcXgljab,rxcui:318341,8,"ga4gh:VA.ORvaSNcFK71WOVr_gi2vv6oPCcXgljab, rxc..."
...,...,...,...,...
815,ga4gh:VA.NaKQmIhVRGhJalSzF8gIqm5IfhEq2n1Q,ncit:C171616,1,"ga4gh:VA.NaKQmIhVRGhJalSzF8gIqm5IfhEq2n1Q, nci..."
816,ga4gh:VA.NbmGko5VG3K44V5sy-uVWrQaBdnqJ227,"(TherapeuticSubstituteGroup, civic.tsgid:R22Zf...",1,"ga4gh:VA.NbmGko5VG3K44V5sy-uVWrQaBdnqJ227, ('T..."
817,ga4gh:VA.NbmGko5VG3K44V5sy-uVWrQaBdnqJ227,ncit:C171616,1,"ga4gh:VA.NbmGko5VG3K44V5sy-uVWrQaBdnqJ227, nci..."
818,ga4gh:VA.O4W3Vd9CFRXUOi-IgiHWMuVdbMQXR_G7,rxcui:1546019,1,"ga4gh:VA.O4W3Vd9CFRXUOi-IgiHWMuVdbMQXR_G7, rxc..."


In [187]:
tdf['is_duplicate'] = tdf['counts'].apply(duplicate_detect)
tdf

Unnamed: 0,Variant,Therapeutic,counts,joined,is_duplicate
0,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,30,"ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, rxc...",Duplicate
1,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,14,"ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ, rxc...",Duplicate
2,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,"(CombinationTherapy, civic.ctid:oBrlcO23adoVXv...",12,"ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, ('C...",Duplicate
3,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,11,"ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-, rxc...",Duplicate
4,ga4gh:VA.ORvaSNcFK71WOVr_gi2vv6oPCcXgljab,rxcui:318341,8,"ga4gh:VA.ORvaSNcFK71WOVr_gi2vv6oPCcXgljab, rxc...",Duplicate
...,...,...,...,...,...
815,ga4gh:VA.NaKQmIhVRGhJalSzF8gIqm5IfhEq2n1Q,ncit:C171616,1,"ga4gh:VA.NaKQmIhVRGhJalSzF8gIqm5IfhEq2n1Q, nci...",Not Duplicate
816,ga4gh:VA.NbmGko5VG3K44V5sy-uVWrQaBdnqJ227,"(TherapeuticSubstituteGroup, civic.tsgid:R22Zf...",1,"ga4gh:VA.NbmGko5VG3K44V5sy-uVWrQaBdnqJ227, ('T...",Not Duplicate
817,ga4gh:VA.NbmGko5VG3K44V5sy-uVWrQaBdnqJ227,ncit:C171616,1,"ga4gh:VA.NbmGko5VG3K44V5sy-uVWrQaBdnqJ227, nci...",Not Duplicate
818,ga4gh:VA.O4W3Vd9CFRXUOi-IgiHWMuVdbMQXR_G7,rxcui:1546019,1,"ga4gh:VA.O4W3Vd9CFRXUOi-IgiHWMuVdbMQXR_G7, rxc...",Not Duplicate


In [188]:
# Bar Graph of Evidence Overlap
fig = px.bar(tdf[0:114], x='joined', y='counts', title='Overlap of Variant/Therapeutic Evidence in MetaKB v2')
fig.update_layout(yaxis_title='# of Study Statements containing Variant/Therapeutic',
                  xaxis_title='Variant/Therapeutic Aggregated Evidence',
                  xaxis=dict(tickfont=dict(size=4)),
                  yaxis_title_font=dict(size=12),
                  bargap=0.2)
fig.show()
pio.write_image(fig, "Variant_Therapeutic_aggregation_overlap_bar.png", format='png', width=1500, height=500, scale=5)


In [189]:
fig = px.pie(tdf, values='counts', names='is_duplicate', title='% Evidence containing duplicate Variant/Therapeutic in MetaKB v2')
fig.update_traces(textinfo='percent+label')
fig.show()
pio.write_image(fig, "Duplicate_Variant_Therapeutic_metakbv2_pie.png", format='png', width=800, height=800, scale=5)

#### Variants in Disease

In [296]:
df[['subject','qualifier']].value_counts()[0:171]

subject                                    qualifier                    
ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L  Melanoma                         35
                                           Colorectal Cancer                33
ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-  Lung Non-small Cell Carcinoma    25
ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ  Lung Non-small Cell Carcinoma    22
ga4gh:VA.ORvaSNcFK71WOVr_gi2vv6oPCcXgljab  Colorectal Cancer                13
                                                                            ..
ga4gh:VA.WCo3uQpMnNSbBMy5pKdSccudXs8ooZw9  Cancer                            2
ga4gh:VA.t8CxWZxFJbfWnqruBINaDXruSd6aNKZH  Colorectal Cancer                 2
ga4gh:VA.2VAvTmPON2L7JffxjE68jAmmmwxyzDpB  Oligodendroglioma                 2
ga4gh:VA.udBCHwlrf8xNiRy_19bLi-h5LhnZLgCt  Non-Small Cell Lung Cancer        2
ga4gh:VA.YqiDFzE2K6fcXhXkCGYHIQ75RgutPq_7  Acute Myeloid Leukemia            2
Name: count, Length: 171, dtype: int64

In [297]:
tdf = df[['subject','qualifier']].value_counts().rename_axis(['Variant','Disease']).reset_index(name='counts')
tdf[0:171]

Unnamed: 0,Variant,Disease,counts
0,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,Melanoma,35
1,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,Colorectal Cancer,33
2,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,Lung Non-small Cell Carcinoma,25
3,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,Lung Non-small Cell Carcinoma,22
4,ga4gh:VA.ORvaSNcFK71WOVr_gi2vv6oPCcXgljab,Colorectal Cancer,13
...,...,...,...
166,ga4gh:VA.WCo3uQpMnNSbBMy5pKdSccudXs8ooZw9,Cancer,2
167,ga4gh:VA.t8CxWZxFJbfWnqruBINaDXruSd6aNKZH,Colorectal Cancer,2
168,ga4gh:VA.2VAvTmPON2L7JffxjE68jAmmmwxyzDpB,Oligodendroglioma,2
169,ga4gh:VA.udBCHwlrf8xNiRy_19bLi-h5LhnZLgCt,Non-Small Cell Lung Cancer,2


In [298]:
tdf['joined'] = None
for idx, row in tdf.iterrows():
    tdf.at[idx,'joined'] = f'{str(row["Variant"])}, {str(row["Disease"])}'

tdf

Unnamed: 0,Variant,Disease,counts,joined
0,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,Melanoma,35,"ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, Mel..."
1,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,Colorectal Cancer,33,"ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, Col..."
2,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,Lung Non-small Cell Carcinoma,25,"ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-, Lun..."
3,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,Lung Non-small Cell Carcinoma,22,"ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ, Lun..."
4,ga4gh:VA.ORvaSNcFK71WOVr_gi2vv6oPCcXgljab,Colorectal Cancer,13,"ga4gh:VA.ORvaSNcFK71WOVr_gi2vv6oPCcXgljab, Col..."
...,...,...,...,...
649,ga4gh:VA.N6UBWVy36H0uBm0x0cMpKOfYW4ILfAp3,Lung Adenocarcinoma,1,"ga4gh:VA.N6UBWVy36H0uBm0x0cMpKOfYW4ILfAp3, Lun..."
650,ga4gh:VA.MnEGjyrxZra3XnDnlR22sCzBum9tLtgs,Cancer,1,"ga4gh:VA.MnEGjyrxZra3XnDnlR22sCzBum9tLtgs, Cancer"
651,ga4gh:VA.Mb5FmolTKggTIxTjxIbG9aC0KngBkprC,Skin Melanoma,1,"ga4gh:VA.Mb5FmolTKggTIxTjxIbG9aC0KngBkprC, Ski..."
652,ga4gh:VA.MSbl4et_1iQVG2_R0TIpDDlt703Y-suA,Colorectal Cancer,1,"ga4gh:VA.MSbl4et_1iQVG2_R0TIpDDlt703Y-suA, Col..."


In [299]:
tdf['is_duplicate'] = tdf['counts'].apply(duplicate_detect)
tdf

Unnamed: 0,Variant,Disease,counts,joined,is_duplicate
0,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,Melanoma,35,"ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, Mel...",Duplicate
1,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,Colorectal Cancer,33,"ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, Col...",Duplicate
2,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,Lung Non-small Cell Carcinoma,25,"ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-, Lun...",Duplicate
3,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,Lung Non-small Cell Carcinoma,22,"ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ, Lun...",Duplicate
4,ga4gh:VA.ORvaSNcFK71WOVr_gi2vv6oPCcXgljab,Colorectal Cancer,13,"ga4gh:VA.ORvaSNcFK71WOVr_gi2vv6oPCcXgljab, Col...",Duplicate
...,...,...,...,...,...
649,ga4gh:VA.N6UBWVy36H0uBm0x0cMpKOfYW4ILfAp3,Lung Adenocarcinoma,1,"ga4gh:VA.N6UBWVy36H0uBm0x0cMpKOfYW4ILfAp3, Lun...",Not Duplicate
650,ga4gh:VA.MnEGjyrxZra3XnDnlR22sCzBum9tLtgs,Cancer,1,"ga4gh:VA.MnEGjyrxZra3XnDnlR22sCzBum9tLtgs, Cancer",Not Duplicate
651,ga4gh:VA.Mb5FmolTKggTIxTjxIbG9aC0KngBkprC,Skin Melanoma,1,"ga4gh:VA.Mb5FmolTKggTIxTjxIbG9aC0KngBkprC, Ski...",Not Duplicate
652,ga4gh:VA.MSbl4et_1iQVG2_R0TIpDDlt703Y-suA,Colorectal Cancer,1,"ga4gh:VA.MSbl4et_1iQVG2_R0TIpDDlt703Y-suA, Col...",Not Duplicate


In [300]:
# Bar Graph of Evidence Overlap
fig = px.bar(tdf[0:171], x='joined', y='counts', title='Overlap of Variant/Disease Evidence in MetaKB v2')
fig.update_layout(yaxis_title='# of Study Statements containing Variant/Disease',
                  xaxis_title='Variant/Disease Aggregated Evidence',
                  xaxis=dict(tickfont=dict(size=4)),
                  yaxis_title_font=dict(size=12),
                  bargap=0.2)
fig.show()
pio.write_image(fig, "Variant_Disease_aggregation_overlap_bar.png", format='png', width=1500, height=500, scale=5)


In [301]:
fig = px.pie(tdf, values='counts', names='is_duplicate', title='% Evidence containing duplicate Variant/Disease in MetaKB v2')
fig.update_traces(textinfo='percent+label')
fig.show()
pio.write_image(fig, "Duplicate_Variant_Disease_metakbv2_pie.png", format='png', width=800, height=800, scale=5)

#### Uniqueness (Pairs)
Repeat the uniqueness exercise, just use pairs of record types instead of singletons.

In [None]:
uq_therdis = df[['obj','qualifier']].value_counts().rename_axis(['Therapeutic','Disease']).reset_index(name='counts')
uq_thervar = df[['obj','subject']].value_counts().rename_axis(['Therapeutic','Variant']).reset_index(name='counts')
uq_vardis = df[['subject','qualifier']].value_counts().rename_axis(['Variant','Disease']).reset_index(name='counts')

uq_therdis['joined'] = None
for idx, row in uq_therdis.iterrows():
    uq_therdis.at[idx,'joined'] = f'{str(row["Therapeutic"])}, {str(row["Disease"])}'

uq_thervar['joined'] = None
for idx, row in uq_thervar.iterrows():
    uq_thervar.at[idx,'joined'] = f'{str(row["Therapeutic"])}, {str(row["Variant"])}'

uq_vardis['joined'] = None
for idx, row in uq_vardis.iterrows():
    uq_vardis.at[idx,'joined'] = f'{str(row["Variant"])}, {str(row["Disease"])}'

In [338]:
uq_therdis['is_duplicate'] = uq_therdis['counts'].apply(duplicate_detect)
uq_thervar['is_duplicate'] = uq_thervar['counts'].apply(duplicate_detect)
uq_vardis['is_duplicate'] = uq_vardis['counts'].apply(duplicate_detect)


In [None]:
def create_test_column(df, column1, column2):
    test_df = df
    test_df['joined'] = None
    for idx, row in df.iterrows():
        test_df.at[idx, 'joined'] = f'{str(row[column1])}, {str(row[column2])}'
    return(test_df)

test_df = create_test_column(df, 'obj', 'qualifier')
uq_therdis['in_civic?'] = uq_therdis['joined'].apply(check_civic, args=('joined',test_df))
uq_therdis['in_moa?'] = uq_therdis['joined'].apply(check_moa, args=('joined',test_df))
uq_therdis = calculate_total_kbs(uq_therdis)

test_df = create_test_column(df, 'obj', 'subject')
uq_thervar['in_civic?'] = uq_thervar['joined'].apply(check_civic, args=('joined',test_df))
uq_thervar['in_moa?'] = uq_thervar['joined'].apply(check_moa, args=('joined',test_df))
uq_thervar = calculate_total_kbs(uq_thervar)

test_df = create_test_column(df, 'subject', 'qualifier')
uq_vardis['in_civic?'] = uq_vardis['joined'].apply(check_civic, args=('joined',test_df))
uq_vardis['in_moa?'] = uq_vardis['joined'].apply(check_moa, args=('joined',test_df))
uq_vardis = calculate_total_kbs(uq_vardis)

In [294]:
# Create grouped counts by pair with counts for Present in 1 KB, Present in 2 KB
uq = {}
uq['labels'] = ['Therapeutic/Disease','Therapeutic/Variant','Variant/Disease']

uq['1 KB'] = [uq_therdis[uq_therdis['is_duplicate']!='Duplicate']['counts'].sum(),
                uq_thervar[uq_thervar['is_duplicate']!='Duplicate']['counts'].sum(),
                uq_vardis[uq_vardis['is_duplicate']!='Duplicate']['counts'].sum()]

uq['2 KBs'] = [uq_therdis[uq_therdis['is_duplicate']=='Duplicate']['counts'].sum(),
                    uq_thervar[uq_thervar['is_duplicate']=='Duplicate']['counts'].sum(),
                    uq_vardis[uq_vardis['is_duplicate']=='Duplicate']['counts'].sum()]

uq = pd.DataFrame(uq)

# Calculate Percentages
100 * (uq['2 KBs'][0] / (uq['2 KBs'][0] + uq['2 KBs'][0]))

uq['percent_1KB'] = [100 * (uq['1 KB'][0] / (uq['1 KB'][0] + uq['2 KBs'][0])),
                     100 * (uq['1 KB'][1] / (uq['1 KB'][1] + uq['2 KBs'][1])),
                     100 * (uq['1 KB'][2] / (uq['1 KB'][2] + uq['2 KBs'][2]))]
uq['percent_2KB'] = [100 * (uq['2 KBs'][0] / (uq['1 KB'][0] + uq['2 KBs'][0])),
                     100 * (uq['2 KBs'][1] / (uq['1 KB'][1] + uq['2 KBs'][1])),
                     100 * (uq['2 KBs'][2] / (uq['1 KB'][2] + uq['2 KBs'][2]))]

uq['percent_1KB'] = [f"{p:.2f}%" for p in uq['percent_1KB']]
uq['percent_2KB'] = [f"{p:.2f}%" for p in uq['percent_2KB']]

uq



Unnamed: 0,labels,1 KB,2 KBs,percent_1KB,percent_2KB
0,Therapeutic/Disease,242,820,22.79%,77.21%
1,Therapeutic/Variant,706,356,66.48%,33.52%
2,Variant/Disease,483,660,42.26%,57.74%


In [295]:
fig = px.bar(uq, x='labels', y=['1 KB','2 KBs'], title='Uniqueness of Records in MetaKB v2', color_discrete_map={'1 KB': '#063970', '2 KBs': '#2596be'},text_auto=True)
# #76b5c5, #abdbe3, #154c79, #eeeee4

fig.data[0].text = uq['percent_1KB']
fig.data[0].texttemplate = '%{text}'
fig.data[1].text = uq['percent_2KB']
fig.data[1].texttemplate = '%{text}'

fig.update_layout(xaxis_title='Type of Record',
                  yaxis_title='# of Records',
                  legend_title_text='Record Present in:'
                  )
fig.show()

pio.write_image(fig, "Paired_record_uniqueness.png", format='png', width=600, height=300, scale=5)

## Evidence Strength
TODO

In [82]:
df

Unnamed: 0,id,description,predicate,subject,obj,qualifier,statement_full
0,civic.eid:238,The T790M mutation in EGFR has been shown to c...,predictsResistanceTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:337525,Lung Non-small Cell Carcinoma,"(ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-, pr..."
1,civic.eid:1409,Phase 3 randomized clinical trial comparing ve...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,Skin Melanoma,"(ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, pr..."
2,civic.eid:1592,Osimertinib has been approved for the treatmen...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,Lung Non-small Cell Carcinoma,"(ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-, pr..."
3,civic.eid:1867,"Randomized, international, open-label, phase 3...",predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,Lung Non-small Cell Carcinoma,"(ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-, pr..."
4,civic.eid:2994,"On May 14, 2013, the U.S. Food and Drug Admini...",predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,Lung Non-small Cell Carcinoma,"(ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ, pr..."
...,...,...,...,...,...,...,...
1138,moa.assertion:990,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.YqiDFzE2K6fcXhXkCGYHIQ75RgutPq_7,ncit:C152914,Oligodendroglioma,"(ga4gh:VA.YqiDFzE2K6fcXhXkCGYHIQ75RgutPq_7, pr..."
1139,moa.assertion:991,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,"(CombinationTherapy, moa.ctid:1TlyFhyafDojGZE0...",Non-Small Cell Lung Cancer,"(ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ, pr..."
1140,moa.assertion:993,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,"(CombinationTherapy, moa.ctid:9k0z3QBtBa8PgGFl...",Non-Small Cell Lung Cancer,"(ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ, pr..."
1141,moa.assertion:996,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,Non-Small Cell Lung Cancer,"(ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-, pr..."


In [51]:
def grab_evidence_strength(study_id):
    query_subject_variant = f"""MATCH (s:Study) WHERE s.id = '{study_id}'
        MATCH (s)-[:HAS_STRENGTH]-(c)
        RETURN properties(s) AS Study,
                properties(c) AS Strength
    """
    result = execute_query(driver, query_subject_variant)
    driver.close()
    if len(result) > 1:
        print(f'{study_id} has {len(result)} subjects possible')
    subject = result[0]['Strength']['label']

    return(subject)

In [52]:
df['evidence_strength'] = None
df['evidence_strength'] = df['id'].apply(grab_evidence_strength)



Using a driver after it has been closed is deprecated. Future versions of the driver will raise an error.



In [53]:
df

Unnamed: 0,id,description,predicate,subject,obj,qualifier,statement_full,evidence_strength_code,evidence_strength
0,civic.eid:238,The T790M mutation in EGFR has been shown to c...,predictsResistanceTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:337525,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence
1,civic.eid:1409,Phase 3 randomized clinical trial comparing ve...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,ncit:C3510,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...,e000001,authoritative evidence
2,civic.eid:1592,Osimertinib has been approved for the treatmen...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence
3,civic.eid:1867,"Randomized, international, open-label, phase 3...",predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence
4,civic.eid:2994,"On May 14, 2013, the U.S. Food and Drug Admini...",predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000001,authoritative evidence
...,...,...,...,...,...,...,...,...,...
1037,moa.assertion:961,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ,rxcui:1364347,ncit:C3174,ga4gh:VA.D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ pred...,e000002,FDA recognized evidence
1038,moa.assertion:963,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.pfWn9x9oFBRzGda1xXcOrE-BrX0R__N8,ncit:C106254,oncotree:LGGNOS,ga4gh:VA.pfWn9x9oFBRzGda1xXcOrE-BrX0R__N8 pred...,e000002,FDA recognized evidence
1039,moa.assertion:967,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000002,FDA recognized evidence
1040,moa.assertion:969,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:328134,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000002,FDA recognized evidence


In [56]:
df['counts'] = None

for idx, row in df.iterrows():
    tdf = df[df['statement_full']==row['statement_full']]
    df.at[idx,'counts'] = len(tdf)

df



Unnamed: 0,id,description,predicate,subject,obj,qualifier,statement_full,evidence_strength_code,evidence_strength,counts
0,civic.eid:238,The T790M mutation in EGFR has been shown to c...,predictsResistanceTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:337525,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,2
1,civic.eid:1409,Phase 3 randomized clinical trial comparing ve...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,ncit:C3510,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...,e000001,authoritative evidence,2
2,civic.eid:1592,Osimertinib has been approved for the treatmen...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,8
3,civic.eid:1867,"Randomized, international, open-label, phase 3...",predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,8
4,civic.eid:2994,"On May 14, 2013, the U.S. Food and Drug Admini...",predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000001,authoritative evidence,7
...,...,...,...,...,...,...,...,...,...,...
1037,moa.assertion:961,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ,rxcui:1364347,ncit:C3174,ga4gh:VA.D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ pred...,e000002,FDA recognized evidence,1
1038,moa.assertion:963,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.pfWn9x9oFBRzGda1xXcOrE-BrX0R__N8,ncit:C106254,oncotree:LGGNOS,ga4gh:VA.pfWn9x9oFBRzGda1xXcOrE-BrX0R__N8 pred...,e000002,FDA recognized evidence,1
1039,moa.assertion:967,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000002,FDA recognized evidence,7
1040,moa.assertion:969,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:328134,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000002,FDA recognized evidence,4


In [64]:
df['evidences_cross'] = None

for idx, row in df.iterrows():
    tdf = df[df['statement_full']==row['statement_full']]
    df.at[idx,'evidences_cross'] = list(tdf['evidence_strength'])

df


Unnamed: 0,id,description,predicate,subject,obj,qualifier,statement_full,evidence_strength_code,evidence_strength,counts,evidences_cross
0,civic.eid:238,The T790M mutation in EGFR has been shown to c...,predictsResistanceTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:337525,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,2,"[authoritative evidence, observational study e..."
1,civic.eid:1409,Phase 3 randomized clinical trial comparing ve...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,ncit:C3510,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...,e000001,authoritative evidence,2,"[authoritative evidence, clinical cohort evide..."
2,civic.eid:1592,Osimertinib has been approved for the treatmen...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,8,"[authoritative evidence, authoritative evidenc..."
3,civic.eid:1867,"Randomized, international, open-label, phase 3...",predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,8,"[authoritative evidence, authoritative evidenc..."
4,civic.eid:2994,"On May 14, 2013, the U.S. Food and Drug Admini...",predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000001,authoritative evidence,7,"[authoritative evidence, clinical cohort evide..."
...,...,...,...,...,...,...,...,...,...,...,...
1037,moa.assertion:961,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ,rxcui:1364347,ncit:C3174,ga4gh:VA.D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ pred...,e000002,FDA recognized evidence,1,[FDA recognized evidence]
1038,moa.assertion:963,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.pfWn9x9oFBRzGda1xXcOrE-BrX0R__N8,ncit:C106254,oncotree:LGGNOS,ga4gh:VA.pfWn9x9oFBRzGda1xXcOrE-BrX0R__N8 pred...,e000002,FDA recognized evidence,1,[FDA recognized evidence]
1039,moa.assertion:967,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000002,FDA recognized evidence,7,"[authoritative evidence, clinical cohort evide..."
1040,moa.assertion:969,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:328134,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000002,FDA recognized evidence,4,"[clinical cohort evidence, clinical cohort evi..."


In [69]:
duplicates = df[df['counts']>1].reset_index(drop=True)
duplicates

Unnamed: 0,id,description,predicate,subject,obj,qualifier,statement_full,evidence_strength_code,evidence_strength,counts,evidences_cross
0,civic.eid:238,The T790M mutation in EGFR has been shown to c...,predictsResistanceTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:337525,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,2,"[authoritative evidence, observational study e..."
1,civic.eid:1409,Phase 3 randomized clinical trial comparing ve...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,ncit:C3510,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...,e000001,authoritative evidence,2,"[authoritative evidence, clinical cohort evide..."
2,civic.eid:1592,Osimertinib has been approved for the treatmen...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,8,"[authoritative evidence, authoritative evidenc..."
3,civic.eid:1867,"Randomized, international, open-label, phase 3...",predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,8,"[authoritative evidence, authoritative evidenc..."
4,civic.eid:2994,"On May 14, 2013, the U.S. Food and Drug Admini...",predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000001,authoritative evidence,7,"[authoritative evidence, clinical cohort evide..."
...,...,...,...,...,...,...,...,...,...,...,...
220,moa.assertion:813,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.lAST62ntkEaXy6RDDOF1V4C3jd-jaAJ5,rxcui:2049873,ncit:C3171,ga4gh:VA.lAST62ntkEaXy6RDDOF1V4C3jd-jaAJ5 pred...,e000002,FDA recognized evidence,3,"[clinical cohort evidence, FDA recognized evid..."
221,moa.assertion:944,The U.S. Food and Drug Administration (FDA) ap...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,ncit:C53972,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...,e000002,FDA recognized evidence,2,"[interventional study evidence, FDA recognized..."
222,moa.assertion:950,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:1721560,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000002,FDA recognized evidence,4,"[preclinical evidence, interventional study ev..."
223,moa.assertion:967,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000002,FDA recognized evidence,7,"[authoritative evidence, clinical cohort evide..."


In [71]:
duplicates['evidences_cross'].explode()[3]

3           authoritative evidence
3           authoritative evidence
3         clinical cohort evidence
3         clinical cohort evidence
3             preclinical evidence
3          FDA recognized evidence
3    interventional study evidence
3     observational study evidence
Name: evidences_cross, dtype: object

In [75]:
from collections import Counter

def check_agreement(evidence_list):
    return len(set(evidence_list)) == 1

def get_buckets(evidence_list):
    unique_evidences = set(evidence_list)
    if len(unique_evidences) == 1:
        return {}  # Return an empty dict if all elements are the same
    else:
        return dict(Counter(evidence_list))


In [78]:
# is_agreement 
# num_of_buckets


duplicates['is_agreement'] = duplicates['evidences_cross'].apply(check_agreement)
duplicates['buckets'] = duplicates['evidences_cross'].apply(get_buckets)
duplicates

Unnamed: 0,id,description,predicate,subject,obj,qualifier,statement_full,evidence_strength_code,evidence_strength,counts,evidences_cross,is_agreement,buckets
0,civic.eid:238,The T790M mutation in EGFR has been shown to c...,predictsResistanceTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:337525,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,2,"[authoritative evidence, observational study e...",False,"{'authoritative evidence': 1, 'observational s..."
1,civic.eid:1409,Phase 3 randomized clinical trial comparing ve...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,ncit:C3510,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...,e000001,authoritative evidence,2,"[authoritative evidence, clinical cohort evide...",False,"{'authoritative evidence': 1, 'clinical cohort..."
2,civic.eid:1592,Osimertinib has been approved for the treatmen...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,8,"[authoritative evidence, authoritative evidenc...",False,"{'authoritative evidence': 2, 'clinical cohort..."
3,civic.eid:1867,"Randomized, international, open-label, phase 3...",predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,8,"[authoritative evidence, authoritative evidenc...",False,"{'authoritative evidence': 2, 'clinical cohort..."
4,civic.eid:2994,"On May 14, 2013, the U.S. Food and Drug Admini...",predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000001,authoritative evidence,7,"[authoritative evidence, clinical cohort evide...",False,"{'authoritative evidence': 1, 'clinical cohort..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
220,moa.assertion:813,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.lAST62ntkEaXy6RDDOF1V4C3jd-jaAJ5,rxcui:2049873,ncit:C3171,ga4gh:VA.lAST62ntkEaXy6RDDOF1V4C3jd-jaAJ5 pred...,e000002,FDA recognized evidence,3,"[clinical cohort evidence, FDA recognized evid...",False,"{'clinical cohort evidence': 1, 'FDA recognize..."
221,moa.assertion:944,The U.S. Food and Drug Administration (FDA) ap...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,ncit:C53972,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...,e000002,FDA recognized evidence,2,"[interventional study evidence, FDA recognized...",False,"{'interventional study evidence': 1, 'FDA reco..."
222,moa.assertion:950,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:1721560,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000002,FDA recognized evidence,4,"[preclinical evidence, interventional study ev...",False,"{'preclinical evidence': 1, 'interventional st..."
223,moa.assertion:967,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000002,FDA recognized evidence,7,"[authoritative evidence, clinical cohort evide...",False,"{'authoritative evidence': 1, 'clinical cohort..."


In [81]:
duplicates['is_agreement'].value_counts()

is_agreement
False    157
True      68
Name: count, dtype: int64

In [85]:
tdf = duplicates[duplicates['is_agreement']==True] # top = BRAF
tdf['statement_full'].value_counts()

statement_full
ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L predictsSensitivityTo treatment by civic.ctid:oBrlcO23adoVXv51xh-5Wigy0QyDWtfr for the disease ncit:C3224     4
ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ predictsSensitivityTo treatment by rxcui:1430438 for the disease ncit:C3512                                   3
ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ predictsSensitivityTo treatment by rxcui:337525 for the disease ncit:C9305                                    3
ga4gh:VA.yHQVVwZjma693Ev6lQtew1axCWVwIi8K predictsResistanceTo treatment by rxcui:2289380 for the disease ncit:C3224                                    2
ga4gh:VA.E09IUKElemW2uoVRT5qacU7RMqT8Um5m predictsSensitivityTo treatment by ncit:C165479 for the disease ncit:C9245                                    2
ga4gh:VA.CpnlaV2B8565obATF-UlE706sBYp0D6M predictsSensitivityTo treatment by rxcui:2289380 for the disease ncit:C3224                                   2
ga4gh:VA.yHQVVwZjma693Ev6lQtew1axCWVwIi8K predictsResistanceT

In [86]:
tdf = duplicates[duplicates['is_agreement']==False] # top = EGFR T790M
tdf['statement_full'].value_counts()

statement_full
ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- predictsSensitivityTo treatment by rxcui:1721560 for the disease ncit:C2926                                  8
ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ predictsSensitivityTo treatment by rxcui:337525 for the disease ncit:C2926                                   7
ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L predictsSensitivityTo treatment by rxcui:1147220 for the disease ncit:C3224                                  7
ga4gh:VA.ORvaSNcFK71WOVr_gi2vv6oPCcXgljab predictsResistanceTo treatment by rxcui:318341 for the disease ncit:C4978                                    6
ga4gh:VA.Dy7soaZQU1vH9Eb93xG_pJyhu7xTDDC9 predictsResistanceTo treatment by rxcui:282388 for the disease ncit:C3868                                    5
ga4gh:VA.cdcXITbBWUXPjy3zPdhs7xjU1yvlkSrj predictsResistanceTo treatment by rxcui:6718 for the disease ncit:C3242                                      4
ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ predictsSensitivityTo tre

In [91]:
# Eight pieces of evidence all use the same components but attribute to 6 different evidence strengths

tdf[tdf['statement_full']=='ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- predictsSensitivityTo treatment by rxcui:1721560 for the disease ncit:C2926']

Unnamed: 0,id,description,predicate,subject,obj,qualifier,statement_full,evidence_strength_code,evidence_strength,counts,evidences_cross,is_agreement,buckets
2,civic.eid:1592,Osimertinib has been approved for the treatmen...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,8,"[authoritative evidence, authoritative evidenc...",False,"{'authoritative evidence': 2, 'clinical cohort..."
3,civic.eid:1867,"Randomized, international, open-label, phase 3...",predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,8,"[authoritative evidence, authoritative evidenc...",False,"{'authoritative evidence': 2, 'clinical cohort..."
23,civic.eid:965,This phase I/II trial (NCT01802632) involved 2...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000005,clinical cohort evidence,8,"[authoritative evidence, authoritative evidenc...",False,"{'authoritative evidence': 2, 'clinical cohort..."
45,civic.eid:966,This study summarized 9 EGFR-mutant patients f...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000005,clinical cohort evidence,8,"[authoritative evidence, authoritative evidenc...",False,"{'authoritative evidence': 2, 'clinical cohort..."
132,civic.eid:963,"Cell line, xenograft, and transgenic models we...",predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000009,preclinical evidence,8,"[authoritative evidence, authoritative evidenc...",False,"{'authoritative evidence': 2, 'clinical cohort..."
190,moa.assertion:242,Osimertinib is a kinase inhibitor indicated fo...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000002,FDA recognized evidence,8,"[authoritative evidence, authoritative evidenc...",False,"{'authoritative evidence': 2, 'clinical cohort..."
193,moa.assertion:256,Osimertinib is being evaluated in patients who...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000006,interventional study evidence,8,"[authoritative evidence, authoritative evidenc...",False,"{'authoritative evidence': 2, 'clinical cohort..."
195,moa.assertion:259,Osimertinib is FDA-Approved for metastatic non...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000007,observational study evidence,8,"[authoritative evidence, authoritative evidenc...",False,"{'authoritative evidence': 2, 'clinical cohort..."


## Concordance
TODO

In [203]:
def grab_source(study_id):
    query_subject_variant = f"""MATCH (s:Study) WHERE s.id = '{study_id}'
        MATCH (s)-[:IS_REPORTED_IN]-(c)
        RETURN properties(s) AS Study,
                properties(c) AS Source
    """
    result = execute_query(driver, query_subject_variant)
    driver.close()
    if len(result) > 1:
        print(f'{study_id} has {len(result)} subjects possible')
    try:
        subject = result[0]['Source']['title']
    except:
        subject = result[0]['Source']['id']
    return(subject)

In [204]:
df['source'] = None
df['source'] = df['id'].apply(grab_source)



Using a driver after it has been closed is deprecated. Future versions of the driver will raise an error.



IndexError: list index out of range

In [99]:
df

Unnamed: 0,id,description,predicate,subject,obj,qualifier,statement_full,evidence_strength_code,evidence_strength,counts,evidences_cross,source
0,civic.eid:238,The T790M mutation in EGFR has been shown to c...,predictsResistanceTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:337525,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,2,"[authoritative evidence, observational study e...",EGFR T790M resistance mutation in non small-ce...
1,civic.eid:1409,Phase 3 randomized clinical trial comparing ve...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,ncit:C3510,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...,e000001,authoritative evidence,2,"[authoritative evidence, clinical cohort evide...",Improved survival with vemurafenib in melanoma...
2,civic.eid:1592,Osimertinib has been approved for the treatmen...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,8,"[authoritative evidence, authoritative evidenc...",Osimertinib: First Global Approval.
3,civic.eid:1867,"Randomized, international, open-label, phase 3...",predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,8,"[authoritative evidence, authoritative evidenc...",Osimertinib or Platinum-Pemetrexed in EGFR T79...
4,civic.eid:2994,"On May 14, 2013, the U.S. Food and Drug Admini...",predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000001,authoritative evidence,7,"[authoritative evidence, clinical cohort evide...",U.S. Food and Drug Administration approval sum...
...,...,...,...,...,...,...,...,...,...,...,...,...
1037,moa.assertion:961,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ,rxcui:1364347,ncit:C3174,ga4gh:VA.D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ pred...,e000002,FDA recognized evidence,1,[FDA recognized evidence],"Takeda Pharmaceuticals America, Inc. Iclusig (..."
1038,moa.assertion:963,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.pfWn9x9oFBRzGda1xXcOrE-BrX0R__N8,ncit:C106254,oncotree:LGGNOS,ga4gh:VA.pfWn9x9oFBRzGda1xXcOrE-BrX0R__N8 pred...,e000002,FDA recognized evidence,1,[FDA recognized evidence],"Day One Biopharmaceuticals, Inc. Ojemda (tovor..."
1039,moa.assertion:967,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000002,FDA recognized evidence,7,"[authoritative evidence, clinical cohort evide...","OSI Pharmaceuticals, LLC. Tarceva (erlotinib) ..."
1040,moa.assertion:969,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:328134,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000002,FDA recognized evidence,4,"[clinical cohort evidence, clinical cohort evi...",Astrazeneca Pharmaceuticals LP. Iressa (gefiti...


In [100]:
df['source'].value_counts()

source
Effects of KRAS, BRAF, NRAS, and PIK3CA mutations on the efficacy of cetuximab plus chemotherapy in chemotherapy-refractory metastatic colorectal cancer: a retrospective consortium analysis.                                                                                                                                            31
Negative feedback-defective PRPS1 mutants drive thiopurine resistance in relapsed childhood ALL.                                                                                                                                                                                                                                          26
Referenced with permission from the NCCN Clinical Practice Guidelines in Oncology (NCCN Guidelines®) for Chronic Myelogenous Leukemia V.2.2016. © National Comprehensive Cancer Network, Inc. 2016. All rights reserved. Accessed August 9 2016. To view the most recent and complete version of the guideline, go online to NCCN.org. 

In [101]:
duplicates = df[df['counts']>1].reset_index(drop=True)
duplicates

Unnamed: 0,id,description,predicate,subject,obj,qualifier,statement_full,evidence_strength_code,evidence_strength,counts,evidences_cross,source
0,civic.eid:238,The T790M mutation in EGFR has been shown to c...,predictsResistanceTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:337525,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,2,"[authoritative evidence, observational study e...",EGFR T790M resistance mutation in non small-ce...
1,civic.eid:1409,Phase 3 randomized clinical trial comparing ve...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,ncit:C3510,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...,e000001,authoritative evidence,2,"[authoritative evidence, clinical cohort evide...",Improved survival with vemurafenib in melanoma...
2,civic.eid:1592,Osimertinib has been approved for the treatmen...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,8,"[authoritative evidence, authoritative evidenc...",Osimertinib: First Global Approval.
3,civic.eid:1867,"Randomized, international, open-label, phase 3...",predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,8,"[authoritative evidence, authoritative evidenc...",Osimertinib or Platinum-Pemetrexed in EGFR T79...
4,civic.eid:2994,"On May 14, 2013, the U.S. Food and Drug Admini...",predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000001,authoritative evidence,7,"[authoritative evidence, clinical cohort evide...",U.S. Food and Drug Administration approval sum...
...,...,...,...,...,...,...,...,...,...,...,...,...
220,moa.assertion:813,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.lAST62ntkEaXy6RDDOF1V4C3jd-jaAJ5,rxcui:2049873,ncit:C3171,ga4gh:VA.lAST62ntkEaXy6RDDOF1V4C3jd-jaAJ5 pred...,e000002,FDA recognized evidence,3,"[clinical cohort evidence, FDA recognized evid...",Servier Pharmaceuticals LLC. Tibsovo (ivosiden...
221,moa.assertion:944,The U.S. Food and Drug Administration (FDA) ap...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,ncit:C53972,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...,e000002,FDA recognized evidence,2,"[interventional study evidence, FDA recognized...","Genentech, Inc. Zelboraf (vemurafenib) [packag..."
222,moa.assertion:950,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:1721560,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000002,FDA recognized evidence,4,"[preclinical evidence, interventional study ev...","AstraZeneca Pharmaceuticals, LP. Tagrisso (osi..."
223,moa.assertion:967,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000002,FDA recognized evidence,7,"[authoritative evidence, clinical cohort evide...","OSI Pharmaceuticals, LLC. Tarceva (erlotinib) ..."


In [102]:
duplicates['source'].value_counts()

source
PIK3CA mutations in colorectal cancer are associated with clinical resistance to EGFR-targeted monoclonal antibodies.                                                                                                                                    6
Effects of KRAS, BRAF, NRAS, and PIK3CA mutations on the efficacy of cetuximab plus chemotherapy in chemotherapy-refractory metastatic colorectal cancer: a retrospective consortium analysis.                                                           5
Novartis Pharmaceuticals Corporation. Tafinlar (dabrafenib) [package insert]. U.S. Food and Drug Administration website. https://www.accessdata.fda.gov/drugsatfda_docs/label/2020/202806s015lbl.pdf. Revised April 2020. Accessed November 12, 2020.    5
Activation of N-ras and K-ras induced by interleukin-6 in a myeloma cell line: implications for disease progression and therapeutic response.                                                                                                   

In [108]:
duplicates[duplicates['source']=='Reduction of serum IGF-I levels in patients affected with Monoclonal Gammopathies of undetermined significance or Multiple Myeloma. Comparison with bFGF, VEGF and K-ras gene mutation.']

Unnamed: 0,id,description,predicate,subject,obj,qualifier,statement_full,evidence_strength_code,evidence_strength,counts,evidences_cross,source
64,civic.eid:2009,In a study of patients receiving melphalan-bas...,predictsResistanceTo,ga4gh:VA.6_uW58_HmIcOOQshfkvhuGjCSFL5H3fs,rxcui:6718,ncit:C3242,ga4gh:VA.6_uW58_HmIcOOQshfkvhuGjCSFL5H3fs pred...,e000005,clinical cohort evidence,4,"[clinical cohort evidence, preclinical evidenc...",Reduction of serum IGF-I levels in patients af...
65,civic.eid:2247,In a study of patients receiving melphalan-bas...,predictsResistanceTo,ga4gh:VA.cdcXITbBWUXPjy3zPdhs7xjU1yvlkSrj,rxcui:6718,ncit:C3242,ga4gh:VA.cdcXITbBWUXPjy3zPdhs7xjU1yvlkSrj pred...,e000005,clinical cohort evidence,4,"[clinical cohort evidence, preclinical evidenc...",Reduction of serum IGF-I levels in patients af...
66,civic.eid:2258,In a study of patients receiving melphalan-bas...,predictsResistanceTo,ga4gh:VA.udBCHwlrf8xNiRy_19bLi-h5LhnZLgCt,rxcui:6718,ncit:C3242,ga4gh:VA.udBCHwlrf8xNiRy_19bLi-h5LhnZLgCt pred...,e000005,clinical cohort evidence,4,"[clinical cohort evidence, preclinical evidenc...",Reduction of serum IGF-I levels in patients af...
67,civic.eid:2274,In a study of patients receiving melphalan-bas...,predictsResistanceTo,ga4gh:VA.ZqmNu5AN2PRYFl0K9eBQ_bo2pJAGKSoa,rxcui:6718,ncit:C3242,ga4gh:VA.ZqmNu5AN2PRYFl0K9eBQ_bo2pJAGKSoa pred...,e000005,clinical cohort evidence,4,"[clinical cohort evidence, preclinical evidenc...",Reduction of serum IGF-I levels in patients af...


In [109]:
#   for group of source duplicates
#       if moa exists AND civic exists
#           check for duplication / evidence strength level
#  
#   for group of source duplicates
#       grab group of variants (subject)
#           check for single record vs multiple records
#           if more than 1 record
#               get length of records