In [1]:
from neo4j import GraphDatabase
import pandas as pd
import requests
from tqdm import tqdm
import re

In [24]:
# Function to create a connection to the Neo4j database
def create_db_connection(uri, user, password):
    driver = GraphDatabase.driver(uri, auth=(user, password))
    return driver

# Function to execute a Cypher query
def execute_query(driver, query):
    with driver.session() as session:
        result = session.run(query)
        return [record for record in result]

# Connect to the Neo4j database
uri = "bolt://localhost:7687"
user = "neo4j"
password = "password"  # Replace 'your_password' with your actual password
driver = create_db_connection(uri, user, password)


query_studies = """MATCH (s:Study)
    RETURN properties(s) AS Study
"""

# Execute the query
result = execute_query(driver, query_studies)

# Close the connection
driver.close()


In [4]:
result[0]['Study']

{'alleleOrigin': 'somatic',
 'id': 'civic.eid:238',
 'description': 'The T790M mutation in EGFR has been shown to confer resistance to the tyrosine kinase inhibitor erlotinib, and patients harboring this mutation that are placed on the drug are likely to relapse.',
 'direction': 'supports',
 'predicate': 'predictsResistanceTo',
 'type': 'VariantTherapeuticResponseStudy'}

In [25]:
data = []
for record in result:
    study = record.get('Study', {})
    row = {
        'id': study.get('id', None),
        'description': study.get('description', None),
        'predicate': study.get('predicate', None)
        # 'id': variation.get('id', None),
        # 'expression_hgvs_c': variation.get('expression_hgvs_c', None),
        # 'expression_hgvs_p': variation.get('expression_hgvs_p', None),
        # 'digest': variation.get('digest', None),
        # 'label': variation.get('label', None),
        # 'type': variation.get('type', None),
        # 'expression_hgvs_g': variation.get('expression_hgvs_g', None),
    }
    data.append(row)

df = pd.DataFrame(data)

df[0:5]

Unnamed: 0,id,description,predicate
0,civic.eid:238,The T790M mutation in EGFR has been shown to c...,predictsResistanceTo
1,civic.eid:1409,Phase 3 randomized clinical trial comparing ve...,predictsSensitivityTo
2,civic.eid:1592,Osimertinib has been approved for the treatmen...,predictsSensitivityTo
3,civic.eid:1867,"Randomized, international, open-label, phase 3...",predictsSensitivityTo
4,civic.eid:2994,"On May 14, 2013, the U.S. Food and Drug Admini...",predictsSensitivityTo


In [26]:
def grab_subject_variant(study_id):
    query_subject_variant = f"""MATCH (s:Study) WHERE s.id = '{study_id}'
        MATCH (s)-[:HAS_VARIANT]-(c)
        MATCH (c)-[:HAS_DEFINING_CONTEXT]-(v)
        RETURN properties(s) AS Study,
                properties(c) AS Variation,
                properties(v) AS Context
    """
    result = execute_query(driver, query_subject_variant)
    driver.close()
    if len(result) > 1:
        print(f'{study_id} has {len(result)} subjects possible')
    subject = result[0]['Context']['id']

    return(subject)

def grab_object_therapeutic(study_id): 
    query = f"""MATCH (s:Study) WHERE s.id = '{study_id}'
        MATCH (s)-[:HAS_THERAPEUTIC]-(c)
        RETURN properties(s) AS Study,
                properties(c) AS Therapeutic
    """
    result = execute_query(driver, query)
    driver.close()
    if len(result) > 1:
        print(f'{study_id} has {len(result)} subjects possible')
    try:
        therapeutic = result[0]['Therapeutic']['therapy_normalizer_id']
    except: # TODO: This is gross, handle it better
        try:
            therapeutic = result[0]['Therapeutic']['label']
        except:
            therapeutic = result[0]['Therapeutic']['id']
    return(therapeutic)

def grab_qualifier_disease(study_id): 
    query = f"""MATCH (s:Study) WHERE s.id = '{study_id}'
        MATCH (s)-[:HAS_TUMOR_TYPE]-(c)
        RETURN properties(s) AS Study,
                properties(c) AS Disease
    """
    result = execute_query(driver, query)
    driver.close()
    if len(result) > 1:
        print(f'{study_id} has {len(result)} subjects possible')
    
    try:
        disease = result[0]['Disease']['disease_normalizer_id']
    except: # TODO: This is gross, handle it better
        try:
            disease = result[0]['Disease']['label']
        except:
            disease = result[0]['Disease']['id']


    return(disease)


In [27]:
df['subject'] = None
df['subject'] = df['id'].apply(grab_subject_variant)

df['obj'] = None
df['obj'] = df['id'].apply(grab_object_therapeutic)

df['qualifier'] = None
df['qualifier'] = df['id'].apply(grab_qualifier_disease)

  with driver.session() as session:
  with driver.session() as session:
  with driver.session() as session:


In [8]:
df

Unnamed: 0,id,description,predicate,subject,obj,qualifier
0,civic.eid:238,The T790M mutation in EGFR has been shown to c...,predictsResistanceTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:337525,ncit:C2926
1,civic.eid:1409,Phase 3 randomized clinical trial comparing ve...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,ncit:C3510
2,civic.eid:1592,Osimertinib has been approved for the treatmen...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926
3,civic.eid:1867,"Randomized, international, open-label, phase 3...",predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926
4,civic.eid:2994,"On May 14, 2013, the U.S. Food and Drug Admini...",predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926
...,...,...,...,...,...,...
1037,moa.assertion:961,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ,rxcui:1364347,ncit:C3174
1038,moa.assertion:963,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.pfWn9x9oFBRzGda1xXcOrE-BrX0R__N8,ncit:C106254,oncotree:LGGNOS
1039,moa.assertion:967,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926
1040,moa.assertion:969,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:328134,ncit:C2926


In [32]:
df['statement_full'] = None
for idx, row in df.iterrows():
    sentence = f"{row['subject']} {row['predicate']} treatment by {row['obj']} for the disease {row['qualifier']}"
    df.at[idx, 'statement_full'] = sentence
    
df

Unnamed: 0,id,description,predicate,subject,obj,qualifier,statement_full
0,civic.eid:238,The T790M mutation in EGFR has been shown to c...,predictsResistanceTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:337525,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...
1,civic.eid:1409,Phase 3 randomized clinical trial comparing ve...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,ncit:C3510,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...
2,civic.eid:1592,Osimertinib has been approved for the treatmen...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...
3,civic.eid:1867,"Randomized, international, open-label, phase 3...",predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...
4,civic.eid:2994,"On May 14, 2013, the U.S. Food and Drug Admini...",predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...
...,...,...,...,...,...,...,...
1037,moa.assertion:961,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ,rxcui:1364347,ncit:C3174,ga4gh:VA.D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ pred...
1038,moa.assertion:963,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.pfWn9x9oFBRzGda1xXcOrE-BrX0R__N8,ncit:C106254,oncotree:LGGNOS,ga4gh:VA.pfWn9x9oFBRzGda1xXcOrE-BrX0R__N8 pred...
1039,moa.assertion:967,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...
1040,moa.assertion:969,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:328134,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...


In [10]:
print(f'Total Evidence Statements: {len(df)}')

Total Evidence Statements: 1042


In [33]:
data = df['statement_full'].value_counts().rename_axis('Statement').reset_index(name='counts')
data

Unnamed: 0,Statement,counts
0,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,8
1,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,7
2,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...,7
3,ga4gh:VA.ORvaSNcFK71WOVr_gi2vv6oPCcXgljab pred...,6
4,ga4gh:VA.Dy7soaZQU1vH9Eb93xG_pJyhu7xTDDC9 pred...,5
...,...,...
898,ga4gh:VA.mecsBfk2t315ZcdZCTD7TTRVezaXskCy pred...,1
899,ga4gh:VA.e0t-Kq4iX8IsDH1F0zj6xClkCKCJpwwk pred...,1
900,ga4gh:VA.RWiEzkpxrOKfQHfoE6T5CEpJPgqgA_YB pred...,1
901,ga4gh:VA.0X2BApoy4FSxtnvy5az1bXu8YMs8E__x pred...,1


In [34]:
df[df['subject']=='ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L'].reset_index(drop=True)

Unnamed: 0,id,description,predicate,subject,obj,qualifier,statement_full
0,civic.eid:1409,Phase 3 randomized clinical trial comparing ve...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,ncit:C3510,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...
1,civic.eid:9851,The open-label phase 3 BEACON CRC trial includ...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,civic.ctid:P1PY89shAjemg7jquQ0V9pg1VnYnkPeK,ncit:C4978,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...
2,civic.eid:3017,Patients with BRAF V600E-mutant NSCLC (n=57) w...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,civic.ctid:oBrlcO23adoVXv51xh-5Wigy0QyDWtfr,ncit:C2926,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...
3,civic.eid:95,Dabrafenib with trametinib provides higher res...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,civic.ctid:oBrlcO23adoVXv51xh-5Wigy0QyDWtfr,ncit:C3224,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...
4,civic.eid:1398,The BRIM-3 Phase III trial NCT01006980 assesse...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,ncit:C3224,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...
...,...,...,...,...,...,...,...
96,moa.assertion:182,Cetuximab (Erbitux) is not recommended by the ...,predictsResistanceTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:318341,ncit:C5105,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...
97,moa.assertion:184,Vemurafenib alone had insufficient activity in...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,moa.ctid:fDe-2sEQxMUyi876GGxkoeGnzFayuyFo,ncit:C5105,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...
98,moa.assertion:865,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,moa.ctid:IAMn2xc8QDzi0_zvPzrGlronyv01Coqs,oncotree:LGGNOS,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...
99,moa.assertion:911,The U.S. Food and Drug Administration (FDA) ap...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,moa.ctid:PGEv4QIKz-vDJF0eyrAfZYTl1fFtDNok,ncit:C2926,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...


### Data Graph

In [17]:
data[0:86]

Unnamed: 0,Statement,counts
0,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,8
1,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,7
2,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...,7
3,ga4gh:VA.ORvaSNcFK71WOVr_gi2vv6oPCcXgljab pred...,6
4,ga4gh:VA.Dy7soaZQU1vH9Eb93xG_pJyhu7xTDDC9 pred...,5
...,...,...
81,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...,2
82,ga4gh:VA.yHQVVwZjma693Ev6lQtew1axCWVwIi8K pred...,2
83,ga4gh:VA.udBCHwlrf8xNiRy_19bLi-h5LhnZLgCt pred...,2
84,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...,2


In [18]:
data['counts'][0:86].sum()

225

In [37]:
data['Statement'][0].split(' ')

['ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-',
 'predictsSensitivityTo',
 'treatment',
 'by',
 'rxcui:1721560',
 'for',
 'the',
 'disease',
 'ncit:C2926']

In [38]:
data['variant'] = None
data['predicate'] = None
data['therapeutic'] = None
data['disease'] = None


for idx, row in data.iterrows():
    terms = row['Statement'].split(' ')
    data.at[idx, 'variant'] = terms[0]
    data.at[idx, 'predicate'] = terms[1]
    data.at[idx, 'therapeutic'] = terms[4]
    data.at[idx, 'disease'] = terms[8]

data 

Unnamed: 0,Statement,counts,variant,predicate,therapeutic,disease
0,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,8,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,predictsSensitivityTo,rxcui:1721560,ncit:C2926
1,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,7,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,predictsSensitivityTo,rxcui:337525,ncit:C2926
2,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...,7,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,predictsSensitivityTo,rxcui:1147220,ncit:C3224
3,ga4gh:VA.ORvaSNcFK71WOVr_gi2vv6oPCcXgljab pred...,6,ga4gh:VA.ORvaSNcFK71WOVr_gi2vv6oPCcXgljab,predictsResistanceTo,rxcui:318341,ncit:C4978
4,ga4gh:VA.Dy7soaZQU1vH9Eb93xG_pJyhu7xTDDC9 pred...,5,ga4gh:VA.Dy7soaZQU1vH9Eb93xG_pJyhu7xTDDC9,predictsResistanceTo,rxcui:282388,ncit:C3868
...,...,...,...,...,...,...
898,ga4gh:VA.mecsBfk2t315ZcdZCTD7TTRVezaXskCy pred...,1,ga4gh:VA.mecsBfk2t315ZcdZCTD7TTRVezaXskCy,predictsSensitivityTo,rxcui:337525,ncit:C2926
899,ga4gh:VA.e0t-Kq4iX8IsDH1F0zj6xClkCKCJpwwk pred...,1,ga4gh:VA.e0t-Kq4iX8IsDH1F0zj6xClkCKCJpwwk,predictsSensitivityTo,rxcui:337525,ncit:C2926
900,ga4gh:VA.RWiEzkpxrOKfQHfoE6T5CEpJPgqgA_YB pred...,1,ga4gh:VA.RWiEzkpxrOKfQHfoE6T5CEpJPgqgA_YB,predictsSensitivityTo,rxcui:337525,ncit:C2926
901,ga4gh:VA.0X2BApoy4FSxtnvy5az1bXu8YMs8E__x pred...,1,ga4gh:VA.0X2BApoy4FSxtnvy5az1bXu8YMs8E__x,predictsResistanceTo,rxcui:337525,ncit:C2926


In [39]:
def duplicate_detect(count):
    if count > 1:
        return('Duplicate')
    else:
        return('Not Duplicate')
    
data['is_duplicate'] = None
data['is_duplicate'] = data['counts'].apply(duplicate_detect)

In [40]:
import plotly
import plotly.express as px
import plotly.io as pio

fig = px.bar(data[0:86], x='Statement', y='counts', color='predicate', title='Overlap of Full Evidence Statements in MetaKB v2')
fig.update_layout(yaxis_title='# Overlapping Statements')
fig.show()
# pio.write_image(fig, "Full_Evidence_Statement_overlap_bar.png", format='png', width=1200, height=400, scale=5)

In [None]:
# TODO: Tuples/Triplets of different components of the full evidence statement for more overlap
# fig = px.bar(data[0:86], x=['Variant','Therapeutic'], y='counts', color='predicate', title='Overlap of Full Evidence Statements in MetaKB v2')
# fig.update_layout(yaxis_title='# Overlapping Statements') 
# fig.show()

In [41]:
fig = px.pie(data, values='counts', names='is_duplicate', title='% of Identical Evidence Statements in MetaKB v2')
fig.update_traces(textinfo='percent+label')
fig.show()
# pio.write_image(fig, "Duplicate_statements_metakbv2_pie.png", format='png', width=800, height=800, scale=5)

In [None]:
# TODO: Loop in evidence level, coding table (HAS_STRENGTH) ---> https://docs.google.com/spreadsheets/d/1FpUmoXmDLVXsNgqog6A9q6o0jIOi_760j02rxIQGFMo/edit?gid=0#gid=0
# Check to see if regulatory approval exists in the extensions, does it have specific indications? If the disease occurs outside the indication, it gets lowered to a different level. 

## Evidence Level

In [43]:
df

Unnamed: 0,id,description,predicate,subject,obj,qualifier,statement_full
0,civic.eid:238,The T790M mutation in EGFR has been shown to c...,predictsResistanceTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:337525,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...
1,civic.eid:1409,Phase 3 randomized clinical trial comparing ve...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,ncit:C3510,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...
2,civic.eid:1592,Osimertinib has been approved for the treatmen...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...
3,civic.eid:1867,"Randomized, international, open-label, phase 3...",predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...
4,civic.eid:2994,"On May 14, 2013, the U.S. Food and Drug Admini...",predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...
...,...,...,...,...,...,...,...
1037,moa.assertion:961,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ,rxcui:1364347,ncit:C3174,ga4gh:VA.D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ pred...
1038,moa.assertion:963,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.pfWn9x9oFBRzGda1xXcOrE-BrX0R__N8,ncit:C106254,oncotree:LGGNOS,ga4gh:VA.pfWn9x9oFBRzGda1xXcOrE-BrX0R__N8 pred...
1039,moa.assertion:967,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...
1040,moa.assertion:969,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:328134,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...


In [51]:
def grab_evidence_strength(study_id):
    query_subject_variant = f"""MATCH (s:Study) WHERE s.id = '{study_id}'
        MATCH (s)-[:HAS_STRENGTH]-(c)
        RETURN properties(s) AS Study,
                properties(c) AS Strength
    """
    result = execute_query(driver, query_subject_variant)
    driver.close()
    if len(result) > 1:
        print(f'{study_id} has {len(result)} subjects possible')
    subject = result[0]['Strength']['label']

    return(subject)

In [52]:
df['evidence_strength'] = None
df['evidence_strength'] = df['id'].apply(grab_evidence_strength)



Using a driver after it has been closed is deprecated. Future versions of the driver will raise an error.



In [53]:
df

Unnamed: 0,id,description,predicate,subject,obj,qualifier,statement_full,evidence_strength_code,evidence_strength
0,civic.eid:238,The T790M mutation in EGFR has been shown to c...,predictsResistanceTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:337525,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence
1,civic.eid:1409,Phase 3 randomized clinical trial comparing ve...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,ncit:C3510,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...,e000001,authoritative evidence
2,civic.eid:1592,Osimertinib has been approved for the treatmen...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence
3,civic.eid:1867,"Randomized, international, open-label, phase 3...",predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence
4,civic.eid:2994,"On May 14, 2013, the U.S. Food and Drug Admini...",predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000001,authoritative evidence
...,...,...,...,...,...,...,...,...,...
1037,moa.assertion:961,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ,rxcui:1364347,ncit:C3174,ga4gh:VA.D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ pred...,e000002,FDA recognized evidence
1038,moa.assertion:963,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.pfWn9x9oFBRzGda1xXcOrE-BrX0R__N8,ncit:C106254,oncotree:LGGNOS,ga4gh:VA.pfWn9x9oFBRzGda1xXcOrE-BrX0R__N8 pred...,e000002,FDA recognized evidence
1039,moa.assertion:967,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000002,FDA recognized evidence
1040,moa.assertion:969,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:328134,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000002,FDA recognized evidence


In [56]:
df['counts'] = None

for idx, row in df.iterrows():
    tdf = df[df['statement_full']==row['statement_full']]
    df.at[idx,'counts'] = len(tdf)

df



Unnamed: 0,id,description,predicate,subject,obj,qualifier,statement_full,evidence_strength_code,evidence_strength,counts
0,civic.eid:238,The T790M mutation in EGFR has been shown to c...,predictsResistanceTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:337525,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,2
1,civic.eid:1409,Phase 3 randomized clinical trial comparing ve...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,ncit:C3510,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...,e000001,authoritative evidence,2
2,civic.eid:1592,Osimertinib has been approved for the treatmen...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,8
3,civic.eid:1867,"Randomized, international, open-label, phase 3...",predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,8
4,civic.eid:2994,"On May 14, 2013, the U.S. Food and Drug Admini...",predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000001,authoritative evidence,7
...,...,...,...,...,...,...,...,...,...,...
1037,moa.assertion:961,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ,rxcui:1364347,ncit:C3174,ga4gh:VA.D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ pred...,e000002,FDA recognized evidence,1
1038,moa.assertion:963,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.pfWn9x9oFBRzGda1xXcOrE-BrX0R__N8,ncit:C106254,oncotree:LGGNOS,ga4gh:VA.pfWn9x9oFBRzGda1xXcOrE-BrX0R__N8 pred...,e000002,FDA recognized evidence,1
1039,moa.assertion:967,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000002,FDA recognized evidence,7
1040,moa.assertion:969,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:328134,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000002,FDA recognized evidence,4


In [64]:
df['evidences_cross'] = None

for idx, row in df.iterrows():
    tdf = df[df['statement_full']==row['statement_full']]
    df.at[idx,'evidences_cross'] = list(tdf['evidence_strength'])

df


Unnamed: 0,id,description,predicate,subject,obj,qualifier,statement_full,evidence_strength_code,evidence_strength,counts,evidences_cross
0,civic.eid:238,The T790M mutation in EGFR has been shown to c...,predictsResistanceTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:337525,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,2,"[authoritative evidence, observational study e..."
1,civic.eid:1409,Phase 3 randomized clinical trial comparing ve...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,ncit:C3510,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...,e000001,authoritative evidence,2,"[authoritative evidence, clinical cohort evide..."
2,civic.eid:1592,Osimertinib has been approved for the treatmen...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,8,"[authoritative evidence, authoritative evidenc..."
3,civic.eid:1867,"Randomized, international, open-label, phase 3...",predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,8,"[authoritative evidence, authoritative evidenc..."
4,civic.eid:2994,"On May 14, 2013, the U.S. Food and Drug Admini...",predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000001,authoritative evidence,7,"[authoritative evidence, clinical cohort evide..."
...,...,...,...,...,...,...,...,...,...,...,...
1037,moa.assertion:961,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ,rxcui:1364347,ncit:C3174,ga4gh:VA.D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ pred...,e000002,FDA recognized evidence,1,[FDA recognized evidence]
1038,moa.assertion:963,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.pfWn9x9oFBRzGda1xXcOrE-BrX0R__N8,ncit:C106254,oncotree:LGGNOS,ga4gh:VA.pfWn9x9oFBRzGda1xXcOrE-BrX0R__N8 pred...,e000002,FDA recognized evidence,1,[FDA recognized evidence]
1039,moa.assertion:967,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000002,FDA recognized evidence,7,"[authoritative evidence, clinical cohort evide..."
1040,moa.assertion:969,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:328134,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000002,FDA recognized evidence,4,"[clinical cohort evidence, clinical cohort evi..."


In [69]:
duplicates = df[df['counts']>1].reset_index(drop=True)
duplicates

Unnamed: 0,id,description,predicate,subject,obj,qualifier,statement_full,evidence_strength_code,evidence_strength,counts,evidences_cross
0,civic.eid:238,The T790M mutation in EGFR has been shown to c...,predictsResistanceTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:337525,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,2,"[authoritative evidence, observational study e..."
1,civic.eid:1409,Phase 3 randomized clinical trial comparing ve...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,ncit:C3510,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...,e000001,authoritative evidence,2,"[authoritative evidence, clinical cohort evide..."
2,civic.eid:1592,Osimertinib has been approved for the treatmen...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,8,"[authoritative evidence, authoritative evidenc..."
3,civic.eid:1867,"Randomized, international, open-label, phase 3...",predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,8,"[authoritative evidence, authoritative evidenc..."
4,civic.eid:2994,"On May 14, 2013, the U.S. Food and Drug Admini...",predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000001,authoritative evidence,7,"[authoritative evidence, clinical cohort evide..."
...,...,...,...,...,...,...,...,...,...,...,...
220,moa.assertion:813,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.lAST62ntkEaXy6RDDOF1V4C3jd-jaAJ5,rxcui:2049873,ncit:C3171,ga4gh:VA.lAST62ntkEaXy6RDDOF1V4C3jd-jaAJ5 pred...,e000002,FDA recognized evidence,3,"[clinical cohort evidence, FDA recognized evid..."
221,moa.assertion:944,The U.S. Food and Drug Administration (FDA) ap...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,ncit:C53972,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...,e000002,FDA recognized evidence,2,"[interventional study evidence, FDA recognized..."
222,moa.assertion:950,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:1721560,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000002,FDA recognized evidence,4,"[preclinical evidence, interventional study ev..."
223,moa.assertion:967,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000002,FDA recognized evidence,7,"[authoritative evidence, clinical cohort evide..."


In [71]:
duplicates['evidences_cross'].explode()[3]

3           authoritative evidence
3           authoritative evidence
3         clinical cohort evidence
3         clinical cohort evidence
3             preclinical evidence
3          FDA recognized evidence
3    interventional study evidence
3     observational study evidence
Name: evidences_cross, dtype: object

In [75]:
from collections import Counter

def check_agreement(evidence_list):
    return len(set(evidence_list)) == 1

def get_buckets(evidence_list):
    unique_evidences = set(evidence_list)
    if len(unique_evidences) == 1:
        return {}  # Return an empty dict if all elements are the same
    else:
        return dict(Counter(evidence_list))


In [78]:
# is_agreement 
# num_of_buckets


duplicates['is_agreement'] = duplicates['evidences_cross'].apply(check_agreement)
duplicates['buckets'] = duplicates['evidences_cross'].apply(get_buckets)
duplicates

Unnamed: 0,id,description,predicate,subject,obj,qualifier,statement_full,evidence_strength_code,evidence_strength,counts,evidences_cross,is_agreement,buckets
0,civic.eid:238,The T790M mutation in EGFR has been shown to c...,predictsResistanceTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:337525,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,2,"[authoritative evidence, observational study e...",False,"{'authoritative evidence': 1, 'observational s..."
1,civic.eid:1409,Phase 3 randomized clinical trial comparing ve...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,ncit:C3510,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...,e000001,authoritative evidence,2,"[authoritative evidence, clinical cohort evide...",False,"{'authoritative evidence': 1, 'clinical cohort..."
2,civic.eid:1592,Osimertinib has been approved for the treatmen...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,8,"[authoritative evidence, authoritative evidenc...",False,"{'authoritative evidence': 2, 'clinical cohort..."
3,civic.eid:1867,"Randomized, international, open-label, phase 3...",predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,8,"[authoritative evidence, authoritative evidenc...",False,"{'authoritative evidence': 2, 'clinical cohort..."
4,civic.eid:2994,"On May 14, 2013, the U.S. Food and Drug Admini...",predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000001,authoritative evidence,7,"[authoritative evidence, clinical cohort evide...",False,"{'authoritative evidence': 1, 'clinical cohort..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
220,moa.assertion:813,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.lAST62ntkEaXy6RDDOF1V4C3jd-jaAJ5,rxcui:2049873,ncit:C3171,ga4gh:VA.lAST62ntkEaXy6RDDOF1V4C3jd-jaAJ5 pred...,e000002,FDA recognized evidence,3,"[clinical cohort evidence, FDA recognized evid...",False,"{'clinical cohort evidence': 1, 'FDA recognize..."
221,moa.assertion:944,The U.S. Food and Drug Administration (FDA) ap...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,ncit:C53972,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...,e000002,FDA recognized evidence,2,"[interventional study evidence, FDA recognized...",False,"{'interventional study evidence': 1, 'FDA reco..."
222,moa.assertion:950,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:1721560,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000002,FDA recognized evidence,4,"[preclinical evidence, interventional study ev...",False,"{'preclinical evidence': 1, 'interventional st..."
223,moa.assertion:967,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000002,FDA recognized evidence,7,"[authoritative evidence, clinical cohort evide...",False,"{'authoritative evidence': 1, 'clinical cohort..."


In [81]:
duplicates['is_agreement'].value_counts()

is_agreement
False    157
True      68
Name: count, dtype: int64

In [85]:
tdf = duplicates[duplicates['is_agreement']==True] # top = BRAF
tdf['statement_full'].value_counts()

statement_full
ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L predictsSensitivityTo treatment by civic.ctid:oBrlcO23adoVXv51xh-5Wigy0QyDWtfr for the disease ncit:C3224     4
ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ predictsSensitivityTo treatment by rxcui:1430438 for the disease ncit:C3512                                   3
ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ predictsSensitivityTo treatment by rxcui:337525 for the disease ncit:C9305                                    3
ga4gh:VA.yHQVVwZjma693Ev6lQtew1axCWVwIi8K predictsResistanceTo treatment by rxcui:2289380 for the disease ncit:C3224                                    2
ga4gh:VA.E09IUKElemW2uoVRT5qacU7RMqT8Um5m predictsSensitivityTo treatment by ncit:C165479 for the disease ncit:C9245                                    2
ga4gh:VA.CpnlaV2B8565obATF-UlE706sBYp0D6M predictsSensitivityTo treatment by rxcui:2289380 for the disease ncit:C3224                                   2
ga4gh:VA.yHQVVwZjma693Ev6lQtew1axCWVwIi8K predictsResistanceT

In [86]:
tdf = duplicates[duplicates['is_agreement']==False] # top = EGFR T790M
tdf['statement_full'].value_counts()

statement_full
ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- predictsSensitivityTo treatment by rxcui:1721560 for the disease ncit:C2926                                  8
ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ predictsSensitivityTo treatment by rxcui:337525 for the disease ncit:C2926                                   7
ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L predictsSensitivityTo treatment by rxcui:1147220 for the disease ncit:C3224                                  7
ga4gh:VA.ORvaSNcFK71WOVr_gi2vv6oPCcXgljab predictsResistanceTo treatment by rxcui:318341 for the disease ncit:C4978                                    6
ga4gh:VA.Dy7soaZQU1vH9Eb93xG_pJyhu7xTDDC9 predictsResistanceTo treatment by rxcui:282388 for the disease ncit:C3868                                    5
ga4gh:VA.cdcXITbBWUXPjy3zPdhs7xjU1yvlkSrj predictsResistanceTo treatment by rxcui:6718 for the disease ncit:C3242                                      4
ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ predictsSensitivityTo tre

In [91]:
# Eight pieces of evidence all use the same components but attribute to 6 different evidence strengths

tdf[tdf['statement_full']=='ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- predictsSensitivityTo treatment by rxcui:1721560 for the disease ncit:C2926']

Unnamed: 0,id,description,predicate,subject,obj,qualifier,statement_full,evidence_strength_code,evidence_strength,counts,evidences_cross,is_agreement,buckets
2,civic.eid:1592,Osimertinib has been approved for the treatmen...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,8,"[authoritative evidence, authoritative evidenc...",False,"{'authoritative evidence': 2, 'clinical cohort..."
3,civic.eid:1867,"Randomized, international, open-label, phase 3...",predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,8,"[authoritative evidence, authoritative evidenc...",False,"{'authoritative evidence': 2, 'clinical cohort..."
23,civic.eid:965,This phase I/II trial (NCT01802632) involved 2...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000005,clinical cohort evidence,8,"[authoritative evidence, authoritative evidenc...",False,"{'authoritative evidence': 2, 'clinical cohort..."
45,civic.eid:966,This study summarized 9 EGFR-mutant patients f...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000005,clinical cohort evidence,8,"[authoritative evidence, authoritative evidenc...",False,"{'authoritative evidence': 2, 'clinical cohort..."
132,civic.eid:963,"Cell line, xenograft, and transgenic models we...",predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000009,preclinical evidence,8,"[authoritative evidence, authoritative evidenc...",False,"{'authoritative evidence': 2, 'clinical cohort..."
190,moa.assertion:242,Osimertinib is a kinase inhibitor indicated fo...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000002,FDA recognized evidence,8,"[authoritative evidence, authoritative evidenc...",False,"{'authoritative evidence': 2, 'clinical cohort..."
193,moa.assertion:256,Osimertinib is being evaluated in patients who...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000006,interventional study evidence,8,"[authoritative evidence, authoritative evidenc...",False,"{'authoritative evidence': 2, 'clinical cohort..."
195,moa.assertion:259,Osimertinib is FDA-Approved for metastatic non...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000007,observational study evidence,8,"[authoritative evidence, authoritative evidenc...",False,"{'authoritative evidence': 2, 'clinical cohort..."


## Concordance

In [92]:
df['triplet'] = None

for idx, row in df.iterrows():
    df.at[idx, 'triplet'] = 

    pass

Unnamed: 0,id,description,predicate,subject,obj,qualifier,statement_full,evidence_strength_code,evidence_strength,counts,evidences_cross
0,civic.eid:238,The T790M mutation in EGFR has been shown to c...,predictsResistanceTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:337525,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,2,"[authoritative evidence, observational study e..."
1,civic.eid:1409,Phase 3 randomized clinical trial comparing ve...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,ncit:C3510,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...,e000001,authoritative evidence,2,"[authoritative evidence, clinical cohort evide..."
2,civic.eid:1592,Osimertinib has been approved for the treatmen...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,8,"[authoritative evidence, authoritative evidenc..."
3,civic.eid:1867,"Randomized, international, open-label, phase 3...",predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,8,"[authoritative evidence, authoritative evidenc..."
4,civic.eid:2994,"On May 14, 2013, the U.S. Food and Drug Admini...",predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000001,authoritative evidence,7,"[authoritative evidence, clinical cohort evide..."
...,...,...,...,...,...,...,...,...,...,...,...
1037,moa.assertion:961,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ,rxcui:1364347,ncit:C3174,ga4gh:VA.D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ pred...,e000002,FDA recognized evidence,1,[FDA recognized evidence]
1038,moa.assertion:963,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.pfWn9x9oFBRzGda1xXcOrE-BrX0R__N8,ncit:C106254,oncotree:LGGNOS,ga4gh:VA.pfWn9x9oFBRzGda1xXcOrE-BrX0R__N8 pred...,e000002,FDA recognized evidence,1,[FDA recognized evidence]
1039,moa.assertion:967,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000002,FDA recognized evidence,7,"[authoritative evidence, clinical cohort evide..."
1040,moa.assertion:969,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:328134,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000002,FDA recognized evidence,4,"[clinical cohort evidence, clinical cohort evi..."


In [97]:
def grab_source(study_id):
    query_subject_variant = f"""MATCH (s:Study) WHERE s.id = '{study_id}'
        MATCH (s)-[:IS_REPORTED_IN]-(c)
        RETURN properties(s) AS Study,
                properties(c) AS Source
    """
    result = execute_query(driver, query_subject_variant)
    driver.close()
    if len(result) > 1:
        print(f'{study_id} has {len(result)} subjects possible')
    try:
        subject = result[0]['Source']['title']
    except:
        subject = result[0]['Source']['id']
    return(subject)

In [98]:
df['source'] = None
df['source'] = df['id'].apply(grab_source)



Using a driver after it has been closed is deprecated. Future versions of the driver will raise an error.



In [99]:
df

Unnamed: 0,id,description,predicate,subject,obj,qualifier,statement_full,evidence_strength_code,evidence_strength,counts,evidences_cross,source
0,civic.eid:238,The T790M mutation in EGFR has been shown to c...,predictsResistanceTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:337525,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,2,"[authoritative evidence, observational study e...",EGFR T790M resistance mutation in non small-ce...
1,civic.eid:1409,Phase 3 randomized clinical trial comparing ve...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,ncit:C3510,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...,e000001,authoritative evidence,2,"[authoritative evidence, clinical cohort evide...",Improved survival with vemurafenib in melanoma...
2,civic.eid:1592,Osimertinib has been approved for the treatmen...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,8,"[authoritative evidence, authoritative evidenc...",Osimertinib: First Global Approval.
3,civic.eid:1867,"Randomized, international, open-label, phase 3...",predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,8,"[authoritative evidence, authoritative evidenc...",Osimertinib or Platinum-Pemetrexed in EGFR T79...
4,civic.eid:2994,"On May 14, 2013, the U.S. Food and Drug Admini...",predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000001,authoritative evidence,7,"[authoritative evidence, clinical cohort evide...",U.S. Food and Drug Administration approval sum...
...,...,...,...,...,...,...,...,...,...,...,...,...
1037,moa.assertion:961,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ,rxcui:1364347,ncit:C3174,ga4gh:VA.D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ pred...,e000002,FDA recognized evidence,1,[FDA recognized evidence],"Takeda Pharmaceuticals America, Inc. Iclusig (..."
1038,moa.assertion:963,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.pfWn9x9oFBRzGda1xXcOrE-BrX0R__N8,ncit:C106254,oncotree:LGGNOS,ga4gh:VA.pfWn9x9oFBRzGda1xXcOrE-BrX0R__N8 pred...,e000002,FDA recognized evidence,1,[FDA recognized evidence],"Day One Biopharmaceuticals, Inc. Ojemda (tovor..."
1039,moa.assertion:967,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000002,FDA recognized evidence,7,"[authoritative evidence, clinical cohort evide...","OSI Pharmaceuticals, LLC. Tarceva (erlotinib) ..."
1040,moa.assertion:969,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:328134,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000002,FDA recognized evidence,4,"[clinical cohort evidence, clinical cohort evi...",Astrazeneca Pharmaceuticals LP. Iressa (gefiti...


In [100]:
df['source'].value_counts()

source
Effects of KRAS, BRAF, NRAS, and PIK3CA mutations on the efficacy of cetuximab plus chemotherapy in chemotherapy-refractory metastatic colorectal cancer: a retrospective consortium analysis.                                                                                                                                            31
Negative feedback-defective PRPS1 mutants drive thiopurine resistance in relapsed childhood ALL.                                                                                                                                                                                                                                          26
Referenced with permission from the NCCN Clinical Practice Guidelines in Oncology (NCCN Guidelines®) for Chronic Myelogenous Leukemia V.2.2016. © National Comprehensive Cancer Network, Inc. 2016. All rights reserved. Accessed August 9 2016. To view the most recent and complete version of the guideline, go online to NCCN.org. 

In [101]:
duplicates = df[df['counts']>1].reset_index(drop=True)
duplicates

Unnamed: 0,id,description,predicate,subject,obj,qualifier,statement_full,evidence_strength_code,evidence_strength,counts,evidences_cross,source
0,civic.eid:238,The T790M mutation in EGFR has been shown to c...,predictsResistanceTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:337525,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,2,"[authoritative evidence, observational study e...",EGFR T790M resistance mutation in non small-ce...
1,civic.eid:1409,Phase 3 randomized clinical trial comparing ve...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,ncit:C3510,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...,e000001,authoritative evidence,2,"[authoritative evidence, clinical cohort evide...",Improved survival with vemurafenib in melanoma...
2,civic.eid:1592,Osimertinib has been approved for the treatmen...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,8,"[authoritative evidence, authoritative evidenc...",Osimertinib: First Global Approval.
3,civic.eid:1867,"Randomized, international, open-label, phase 3...",predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- pred...,e000001,authoritative evidence,8,"[authoritative evidence, authoritative evidenc...",Osimertinib or Platinum-Pemetrexed in EGFR T79...
4,civic.eid:2994,"On May 14, 2013, the U.S. Food and Drug Admini...",predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000001,authoritative evidence,7,"[authoritative evidence, clinical cohort evide...",U.S. Food and Drug Administration approval sum...
...,...,...,...,...,...,...,...,...,...,...,...,...
220,moa.assertion:813,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.lAST62ntkEaXy6RDDOF1V4C3jd-jaAJ5,rxcui:2049873,ncit:C3171,ga4gh:VA.lAST62ntkEaXy6RDDOF1V4C3jd-jaAJ5 pred...,e000002,FDA recognized evidence,3,"[clinical cohort evidence, FDA recognized evid...",Servier Pharmaceuticals LLC. Tibsovo (ivosiden...
221,moa.assertion:944,The U.S. Food and Drug Administration (FDA) ap...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,ncit:C53972,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L pred...,e000002,FDA recognized evidence,2,"[interventional study evidence, FDA recognized...","Genentech, Inc. Zelboraf (vemurafenib) [packag..."
222,moa.assertion:950,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:1721560,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000002,FDA recognized evidence,4,"[preclinical evidence, interventional study ev...","AstraZeneca Pharmaceuticals, LP. Tagrisso (osi..."
223,moa.assertion:967,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ pred...,e000002,FDA recognized evidence,7,"[authoritative evidence, clinical cohort evide...","OSI Pharmaceuticals, LLC. Tarceva (erlotinib) ..."


In [102]:
duplicates['source'].value_counts()

source
PIK3CA mutations in colorectal cancer are associated with clinical resistance to EGFR-targeted monoclonal antibodies.                                                                                                                                    6
Effects of KRAS, BRAF, NRAS, and PIK3CA mutations on the efficacy of cetuximab plus chemotherapy in chemotherapy-refractory metastatic colorectal cancer: a retrospective consortium analysis.                                                           5
Novartis Pharmaceuticals Corporation. Tafinlar (dabrafenib) [package insert]. U.S. Food and Drug Administration website. https://www.accessdata.fda.gov/drugsatfda_docs/label/2020/202806s015lbl.pdf. Revised April 2020. Accessed November 12, 2020.    5
Activation of N-ras and K-ras induced by interleukin-6 in a myeloma cell line: implications for disease progression and therapeutic response.                                                                                                   

In [108]:
duplicates[duplicates['source']=='Reduction of serum IGF-I levels in patients affected with Monoclonal Gammopathies of undetermined significance or Multiple Myeloma. Comparison with bFGF, VEGF and K-ras gene mutation.']

Unnamed: 0,id,description,predicate,subject,obj,qualifier,statement_full,evidence_strength_code,evidence_strength,counts,evidences_cross,source
64,civic.eid:2009,In a study of patients receiving melphalan-bas...,predictsResistanceTo,ga4gh:VA.6_uW58_HmIcOOQshfkvhuGjCSFL5H3fs,rxcui:6718,ncit:C3242,ga4gh:VA.6_uW58_HmIcOOQshfkvhuGjCSFL5H3fs pred...,e000005,clinical cohort evidence,4,"[clinical cohort evidence, preclinical evidenc...",Reduction of serum IGF-I levels in patients af...
65,civic.eid:2247,In a study of patients receiving melphalan-bas...,predictsResistanceTo,ga4gh:VA.cdcXITbBWUXPjy3zPdhs7xjU1yvlkSrj,rxcui:6718,ncit:C3242,ga4gh:VA.cdcXITbBWUXPjy3zPdhs7xjU1yvlkSrj pred...,e000005,clinical cohort evidence,4,"[clinical cohort evidence, preclinical evidenc...",Reduction of serum IGF-I levels in patients af...
66,civic.eid:2258,In a study of patients receiving melphalan-bas...,predictsResistanceTo,ga4gh:VA.udBCHwlrf8xNiRy_19bLi-h5LhnZLgCt,rxcui:6718,ncit:C3242,ga4gh:VA.udBCHwlrf8xNiRy_19bLi-h5LhnZLgCt pred...,e000005,clinical cohort evidence,4,"[clinical cohort evidence, preclinical evidenc...",Reduction of serum IGF-I levels in patients af...
67,civic.eid:2274,In a study of patients receiving melphalan-bas...,predictsResistanceTo,ga4gh:VA.ZqmNu5AN2PRYFl0K9eBQ_bo2pJAGKSoa,rxcui:6718,ncit:C3242,ga4gh:VA.ZqmNu5AN2PRYFl0K9eBQ_bo2pJAGKSoa pred...,e000005,clinical cohort evidence,4,"[clinical cohort evidence, preclinical evidenc...",Reduction of serum IGF-I levels in patients af...


In [109]:
#   for group of source duplicates
#       if moa exists AND civic exists
#           check for duplication / evidence strength level
#  
#   for group of source duplicates
#       grab group of variants (subject)
#           check for single record vs multiple records
#           if more than 1 record
#               get length of records