In [3]:
from neo4j import GraphDatabase
import pandas as pd
import requests
from tqdm import tqdm
import re

In [10]:
# Function to create a connection to the Neo4j database
def create_db_connection(uri, user, password):
    driver = GraphDatabase.driver(uri, auth=(user, password))
    return driver

# Function to execute a Cypher query
def execute_query(driver, query):
    with driver.session() as session:
        result = session.run(query)
        return [record for record in result]

# Connect to the Neo4j database
uri = "bolt://localhost:7687"
user = "neo4j"
password = "password"  # Replace 'your_password' with your actual password
driver = create_db_connection(uri, user, password)


query_studies = """MATCH (s:Study)
    RETURN properties(s) AS Study
"""

# Execute the query
result = execute_query(driver, query_studies)

# Close the connection
driver.close()


In [8]:
result[0]['Study']

{'alleleOrigin': 'somatic',
 'id': 'civic.eid:238',
 'description': 'The T790M mutation in EGFR has been shown to confer resistance to the tyrosine kinase inhibitor erlotinib, and patients harboring this mutation that are placed on the drug are likely to relapse.',
 'direction': 'supports',
 'predicate': 'predictsResistanceTo',
 'type': 'VariantTherapeuticResponseStudy'}

In [11]:
data = []
for record in result:
    study = record.get('Study', {})
    row = {
        'id': study.get('id', None),
        'description': study.get('description', None),
        'predicate': study.get('predicate', None)
        # 'id': variation.get('id', None),
        # 'expression_hgvs_c': variation.get('expression_hgvs_c', None),
        # 'expression_hgvs_p': variation.get('expression_hgvs_p', None),
        # 'digest': variation.get('digest', None),
        # 'label': variation.get('label', None),
        # 'type': variation.get('type', None),
        # 'expression_hgvs_g': variation.get('expression_hgvs_g', None),
    }
    data.append(row)

df = pd.DataFrame(data)

df[0:5]

Unnamed: 0,id,description,predicate
0,civic.eid:238,The T790M mutation in EGFR has been shown to c...,predictsResistanceTo
1,civic.eid:1409,Phase 3 randomized clinical trial comparing ve...,predictsSensitivityTo
2,civic.eid:1592,Osimertinib has been approved for the treatmen...,predictsSensitivityTo
3,civic.eid:1867,"Randomized, international, open-label, phase 3...",predictsSensitivityTo
4,civic.eid:2994,"On May 14, 2013, the U.S. Food and Drug Admini...",predictsSensitivityTo


In [55]:
def grab_subject_variant(study_id):
    query_subject_variant = f"""MATCH (s:Study) WHERE s.id = '{study_id}'
        MATCH (s)-[:HAS_VARIANT]-(c)
        MATCH (c)-[:HAS_DEFINING_CONTEXT]-(v)
        RETURN properties(s) AS Study,
                properties(c) AS Variation,
                properties(v) AS Context
    """
    result = execute_query(driver, query_subject_variant)
    driver.close()
    if len(result) > 1:
        print(f'{study_id} has {len(result)} subjects possible')
    subject = result[0]['Context']['id']

    return(subject)

def grab_object_therapeutic(study_id): 
    query = f"""MATCH (s:Study) WHERE s.id = '{study_id}'
        MATCH (s)-[:HAS_THERAPEUTIC]-(c)
        RETURN properties(s) AS Study,
                properties(c) AS Therapeutic
    """
    result = execute_query(driver, query)
    driver.close()
    if len(result) > 1:
        print(f'{study_id} has {len(result)} subjects possible')
    try:
        therapeutic = result[0]['Therapeutic']['therapy_normalizer_id']
    except: # TODO: This is gross, handle it better
        try:
            therapeutic = result[0]['Therapeutic']['label']
        except:
            therapeutic = result[0]['Therapeutic']['id']
    return(therapeutic)

def grab_qualifier_disease(study_id): 
    query = f"""MATCH (s:Study) WHERE s.id = '{study_id}'
        MATCH (s)-[:HAS_TUMOR_TYPE]-(c)
        RETURN properties(s) AS Study,
                properties(c) AS Disease
    """
    result = execute_query(driver, query)
    driver.close()
    if len(result) > 1:
        print(f'{study_id} has {len(result)} subjects possible')
    
    try:
        disease = result[0]['Disease']['disease_normalizer_id']
    except: # TODO: This is gross, handle it better
        try:
            disease = result[0]['Disease']['label']
        except:
            disease = result[0]['Disease']['id']


    return(disease)


In [56]:
df['subject'] = None
df['subject'] = df['id'].apply(grab_subject_variant)

df['obj'] = None
df['obj'] = df['id'].apply(grab_object_therapeutic)

df['qualifier'] = None
df['qualifier'] = df['id'].apply(grab_qualifier_disease)

  with driver.session() as session:


In [58]:
df

Unnamed: 0,id,description,predicate,subject,obj,qualifier
0,civic.eid:238,The T790M mutation in EGFR has been shown to c...,predictsResistanceTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:337525,ncit:C2926
1,civic.eid:1409,Phase 3 randomized clinical trial comparing ve...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,ncit:C3510
2,civic.eid:1592,Osimertinib has been approved for the treatmen...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926
3,civic.eid:1867,"Randomized, international, open-label, phase 3...",predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926
4,civic.eid:2994,"On May 14, 2013, the U.S. Food and Drug Admini...",predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926
...,...,...,...,...,...,...
1037,moa.assertion:961,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ,rxcui:1364347,ncit:C3174
1038,moa.assertion:963,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.pfWn9x9oFBRzGda1xXcOrE-BrX0R__N8,ncit:C106254,oncotree:LGGNOS
1039,moa.assertion:967,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926
1040,moa.assertion:969,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:328134,ncit:C2926


In [61]:
df['statement_full'] = None
for idx, row in df.iterrows():
    sentence = f"{row['subject']} with treatment by {row['obj']} {row['predicate']} the disease {row['qualifier']}"
    df.at[idx, 'statement_full'] = sentence
    
df

Unnamed: 0,id,description,predicate,subject,obj,qualifier,statement_full
0,civic.eid:238,The T790M mutation in EGFR has been shown to c...,predictsResistanceTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:337525,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- with...
1,civic.eid:1409,Phase 3 randomized clinical trial comparing ve...,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,ncit:C3510,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L with...
2,civic.eid:1592,Osimertinib has been approved for the treatmen...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- with...
3,civic.eid:1867,"Randomized, international, open-label, phase 3...",predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- with...
4,civic.eid:2994,"On May 14, 2013, the U.S. Food and Drug Admini...",predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ with...
...,...,...,...,...,...,...,...
1037,moa.assertion:961,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ,rxcui:1364347,ncit:C3174,ga4gh:VA.D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ with...
1038,moa.assertion:963,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.pfWn9x9oFBRzGda1xXcOrE-BrX0R__N8,ncit:C106254,oncotree:LGGNOS,ga4gh:VA.pfWn9x9oFBRzGda1xXcOrE-BrX0R__N8 with...
1039,moa.assertion:967,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ with...
1040,moa.assertion:969,The U.S. Food and Drug Administration (FDA) gr...,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:328134,ncit:C2926,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ with...


In [63]:
print(f'Total Evidence Statements: {len(df)}')

Total Evidence Statements: 1042


In [68]:
data = df['statement_full'].value_counts().rename_axis('Statement').reset_index(name='counts')
data

Unnamed: 0,Statement,counts
0,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- with...,8
1,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ with...,7
2,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L with...,7
3,ga4gh:VA.ORvaSNcFK71WOVr_gi2vv6oPCcXgljab with...,6
4,ga4gh:VA.Dy7soaZQU1vH9Eb93xG_pJyhu7xTDDC9 with...,5
...,...,...
898,ga4gh:VA.mecsBfk2t315ZcdZCTD7TTRVezaXskCy with...,1
899,ga4gh:VA.e0t-Kq4iX8IsDH1F0zj6xClkCKCJpwwk with...,1
900,ga4gh:VA.RWiEzkpxrOKfQHfoE6T5CEpJPgqgA_YB with...,1
901,ga4gh:VA.0X2BApoy4FSxtnvy5az1bXu8YMs8E__x with...,1


In [70]:
data['Statement'][0]

'ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- with treatment by rxcui:1721560 predictsSensitivityTo the disease ncit:C2926'

In [71]:
df[df['statement_full']=='ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- with treatment by rxcui:1721560 predictsSensitivityTo the disease ncit:C2926']

Unnamed: 0,id,description,predicate,subject,obj,qualifier,statement_full
2,civic.eid:1592,Osimertinib has been approved for the treatmen...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- with...
3,civic.eid:1867,"Randomized, international, open-label, phase 3...",predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- with...
37,civic.eid:965,This phase I/II trial (NCT01802632) involved 2...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- with...
79,civic.eid:966,This study summarized 9 EGFR-mutant patients f...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- with...
560,civic.eid:963,"Cell line, xenograft, and transgenic models we...",predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- with...
917,moa.assertion:242,Osimertinib is a kinase inhibitor indicated fo...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- with...
922,moa.assertion:256,Osimertinib is being evaluated in patients who...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- with...
924,moa.assertion:259,Osimertinib is FDA-Approved for metastatic non...,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,ncit:C2926,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- with...


In [73]:
for description in df[df['statement_full']=='ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- with treatment by rxcui:1721560 predictsSensitivityTo the disease ncit:C2926']['description']:
    print(f'{description}\n')

Osimertinib has been approved for the treatment of EGFR T790M mutant NSCLC.

Randomized, international, open-label, phase 3 trial (NCT02151981) in 419 patients with T790M-positive advanced NSCLC and disease progression after first-line EGFR-TKI therapy. Patients were randomized, in a 2:1 ratio, to oral osimertinib or chemotherapy (pemetrexed plus either carboplatin or cisplatin every 3 weeks for up to six cycles, maintenance pemetrexed was allowed). Primary end point was PFS, which was significantly longer with osimertinib than with chemotherapy (10.1 months vs. 4.4 months; HR 0.30; 95% CI: 0.23 to 0.41; P<0.001). Objective response rate was significantly better with osimertinib (71%; 95% CI, 65 to 76) than with chemotherapy (31%; 95% CI, 24 to 40) (odds ratio for objective response, 5.39; 95% CI, 3.47 to 8.48; P<0.001).

This phase I/II trial (NCT01802632) involved 253 non-small cell lung cancer patients with activating EGFR mutations, who had progressed on first generation tyrosine k

### Data Graph

In [84]:
data[0:86]

Unnamed: 0,Statement,counts
0,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- with...,8
1,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ with...,7
2,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L with...,7
3,ga4gh:VA.ORvaSNcFK71WOVr_gi2vv6oPCcXgljab with...,6
4,ga4gh:VA.Dy7soaZQU1vH9Eb93xG_pJyhu7xTDDC9 with...,5
...,...,...
81,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L with...,2
82,ga4gh:VA.yHQVVwZjma693Ev6lQtew1axCWVwIi8K with...,2
83,ga4gh:VA.udBCHwlrf8xNiRy_19bLi-h5LhnZLgCt with...,2
84,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L with...,2


In [102]:
data['counts'][0:86].sum()

225

In [86]:
data['Statement'][0].split(' ')[0] # Variant Hash
data['Statement'][0].split(' ')[4] # Therapeutic 
data['Statement'][0].split(' ')[5] # Predicate 
data['Statement'][0].split(' ')[8] # Disease



['ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-',
 'with',
 'treatment',
 'by',
 'rxcui:1721560',
 'predictsSensitivityTo',
 'the',
 'disease',
 'ncit:C2926']

In [89]:
data['variant'] = None
data['predicate'] = None
data['therapeutic'] = None
data['disease'] = None


for idx, row in data.iterrows():
    terms = row['Statement'].split(' ')
    data.at[idx, 'variant'] = terms[0]
    data.at[idx, 'predicate'] = terms[5]
    data.at[idx, 'therapeutic'] = terms[4]
    data.at[idx, 'disease'] = terms[8]

data 

Unnamed: 0,Statement,counts,variant,therapeutic,predicate,disease
0,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- with...,8,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,rxcui:1721560,predictsSensitivityTo,ncit:C2926
1,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ with...,7,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,rxcui:337525,predictsSensitivityTo,ncit:C2926
2,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L with...,7,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,rxcui:1147220,predictsSensitivityTo,ncit:C3224
3,ga4gh:VA.ORvaSNcFK71WOVr_gi2vv6oPCcXgljab with...,6,ga4gh:VA.ORvaSNcFK71WOVr_gi2vv6oPCcXgljab,rxcui:318341,predictsResistanceTo,ncit:C4978
4,ga4gh:VA.Dy7soaZQU1vH9Eb93xG_pJyhu7xTDDC9 with...,5,ga4gh:VA.Dy7soaZQU1vH9Eb93xG_pJyhu7xTDDC9,rxcui:282388,predictsResistanceTo,ncit:C3868
...,...,...,...,...,...,...
898,ga4gh:VA.mecsBfk2t315ZcdZCTD7TTRVezaXskCy with...,1,ga4gh:VA.mecsBfk2t315ZcdZCTD7TTRVezaXskCy,rxcui:337525,predictsSensitivityTo,ncit:C2926
899,ga4gh:VA.e0t-Kq4iX8IsDH1F0zj6xClkCKCJpwwk with...,1,ga4gh:VA.e0t-Kq4iX8IsDH1F0zj6xClkCKCJpwwk,rxcui:337525,predictsSensitivityTo,ncit:C2926
900,ga4gh:VA.RWiEzkpxrOKfQHfoE6T5CEpJPgqgA_YB with...,1,ga4gh:VA.RWiEzkpxrOKfQHfoE6T5CEpJPgqgA_YB,rxcui:337525,predictsSensitivityTo,ncit:C2926
901,ga4gh:VA.0X2BApoy4FSxtnvy5az1bXu8YMs8E__x with...,1,ga4gh:VA.0X2BApoy4FSxtnvy5az1bXu8YMs8E__x,rxcui:337525,predictsResistanceTo,ncit:C2926


In [105]:
def duplicate_detect(count):
    if count > 1:
        return('Duplicate')
    else:
        return('Not Duplicate')
    
data['is_duplicate'] = None
data['is_duplicate'] = data['counts'].apply(duplicate_detect)

In [119]:
import plotly
import plotly.express as px

fig = px.bar(data[0:86], x='Statement', y='counts', color='predicate', title='Overlap of Full Evidence Statements in MetaKB v2')
fig.update_layout(yaxis_title='# Overlapping Statements')
fig.show()

In [None]:
# TODO: Tuples/Triplets of different components of the full evidence statement for more overlap
# fig = px.bar(data[0:86], x=['Variant','Therapeutic'], y='counts', color='predicate', title='Overlap of Full Evidence Statements in MetaKB v2')
# fig.update_layout(yaxis_title='# Overlapping Statements') 
# fig.show()

In [124]:
fig = px.pie(data, values='counts', names='is_duplicate', title='% of Identical Evidence Statements in MetaKB v2')
fig.update_traces(textinfo='percent+label')
fig.show()