In [1]:
from neo4j import GraphDatabase
import pandas as pd
from tqdm import tqdm
import plotly
import plotly.express as px
import plotly.io as pio

## Harmonizing Different Evidence Statement Types in MetaKB v2
The first take at this analysis utilized just broad statements with attached therapeutics with no regard for the statement type. (To be honest, I don't think it even had a notion of a type at the time.)  
  
For this notebook, we will retrieve study statements and perform an analysis that looks at uniqueness by resource across different categories of evidence (as defined by va-spec). Additionally, I am going to try to pipeline this notebook to automatically run all of it with a single button.

### Table of Contents
* [Data Retrieval](evidence.ipynb#data-retrieval)
    * [Get Study Statements](#get-study-statements)
    * [Additional Queries](#additional-queries)
* [Variant Prognostic Study Statements](#variant-prognostic-study-statements)
* [Variant Therapeutic Response Study Statements](#variant-therapeutic-response-study-statements)

## Data Retrieval


### Get Study Statements

In [24]:
def create_db_connection(uri, user, password):
    driver = GraphDatabase.driver(uri, auth=(user, password))
    return driver

def execute_query(query):
    uri = "bolt://localhost:7687"
    user = "neo4j"
    password = "password"
    driver = create_db_connection(uri, user, password)
    with driver.session() as session:
        result = session.run(query)
        data = [record for record in result]
    driver.close()
    return data
    
# Queries
q_all_statements = """MATCH (s:StudyStatement)
    RETURN properties(s) AS Study
"""

# Run Query
result = execute_query(q_all_statements)


In [25]:
data = []
for record in result:
    study = record.get('Study', {})
    row = {
        'id': study.get('id', None),
        'studyType': study.get('type',None),
        'description': study.get('description', None),
        'alleleOriginQualifier': study.get('alleleOriginQualifier',None),
        'direction': study.get('direction',None),
        'predicate': study.get('predicate', None)
    }
    data.append(row)

df = pd.DataFrame(data)

df

Unnamed: 0,id,studyType,description,alleleOriginQualifier,direction,predicate
0,civic.eid:238,VariantTherapeuticResponseStudyStatement,The T790M mutation in EGFR has been shown to c...,somatic,supports,predictsResistanceTo
1,civic.eid:1409,VariantTherapeuticResponseStudyStatement,Phase 3 randomized clinical trial comparing ve...,somatic,supports,predictsSensitivityTo
2,civic.eid:1592,VariantTherapeuticResponseStudyStatement,Osimertinib has been approved for the treatmen...,somatic,supports,predictsSensitivityTo
3,civic.eid:1867,VariantTherapeuticResponseStudyStatement,"Randomized, international, open-label, phase 3...",somatic,supports,predictsSensitivityTo
4,civic.eid:2994,VariantTherapeuticResponseStudyStatement,"On May 14, 2013, the U.S. Food and Drug Admini...",somatic,supports,predictsSensitivityTo
...,...,...,...,...,...,...
1138,moa.assertion:990,VariantTherapeuticResponseStudyStatement,The U.S. Food and Drug Administration (FDA) gr...,somatic,,predictsSensitivityTo
1139,moa.assertion:991,VariantTherapeuticResponseStudyStatement,The U.S. Food and Drug Administration (FDA) gr...,somatic,,predictsSensitivityTo
1140,moa.assertion:993,VariantTherapeuticResponseStudyStatement,The U.S. Food and Drug Administration (FDA) gr...,somatic,,predictsSensitivityTo
1141,moa.assertion:996,VariantTherapeuticResponseStudyStatement,The U.S. Food and Drug Administration (FDA) gr...,somatic,,predictsSensitivityTo


In [16]:
df['studyType'].value_counts()

studyType
VariantTherapeuticResponseStudyStatement    1062
VariantPrognosticStudyStatement               81
Name: count, dtype: int64

### Additional Queries
These will be used to fill in additional data from retrieved nodes as dependent on the study type.

In [None]:
def grab_subject_variant(study_id):
    query = f"""MATCH (s:StudyStatement) WHERE s.id = '{study_id}'
        MATCH (s)-[:HAS_VARIANT]-(c)
        MATCH (c)-[:HAS_DEFINING_CONTEXT]-(v)
        RETURN properties(s) AS Study,
                properties(c) AS Variation,
                properties(v) AS Context
    """
    result = execute_query(query)
    if len(result) > 1:
        print(f'{study_id} has {len(result)} subjects possible')
    subject = result[0]['Context']['id']
    return(subject)


def grab_object_therapeutic(study_id): 
    query = f"""MATCH (s:StudyStatement) WHERE s.id = '{study_id}'
        MATCH (s)-[:HAS_THERAPEUTIC]-(c)
        RETURN properties(s) AS Study,
                properties(c) AS Therapeutic
    """
    result = execute_query(query)
    if len(result) > 1:
        print(f'{study_id} has {len(result)} subjects possible')

    # Check for empty result
    if not(result): 
        return(None)

    # Grab concept identifier (if not single therapeutic, handle other cases)
    if 'normalizer_id' in result[0]['Therapeutic']:
        therapeutic = result[0]['Therapeutic']['normalizer_id']
        return(therapeutic)
    
    ther_type = result[0]['Therapeutic']['type']

    # TODO: Consider additional ways to handle these group therapy types?
    if ther_type == 'CombinationTherapy':
        # DO SOMETHING
        return((ther_type, result[0]['Therapeutic']['id']))
        
    if ther_type == 'TherapeuticSubstituteGroup':
        # DO SOMETHING ELSE
        return((ther_type, result[0]['Therapeutic']['id']))
    

def grab_qualifier_disease(study_id): 
    query = f"""MATCH (s:StudyStatement) WHERE s.id = '{study_id}'
        MATCH (s)-[:HAS_TUMOR_TYPE]-(c)
        RETURN properties(s) AS Study,
                properties(c) AS Disease
    """
    result = execute_query(query)
    if len(result) > 1:
        print(f'{study_id} has {len(result)} subjects possible')
    
    try: # TODO: This id should be normalizer_id not disease_normalizer_id. Will likely need to redo analysis
        disease = result[0]['Disease']['disease_normalizer_id']
    except: # TODO: This is gross, handle it better
        try:
            disease = result[0]['Disease']['label']
        except:
            disease = result[0]['Disease']['id']
    return(disease)


## Variant Prognostic Study Statements



In [32]:
data = df[df['studyType']=='VariantPrognosticStudyStatement'].reset_index(drop=True)
print(f'Studies found: {len(data)}')
data[0:5]

Studies found: 81


Unnamed: 0,id,studyType,description,alleleOriginQualifier,direction,predicate
0,civic.eid:102,VariantPrognosticStudyStatement,Unlike other studies that suggest a poorer out...,somatic,disputes,associatedWithWorseOutcomeFor
1,civic.eid:103,VariantPrognosticStudyStatement,V600E is associated with adverse pathological ...,somatic,supports,associatedWithWorseOutcomeFor
2,civic.eid:656,VariantPrognosticStudyStatement,In patients with papillary thyroid cancer harb...,somatic,supports,associatedWithWorseOutcomeFor
3,civic.eid:1420,VariantPrognosticStudyStatement,Study examined rare SNPs on MAP2K7 among a dis...,germline,supports,associatedWithWorseOutcomeFor
4,civic.eid:26,VariantPrognosticStudyStatement,"In acute myloid leukemia patients, D816 mutati...",somatic,supports,associatedWithWorseOutcomeFor


In [33]:
data[['Variant','Disease']] = None 
data['Variant'] = data['id'].apply(grab_subject_variant)
data['Disease'] = data['id'].apply(grab_qualifier_disease)
data[0:5]

Unnamed: 0,id,studyType,description,alleleOriginQualifier,direction,predicate,Variant,Disease
0,civic.eid:102,VariantPrognosticStudyStatement,Unlike other studies that suggest a poorer out...,somatic,disputes,associatedWithWorseOutcomeFor,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,Papillary Thyroid Carcinoma
1,civic.eid:103,VariantPrognosticStudyStatement,V600E is associated with adverse pathological ...,somatic,supports,associatedWithWorseOutcomeFor,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,Colorectal Cancer
2,civic.eid:656,VariantPrognosticStudyStatement,In patients with papillary thyroid cancer harb...,somatic,supports,associatedWithWorseOutcomeFor,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,Papillary Thyroid Carcinoma
3,civic.eid:1420,VariantPrognosticStudyStatement,Study examined rare SNPs on MAP2K7 among a dis...,germline,supports,associatedWithWorseOutcomeFor,ga4gh:VA.nE0U2q_0hSEFMnTRq7MM3ZrVByPOuTdE,Lung Cancer
4,civic.eid:26,VariantPrognosticStudyStatement,"In acute myloid leukemia patients, D816 mutati...",somatic,supports,associatedWithWorseOutcomeFor,ga4gh:VA.nhiDwIq1klrGm3wtWO4a4BiS0jdW79Wd,Acute Myeloid Leukemia


In [38]:
data['source'] = data['id'].apply(lambda x: x.split('.')[0])
data['source'].value_counts()

source
civic    81
Name: count, dtype: int64

In [39]:
data['predicate'].value_counts()

predicate
associatedWithWorseOutcomeFor     66
associatedWithBetterOutcomeFor    15
Name: count, dtype: int64

In [40]:
data['Variant'].value_counts()

Variant
ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L    19
ga4gh:VA.DqV_8uo_pXQ_oDiBS0RjW01imBmUDEXp     6
ga4gh:VA.uOzCG-Blg0EEZfX9z2ZxvpGcWp7r2NrI     5
ga4gh:VA.cdcXITbBWUXPjy3zPdhs7xjU1yvlkSrj     4
ga4gh:VA.5GEqm-zIzvvPheyGb9rlu5DyIueIubXm     2
ga4gh:VA.WCo3uQpMnNSbBMy5pKdSccudXs8ooZw9     2
ga4gh:VA.OUtnlXzYAPjSCFTJdzslpJisv5mVZEWG     2
ga4gh:VA.NbmGko5VG3K44V5sy-uVWrQaBdnqJ227     2
ga4gh:VA.4hfGvwsO6MSx7QZOah2F7tcDOihO-sS6     2
ga4gh:VA.l7RwuFlchr3Ol0VBFIpnUPA-STxTS1tK     2
ga4gh:VA.g1Zv9RlyZLRqT-9pmYzVlvHW9cOA-dJC     2
ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-     2
ga4gh:VA.nhiDwIq1klrGm3wtWO4a4BiS0jdW79Wd     2
ga4gh:VA.6xzzXheebNYrE6lgDlRK2XN0NHrnoyOg     1
ga4gh:VA.dN6ljLlOSzt8ktlVPONDiykp5NXsQK13     1
ga4gh:VA.xb30dObTCWb854GJXbRM8z6rznkk_EQ4     1
ga4gh:VA.DhYFjPzh5PYmi9VVBykABjDGQtda800a     1
ga4gh:VA.JCIYeAy0qB-xdSekii9hbcEtCVfiqKVj     1
ga4gh:VA.-dMnJf9oUBfl9De0llc3LqJaGdFzfATK     1
ga4gh:VA.hEybNB_CeKflfFhT5AKOU5i1lgZPP-aS     1
ga4gh:VA.Audz_0Y7aDtAI6drdpRSZBe

In [41]:
data['Disease'].value_counts()

Disease
Colorectal Cancer                       14
Breast Cancer                           13
Papillary Thyroid Carcinoma              8
Lung Non-small Cell Carcinoma            8
Spindle Cell Rhabdomyosarcoma            4
Desmoid Tumor                            4
Melanoma                                 3
Childhood Acute Lymphocytic Leukemia     2
Acute Myeloid Leukemia                   2
Pancreatic Cancer                        2
Pancreatic Ductal Carcinoma              1
Neuroblastoma                            1
B-lymphoblastic Leukemia/lymphoma        1
Intrahepatic Cholangiocarcinoma          1
Lung Adenocarcinoma                      1
Lung Carcinoma                           1
Cervical Cancer                          1
Rectum Cancer                            1
Malignant Exocrine Pancreas Neoplasm     1
Rhabdomyosarcoma                         1
Multiple Myeloma                         1
Glioblastoma                             1
Childhood Low-grade Glioma               1
Chr

In [42]:
data[['Variant','predicate','Disease']].value_counts()

Variant                                    predicate                       Disease                             
ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L  associatedWithWorseOutcomeFor   Papillary Thyroid Carcinoma             8
                                                                           Colorectal Cancer                       7
ga4gh:VA.uOzCG-Blg0EEZfX9z2ZxvpGcWp7r2NrI  associatedWithWorseOutcomeFor   Spindle Cell Rhabdomyosarcoma           4
ga4gh:VA.g1Zv9RlyZLRqT-9pmYzVlvHW9cOA-dJC  associatedWithWorseOutcomeFor   Breast Cancer                           2
ga4gh:VA.5GEqm-zIzvvPheyGb9rlu5DyIueIubXm  associatedWithBetterOutcomeFor  Breast Cancer                           2
ga4gh:VA.4hfGvwsO6MSx7QZOah2F7tcDOihO-sS6  associatedWithWorseOutcomeFor   Breast Cancer                           2
ga4gh:VA.WCo3uQpMnNSbBMy5pKdSccudXs8ooZw9  associatedWithWorseOutcomeFor   Colorectal Cancer                       2
ga4gh:VA.NbmGko5VG3K44V5sy-uVWrQaBdnqJ227  associatedWithWorseOutcome

In [43]:
# TODO: Graph these results

## Variant Therapeutic Response Study Statements

## Variant Diagnostic Study Statements

In [None]:
# TODO: Pull this in with an updated CDM

## Variant Oncogenicity Study Statements

In [None]:
# TODO: This does not exist in the Neo4j yet

## Cohort Allele Frequency Study Statements

In [None]:
# TODO: This does not exist in the Neo4j yet

## Experimental Variant Functional Impact Study Statements

In [None]:
# TODO: This does not exist in the Neo4j yet