## Investigate MetaKB Dataset: Gene Contexts
In order to better understand the MetaKB aggregate dataset, perform graph-directed lookups via neo4j to identify potential insights or avenues of discussion that might not previously be accessible.
  
This file focuses on gene contexts and attempts to understand the representation of different genes and alleles across evidence items from multiple datasets. Initial ideas are to look at frequency of genes across datasets and breakdown descriptions from civic vs moa to observe overlap or uniqueness.
  
**Current Data Version**: 5.20.0

### Grab Gene Context Data 

In [1]:
from neo4j import GraphDatabase

# Function to create a connection to the Neo4j database
def create_db_connection(uri, user, password):
    driver = GraphDatabase.driver(uri, auth=(user, password))
    return driver

# Function to execute a Cypher query
def execute_query(driver, query):
    with driver.session() as session:
        result = session.run(query)
        return [record for record in result]

# Connect to the Neo4j database
uri = "bolt://localhost:7687"
user = "neo4j"
password = "password"  # Replace 'your_password' with your actual password
driver = create_db_connection(uri, user, password)

query = """MATCH 
  (g:Gene)-[r:HAS_GENE_CONTEXT]-(node1),
  (node1)-[str:HAS_STRENGTH]-(node2),
  (node1)-[var:HAS_VARIANT]-(node3),
  (node1)-[spec:IS_SPECIFIED_BY]-(node4)

RETURN g,COUNT(r), properties(node1),COUNT(str), properties(node2),COUNT(var), properties(node3), COUNT(spec), properties(node4)
"""

# Execute the query
result = execute_query(driver, query)

# Close the connection
driver.close()




In [3]:
import pandas as pd

data = []
for record in result:
    row = {
        'gene_label': record.data()['g']['label'],
        'gene_normalizer_id': record.data()['g']['gene_normalizer_id'],
        'gene_id': record.data()['g']['id'],
        'allele_origin': record.data().get('properties(node1)', {}).get('alleleOrigin'),
        'allele_id': record.data()['properties(node1)']['id'],
        'direction': record.data()['properties(node1)']['direction'],
        'predicate': record.data()['properties(node1)']['predicate'],
        'ev_type': record.data()['properties(node1)']['type'],
        'evidence_label': record.data()['properties(node2)']['label'],
        'evidence_code': record.data()['properties(node2)']['code'],
        'variant_id': record.data()['properties(node3)']['id'],
        'spec_id': record.data()['properties(node4)']['id'],
        'spec_label': record.data()['properties(node4)']['label']
    }
    data.append(row)

df = pd.DataFrame(data)
df


Unnamed: 0,gene_label,gene_normalizer_id,gene_id,allele_origin,allele_id,direction,predicate,ev_type,evidence_label,evidence_code,variant_id,spec_id,spec_label
0,ABCB1,hgnc:40,civic.gid:4244,germline,civic.eid:675,supports,predictsSensitivityTo,VariantTherapeuticResponseStudy,clinical cohort evidence,e000005,civic.mpid:259,civic.method:2019,CIViC Curation SOP (2019)
1,ABL1,hgnc:76,civic.gid:4,somatic,civic.eid:4788,supports,predictsSensitivityTo,VariantTherapeuticResponseStudy,preclinical evidence,e000009,civic.mpid:1565,civic.method:2019,CIViC Curation SOP (2019)
2,ABL1,hgnc:76,civic.gid:4,somatic,civic.eid:4787,supports,predictsSensitivityTo,VariantTherapeuticResponseStudy,preclinical evidence,e000009,civic.mpid:1564,civic.method:2019,CIViC Curation SOP (2019)
3,ABL1,hgnc:76,civic.gid:4,somatic,civic.eid:6376,supports,predictsSensitivityTo,VariantTherapeuticResponseStudy,preclinical evidence,e000009,civic.mpid:1564,civic.method:2019,CIViC Curation SOP (2019)
4,ABL1,hgnc:76,civic.gid:4,somatic,civic.eid:6977,supports,predictsResistanceTo,VariantTherapeuticResponseStudy,case study evidence,e000008,civic.mpid:1547,civic.method:2019,CIViC Curation SOP (2019)
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1037,PIK3CA,hgnc:8975,moa.normalize.gene:PIK3CA,somatic,moa.assertion:461,none,predictsSensitivityTo,VariantTherapeuticResponseStudy,preclinical evidence,e000009,moa.variant:461,moa.method:2021,MOAlmanac (2021)
1038,PIK3CA,hgnc:8975,moa.normalize.gene:PIK3CA,somatic,moa.assertion:460,none,predictsSensitivityTo,VariantTherapeuticResponseStudy,preclinical evidence,e000009,moa.variant:460,moa.method:2021,MOAlmanac (2021)
1039,PIK3CA,hgnc:8975,moa.normalize.gene:PIK3CA,somatic,moa.assertion:459,none,predictsSensitivityTo,VariantTherapeuticResponseStudy,preclinical evidence,e000009,moa.variant:459,moa.method:2021,MOAlmanac (2021)
1040,RET,hgnc:9967,moa.normalize.gene:RET,somatic,moa.assertion:520,none,predictsSensitivityTo,VariantTherapeuticResponseStudy,observational study evidence,e000007,moa.variant:520,moa.method:2021,MOAlmanac (2021)


### Inspect & Graph

In [4]:
data = df['gene_label'].value_counts().rename_axis('gene').reset_index(name='counts')
data

Unnamed: 0,gene,counts
0,EGFR,163
1,BRAF,160
2,PIK3CA,103
3,KRAS,89
4,KIT,81
...,...,...
67,BRCA1,1
68,ARID1A,1
69,AR,1
70,AKT3,1


In [5]:
def get_spec_dist(gene):
    tdf = df[df['gene_label']==gene]
    tdf = dict(tdf['spec_label'].value_counts())

    try:
        civic_counts = tdf['CIViC Curation SOP (2019)']
    except:
        civic_counts = 0

    try:
        moa_counts = tdf['MOAlmanac (2021)']
    except:
        moa_counts = 0

    return civic_counts, moa_counts

In [7]:
new_data = data['gene'].apply(get_spec_dist).apply(pd.Series)
new_data.columns = ['civic_counts','moa_counts']

data[['civic_counts','moa_counts']] = new_data

data['percent_civic'] = data.apply(lambda x: (x['civic_counts'] / x['counts']) * 100 if x['counts'] != 0 else 0, axis=1)
data['percent_moa'] = data.apply(lambda x: (x['moa_counts'] / x['counts']) * 100 if x['counts'] != 0 else 0, axis=1)

data

Unnamed: 0,gene,counts,civic_counts,moa_counts,percent_civic,percent_moa
0,EGFR,163,143,20,87.730061,12.269939
1,BRAF,160,123,37,76.875000,23.125000
2,PIK3CA,103,83,20,80.582524,19.417476
3,KRAS,89,86,3,96.629213,3.370787
4,KIT,81,69,12,85.185185,14.814815
...,...,...,...,...,...,...
67,BRCA1,1,1,0,100.000000,0.000000
68,ARID1A,1,1,0,100.000000,0.000000
69,AR,1,1,0,100.000000,0.000000
70,AKT3,1,1,0,100.000000,0.000000


In [8]:
import plotly.express as px

fig = px.bar(data, x='gene', y=['civic_counts', 'moa_counts'],
             title="Gene Context Contributions by Source",
             labels={'value': 'Counts', 'variable': 'Source'},
             color_discrete_map={'civic_counts': 'blue', 'moa_counts': 'red'}, text_auto=True)

fig.update_layout(barmode='stack',
                  xaxis_title="Gene",
                  yaxis_title="# Of Contexts",
                  legend_title="Source")

# fig.write_image('gene_contexts_by_source.png', width=1600, height=900, scale=2)
fig.show()
