In [2]:
import pandas as pd
from wordcloud import WordCloud
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np

In [3]:
gene_df = pd.read_csv("../results/ner_disambiguated_genes.csv")
diseases_df = pd.read_csv("../results/ner_disambiguated_diseases.csv")

In [4]:
gene_ner_words = Counter(gene_df[gene_df["word"].notnull()]["word"])
gene_ner_dis_words = Counter(gene_df[gene_df["gene"].notnull()]["gene"])

In [5]:
diseases_ner_words = Counter(diseases_df[diseases_df["word"].notnull()]["word"])
diseases_ner_dis_words = Counter(diseases_df[diseases_df["diseases"].notnull()]["diseases"])

In [None]:
gene_pubmed = gene_df[gene_df["gene"].notnull()].groupby(["gene"])["pubmed_id"].unique().reset_index()
disease_pubmed = diseases_df[diseases_df["diseases"].notnull()].groupby(["diseases"])["pubmed_id"].unique().reset_index()
pubmed_genes = gene_df[gene_df["gene"].notnull()].groupby(["pubmed_id"])["gene"].unique().reset_index()
pubmed_diseases= diseases_df[diseases_df["diseases"].notnull()].groupby(["pubmed_id"])["diseases"].unique().reset_index()

In [8]:
pubmed_data = pubmed_genes.merge(pubmed_diseases, on=["pubmed_id"], how="left")
pubmed_data.head()

Unnamed: 0,pubmed_id,gene,diseases
0,1312696,"[TP53, CDH1, PTCSC1]",[Lung Cancer]
1,1317264,[APC],"[Gastric Cancer, Colorectal Neoplasia]"
2,1322785,"[TP53, TERC, AR]",[Gastric Cancer]
3,1329510,[FA],"[Childhood Cancer, Colorectal Neoplasia, Hepat..."
4,1338691,[APC],"[Colorectal Neoplasia, Gastric Cancer, Colorec..."


In [17]:
genes = list(gene_pubmed[gene_pubmed["gene"].notnull()]["gene"].unique())
diseases = list(disease_pubmed[disease_pubmed["diseases"].notnull()]["diseases"].unique())
pubmed_ids = list(pubmed_data.pubmed_id.values)

In [56]:
nl = "\r\n"
sl = "\'"
disease_relationship = "diseases_in"
gene_relationship = "genes_in"

pubmed_gene_triples_ls = []
pubmed_disease_triples_ls = []
pubmed_triples_ls = []
for _, rows in pubmed_data.iterrows():
    tmp_gene = rows.gene
    tmp_pubmed_id = rows.pubmed_id
    tmp_disease = rows.diseases
    if (tmp_disease is not None) and (type(tmp_disease) is not float):
        pubmed_disease_triples_ls.append(", ".join([f'(pubmed_{tmp_pubmed_id})-[:{disease_relationship}]->({d.replace("-", "_").replace(" ", "_").replace("/", "_").replace(nl, "_").replace(")", "").replace("(", "_").replace(sl, "")})' for d in tmp_disease]))
        pubmed_gene_triples_ls.append(", ".join([f'(pubmed_{tmp_pubmed_id})-[:{gene_relationship}]->({g.replace("-", "_").replace(" ", "_").replace("/", "_").replace(nl, "_").replace(")", "").replace("(", "_").replace(sl, "")})' for g in tmp_gene]))

pubmed_triples_ls.extend(pubmed_gene_triples_ls)
pubmed_triples_ls.extend(pubmed_disease_triples_ls)
pubmed_gene_triples = ", ".join(pubmed_gene_triples_ls)
pubmed_disease_triples = ", ".join(pubmed_disease_triples_ls)
pubmed_triples = ", ".join(pubmed_triples_ls)

In [58]:
print(f"Total number of pubmed nodes: {len(pubmed_ids)}")
print(f"Total number of genes nodes: {len(genes)}")
print(f"Total number of diseases nodes: {len(diseases)}")
print(f"Total number of gene triples: {len(pubmed_gene_triples.split(','))}")
print(f"Total number of disease triples: {len(pubmed_disease_triples.split(','))}")
print(f"Total number of triples: {len(pubmed_triples.split(','))}")
print(f"Total number of entities: {len(genes) + len(diseases) + len(pubmed_ids)}")

Total number of pubmed nodes: 8987
Total number of genes nodes: 297
Total number of diseases nodes: 130
Total number of gene triples: 27493
Total number of disease triples: 19254
Total number of triples: 46747
Total number of entities: 9414


In [23]:
cqlCreate = f""" 
CREATE
{", ".join([f'(pubmed_{p}:pubmed_id {{name: "{p}"}})' for p in pubmed_ids])},
{", ".join([f'({g.replace("-", "_").replace(" ", "_").replace("/", "_").replace(nl, "_").replace(")", "").replace("(", "_").replace(sl, "") }:gene {{name: "{g.replace(nl, " ").replace(sl, "")}"}})' for g in genes])},
{", ".join([f'({d.replace("-", "_").replace(" ", "_").replace("/", "_").replace(nl, "_").replace(")", "").replace("(", "_").replace(sl, "") }:disease {{name: "{d.replace(nl, " ").replace(sl, "")}"}})' for d in diseases])},
{pubmed_relationships}
"""

In [16]:
from neo4j import GraphDatabase

# Database Credentials

uri             = "bolt://localhost:7687"
userName        = "neo4j"
password = "3eGAD963pZEbFNesk6ZUD15iUmQgZEWdjwZn4PG8gXk"

# Connect to the neo4j database server
graphDB_Driver  = GraphDatabase.driver(uri, auth=(userName, password), max_connection_lifetime=200)


In [213]:
# Execute the CQL query

with graphDB_Driver.session() as graphDB_Session:

    # Create nodes
    graphDB_Session.run(cqlCreate)

Remove `\n` and copy-paste into the Neo4j console

In [1]:
# Neo4j queries
"MATCH (g:gene)<-[:genes_in]-(p:pubmed_id)-[:diseases_in]->(d:disease) WHERE p.name = '9024708' RETURN p, d, g"
"MATCH (g:gene)<-[:genes_in]-(p:pubmed_id)-[:diseases_in]->(d:disease) WHERE d.name = 'Teeth (Benign)' RETURN p, d"
"MATCH (g:gene)<-[:genes_in]-(p:pubmed_id)-[:diseases_in]->(d:disease) WHERE d.name = 'Teeth (Benign)' RETURN p, d, g"
"MATCH (g:gene)<-[:genes_in]-(p:pubmed_id)-[:diseases_in]->(d:disease) WHERE g.name = 'AXIN2' RETURN p, g"
"MATCH (g:gene)<-[:genes_in]-(p:pubmed_id)-[:diseases_in]->(d:disease) WHERE g.name = 'TP53' RETURN p, g"

"MATCH (g:gene)<-[:genes_in]-(p:pubmed_id)-[:diseases_in]->(d:disease) WHERE g.name = 'TP53' RETURN p, g"