### Generate Cypher Projection

@cypher:

    CALL gds.graph.create(
    'KG1',
        ["MeSH", "Document", "Protein", "Drug", "Pathway"],
        {
            MENTIONS: {orientation: 'UNDIRECTED'},
            TARGET: {orientation: 'UNDIRECTED'},
            ASSIGNS: {orientation: 'UNDIRECTED'},
            CANDIDATE: {orientation: 'UNDIRECTED'}
        }
    )
    

In [1]:
from neo4j import GraphDatabase 
import pandas as pd
import os
from pathlib import Path

from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx

In [244]:
class driver():
    """Class to run LinkPred"""
    def __init__(self) -> None:
        self.driver = GraphDatabase.driver("neo4j://localhost:7687", auth=("neo4j", "bishop-detect-tahiti-distant-pixel-3737"))

    def close(self) -> None:
        self.driver.close()

    @classmethod
    def louvain(cls,tx) -> any:
        louvain_query = "CALL gds.louvain.write('KG1', { writeProperty : 'community'})"
        tx.run(louvain_query)

        query = ("""
        CALL gds.graph.streamNodeProperty('KG1', 'community')
        YIELD nodeId, propertyValue
        RETURN LABELS(gds.util.asNode(nodeId)) AS Type, nodeId as ID, gds.util.asNode(nodeId).name as Name, propertyValue AS Community
        """)

        # query = ("""
        # CALL gds.louvain.stream('KG1')
        # YIELD nodeId, communityId
        # RETURN LABELS(gds.util.asNode(nodeId)) AS Type, nodeId as ID, gds.util.asNode(nodeId).name AS Name, communityId as Community
        # """)
        result = tx.run(query)
        return result.data()

    def run_louvain(self) -> any:
        result = self.driver.session().write_transaction(self.louvain)
        result = pd.DataFrame(result)
        return result

    def pagerank(cls, tx, community) -> any:
        """
        @param cls is the class
        @param tx is the transaction
        @return result.data() is the data of the pagerank
        """
        graph_name = 'subgraph_' + str(community)
        query = "CALL gds.pageRank.stream('" + graph_name + "') YIELD nodeId, score RETURN LABELS(gds.util.asNode(nodeId)) AS Type, nodeId as ID, gds.util.asNode(nodeId).name as Name, score AS pagerank "
        result = tx.run(query)
        return result.data()


    def run_pagerank(self,community) -> any:
        """
        @param self
        @return result is the dataframe from the pagerank
        """
        result = self.driver.session().write_transaction(self.pagerank, community)
        result = pd.DataFrame(result)
        return result

    def create_subgraph(self, community):
        with self.driver.session() as session:
            result = session.write_transaction(self._create_subgraph, community)
            return result
    
    def _create_subgraph(cls, tx, community) -> any:
        graph_name = 'subgraph_' + str(community)
        remove_query = "CALL gds.graph.drop('" + graph_name + "', false)"
        tx.run(remove_query)

        query = "CALL gds.beta.graph.create.subgraph('" + graph_name + "','" + "KG1" + "','" + "n.community = " + str(community) + "','*')"
        result = tx.run(query)
        return result

In [245]:
KG = driver()

In [246]:
results = KG.run_louvain()
results["Type"] = results["Type"].str[0]
results["Frequency"] = results["Community"].map(results["Community"].value_counts())
results.head()

Unnamed: 0,Type,ID,Name,Community,Frequency
0,Protein,0,Amyloid beta A4 protein,11811,676
1,Protein,1,Matrix metalloproteinase-9,13764,1601
2,Protein,2,Voltage-dependent P/Q-type calcium channel sub...,13735,1809
3,Protein,3,Extracellular calcium-sensing receptor,13643,995
4,Protein,4,"Nitric oxide synthase, brain",13642,757


In [263]:
# get proteins and MeSH terms that are in a community of size >= 10
actual_communities = results.loc[(results['Frequency'] >= 10)]

# https://moonbooks.org/Articles/How-to-slice-split-a-dataframe-by-column-value-with-pandas-in-python-/
community_ids = actual_communities['Community'].unique()

In [264]:
community_pageranks = {}

# for each community, create a subgraph of that community
# and run pagerank locally
for community in community_ids:
    KG.create_subgraph(community)

    pagerank = KG.run_pagerank(community)
    pagerank["Type"] = pagerank["Type"].str[0]

    pagerank = pagerank.loc[((pagerank['Type'] == "Protein") | (pagerank['Type'] == 'MeSH'))]
    community_pageranks[community] = pagerank.sort_values("pagerank", ascending=False)

In [265]:
# makes subdirectory /CSVs and cleans out old files
path = str(Path().absolute()) + '/CSVs'
os.makedirs(path, exist_ok=True)
for f in os.listdir(path):
    os.remove(os.path.join(path, f))  

# output data into CSVs named [communityId].csv into CSVs/
for community in community_ids:
    community_pageranks[community][['Type','Name',"pagerank"]].to_csv(path + '/' + str(community) + '.csv') 

In [262]:
community_pageranks[55]

Unnamed: 0,Type,ID,Name,pagerank
147,Protein,18,Ryanodine receptor 2,128.947991
132,MeSH,13687,"tachycardia, ventricular",77.61513
136,MeSH,13788,"death, sudden, cardiac",56.70059
130,MeSH,13669,long qt syndrome,35.468294
148,Protein,45,Kv channel-interacting protein 2,31.634226
134,MeSH,13690,ventricular fibrillation,30.957462
149,Protein,50,Triadin,21.843049
146,Protein,17,Ryanodine receptor 1,20.111406
152,Protein,59,Calsequestrin-2,16.779119
137,MeSH,13794,ventricular dysfunction,15.873695
