In [1]:
from neo4j import GraphDatabase 
import pandas as pd
import os
from pathlib import Path

from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx

In [2]:
class driver():
    """Class to run LinkPred"""
    def __init__(self) -> None:
        self.driver = GraphDatabase.driver("neo4j://localhost:7687", auth=("neo4j", "bishop-detect-tahiti-distant-pixel-3737"))

    def close(self) -> None:
        self.driver.close()

    @classmethod
    def louvain(cls,tx) -> any:
        query = ("""
        CALL gds.louvain.stream("KG")
        YIELD nodeId, communityId
        RETURN LABELS(gds.util.asNode(nodeId)) AS Type, nodeId as ID, gds.util.asNode(nodeId).name AS Name, communityId as Community
        """)
        result = tx.run(query)
        return result.data()

    def run_louvain(self) -> any:
        result = self.driver.session().write_transaction(self.louvain)
        result = pd.DataFrame(result)
        return result

In [3]:
KG = driver()

In [4]:
results = KG.run_louvain()
results["Type"] = results["Type"].str[0]
results["Frequency"] = results["Community"].map(results["Community"].value_counts())
results.head()

Unnamed: 0,Type,ID,Name,Community,Frequency
0,Protein,0,Amyloid beta A4 protein,13645,695
1,Protein,1,Matrix metalloproteinase-9,13764,1604
2,Protein,2,Voltage-dependent P/Q-type calcium channel sub...,13735,1778
3,Protein,3,Extracellular calcium-sensing receptor,13643,833
4,Protein,4,"Nitric oxide synthase, brain",13642,1152


In [5]:
# get proteins and MeSH terms that are in a community of size >= 10
proteins_and_mesh = results.loc[((results['Type'] == "Protein") | (results['Type'] == 'MeSH')) & (results['Frequency'] >= 10)]

# https://moonbooks.org/Articles/How-to-slice-split-a-dataframe-by-column-value-with-pandas-in-python-/
community_ids = proteins_and_mesh['Community'].unique()
df_sliced_dict = {}
for community in community_ids:
    df_sliced_dict[community] = proteins_and_mesh[ proteins_and_mesh['Community'] == community]

In [6]:
# makes subdirectory /CSVs and cleans out old files
path = str(Path().absolute()) + '/CSVs'
os.makedirs(path, exist_ok=True)
for f in os.listdir(path):
    os.remove(os.path.join(path, f))  

# output data into CSVs named [communityId].csv into CSVs/
for community in community_ids:
    df_sliced_dict[community][['Type','Name']].to_csv(path + '/' + str(community) + '.csv') 