# Neo4j to Network X

This notebook contains code for querying the neo4j knowledge graph and transforming the results of that query into a network x graph.

In [1]:
import networkx as nx
import os
import pandas as pd
from tqdm.notebook import tqdm
from neo4j import GraphDatabase

You can visit this G Drive folder to get the data for this project: https://drive.google.com/drive/folders/1E-pxqkJDrS-jd_MNq6IbS-v7gGNNmrYd?usp=sharing. Insert this data into ../data/processed/

## Functions for coercing knowledge graph into NetworkX

In [None]:
def getSubgraph(q, parameters=None):

    '''
    Given a Cypher query q, this function queries the knowledge graph,
    returns the nodes and edges from this query, and uses them to construct
    a networkx graph.

    E.g. getSubgraph(r'MATCH (u:Cid)-[r:HYPERLINKS_TO]->(v:Cid) RETURN *')
         returns the structural graph.

    Optionally, can add in parameters (dictionary), allowing Python variables
    to be integrated into the Cypher query q.

    E.g.
        parameters = {}
        parameters['pages'] = ['a','list','of','stuff']
        q7 = f"""
        MATCH (u:Cid)-[r]-(v:Cid)
        WHERE u.name IN $pages AND v.name in $pages
        RETURN *
        """

        g7 = getSubgraph(q7, parameters)
    '''

    # get credentials
    # add to .secrets: export KG_PWD="<PASSWORD>"
    KG_PWD = os.getenv("KG_PWD")

    # create connection to knowledge graph
    driver = GraphDatabase.driver(
        "bolt+s://knowledge-graph.integration.govuk.digital:7687",
        auth=("neo4j", KG_PWD),
    )

    # run query on knowledge graph
    results = driver.session().run(q, parameters)

    # create networkx graph object
    G = nx.MultiDiGraph()

    # add nodes into networkx graph object
    nodes = list(results.graph()._nodes.values())
    print("Adding nodes\n")
    for node in tqdm(nodes):
        G.add_node(node.id, labels=node._labels, properties=node._properties)

    # add edges into networkx graph object
    rels = list(results.graph()._relationships.values())
    print("Adding edges\n")
    for rel in tqdm(rels):
        G.add_edge(
            rel.start_node.id,
            rel.end_node.id,
            key=rel.id,
            type=rel.type,
            properties=rel._properties,
        )

    return G


def showGraph(g):
    """
    Given a networkx graph g, this function visualises the graph.
    Do not use for a large g.
    """
    print(nx.info(g))
    nx.draw(g)

## Defining subgraph based on mainstream content

In [None]:
# mainstream content in the structural graph

q3 = r"""
MATCH (u:Mainstream)-[r:HYPERLINKS_TO]->(v:Mainstream)
RETURN *
"""

g3 = getSubgraph(q3)
showGraph(g3)

## Defining subgraph based on functional graph

In [None]:
# get the functional graph

q5 = r"""
MATCH (u:Cid)-[r:USER_MOVEMENT]->(v:Cid)
RETURN *
"""

g5 = getSubgraph(q5)
g5.number_of_nodes(), g5.number_of_edges()

In [None]:
# write the functional graph to disk
nx.write_gpickle(g5, "../data/processed/functional_graph.gpickle")

## Defining subgraph based on structural graph

In [None]:
# get the structural grpah

q6 = r"""
MATCH (u:Cid)-[r:HYPERLINKS_TO]->(v:Cid)
RETURN *
"""

g6 = getSubgraph(q6)
g6.number_of_nodes(), g6.number_of_edges()

In [None]:
# write structural graph to disk
nx.write_gpickle(g6, "../data/processed/structural_graph.gpickle")

In [None]:
# visualising nodes in the structural graph that mention 'start a business'
nodes = list(g6.nodes(data=True))
sabNodes = [
    node
    for node in nodes
    if "start a business" in node[1]["properties"]["text"].lower()
]
nx.draw(g6.subgraph([node[0] for node in sabNodes]))

## Defining subgraph based on page hits from Big Query GA data

### By page path

In [5]:
page_paths = pd.read_csv('../data/processed/page_paths.csv')
page_paths.head()

Unnamed: 0,pagePath,noOfSessions
0,/report-blocked-drain/harlow,5
1,/government/publications/weekly-statistics-for...,5
2,/student-finance-calculator/y/[date]/uk-full-t...,5
3,/government/statistics/suicide-deaths-in-north...,5
4,/government/publications/psv-speed-limiter-exe...,5


In [None]:
parameters = {}
parameters["pages"] = page_paths["pagePath"].tolist()
q7 = f"""
MATCH (u:Cid)-[r:HYPERLINKS_TO|USER_MOVEMENT]->(v:Cid)
WHERE u.name IN $pages AND v.name in $pages
RETURN *
"""

g7 = getSubgraph(q7, parameters)
nx.info(g7)

In [None]:
# write graph to disk
nx.write_gpickle(g7, "../data/processed/5_hits_graph.gpickle")

#### Running checks to ensure all SaB pages are in this subgraph

In [6]:
sab_pages = pd.read_csv('../data/processed/sab_pages.csv')
sab_pages.head()

Unnamed: 0,pagePath
0,/creative-works-licence
1,/get-uncertified-electronic-copy-patent
2,/unincorporated-associations
3,/get-information-about-a-company
4,/goods-sent-from-abroad


In [None]:
# percentage of SaB nodes that are in our subgraph
g7nodes = list(g7.nodes(data=True))
g7names = set([node[1]["properties"]["name"] for node in g7nodes])
len(set(sab_pages.pagePath).intersection(g7names)) / len(set(sab_pages.pagePath)) * 100

99.67% of SaB nodes are in this subgraph. The missing node relates to a withdrawn page, therefore, effectively 100% of the SaB nodes are in this subgraph:

In [None]:
set(sab_pages.pagePath) - set(sab_pages.pagePath).intersection(g7names)

### By content ID

In [7]:
content_ids = pd.read_csv('../data/processed/content_ids.csv')
content_ids.head()

Unnamed: 0,contentID,noOfSessions
0,ee03eb40-34b3-4930-8715-9cab858421ef,5533
1,f3bbdec2-0e62-4520-a7fd-6ffd5d36e03a,4840078
2,774cee22-d896-44c1-a611-e3109cce8eae,980732
3,86f14e34-ba09-4e35-913e-af9e213cff2e,110257
4,e41bd8f3-148c-4285-ad16-131c716bc067,76841


In [None]:
parameters = {}
parameters["pages"] = content_ids["contentID"].tolist()
q8 = f"""
MATCH (u:Cid)-[r:HYPERLINKS_TO|USER_MOVEMENT]->(v:Cid)
WHERE u.contentID IN $pages AND v.contentID in $pages
RETURN *
"""

g8 = getSubgraph(q8, parameters)
nx.info(g8)

In [None]:
# write graph to disk
nx.write_gpickle(g8, "../data/processed/5_hits_per_contentID_graph.gpickle")

In [None]:
# percentage of SaB nodes that are in our subgraph
g8nodes = list(g8.nodes(data=True))
g8names = set([node[1]["properties"]["name"] for node in g8nodes])
len(set(sab_pages.pagePath).intersection(g8names)) / len(set(sab_pages.pagePath)) * 100