# Community detection for identifying WUJ pages

### Read graph

In [1]:
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd

In [2]:
G = nx.read_gpickle("../data/processed/5_hits_per_contentID_graph.gpickle")

### Helper functions

In [19]:
def evaluate(true_pages, predicted_pages):
    '''
    true_pages is a list of all the pages known to belong to a WUJ
    predicted_pages ia a list of pages predicted to belong to a WUJ
    
    returns precision and recall
    '''
    
    true_pages = set(true_pages)
    predicted_pages = set(predicted_pages)
    
    # what proportion of true pages were correctly predicted?
    recall = len(true_pages.intersection(predicted_pages))/len(true_pages)
    
    # what proportion of predicted pages are true pages?
    precision = len(true_pages.intersection(predicted_pages))/len(predicted_pages)
    
    return (precision, recall)

def getSlugs(G):
    '''
    Returns a list of slugs, given a networkx graph G.
    '''
    return [node[1]['properties']['name'] for node in G.nodes(data=True)]

## Random walks

In [31]:
def random_walk(G, steps, seed):
    '''
    G is a networkx graph.
    steps is the number of steps to take in the random walk.
    seed is a page slug for your starting node in the random walk. E.g. "/set-up-business" 
    
    returns a numpy array of node ids visited during the random walk.
    '''
    
    # get adjacency matrix A
    A = nx.adj_matrix(G, weight=None)
    #A = np.array(A.todense())

    # set a seed node
    for i, node in enumerate(G.nodes(data=True)):
        if node[1]["properties"]["name"] == seed:
            break
    current_node_index = i

    # list of nodes visited during the random walk
    visited = [current_node_index]

    for step in range(steps):

        # identify neighbours of current node
        neighbours = np.nonzero(A[current_node_index])[1]

        # if reached an absorbing state, i.e. no neighbours, terminate the random walk
        if neighbours.size == 0:
            print("Reached absorbing state after",step,"steps")
            return visited
            
        # randomly select the index of next node to transition to
        current_node_index = np.random.choice(neighbours)

        # maintain record of the path taken by the random walk
        visited.append(current_node_index)
        
    return np.array(G.nodes())[visited]

## Naive random walk

In [34]:
evaluate(pd.read_csv("../data/processed/sab_pages.csv").pagePath.tolist(),
         getSlugs(G.subgraph(random_walk(G, 500, '/set-up-business'))))

(0.12903225806451613, 0.013029315960912053)