In [None]:
import pandas as pd

In [1]:
def getNodesEdges(filename):
    """
    Get nodes and edges from a file
    """
    nodes = set()
    edges = []
    with open(filename, 'r') as f:
        for line in f:
            src, dst = line.strip().split(' ')
            nodes.add(src)
            nodes.add(dst)
            edges.append((src, dst))
    print("Number of Nodes:", len(nodes))
    print("Number of Edges:", len(edges))
    return nodes, edges
    
    
nodes, edges = getNodesEdges("../Data/twitter_combined.txt")

Number of Nodes: 81306
Number of Edges: 2420766


In [2]:
def calculatePageRank(nodes, edges, dampingFactor=0.85, iterations=10):
    """
    Calculate the PageRank of each node
    """
    # Initialize the page rank dictionary with 1/N for all nodes
    pageRank = {node: 1/len(nodes) for node in nodes}
    # Iterate 'iterations' times
    for i in range(iterations):
        print("Iteration", i)
        # Create a dictionary of outgoing links with default 0
        links = {node: 0 for node in nodes}
        # Count how many links point from each node
        for src, dst in edges:
            links[src] += 1
        # Calculate the page rank of each node
        for node in nodes:
            # Calculate the part of the equation where the dangling nodes come in
            if links[node] == 0:
                pageRank[node] = (1 - dampingFactor) / len(nodes)
            else:
                # Calculate the part of the equation where the links come in
                pageRank[node] = (1 - dampingFactor) / len(nodes) + dampingFactor * sum(pageRank[dst] / links[dst] for dst in nodes if links[dst] != 0)
    return pageRank

def calculateAuthorityScore(nodes, edges, dampingFactor=0.85, iterations=10):
    """
    Calculate the Authority Score of each node
    """
    # Initialize the page rank dictionary with 1/N for all nodes
    authorityScore = {node: 1/len(nodes) for node in nodes}
    # Iterate 'iterations' times
    for i in range(iterations):
        print("Iteration", i)
        # Create a dictionary of incoming links with default 0
        links = {node: 0 for node in nodes}
        # Count how many links point to each node
        for src, dst in edges:
            links[dst] += 1
        # Calculate the authority score of each node
        for node in nodes:
            # Calculate the part of the equation where the dangling nodes come in
            if links[node] == 0:
                authorityScore[node] = (1 - dampingFactor) / len(nodes)
            else:
                # Calculate the part of the equation where the links come in
                authorityScore[node] = (1 - dampingFactor) / len(nodes) + dampingFactor * sum(authorityScore[src] / links[src] for src in nodes if links[src] != 0)
    return authorityScore

def calculateHubScore(nodes, edges, dampingFactor=0.85, iterations=10):
    """
    Calculate the Hub Score of each node
    """
    # Initialize the page rank dictionary with 1/N for all nodes
    hubScore = {node: 1/len(nodes) for node in nodes}
    # Iterate 'iterations' times
    for i in range(iterations):
        print("Iteration", i)
        # Create a dictionary of outgoing links with default 0
        links = {node: 0 for node in nodes}
        # Count how many links point from each node
        for src, dst in edges:
            links[src] += 1
        # Calculate the hub score of each node
        for node in nodes:
            # Calculate the part of the equation where the dangling nodes come in
            if links[node] == 0:
                hubScore[node] = (1 - dampingFactor) / len(nodes)
            else:
                # Calculate the part of the equation where the links come in
                hubScore[node] = (1 - dampingFactor) / len(nodes) + dampingFactor * sum(hubScore[dst] / links[dst] for dst in nodes if links[dst] != 0)
    return hubScore

In [None]:
calculatePageRank(nodes, edges)