Task 2.1: PageRank Algorithm

In [None]:
import random
import networkx as nx

# Function to generate random link data
def generate_link_data(num_nodes):
    link_data = {}
    for node in range(num_nodes):
        num_links = random.randint(1, min(10, num_nodes - 1))
        links = random.sample(range(num_nodes), num_links)
        links = [link for link in links if link != node]
        link_data[node] = links
    link_graph = nx.DiGraph(link_data)
    labels = dict((n, str(n)) for n in link_graph.nodes())
    nx.draw_circular(link_graph, labels = labels)
    return link_data

# Function to write link data to a text file
def write_link_data_to_file(link_data, filename):
    with open(filename, 'w') as file:
        for node, links in link_data.items():
            file.write("{}: {}\n".format(node, links))

# Main function
def main():
    num_nodes = 100
    link_data = generate_link_data(num_nodes)

    # Write link data to a file
    write_link_data_to_file(link_data, 'pagelinks.txt')

    print("File 'pagelinks.txt' created successfully.")

if __name__ == "__main__":
    main()


In [None]:
from pyspark import SparkContext

In [None]:
sc = SparkContext("local", "PageRank")

In [None]:
lines = sc.textFile("pagelinks.txt")

In [None]:
links = (
    lines.map(lambda line: line.strip().split())
         .filter(lambda parts: len(parts) > 1)
         .flatMap(lambda parts: [(parts[0], dest) for dest in parts[1:]])
)
links = links.groupByKey().mapValues(list)

In [None]:
ranks = links.mapValues(lambda _: 1.0)

In [None]:
damping = 0.85
num_iterations = 10

In [None]:
for i in range(num_iterations):
    # Join links and ranks: (page, ([neighbors], rank))
    contributions = links.join(ranks).flatMap(
        lambda page_neighbors_rank: [
            (neighbor, page_neighbors_rank[1][1] / len(page_neighbors_rank[1][0]))
            for neighbor in page_neighbors_rank[1][0]
        ]
    )

    # Update rank with damping
    ranks = contributions.reduceByKey(lambda x, y: x + y) \
                         .mapValues(lambda rank: (1 - damping) + damping * rank)


In [None]:
final_ranks = ranks.collect()

In [None]:
print("\nFinal Page Ranks:")
for page, rank in sorted(final_ranks, key=lambda x: -x[1]):
    print(f"{page}: {rank:.4f}")