In [25]:
pip install pyspark



In [26]:
from pyspark import SparkContext
from pyspark import SparkConf

In [41]:
conf = SparkConf().setAppName("PageRank")
sc = SparkContext.getOrCreate()

In [50]:
edges = sc.textFile("drive/MyDrive/A3-data/graph-small.txt")

def parse_edge(edge):
    nodes = edge.split("\t")
    return (int(nodes[0]), int(nodes[1]))

edges = edges.map(parse_edge).distinct()
n = edges.flatMap(lambda edge: edge).distinct().count()

edges = edges.groupByKey().cache()
ranks = sc.parallelize([(i, 1.0 / n) for i in range(1, n + 1)])

def compute_contribs(neighbors, rank):
    num_neighbors = len(neighbors)
    for neighbor in neighbors:
        yield (neighbor, rank / num_neighbors)

b = 0.8
iterations = 40

for iteration in range(iterations):
    contribs = edges.join(ranks).flatMap(
        lambda edge_rank: compute_contribs(edge_rank[1][0], edge_rank[1][1])
    )

    ranks = contribs.reduceByKey(lambda x, y: x + y).mapValues(lambda contrib: (1 - b) / n + b * contrib)

final_ranks = ranks.collect()
final_ranks.sort(key=lambda x: x[1], reverse=True)

print("Top 5 nodes by PageRank:")
for i in range(5):
    print(f"Node {final_ranks[i][0]} has a PageRank of {final_ranks[i][1]}")

print("Bottom 5 nodes by PageRank:")
for i in range(1, 6):
    print(f"Node {final_ranks[-i][0]} has a PageRank of {final_ranks[-i][1]}")

Top 5 nodes by PageRank:
Node 53 has a PageRank of 0.03573120223267161
Node 14 has a PageRank of 0.03417090697259137
Node 40 has a PageRank of 0.033630087189743904
Node 1 has a PageRank of 0.03000597947978861
Node 27 has a PageRank of 0.029720144201405382
Bottom 5 nodes by PageRank:
Node 85 has a PageRank of 0.003409694077402821
Node 59 has a PageRank of 0.0036698606601272845
Node 81 has a PageRank of 0.0036953517493609916
Node 37 has a PageRank of 0.0038082042916114515
Node 89 has a PageRank of 0.003922466019802269


In [51]:
edges = sc.textFile("drive/MyDrive/A3-data/graph-full.txt")

def parse_edge(edge):
    nodes = edge.split("\t")
    return (int(nodes[0]), int(nodes[1]))

edges = edges.map(parse_edge).distinct()
n = edges.flatMap(lambda edge: edge).distinct().count()

edges = edges.groupByKey().cache()
ranks = sc.parallelize([(i, 1.0 / n) for i in range(1, n + 1)])

def compute_contribs(neighbors, rank):
    num_neighbors = len(neighbors)
    for neighbor in neighbors:
        yield (neighbor, rank / num_neighbors)

b = 0.8
iterations = 40

for iteration in range(iterations):
    contribs = edges.join(ranks).flatMap(
        lambda edge_rank: compute_contribs(edge_rank[1][0], edge_rank[1][1])
    )

    ranks = contribs.reduceByKey(lambda x, y: x + y).mapValues(lambda contrib: (1 - b) / n + b * contrib)

final_ranks = ranks.collect()
final_ranks.sort(key=lambda x: x[1], reverse=True)

print("Top 5 nodes by PageRank:")
for i in range(5):
    print(f"Node {final_ranks[i][0]} has a PageRank of {final_ranks[i][1]}")

print("Bottom 5 nodes by PageRank:")
for i in range(1, 6):
    print(f"Node {final_ranks[-i][0]} has a PageRank of {final_ranks[-i][1]}")

Top 5 nodes by PageRank:
Node 263 has a PageRank of 0.002020291181518219
Node 537 has a PageRank of 0.0019433415714531492
Node 965 has a PageRank of 0.0019254478071662631
Node 243 has a PageRank of 0.001852634016241731
Node 285 has a PageRank of 0.0018273721700645144
Bottom 5 nodes by PageRank:
Node 558 has a PageRank of 0.0003286018525215297
Node 93 has a PageRank of 0.0003513568937516577
Node 62 has a PageRank of 0.00035314810510596274
Node 424 has a PageRank of 0.00035481538649301454
Node 408 has a PageRank of 0.00038779848719291705
