In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName("SimpleApp").getOrCreate()
sc = spark.sparkContext

In [8]:
iters = 10
N = 6012
d = 0.85

In [11]:
data = sc.textFile("dgraph.txt")
data = data.map(lambda line: line.split())

links = data.groupByKey()
ranks = links.map(lambda x: (x[0], 1.0))

def computeContribs(dests, rank):
    given = rank / len(dests)
    return [(dest, given) for dest in dests]

for i in range(iters):
    contribs = links.join(ranks).flatMap(lambda x: computeContribs(x[1][0], x[1][1]))
    ranks = contribs.reduceByKey(lambda v1,v2: v1+v2).map(lambda x: (x[0], (1-d) + d*x[1]))

print(ranks.sortBy(lambda x: -x[1]).take(100))

[('2', 46.10691105759518), ('37', 24.468079296370252), ('38', 22.670161588010362), ('61', 21.431960076485367), ('52', 21.285059139778394), ('43', 18.854924698385023), ('425', 16.398342226316267), ('27', 15.768031674906283), ('28', 14.511095330628715), ('29', 11.558321636408442), ('4023', 11.542984997498607), ('5254', 9.781698440058241), ('3227', 9.692790976859241), ('40', 9.141386877616593), ('822', 8.997012868719548), ('3834', 8.81609538333864), ('73', 8.80114929637671), ('4075', 8.63330419779759), ('132', 8.535326945424675), ('81', 8.363206361612374), ('3941', 7.557524242555665), ('593', 7.551678219723799), ('5072', 7.443596419422556), ('3220', 7.3826920545684835), ('1379', 7.230737038039191), ('3875', 6.928235700273627), ('42', 6.859956829817239), ('3873', 6.617490910838404), ('5175', 6.36441874436328), ('44', 6.339981283268827), ('3224', 6.316383515491608), ('3994', 6.298944139293149), ('1370', 6.054223596856954), ('5112', 5.824808437598615), ('80', 5.813008639172718), ('3221', 5.7

In [17]:
from pyspark.sql.functions import *
from pyspark.sql.types import Row

data = spark.read.option("delimiter", " ").csv("dgraph.txt").toDF("src", "dest")

links = data.groupBy("src").agg(collect_set("dest")).toDF("url_id", "dests").repartition(2)
ranks = links.select("url_id").withColumn("rank", lit(1.0))

def computeContribs(dests, rank):
    given = rank / len(dests)
    return [Row(url_id=dest, given=given) for dest in dests]

for i in range(iters):
    contribs = links.join(ranks, ["url_id"]) \
            .rdd.flatMap(lambda r: computeContribs(r.dests, r.rank)).toDF()
    ranks = contribs.groupBy("url_id").agg(sum("given").alias("sum_of_given")) \
        .withColumn("rank", lit(1-d)+(lit(d)*col("sum_of_given"))) \
        .select("url_id", "rank").repartition(2)

ranks.orderBy(desc("rank")).show()

+------+------------------+
|url_id|              rank|
+------+------------------+
|     2|  46.1069110575952|
|    37|24.468079296370256|
|    38|22.670161588010362|
|    61|21.431960076485367|
|    52|  21.2850591397784|
|    43| 18.85492469838503|
|   425|16.398342226316274|
|    27| 15.76803167490629|
|    28| 14.51109533062871|
|    29|11.558321636408444|
|  4023|11.542984997498602|
|  5254| 9.781698440058237|
|  3227| 9.692790976859236|
|    40|  9.14138687761659|
|   822| 8.997012868719546|
|  3834| 8.816095383338645|
|    73| 8.801149296376716|
|  4075| 8.633304197797587|
|   132| 8.535326945424677|
|    81| 8.363206361612379|
+------+------------------+
only showing top 20 rows



In [20]:
# efficient Implementation

data = sc.textFile("dgraph.txt")
data = data.map(lambda line: line.split())

def fil(x): 
    return int(x[0]) % 2

links = data.groupByKey().partitionBy(2, fil)
ranks = links.mapValues(lambda x: 1.0)

def computeContribs(dests, rank):
    given = rank / len(dests)
    return [[dest, given] for dest in dests]

for i in range(iters):
    contribs = links.join(ranks).flatMap(lambda x: computeContribs(x[1][0], x[1][1]))
    ranks = contribs.reduceByKey(lambda v1,v2: v1+v2).mapValues(lambda rank: (1-d) + d*rank).partitionBy(2, fil)

print(ranks.sortBy(lambda x: -x[1]).take(100))

[('2', 46.10691105759518), ('37', 24.468079296370274), ('38', 22.67016158801037), ('61', 21.43196007648538), ('52', 21.28505913977838), ('43', 18.854924698385016), ('425', 16.398342226316274), ('27', 15.768031674906279), ('28', 14.511095330628715), ('29', 11.558321636408438), ('4023', 11.542984997498602), ('5254', 9.781698440058234), ('3227', 9.692790976859232), ('40', 9.141386877616583), ('822', 8.997012868719548), ('3834', 8.81609538333864), ('73', 8.801149296376712), ('4075', 8.633304197797585), ('132', 8.535326945424679), ('81', 8.363206361612374), ('3941', 7.5575242425556635), ('593', 7.551678219723799), ('5072', 7.443596419422556), ('3220', 7.3826920545684835), ('1379', 7.230737038039187), ('3875', 6.928235700273624), ('42', 6.859956829817235), ('3873', 6.617490910838403), ('5175', 6.364418744363281), ('44', 6.339981283268824), ('3224', 6.316383515491606), ('3994', 6.29894413929315), ('1370', 6.054223596856955), ('5112', 5.824808437598615), ('80', 5.813008639172718), ('3221', 5.7