In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import desc
from graphframes import *
from pyspark.sql.functions import col, asc

In [2]:
spark = SparkSession.builder.appName('PageRank').getOrCreate()

In [79]:
rdd = spark.sparkContext.textFile('../resources/02AdjacencyList.txt')

In [80]:
v_list = []
e_list = []

for node in rdd.map(lambda item: item.split(' ')).collect():
    v_list.append((node[0], 'vertice_'+node[0]))
    
    for edge in range(1, len(node)):
        e_list.append((node[0], node[edge]))    

In [81]:
vertices = spark.createDataFrame(v_list, ['id', 'name'])
edges = spark.createDataFrame(e_list, ['src', 'dst'])

In [82]:
vertices.show()

+---+---------+
| id|     name|
+---+---------+
|  1|vertice_1|
|  2|vertice_2|
|  3|vertice_3|
|  4|vertice_4|
|  5|vertice_5|
+---+---------+



In [83]:
edges.show()

+---+---+
|src|dst|
+---+---+
|  1|  2|
|  2|  3|
|  2|  4|
|  3|  4|
|  4|  1|
|  4|  5|
|  5|  3|
+---+---+



#### PageRank DataFrame
##### Initial Page Rank = 1
____

In [179]:
l_rank = [(vertex[0], 0, 1/vertices.count(), 1/vertices.count()) for vertex in vertices.select('id').collect()]

pagerank = spark.createDataFrame(l_rank, ['page', 'iteration', 'pagerank', 'perc_rank'])

In [180]:
pagerank.show()

+----+---------+--------+---------+
|page|iteration|pagerank|perc_rank|
+----+---------+--------+---------+
|   1|        0|     0.2|      0.2|
|   2|        0|     0.2|      0.2|
|   3|        0|     0.2|      0.2|
|   4|        0|     0.2|      0.2|
|   5|        0|     0.2|      0.2|
+----+---------+--------+---------+



In [181]:
def calc_page_rank(edge, damping_factor):
    
    total_rank = []
    
    for inbound in edges.filter(f"dst == '{edge}'").collect():
        last_iteration = pagerank.filter(f"page == '{inbound['src']}'").agg({'iteration' : 'max'}).collect()[0][0]
        last_rank = pagerank.filter(f"page == '{inbound['src']}'").filter(f'iteration == {last_iteration}').select('pagerank').collect()[0][0]
        total_outbound = edges.filter(f"src == '{inbound['src']}'").count()
        
        total_rank.append(last_rank/total_outbound)
        
    return (edge, last_iteration + 1, (1-damping_factor) + (damping_factor * sum(total_rank)))

In [182]:
damping_factor = 0.85
rows = []
iterations = 29

for _ in range(iterations):
    for page in vertices.select('id').collect():
        rows.append(calc_page_rank(page[0], damping_factor))
  
    newRows = spark.createDataFrame(rows)
    sum_pr = newRows.select(newRows.columns[-1]).agg({f'{newRows.columns[-1]}' : 'sum'}).collect()[0][0]    
    newRows = newRows.withColumn("perc_rank", (newRows[f'{newRows.columns[-1]}']/sum_pr))
    
    pagerank = pagerank.union(newRows)
    rows.clear() 


In [183]:
last_iteration = pagerank.agg({'iteration' : 'max'}).collect()[0][0]

pagerank.filter(f"iteration == {last_iteration}").show()

+----+---------+------------------+-------------------+
|page|iteration|          pagerank|          perc_rank|
+----+---------+------------------+-------------------+
|   1|       29|0.7723196821160265|0.15558130445306423|
|   2|       29|0.8056407707925519|0.16229373010028633|
|   3|       29|1.1476149730879313|0.23118332817014126|
|   4|       29|1.4661954349708077| 0.2953603328234441|
|   5|       29|0.7723196821160265|0.15558130445306423|
+----+---------+------------------+-------------------+



In [122]:
df = df.withColumn("x4", df.pagerank/sum_pr)

In [123]:
df.show()

+----+---------+------------------+-------------------+
|page|iteration|          pagerank|                 x4|
+----+---------+------------------+-------------------+
|   1|       30|0.7731330598625933| 0.1555763445194499|
|   2|       30|0.8064717297986226|0.16228503241427328|
|   3|       30|1.1488690573854572|0.23118510584879393|
|   4|       30|1.4678700547115762| 0.2953771726980331|
|   5|       30|0.7731330598625933| 0.1555763445194499|
+----+---------+------------------+-------------------+



In [143]:
df.select(df.columns[-1]).agg({f'{df.columns[-1]}' : 'sum'}).collect()[0][0]

1.0

In [148]:
df = df.withColumn("x5", df['pagerank']/sum_pr)

In [149]:
df.show()

+----+---------+------------------+-------------------+-------------------+
|page|iteration|          pagerank|                 x4|                 x5|
+----+---------+------------------+-------------------+-------------------+
|   1|       30|0.7731330598625933| 0.1555763445194499|0.48320816241412073|
|   2|       30|0.8064717297986226|0.16228503241427328|  0.504044831124139|
|   3|       30|1.1488690573854572|0.23118510584879393| 0.7180431608659106|
|   4|       30|1.4678700547115762| 0.2953771726980331| 0.9174187841947349|
|   5|       30|0.7731330598625933| 0.1555763445194499|0.48320816241412073|
+----+---------+------------------+-------------------+-------------------+



In [171]:
#    newRows = spark.createDataFrame(rows)
#    tempRows = newRows
#    sum_pr = newRows.select(newRows.columns[-1]).agg({f'{newRows.columns[-1]}' : 'sum'}).collect()[0][0]    
#    newRows = newRows.withColumn("perc_rank", (newRows[f'{newRows.columns[-2]}']/sum_pr))


sum_pr = tempRows.select(tempRows.columns[-1]).agg({f'{tempRows.columns[-1]}' : 'sum'}).collect()[0][0]
col = tempRows.columns[-1]
#print(sum_pr)

#print(col)
#tempRows.withColumn("perc_rank", (tempRows[f'{col}']/sum_pr)).show()
tempRows.withColumn("perc_rank", (tempRows[f'{tempRows.columns[-1]}']/sum_pr)).show()

#tempRows.show()

+---+---+-------------------+---------+
| _1| _2|                 _3|perc_rank|
+---+---+-------------------+---------+
|  1|  1|0.23500000000000004| 0.146875|
|  2|  1|0.32000000000000006|      0.2|
|  3|  1|              0.405| 0.253125|
|  4|  1|              0.405| 0.253125|
|  5|  1|0.23500000000000004| 0.146875|
+---+---+-------------------+---------+

