#### Dependencies
_____

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import desc
from graphframes import *

#### Spark Session
____

In [2]:
spark = SparkSession.builder.appName('PageRank').getOrCreate()

#### Load Adjacency List
____

In [3]:
rdd = spark.sparkContext.textFile('../resources/02AdjacencyList.txt')
rdd.persist()

../resources/02AdjacencyList.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

#### Create Vertices and Edges DataFrames
____

In [6]:
v_list = []
e_list = []

for node in rdd.map(lambda item: item.split(' ')).collect():
    v_list.append((node[0], 'vertice_'+node[0]))
    
    for edge in range(1, len(node)):
        e_list.append((node[0], node[edge]))    


In [7]:
vertices = spark.createDataFrame(v_list, ['id', 'name'])
vertices.show()

+---+---------+
| id|     name|
+---+---------+
|  1|vertice_1|
|  2|vertice_2|
|  3|vertice_3|
|  4|vertice_4|
|  5|vertice_5|
+---+---------+



In [8]:
edges = spark.createDataFrame(e_list, ['src', 'dst'])
edges.show()

+---+---+
|src|dst|
+---+---+
|  1|  2|
|  2|  3|
|  2|  4|
|  3|  4|
|  4|  1|
|  4|  5|
|  5|  3|
+---+---+



#### Construct Graph 
____

In [9]:
graph = GraphFrame(vertices, edges)

#### Verify Vertices, Edges and Degress of Graph 
____

In [10]:
graph.vertices.show()

+---+---------+
| id|     name|
+---+---------+
|  1|vertice_1|
|  2|vertice_2|
|  3|vertice_3|
|  4|vertice_4|
|  5|vertice_5|
+---+---------+



In [11]:
graph.edges.show()

+---+---+
|src|dst|
+---+---+
|  1|  2|
|  2|  3|
|  2|  4|
|  3|  4|
|  4|  1|
|  4|  5|
|  5|  3|
+---+---+



In [12]:
graph.degrees.show()

+---+------+
| id|degree|
+---+------+
|  3|     3|
|  5|     2|
|  1|     2|
|  4|     4|
|  2|     3|
+---+------+



#### PageRank
____

In [13]:
ranks = graph.pageRank(resetProbability=0.15, maxIter=30)

In [14]:
ranks.vertices.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- pagerank: double (nullable = true)



In [15]:
ranks.vertices.orderBy(desc('pagerank')).select('id', 'pagerank').show()

+---+------------------+
| id|          pagerank|
+---+------------------+
|  4|1.4772988621034329|
|  3|1.1559608107498358|
|  2|0.8111910907636817|
|  1|0.7777746181915253|
|  5|0.7777746181915253|
+---+------------------+



In [16]:
sum_pr = ranks.vertices.select('pagerank').agg({'pagerank': 'sum'}).collect()[0][0]    

ranks.vertices.withColumn("perc_rank", ranks.vertices.pagerank/sum_pr).show()

+---+---------+------------------+-------------------+
| id|     name|          pagerank|          perc_rank|
+---+---------+------------------+-------------------+
|  1|vertice_1|0.7777746181915253|  0.155554923638305|
|  3|vertice_3|1.1559608107498358|0.23119216214996707|
|  2|vertice_2|0.8111910907636817|0.16223821815273629|
|  4|vertice_4|1.4772988621034329| 0.2954597724206865|
|  5|vertice_5|0.7777746181915253|  0.155554923638305|
+---+---------+------------------+-------------------+



#### Save To Disk
______

In [19]:
ranks.vertices.write.format("csv").save("../output/graphframe_page_rank")