In [1]:
from pyspark.sql.functions import *
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import *
from graphframes import *
from pyspark.sql.functions import *

In [2]:
spark = SparkSession \
        .builder \
        .appName("PageRank") \
        .config("spark.executor.heartbeatInterval", "30000s")\
        .getOrCreate()

In [3]:
spark

In [5]:
pages = spark.read.format("csv").load("../data/w3data/pagerank/*pages.csv")

In [6]:
links = spark.read.format("csv").load("../data/w3data/pagerank/*links.csv")

In [7]:
links=links.select(col("_c0").alias("src"), (col("_c1").alias("dst")))
pages=pages.select(col("_c0").alias("id"))

In [8]:
pages.show(5, False)

+---------------------------------------------------------------------------------+
|id                                                                               |
+---------------------------------------------------------------------------------+
|https://w3.ibm.com/w3publisher/google-analytics/education-training/when-to-use-ga|
|https://w3.ibm.com/w3publisher/gts-efc/events/initiatives/balance-sheet-reporting|
|https://w3.ibm.com/w3publisher/patents/research-master-inventor                  |
|https://w3.ibm.com/w3publisher/gts-next/gts-narrative                            |
|https://w3.ibm.com/w3publisher/gbs-eu-pde/archive/consulting-fund-2              |
+---------------------------------------------------------------------------------+
only showing top 5 rows



In [9]:
links.show(5, False)

+---------------------------------------------------------------+-------------------------------------------------------------+
|src                                                            |dst                                                          |
+---------------------------------------------------------------+-------------------------------------------------------------+
|https://w3.ibm.com/w3publisher/patents/research-master-inventor|https://w3.ibm.com/w3publisher/patents/why-we-patent         |
|https://w3.ibm.com/w3publisher/patents/research-master-inventor|https://w3.ibm.com/w3publisher/patents/awards                |
|https://w3.ibm.com/w3publisher/patents/research-master-inventor|https://w3.ibm.com/                                          |
|https://w3.ibm.com/w3publisher/patents/research-master-inventor|http://w3.ibm.com/w3/info_terms_of_use.html                  |
|https://w3.ibm.com/w3publisher/patents/research-master-inventor|https://w3.ibm.com/w3publisher/patents/

### Create a graphframe from vertices and edges

In [10]:
w3graph = GraphFrame(pages, links)

### Run a Pagerank algorithm

In [13]:
w3pagerank = w3graph.pageRank(resetProbability=0.15, maxIter=5)

In [18]:
w3pagerank.vertices.orderBy("pagerank", ascending=False).show(50, False)

+-----------------------------------------------------------------------------------------------------+------------------+
|id                                                                                                   |pagerank          |
+-----------------------------------------------------------------------------------------------------+------------------+
|https://w3.ibm.com/w3publisher                                                                       |918.6350758663948 |
|https://w3.ibm.com/w3publisher/w3-privacy-notice/                                                    |917.9575882531376 |
|https://w3.ibm.com/w3publisher/sales-and-delivery-101-mod-3/consult-to-operate                       |29.492137597197754|
|https://w3.ibm.com/w3publisher/process-portal-benelux/process-portal-benelux/contact                 |28.53062710038013 |
|https://w3.ibm.com/w3publisher/                                                                      |27.380086802454347|
|https://w3.ibm.