In [1]:
#Running on Colab
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=bbaf8fec3b0e1454c5baeabf1386a5230dadee7c5d1af8fc011555744b7fb6e9
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1
The following additional packages will be installed:
  libxtst6 openjdk-8-jre-headless
Suggested packages:
  openjdk-8-demo openjdk-8-source libnss-mdns fonts-dejavu-extra fonts-nanum fonts-ipafont-gothic
  fonts-ipafont-mincho fonts-wqy-microhei fonts-wqy-zenhei fonts-indic

In [3]:
#  Importing Required Libraries
import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

# Create Spark session and ContextRun PySpark.
# create the session
conf = SparkConf().set("spark.ui.port","4050")
# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.appName("DataFrame").config('spark.ui.port', '4050').getOrCreate()
spark

In [7]:
links = sc.parallelize([['A', ['B', 'C']], ['B', ['A']], ['C', ['B', 'D']], ['D', ['A']]]).persist()
N = links.count()
iterationsToRun = 1

ranks = links.mapValues(lambda x: 1.0 / N)
for iteration in range(iterationsToRun):
    contributions = links.join(ranks).flatMap(lambda x: [(dest, x[1][1] / len(x[1][0])) for dest in x[1][0]])
    ranks = contributions.reduceByKey(lambda x, y: x + y).mapValues(lambda r: r * 0.85 + 0.15)

print('Graph Structure:')
print(links.collect())
print('Number of Nodes: ', N)
print('Ranks: ')
print(ranks.collect())

Graph Structure:
[['A', ['B', 'C']], ['B', ['A']], ['C', ['B', 'D']], ['D', ['A']]]
Number of Nodes:  4
Ranks: 
[('B', 0.3625), ('A', 0.575), ('C', 0.25625), ('D', 0.25625)]


In [19]:
links = sc.parallelize([['A', ['B', 'C']], ['B', ['A']], ['C', ['B', 'D']], ['D', ['A']]]).persist()
N = links.count()
iterationsToRun = 2

# Function to handle dead ends
def handle_dead_ends(contributions, ranks, N):
    dead_ends = ranks.subtractByKey(contributions)
    dead_end_sum = dead_ends.map(lambda x: x[1]).sum()
    redistributed_contribution = dead_end_sum / N
    ranks = ranks.mapValues(lambda rank: rank + redistributed_contribution)
    return ranks

ranks = links.mapValues(lambda x: 1.0 / N)
for iteration in range(iterationsToRun):
    contributions = links.join(ranks).flatMap(lambda x: [(dest, x[1][1] / len(x[1][0])) for dest in x[1][0]])
    ranks = contributions.reduceByKey(lambda x, y: x + y).mapValues(lambda r: r * 0.85 + 0.15)
    ranks = handle_dead_ends(contributions, ranks, N)

print('Graph Structure:')
print(links.collect())
print('Number of Nodes: ', N)
print('Ranks: ')
print(ranks.collect())


Graph Structure:
[['A', ['B', 'C']], ['B', ['A']], ['C', ['B', 'D']], ['D', ['A']]]
Number of Nodes:  4
Ranks: 
[('C', 0.394375), ('A', 0.6759375), ('B', 0.5032812499999999), ('D', 0.25890625)]
