<a href="https://colab.research.google.com/github/blancavazquez/CursoDatosMasivosII/blob/master/notebooks/PageRank_spark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Algoritmo de Page Rank usando pyspark

In [1]:
#!pip install pyspark
#!pip install -U -q PyDrive
#!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

In [2]:
import pyspark
# Initialize the spark context.
sc = pyspark.SparkContext(appName="PageRankpySpark")

In [3]:
# returns list of (url, contribution) tuples to each url 
def computeContribs(urls, rank):
    """Calculates URL contributions to the rank of other URLs."""
    num_urls = len(urls)
    contributions = []
    for url in urls:
        contributions.append( (url, rank/num_urls ) )
    return contributions

In [4]:
urls = ["Page1 Page3",
         "Page2 Page1",
         "Page3 Page1",
         "Page3 Page4",
         "Page4 Page1",
         "Page4 Page2"]

In [5]:
# create RDD in format (URL, [neighbors])
links = sc.parallelize(urls)
links = links.map(lambda x: x.split()).map(lambda x: (x[0], x[1])).distinct().groupByKey()

In [6]:
# after groupBy, value is iterator; let's convert to a list
links = links.mapValues(lambda x: list(x))
links.collect()

[('Page2', ['Page1']),
 ('Page4', ['Page1', 'Page2']),
 ('Page1', ['Page3']),
 ('Page3', ['Page1', 'Page4'])]

In [7]:
# Initialize rank of each URL to 1
ranks = links.map(lambda x: (x[0], 1.0))
ranks.collect()

[('Page2', 1.0), ('Page4', 1.0), ('Page1', 1.0), ('Page3', 1.0)]

In [8]:
# Calculates and updates URL ranks continuously using PageRank algorithm.
#   (10 iterations are used)
for iteration in range(10):
    
    # create RDD of form: (url, (neighbors, rank)
    contribs = links.join(ranks)
    contribs.collect()
    
    # create RDD of form: (url, contributed rank)
    contribs = contribs.flatMap( #neighbor urls,  rank
            lambda x: computeContribs(x[1][0], x[1][1]))

    # Re-calculates URL ranks based on neighbor contributions, by
    #     summing contributions to each url (reduceByKey)
    #     rank = .85*contributions + 0.15 (mapValues)
    ranks = contribs.reduceByKey(lambda v1,v2:v1+v2).mapValues(lambda rank: rank * 0.85 + 0.15)
    ranks.collect()

In [9]:
# Output final URL ranks
for link, rank in ranks.collect():
    print(link, "has rank:", rank)

Page1 has rank: 1.4313779845858583
Page3 has rank: 1.3758228705372555
Page4 has rank: 0.7294952436130331
Page2 has rank: 0.4633039012638519


In [10]:
sc.stop()