<a href="https://colab.research.google.com/github/blancavazquez/CursoDatosMasivosII/blob/master/notebooks/3d_PageRank_pyspark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Algoritmo de Page Rank usando pyspark

In [1]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

openjdk-8-jdk-headless is already the newest version (8u292-b10-0ubuntu1~18.04).
0 upgraded, 0 newly installed, 0 to remove and 37 not upgraded.


In [2]:
import pyspark
from pyspark import SparkContext

sc = SparkContext(appName="PageRank_pySpark") # Initialize the spark context.

In [3]:
# returns list of (url, contribution) tuples to each url 
def computeContribs(urls, rank):
    """Calculates URL contributions to the rank of other URLs."""
    num_urls = len(urls)
    contributions = []
    for url in urls:
        contributions.append( (url, rank/num_urls ) )
    return contributions

In [4]:
links = sc.textFile('/content/drive/MyDrive/Colab Notebooks/datos_vinculos/data_hist.txt')

In [5]:
# create RDD in format (URL, [neighbors])
links = links.map(lambda x: x.split()).map(lambda x: (x[0], x[1])).distinct().groupByKey()

In [6]:
# after groupBy, value is iterator; let's convert to a list
links = links.mapValues(lambda x: list(x))
links.collect()

[('1', ['4', '2', '3']), ('4', ['2', '3']), ('3', ['5']), ('2', ['1', '4'])]

In [7]:
# Initialize rank of each URL to 1
ranks = links.map(lambda x: (x[0], 1.0))
ranks.collect()

[('1', 1.0), ('4', 1.0), ('3', 1.0), ('2', 1.0)]

In [8]:
# Calculates and updates URL ranks continuously using PageRank algorithm.
for iteration in range(10):
    
    # create RDD of form: (url, (neighbors, rank)
    contribs = links.join(ranks)
    contribs.collect()
    
    # create RDD of form: (url, contributed rank)
    contribs = contribs.flatMap( #neighbor urls,  rank
             lambda x: computeContribs(x[1][0], x[1][1]))

    # Re-calculates URL ranks based on neighbor contributions, by
    #     summing contributions to each url (reduceByKey)
    #     rank = .85*contributions + 0.15 (mapValues)
    ranks = contribs.reduceByKey(lambda v1,v2:v1+v2).mapValues(lambda rank: rank * 0.85 + 0.15)
    ranks.collect()

In [9]:
# Output final
for link, rank in ranks.collect():
    print(link, "has rank:", rank)

2 has rank: 0.42905173875775604
5 has rank: 0.5176120203758106
3 has rank: 0.42905173875775604
1 has rank: 0.33380601018790523
4 has rank: 0.42905173875775604


In [10]:
sc.stop()