<a href="https://colab.research.google.com/github/blancavazquez/CursoDatosMasivosII/blob/master/notebooks/PageRank_spark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Algoritmo de Page Rank usando pyspark

In [1]:
#!pip install pyspark
#!pip install -U -q PyDrive
#!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

Collecting pyspark
  Downloading pyspark-3.1.2.tar.gz (212.4 MB)
[K     |████████████████████████████████| 212.4 MB 63 kB/s 
[?25hCollecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 54.2 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.2-py2.py3-none-any.whl size=212880768 sha256=fe91b69810e6c1ee9b6ff6988bc34742bc31ee9b93145034492f41b15c9ff185
  Stored in directory: /root/.cache/pip/wheels/a5/0a/c1/9561f6fecb759579a7d863dcd846daaa95f598744e71b02c77
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.2
The following additional packages will be installed:
  openjdk-8-jre-headless
Suggested packages:
  openjdk-8-demo openjdk-8-source libnss-mdns fonts-dejavu-extra
  fonts-ipafont-gothic fonts-ipafont-mincho fonts-wqy-microhe

In [2]:
import pyspark
# Initialize the spark context.
sc = pyspark.SparkContext(appName="PageRankpySpark")

In [3]:
# returns list of (url, contribution) tuples to each url 
def computeContribs(urls, rank):
    """Calculates URL contributions to the rank of other URLs."""
    num_urls = len(urls)
    contributions = []
    for url in urls:
        contributions.append( (url, rank/num_urls ) )
    return contributions

In [None]:
urls = ["Page1 Page3",
         "Page2 Page1",
         "Page3 Page1",
         "Page3 Page4",
         "Page4 Page1",
         "Page4 Page2"]

In [6]:
links = sc.textFile('/content/drive/MyDrive/Colab Notebooks/datos_vinculos/data_hist.txt')

In [7]:
# create RDD in format (URL, [neighbors])
#links = sc.parallelize(urls)
links = links.map(lambda x: x.split()).map(lambda x: (x[0], x[1])).distinct().groupByKey()

In [8]:
# after groupBy, value is iterator; let's convert to a list
links = links.mapValues(lambda x: list(x))
links.collect()

[('1', ['4', '2', '3']), ('4', ['2', '3']), ('3', ['5']), ('2', ['1', '4'])]

In [9]:
# Initialize rank of each URL to 1
ranks = links.map(lambda x: (x[0], 1.0))
ranks.collect()

[('1', 1.0), ('4', 1.0), ('3', 1.0), ('2', 1.0)]

In [10]:
# Calculates and updates URL ranks continuously using PageRank algorithm.
for iteration in range(10):
    
    # create RDD of form: (url, (neighbors, rank)
    contribs = links.join(ranks)
    contribs.collect()
    
    # create RDD of form: (url, contributed rank)
    contribs = contribs.flatMap( #neighbor urls,  rank
            lambda x: computeContribs(x[1][0], x[1][1]))

    # Re-calculates URL ranks based on neighbor contributions, by
    #     summing contributions to each url (reduceByKey)
    #     rank = .85*contributions + 0.15 (mapValues)
    ranks = contribs.reduceByKey(lambda v1,v2:v1+v2).mapValues(lambda rank: rank * 0.85 + 0.15)
    ranks.collect()

In [11]:
# Output final URL ranks
for link, rank in ranks.collect():
    print(link, "has rank:", rank)

2 has rank: 0.42905173875775604
5 has rank: 0.5176120203758106
3 has rank: 0.42905173875775604
1 has rank: 0.33380601018790523
4 has rank: 0.42905173875775604


In [12]:
sc.stop()