<a href="https://colab.research.google.com/github/blancavazquez/CursoDatosMasivosII/blob/master/notebooks/HIST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Algoritmo de HITS usando pyspark

In [1]:
#!pip install pyspark
#!pip install -U -q PyDrive
#!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

In [2]:
import re, sys
from operator import add
from pyspark import SparkContext

In [3]:
def computeAuth(urls, hub):
    """Calculates hub contributions to the auth of other URLs."""
    num_urls = len(urls)
    for url in urls: yield (url, hub)

def computeHub(urls, auth):
    """Calculates auth contributions to the hub of other URLs."""
    num_urls = len(urls)
    for url in urls: yield (url, auth)

def outNeighbors(urls):
    """Parses a urls pair string into urls pair."""
    parts = re.split(r'\s+', urls)
    return parts[0], parts[1]

def inNeighbors(urls):
    """Parses a urls pair string into urls pair."""
    parts = re.split(r'\s+', urls)
    return parts[1], parts[0]

In [4]:
import pyspark
from pyspark.sql.functions import * 
from pyspark.sql import SparkSession
import re, sys
from operator import add
from pyspark import SparkContext
import time
sc = SparkContext()

In [5]:
lineas = sc.textFile('/content/drive/MyDrive/Colab Notebooks/datos_vinculos/data_hist.txt')

In [6]:
lineas.collect()

['1 2', '1 3', '1 4', '2 1', '2 4', '3 5', '4 2', '4 3']

In [7]:
out_links = lineas.map(lambda urls: outNeighbors(urls)).distinct().groupByKey()
out_links.collect()

[('1', <pyspark.resultiterable.ResultIterable at 0x7f4e15e09e10>),
 ('4', <pyspark.resultiterable.ResultIterable at 0x7f4e15e09cd0>),
 ('3', <pyspark.resultiterable.ResultIterable at 0x7f4e15e09e90>),
 ('2', <pyspark.resultiterable.ResultIterable at 0x7f4e15e09d90>)]

In [8]:
in_links = lineas.map(lambda urls: inNeighbors(urls)).distinct().groupByKey()
in_links.collect()

[('4', <pyspark.resultiterable.ResultIterable at 0x7f4e15e2b310>),
 ('1', <pyspark.resultiterable.ResultIterable at 0x7f4e15e2b090>),
 ('5', <pyspark.resultiterable.ResultIterable at 0x7f4e15e2b390>),
 ('2', <pyspark.resultiterable.ResultIterable at 0x7f4e15e2b4d0>),
 ('3', <pyspark.resultiterable.ResultIterable at 0x7f4e15df9bd0>)]

In [9]:
hubs = out_links.map(lambda x: (x[0], 1.0))
hubs.collect()

[('1', 1.0), ('4', 1.0), ('3', 1.0), ('2', 1.0)]

In [10]:
auths = in_links.map(lambda x: (x[0], 1.0))
auths.collect()

[('4', 1.0), ('1', 1.0), ('5', 1.0), ('2', 1.0), ('3', 1.0)]

In [11]:
# Calculates and updates URL ranks continuously using Hits algorithm.
for iteration in range(int(10)):
  # Calculates URL contributions to the rank of other URLs.
  # Here we are contributing auth of a link present in the outgoing list of a link whose hub is given
   
   auth_contribs = out_links.join(hubs).flatMap(lambda url_urls_rank:computeAuth(url_urls_rank[1][0], url_urls_rank[1][1]))

   auths = auth_contribs.reduceByKey(add) 
   max_value = auths.max(lambda x:x[1])[1]

   auths = auths.mapValues(lambda rank: rank/(max_value))

   # Here we are contributing hub of a link present in the incoming list of a link whose auth is given
   hub_contribs = in_links.join(auths).flatMap(lambda url_urls_rank:computeHub(url_urls_rank[1][0], url_urls_rank[1][1]))
   
   hubs = hub_contribs.reduceByKey(add)
   max_value = hubs.max(lambda x:x[1])[1]

   hubs = hubs.mapValues(lambda rank:rank/(max_value))
   print("iteración:", iteration)

iteración: 0
iteración: 1
iteración: 2
iteración: 3
iteración: 4
iteración: 5
iteración: 6
iteración: 7
iteración: 8
iteración: 9


In [12]:
# Collects all URL ranks and dump them to console.
for (link, rank) in auths.collect():
  print(link, "has auth:", (link, rank))

print("**************************************")
for (link, rank) in hubs.collect():
  print(link, "has hub:", (link, rank))

1 has auth: ('1', 0.12838756187947561)
2 has auth: ('2', 0.6149109120448921)
4 has auth: ('4', 0.4866324223620656)
3 has auth: ('3', 0.6149109120448921)
5 has auth: ('5', 2.1303163408019257e-07)
**************************************
3 has hub: ('3', 1.2411145506532355e-07)
4 has hub: ('4', 0.7164897209651799)
2 has hub: ('2', 0.3583084055475835)
1 has hub: ('1', 1.0)
