<a href="https://colab.research.google.com/github/blancavazquez/CursoDatosMasivosII/blob/master/notebooks/Topic_sensitive.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Algoritmo de Page Rank sensible al tópico usando pyspark

In [1]:
#!pip install pyspark
#!pip install -U -q PyDrive
#!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

In [2]:
def computeContribs(urls, rank):
    num_urls = len(urls)
    for url in urls: yield (url, rank / num_urls)


def parseNeighbors(urls):
    parts = re.split(r'\s+', urls)
    return parts[0], parts[1]

In [3]:
import pyspark
from pyspark.sql.functions import * 
from pyspark.sql import SparkSession
import re, sys
from operator import add
from pyspark import SparkContext
import time
sc = SparkContext()

In [4]:
lines = sc.textFile('/content/drive/MyDrive/Colab Notebooks/datos_vinculos/data_hist.txt')
pages = sc.textFile('/content/drive/MyDrive/Colab Notebooks/datos_vinculos/topics.txt')

In [5]:
lines.collect()

['1 2', '1 3', '1 4', '2 1', '2 4', '3 5', '4 2', '4 3']

In [6]:
pages.collect()

['2', '3']

In [7]:
links = lines.map(lambda urls: parseNeighbors(urls)).distinct().groupByKey().cache()
topics = pages.map(lambda urls: urls).distinct()

In [8]:
# Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one.
ranks = links.map(lambda x: (x[0], 1.0))
ranks.collect()

[('1', 1.0), ('4', 1.0), ('3', 1.0), ('2', 1.0)]

In [9]:
# Calculates and updates URL ranks continuously using PageRank Topic sensitive algorithm.
for iteration in range(int(10)):

  # Calculates URL contributions to the rank of other URLs.
  contribs = links.join(ranks).flatMap(lambda url_urls_rank:computeContribs(url_urls_rank[1][0], url_urls_rank[1][1]))
  
  # Re-calculates URL ranks based on neighbor contributions.
  ranks = contribs.reduceByKey(add).mapValues(lambda rank: rank * 0.85)
  new_ranks = [(v[0], v[1]) for i, v in enumerate(ranks.collect())]
  
  for number, i in enumerate(new_ranks):
    if i[0] in topics.collect():
      new_ranks[number] = (i[0], i[1]+0.15)
  
  ranks = sc.parallelize(new_ranks)

# Collects all URL ranks and dump them to console.
for link, rank in ranks.collect():
  print(link, "has rank:", rank)

4 has rank: 0.13404400023101573
3 has rank: 0.23928692021412215
1 has rank: 0.10374045693056969
2 has rank: 0.23928692021412215
5 has rank: 0.20748091386113937


In [10]:
sc.stop()