In [20]:
!pip install pyspark



In [1]:
from pyspark import SparkConf, SparkContext
from operator import add
import re

In [2]:
conf = SparkConf().setAppName("PageRank").setMaster("local[*]")
sc = SparkContext(conf=conf)

In [3]:
    input_path = "/content/date/web-Google.txt.gz"
    lines = sc.textFile(input_path)

In [4]:
def parseNeighbors(urls):

    parts = re.split(r'\s+', urls)
    return parts[0], parts[1]


In [5]:
links = lines.map(lambda urls: parseNeighbors(urls)).distinct().groupByKey().cache()

In [6]:
ranks = links.map(lambda url_neighbors: (url_neighbors[0], 1.0))

In [7]:
def computeContribs(urls, rank):
    num_urls = len(urls)
    for url in urls:
        yield (url, rank / num_urls)

In [8]:
    iterations = 10

    for iteration in range(iterations):

        contribs = links.join(ranks).flatMap(
            lambda url_urls_rank: computeContribs(url_urls_rank[1][0], url_urls_rank[1][1])
        )


        ranks = contribs.reduceByKey(add).mapValues(lambda rank: 0.15 + 0.85 * rank)

In [9]:
    for (link, rank) in ranks.collect():
        print("%s has rank: %s." % (link, rank))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
495997 has rank: 0.16149076423704511.
557618 has rank: 0.16149076423704511.
270447 has rank: 0.18498569992218133.
357563 has rank: 0.3755838839642609.
379008 has rank: 0.1798277993689224.
335022 has rank: 0.37685878057487.
825762 has rank: 0.23852784275137667.
571284 has rank: 0.23852784275137667.
305791 has rank: 0.20121633535737823.
307456 has rank: 0.34020435729585474.
167349 has rank: 0.15726092829671928.
231545 has rank: 0.5292659281710997.
640126 has rank: 0.18035328481932045.
868889 has rank: 0.29507557739723966.
54126 has rank: 0.17210140035291008.
486599 has rank: 0.21615445328964455.
281281 has rank: 0.24382474554836953.
179671 has rank: 0.17105956142330242.
604661 has rank: 0.370182114088184.
464629 has rank: 0.1699625730037076.
374877 has rank: 0.2504861148133133.
438988 has rank: 0.1805338196892717.
263737 has rank: 0.17450348164779947.
983 has rank: 0.3605029800855466.
201942 has rank: 0.7800123554568629.
10

In [10]:
sc.stop()