In [1]:
from datetime import datetime

In [2]:
# This function print some statistics about the input RDD
def analyzePartitions(myRDD):
    if myRDD.partitioner is None:
        print("Partioner: No partitioner")
    else:
        print("Partioner: "+str(myRDD.partitioner.partitionFunc))
    
    print("Num. partitions: "+ str(myRDD.getNumPartitions()))
    
    # Create a local copy of the input partitions in a local Python list
    partitions = myRDD.glom().collect()

    print("Content of the partitions")
    for p in partitions:
        print(str(p))

In [3]:
start = datetime.now()

In [4]:
# Read the input file with the structure of the web graph
inputData = sc.textFile("links.txt")

In [5]:
#analyzePartitions(inputData)

In [6]:
# Format of each input line
# PageId,LinksToOtherPages
# e.g., P3 [P1,P2,P4,P5]
def mapToPairPageIDLinks(line):
    fields = line.split(' ')
    pageID = fields[0]
    links = fields[1].split(',')
       
    
    return (pageID, links)

In [7]:
links = inputData.map(mapToPairPageIDLinks).partitionBy(inputData.getNumPartitions()).cache()

In [8]:
#analyzePartitions(links)

In [9]:
# Initialize each page's rank to 1.0; since we use mapValues, 
# the resulting RDD will have the same partitioner as links
ranks = links.mapValues(lambda v: 1.0)

In [10]:
#analyzePartitions(ranks)

In [11]:
# Return a set of pairs from each input pair
# input pair: 
#---- (pageid, (linked pages, current page rank of pageid) )
# output pairs:
# --- one output pair for each linked page
# --- (pageid linked page, current page rank of the linking page pageid / number of linked pages)
def computeContributions(pageIDLinksPageRank):
    pagesContributions = []
    
    currentPageRank = pageIDLinksPageRank[1][1]
    linkedPages = pageIDLinksPageRank[1][0]
    numLinkedPages = len(linkedPages)
    contribution = currentPageRank/numLinkedPages
    
    for pageidLinkedPage in linkedPages:
        pagesContributions.append( (pageidLinkedPage, contribution))
    
    return pagesContributions

In [12]:
# Run 30 iterations of PageRank
for x in range(30):
    # Retrieve for each page its current pagerank and the list of linked pages
    pageRankLinks = links.join(ranks)
    # Compute contributions from linking pages to linked pages for this iteration
    contributions = pageRankLinks.flatMap(computeContributions)
    # Update current pagerank of all pages for this iteration
    ranks = contributions.reduceByKey(lambda contribution1, contribution2: contribution1+contribution2)

In [13]:
ranks.collect()

[('P2', 0.4545454229323411),
 ('P5', 1.5909107866251588),
 ('P1', 0.22727216197122857),
 ('P4', 1.8181787621100227),
 ('P3', 0.9090928663612488)]

In [14]:
end = datetime.now()
print(end-start)

0:00:06.224998
