## Page Ranking (not optimized)

In [1]:
from datetime import datetime

In [2]:
sc

In [3]:
# This function print some statistics about the input RDD
def analyzePartitions(myRDD):
    if myRDD.partitioner is None:
        print("Partioner: No partitioner")
    else:
        print("Partioner: "+str(myRDD.partitioner.partitionFunc))
    
    print("Num. partitions: "+ str(myRDD.getNumPartitions()))
    
    # Create a local copy of the input partitions in a local Python list
    partitions = myRDD.glom().collect()

    print("Content of the partitions")
    for p in partitions:
        print(str(p))

In [4]:
start = datetime.now()

In [5]:
# Read the input file with the structure of the web graph
inputData = sc.textFile("./databases/links.txt")

In [8]:
analyzePartitions(inputData)

Partioner: No partitioner
Num. partitions: 2
Content of the partitions
['P1 P1', 'P2 P5', 'P3 P1,P2,P4,P5']
['P4 P3,P5', 'P5 P4']


In [9]:
# Format of each input line
# PageId,LinksToOtherPages
# e.g., P3 [P1,P2,P4,P5]
def mapToPairPageIDLinks(line):
    fields = line.split(' ')
    pageID = fields[0]
    links = fields[1].split(',')
       
    
    return (pageID, links)

In [10]:
links = inputData.map(mapToPairPageIDLinks).cache()

In [11]:
analyzePartitions(links)

Partioner: No partitioner
Num. partitions: 2
Content of the partitions
[('P1', ['P1']), ('P2', ['P5']), ('P3', ['P1', 'P2', 'P4', 'P5'])]
[('P4', ['P3', 'P5']), ('P5', ['P4'])]


In [12]:
# Initialize each page's rank to 1.0; since we use mapValues, 
# the resulting RDD will have the same partitioner as links
ranks = links.mapValues(lambda v: 1.0)

In [13]:
analyzePartitions(ranks)

Partioner: No partitioner
Num. partitions: 2
Content of the partitions
[('P1', 1.0), ('P2', 1.0), ('P3', 1.0)]
[('P4', 1.0), ('P5', 1.0)]


In [14]:
pageRankLinks = links.join(ranks)
pageRankLinks.collect()

[('P1', (['P1'], 1.0)),
 ('P2', (['P5'], 1.0)),
 ('P5', (['P4'], 1.0)),
 ('P3', (['P1', 'P2', 'P4', 'P5'], 1.0)),
 ('P4', (['P3', 'P5'], 1.0))]

In [18]:
contributions = pageRankLinks.flatMap(computeContributions)

In [19]:
contributions.collect()

[('P1', 0.04427343384909932),
 ('P2', 0.04427343384909932),
 ('P4', 0.04427343384909932),
 ('P5', 0.04427343384909932),
 ('P5', 0.04643422846430667),
 ('P1', 4.170157775076916),
 ('P3', 0.16667121504943339),
 ('P5', 0.16667121504943339),
 ('P4', 0.2729718309635132)]

In [21]:
# Return a set of pairs from each input pair
# input pair: 
#---- (pageid, (linked pages, current page rank of pageid) )
# output pairs:
# --- one output pair for each linked page
# --- (pageid linked page, current page rank of the linking page pageid / number of linked pages)
def computeContributions(pageIDLinksPageRank):
    pagesContributions = []
    
    currentPageRank = pageIDLinksPageRank[1][1] # this takes the 'rank' from the puple ('ID', (['IDs'], 'rank'))
    linkedPages = pageIDLinksPageRank[1][0]     # this takes the IDs list from the puple ('ID', (['IDs'], 'rank'))
    numLinkedPages = len(linkedPages)
    contribution = currentPageRank/numLinkedPages
    
    for pageidLinkedPage in linkedPages:
        pagesContributions.append( (pageidLinkedPage, contribution))
    
    return pagesContributions

In [24]:
# Run 30 iterations of PageRank
for x in range(50):
    # Retrieve for each page its current pagerank and the list of linked pages
    pageRankLinks = links.join(ranks)
    # Compute contributions from linking pages to linked pages for this iteration
    contributions = pageRankLinks.flatMap(computeContributions)
    # Update current pagerank of all pages for this iteration
    ranks = contributions.reduceByKey(lambda contribution1, contribution2: contribution1+contribution2)

**PAY ATTENTION:** we know that RDDs are immutable. This means that we are creating 50 times a new RDD. At the end we will have a reference of the last RDD. This is not very smart or efficient. Using partition by we can do something more efficient.

In [25]:
ranks.collect()

[('P1', 4.990081695195067),
 ('P2', 0.0005571340830127929),
 ('P4', 0.003995583027421031),
 ('P5', 0.0032555756738218465),
 ('P3', 0.002110012020679645)]

In [26]:
end = datetime.now()
print(end-start)

0:11:42.326045
