## Page Ranking (not optimized)

In [1]:
from datetime import datetime

In [2]:
sc

In [3]:
# This function print some statistics about the input RDD
def analyzePartitions(myRDD):
    if myRDD.partitioner is None:
        print("Partioner: No partitioner")
    else:
        print("Partioner: "+str(myRDD.partitioner.partitionFunc))
    
    print("Num. partitions: "+ str(myRDD.getNumPartitions()))
    
    # Create a local copy of the input partitions in a local Python list
    partitions = myRDD.glom().collect()

    print("Content of the partitions")
    for p in partitions:
        print(str(p))

In [4]:
start = datetime.now()

In [5]:
# Read the input file with the structure of the web graph
inputData = sc.textFile("./databases/links.txt")

In [6]:
#analyzePartitions(inputData)

In [7]:
# Format of each input line
# PageId,LinksToOtherPages
# e.g., P3 [P1,P2,P4,P5]
def mapToPairPageIDLinks(line):
    fields = line.split(' ')
    pageID = fields[0]
    links = fields[1].split(',')
       
    
    return (pageID, links)

In [8]:
links = inputData.map(mapToPairPageIDLinks).cache()

In [9]:
#analyzePartitions(links)

In [10]:
# Initialize each page's rank to 1.0; since we use mapValues, 
# the resulting RDD will have the same partitioner as links
ranks = links.mapValues(lambda v: 1.0)

In [11]:
#analyzePartitions(ranks)

In [12]:
pageRankLinks = links.join(ranks)
#pageRankLinks.collect()

In [13]:
#contributions = pageRankLinks.flatMap(computeContributions)

In [14]:
#contributions.collect()

In [15]:
# Return a set of pairs from each input pair
# input pair: 
#---- (pageid, (linked pages, current page rank of pageid) )
# output pairs:
# --- one output pair for each linked page
# --- (pageid linked page, current page rank of the linking page pageid / number of linked pages)
def computeContributions(pageIDLinksPageRank):
    pagesContributions = []
    
    currentPageRank = pageIDLinksPageRank[1][1] # this takes the 'rank' from the puple ('ID', (['IDs'], 'rank'))
    linkedPages = pageIDLinksPageRank[1][0]     # this takes the IDs list from the puple ('ID', (['IDs'], 'rank'))
    numLinkedPages = len(linkedPages)
    contribution = currentPageRank/numLinkedPages
    
    for pageidLinkedPage in linkedPages:
        pagesContributions.append( (pageidLinkedPage, contribution))
    
    return pagesContributions

In [16]:
# Run 30 iterations of PageRank
for x in range(50):
    # Retrieve for each page its current pagerank and the list of linked pages
    pageRankLinks = links.join(ranks)
    # Compute contributions from linking pages to linked pages for this iteration
    contributions = pageRankLinks.flatMap(computeContributions)
    # Update current pagerank of all pages for this iteration
    ranks = contributions.reduceByKey(lambda contribution1, contribution2: contribution1+contribution2)

**PAY ATTENTION:** we know that RDDs are immutable. This means that we are creating 50 times a new RDD. At the end we will have a reference of the last RDD. This is not very smart or efficient. Using partition by we can do something more efficient. If we apply partition by to 'links', the system will create a new version of this RDD where the data are already organized by key, so the system know that in one specific server you have all the key/val pairs associated with a specific page ID, in another server you'll have some others IDs and so on. 

In [17]:
ranks.collect()

[('P5', 0.08643208585140724),
 ('P3', 0.0560163446251082),
 ('P1', 4.736656637694964),
 ('P2', 0.014794913519180311),
 ('P4', 0.10610001830934136)]

In [18]:
end = datetime.now()
print(end-start)

0:00:18.301273
