In [1]:
# Jupyter requires this for MRJob to reload classes properly
%load_ext autoreload
%autoreload 2
%matplotlib inline



## HW 9.4: Topic-specific PageRank implementation using MRJob

Modify your PageRank implementation to produce a topic specific PageRank implementation,
as described in:

http://www-cs-students.stanford.edu/~taherh/papers/topic-sensitive-pagerank.pdf

Note in this article that there is a special caveat to ensure that the transition matrix is irreducible.
This caveat lies in footnote 3 on page 3:

	A minor caveat: to ensure that M is irreducible when p
	contains any 0 entries, nodes not reachable from nonzero
	nodes in p should be removed. In practice this is not problematic.

and must be adhered to for convergence to be guaranteed.

Run topic specific PageRank on the following randomly generated network of 100 nodes:

s3://ucb-mids-mls-networks/randNet.txt (also available on Dropbox)

which are organized into ten topics, as described in the file:

s3://ucb-mids-mls-networks/randNet_topics.txt  (also available on Dropbox)

Since there are 10 topics, your result should be 11 PageRank vectors
(one for the vanilla PageRank implementation in 9.1, and one for each topic
with the topic specific implementation). Print out the top ten ranking nodes 
and their topics for each of the 11 versions, and comment on your result. 
Assume a teleportation factor of 0.15 in all your analyses.

One final and important comment here:  please consider the 
requirements for irreducibility with topic-specific PageRank.
In particular, the literature ensures irreducibility by requiring that
nodes not reachable from in-topic nodes be removed from the network.

This is not a small task, especially as it it must be performed
separately for each of the (10) topics.

So, instead of using this method for irreducibility, 
please comment on why the literature's method is difficult to implement,
and what what extra computation it will require.
Then for your code, please use the alternative, 
non-uniform damping vector:

$v_{ji} = \beta(\frac{1}{|T_{j}|})$; if node i lies in topic $T_j$

$v_{ji} = (1-\beta)\frac{1}{N - |T_{j}|}$; if node i lies outside of topic $T_j$

for beta in (0,1) close to 1. 

With this approach, you will not have to delete any nodes.
If beta > 0.5, PageRank is topic-sensitive, 
and if beta < 0.5, the PageRank is anti-topic-sensitive. 
For any value of beta irreducibility should hold,
so please try beta=0.99, and perhaps some other values locally,
on the smaller networks.

-------------------

This first step transforms the original adjacency list to create a list of the form:

    node \t [page_rank, [topic_ranks], [outlinks]]
    
The `[topic_ranks]` is the bias vector and is initialized as each element as the inverse of the number of nodes classified as that topic, whereas `page_rank` is initialized as 1/Number of nodes.


In [2]:
%%writefile MrJobTransform94.py
from mrjob.job import MRJob
from mrjob.step import MRStep
import numpy as np

import re
        
class MrJobTransform94(MRJob):

    def configure_options(self):
        super(MrJobTransform94, self).configure_options()    
        self.add_passthrough_option(
            '--nodes', dest='nodes', type='int', default=0, 
            help="Number of nodes")
        self.add_passthrough_option(
            '--maptasks', dest='mappers', type='int', default=2, 
            help="mapper task instances")
        self.add_passthrough_option(
            '--reducetasks', dest='reducers', type='int', default=1, 
            help="reducer task instances")
      

    # each line is from the adjacency list node_id \t {neighbor: weight}
    # where {neighbor:weight} is one of m elements in the dictionary
    # emit node \t [page_rank, num_outlinks, [outlinks]]
    def mapper(self, _, line):
        node, adj_list = re.split('\t',line.strip())
        node = node.strip('"')
        neighbors = eval(adj_list)
        yield node, [neighbors.keys()]
        for neighbor in neighbors:
            yield neighbor, []

    def reducer_init(self):
        self.rank = 1.0/self.options.nodes
        self.topic_count = [0.0]*10  # 10 is the number of topics
        with open('/Users/rcordell/Documents/MIDS/W261/week09/HW9/Data/randNet_topics.txt','r') as topic_file:
            for line in topic_file.readlines():
                node, topic = re.split('\t',line.strip())
                self.topic_count[int(topic)-1]+=1
        # initialized topic vector is 1/|T_j|
        self.topic_rank = [1.0/x for x in self.topic_count]
                
        
    # node, [page_rank, [outlinks]]
    def reducer(self, node, outlinks_list):
        combined_outlinks = []
        for outlinks in outlinks_list:
            for outlink in outlinks:
                combined_outlinks = combined_outlinks + outlink
        self.increment_counter('transformer', 'nodes', 1) 
        yield node, [self.rank, self.topic_rank ,combined_outlinks]

    def steps(self):
        return [MRStep(mapper=self.mapper,
                       reducer_init=self.reducer_init,
                       reducer=self.reducer,
                       jobconf = {
                        'mapreduce.job.maps' : self.options.mappers,
                        'mapreduce.job.reduces' : self.options.reducers}
                    )]
    
if __name__ == '__main__':
    MrJobTransform94.run()

Overwriting MrJobTransform94.py


In [119]:
from MrJobTransform94 import MrJobTransform94
              
def transform(qfile):
    mr_job = MrJobTransform94(args=[qfile,
                                    '-r','local',
                                    '--nodes', '100',
                                    '--maptasks', '4',
                                    '--reducetasks', '1'])
    with open('Data/randNet-in.txt','w') as outfile:
        with mr_job.make_runner() as runner:
            runner.run()
            for line in runner.stream_output():
                q = mr_job.parse_output_line(line)
                outfile.write(line)
            counters = runner.counters()
            print 'Node Count: {0}'.format(counters[0]['transformer']['nodes'])

        
if __name__ == '__main__':
    transform('Data/randNet.txt')

Node Count: 100


This second step is the 2-stage map reduce that calculates the page rank and the topic rank vectors over several iterations.

Page rank is calculated as $\alpha \frac{1}{|N|} + (1-\alpha)(\frac{m}{N} + p)$ where $p$ is the page rank as accumulated from the contributions of incoming links to that node.

Topic rank is calculated as $\sum_{i}v_{ij}$ where $v_{ij} = \beta(\frac{1}{|T_j|}) p$ for outlink $i$ that is of topic $j$ and $= (1-\beta)(\frac{1}{N-|T_j|})p$ for the outlink $i$ that is not of topic $j$

I believe that I am calculating the topic rank incorrectly....

This MR job attempts to calculate all the topic vectors at the same time in a single pass

In [120]:
%%writefile MrJob94.py
from mrjob.job import MRJob
from mrjob.step import MRStep
import numpy as np
import types

import re
       
class MrJob94(MRJob):

    def configure_options(self):
        super(MrJob94, self).configure_options()    
        self.add_passthrough_option(
            '--nodes', dest='nodes', type='int', default=0, 
            help="Number of nodes")    
        self.add_passthrough_option(
            '--damping', dest='damping', type='float', default=0.85, 
            help="damping factor") 
        self.add_passthrough_option(
            '--beta', dest='beta', type='float', default=0.99, 
            help="damping factor") 
        self.add_passthrough_option(
            '--maptasks', dest='mappers', type='int', default=2, 
            help="mapper task instances")
        self.add_passthrough_option(
            '--reducetasks', dest='reducers', type='int', default=1, 
            help="reducer task instances")

    def pr_mapper_init(self):
        self.node_topics = [0]*100
        self.topic_counts = [0]*10
        with open('/Users/rcordell/Documents/MIDS/W261/week09/HW9/Data/randNet_topics.txt','r') as topic_file:
            for line in topic_file.readlines():
                node, topic = re.split('\t',line.strip())
                self.node_topics[int(node)-1] = int(topic)
                self.topic_counts[int(topic)-1] += 1
        self.topic_freqs = [1/x for x in self.topic_counts]


    def pr_mapper(self, _, line):
        node, value_list = re.split('\t',line.strip())
        node = node.strip('"')
        v = eval(value_list)

        # yield the graph structure
        yield node, [0.0, v[1], v[2]]
        
        # distrubute the page rank and topic ranks to all the outlink nodes
        if len(v[2]) > 0:
            for outlink in v[2]:
                yield outlink, [v[0]/len(v[2]), [x/len(v[2]) for x in v[1]]]
        else:
            # this takes care of mass that doesn't get propogated when 
            # there are no outlink nodes to distribute it to
            yield node, [v[0], v[1]]

            
    def pr_reducer_init(self):
        self.pr_mass = 0.0
        
    # node_id \t page_rank 
    # OR
    # node_id \t [page_rank, [topic_rank],[outlinked nodes]]
    def pr_reducer(self, node, neighbors):
        adj_list = []
        rank = 0.0
        topic_ranks = [0.0]*10
        for neighbor in neighbors:
            if len(neighbor) == 3:
                # this is the graph
                adj_list = neighbor
            else:
                rank += neighbor[0]
                topic_ranks = [a+b for a,b in zip(topic_ranks,neighbor[1])]
        self.pr_mass += rank
        yield node, [rank, topic_ranks, adj_list[2]]
   
    def pr_reducer_final(self):
        print '-',self.pr_mass
        pass

    # find the dangling nodes and emit their mass with the key '*'
    def dangling_mapper(self, node, rank):
        if len(rank[2]) == 0:            
            yield '*', rank[0]
            yield node, [0.0, rank[1], rank[2]]
        else:
            yield node, rank
            
    
    def dangling_reducer_init(self):
        self.conserved_mass = 0.0
        self.mass = 0.0
        self.node_topics = [0]*100
        self.topic_counts = [0]*10
        with open('/Users/rcordell/Documents/MIDS/W261/week09/HW9/Data/randNet_topics.txt','r') as topic_file:
            for line in topic_file.readlines():
                node, topic = re.split('\t',line.strip())
                self.node_topics[int(node)-1] = int(topic)
                self.topic_counts[int(topic)-1] += 1
        self.topic_freqs = [1/x for x in self.topic_counts]
        
            
    # sum the dangling node mass
    def dangling_reducer(self, key, mass):
        if key == '*':
            self.conserved_mass = sum(mass)
        else:
            # distrubute the conserved mass
            # mass is really [rank, count, [outlinks]] in this case
            distributed_mass = self.conserved_mass/self.options.nodes
            teleport_prob = (1.0-self.options.damping)/self.options.nodes
            for item in mass:
                # compute the basic page rank
                new_rank = teleport_prob + (self.options.damping*(item[0] + distributed_mass))
                
                # compute the topic rank vector
                for i, topic_rank in enumerate(item[1]):
                    weight = 0.0
                    for outlink in item[2]:
                        if i == self.node_topics[int(outlink)-1]:
                            weight += self.options.beta / self.topic_counts[i]
                        else:
                            weight += (1.0 - self.options.beta)*(1.0/(self.options.nodes - self.topic_counts[i]))
                    item[1][i] = item[1][i] + weight * new_rank      
                self.mass += new_rank
                yield key, [new_rank, item[1], item[2]]
    
    def dangling_reducer_final(self):
        print '*',self.mass
        pass
             

    def steps(self):
        return [MRStep(mapper_init=self.pr_mapper_init,
                       mapper=self.pr_mapper,
                       reducer_init=self.pr_reducer_init,
                       reducer=self.pr_reducer,
#                       reducer_final=self.pr_reducer_final,
                       jobconf = {
                        'mapreduce.job.maps' : self.options.mappers,
                        'mapreduce.job.reduces' : self.options.reducers}), 
                MRStep(mapper=self.dangling_mapper,
                       reducer_init=self.dangling_reducer_init,
                       reducer=self.dangling_reducer,
#                       reducer_final=self.dangling_reducer_final,
                       jobconf = {
                        'mapreduce.job.maps' : self.options.mappers,
                        'mapreduce.job.reduces' : self.options.reducers})] 
    
if __name__ == '__main__':
    MrJob94.run()

Overwriting MrJob94.py


In [124]:
from MrJob94 import MrJob94
from shutil import copyfile
import sys
    
def page_rank(infile, outfile, node_count, damping_factor, beta):
    mr_job = MrJob94(args=[infile,
                          '--nodes', str(node_count),
                          '--damping', str(damping_factor),
                          '--beta', str(beta)])

    with open(outfile,'w') as rankfile:
        with mr_job.make_runner() as runner:
            runner.run()
            for line in runner.stream_output():
#                q = mr_job.parse_output_line(line)
#                print q
                rankfile.write(line)

def driver(infile, outfile, damping_factor, beta):
    
    iterations = 0     
    while iterations < 100:
        page_rank(infile, outfile, 100, damping_factor, beta)
        copyfile(outfile, infile)
        iterations += 1
#        print 'Iteration: {0}'.format(iterations)  
        
if __name__ == '__main__':
    driver('Data/randNet-in.txt','Data/randNet-out.txt', 0.85, 0.99)