>### HW 7.0: Shortest path graph distances (toy networks)

>In this part of your assignment you will develop the base of your code for the week.

>Write MRJob classes to find shortest path graph distances, 
as described in the lectures. In addition to finding the distances, 
your code should also output a distance-minimizing path between the source and target.
Work locally for this part of the assignment, and use 
both of the undirected and directed toy networks.

>To proof you code's function, run the following jobs

>- shortest path in the undirected network from node 1 to node 4  
Solution: 1,5,4 

>- shortest path in the directed network from node 1 to node 5  
Solution: 1,2,4,5

>and report your output---make sure it is correct!

In [88]:
%%writefile shortest_path.py
from mrjob.job import MRJob

class ShortestPathBFS(MRJob):
    class Node:
        def __init__(self, nodeid, links='{}', distance=-1, state='U'):
            self.links = eval(links)
            self.distance = distance
            self.STATE = state
            self.ID = nodeid

        def setDistance(self, distance):
            self.distance = distance

        def setVisited(self):
            self.STATE = 'V'

        def setQueued(self):
            self.STATE = 'Q'

        def sendQueuedNodes(self):
            for link_id in self.links:
                yield link_id, '\t'.join([ '{}', str(self.distance+1), 'Q' ])
        
        def makeNode(self):
            return '\t'.join([str(self.links), str(self.distance), self.STATE])
    
    def process_node_occurances(self, nodeID, nodeinfo):
        ''' Parse nodes within reducer 
        '''
        links, distance, state = nodeinfo.split('\t')
        return self.Node(nodeID, links, int(distance), state)
        
    def mapper(self, _, line):
        ''' Read each node from temp file
            and send node / queued nodes 
            to stream
        '''
        # read line as a node
        nodeID, links, distance, state = line.strip().split('\t')
        current_node = self.Node(nodeID, links, int(distance), state)
        
        # send queued nodes 
        if current_node.STATE == 'Q':
            distance = current_node.distance
            for node_id, node in current_node.sendQueuedNodes():
                yield node_id, node
            current_node.setVisited()
        
        # send current node
        yield current_node.ID, current_node.makeNode()
    
    def reducer(self, nodeID, occurances):
        ''' Join all information for each node 
        '''
        # read each node occurance
        node_data = [ self.process_node_occurances(nodeID, o) for o in occurances ]
        
        # join all node data together 
        node_distance = -1
        node_links = {}
        node_state = 'U'
        for n in node_data:
            if node_distance != -1:
                if n.distance < node_distance: 
                    node_distance = n.distance
            else:
                node_distance = n.distance
            node_links.update(n.links)
            if n.STATE == 'V': node_state = 'V'
            elif n.STATE != node_state and node_state != 'V': node_state = n.STATE
        current_node = self.Node(nodeID, str(node_links), str(node_distance), node_state)
        
        # send node 
        yield current_node.ID, current_node.makeNode()

if __name__=='__main__':
    ShortestPathBFS.run()

Overwriting shortest_path.py


In [100]:
import shortest_path, os
reload(shortest_path)
from shortest_path import ShortestPathBFS

def find_distance(file_name, node_num):
    with open(file_name,'r') as r:
        for line in r:
            node_id, _, distance, _ = line.split('\t')
            if node_id == node_num:
                return int(distance)

SOURCE_FILE = 'Data/directed_toy.txt'
TEMP_FILE = 'Data/graph_tmp.txt'
START_NODE = '1'
END_NODE = '4'

with open(TEMP_FILE, 'w') as w:
    with open(SOURCE_FILE, 'r') as r:
        for line in r:
            line = line.strip()
            nodeid, links = line.split('\t')
            if nodeid == START_NODE: 
                distance = 0
                state = 'Q'
            else: 
                distance = -1
                state = 'U'
            w.write('\t'.join((nodeid, links, str(distance), state))+'\n')

args = [TEMP_FILE, '--strict-protocols']
mrjob = ShortestPathBFS(args=args)

i = 0
queue_empty = False
while not queue_empty and i < 10:
    i += 1
    with mrjob.make_runner() as runner, open(TEMP_FILE+'.running', "w") as f:
        runner.run()

        for line in runner.stream_output():
            # write line to temp file 
            nodeid, node = mrjob.parse_output_line(line)
            f.write('\t'.join((nodeid, node))+'\n')
            
            # check for last iteration 
            _, distance, _ = node.split('\t')
            if nodeid == END_NODE and distance != '-1':
                queue_empty = True
        
    os.remove(TEMP_FILE)
    os.rename(TEMP_FILE+'.running', TEMP_FILE)

print 'End at iteration {}'.format(i)
print 'Distance: {}'.format(find_distance(TEMP_FILE, END_NODE))

End at iteration 2
Distance: 2
