# Make a Tree Out of a Graph

<img src="images/Screen Shot 2019-02-10 at 3.31.26 PM.png" />





In [9]:
marvel_bipartite = {}

# Builds bipartite graph from data file
# Keys are comic books and values are dicts of characters
with open("data/marvel.tsv") as f:
    for line in f:
        character, comic = line.split("\t")
            
        if comic not in marvel_bipartite:
            marvel_bipartite[comic] = {character: 1}
        else:
            marvel_bipartite[comic][character] = 1

marvel_graph = {}

# Builds character to character graph from bipartite graph
# Nodes are characters and edges are comics
for comic, character_dict in marvel_bipartite.items():
    for i in range(len(character_dict)-1):
        for j in range(i + 1,len(character_dict)):
            character_1, character_2 = sorted((list(character_dict.keys())[i], list(character_dict.keys())[j]))
            if character_1 not in marvel_graph:
                marvel_graph[character_1] = {}
            if character_2 not in marvel_graph[character_1]:
                marvel_graph[character_1][character_2] = 1
            else:
                marvel_graph[character_1][character_2] += 1
                
max_edge = 0
max_nodes = ()

# Finds highest weighted edge in graph
for character_1, character_1_dict in marvel_graph.items():
    for character_2, edge_value in character_1_dict.items():
        if edge_value > max_edge:
            max_edge = edge_value
            max_nodes = (character_1, character_2)
            
print("max_edge:", max_edge)
print("max_nodes:", max_nodes)
            
        

max_edge: 744
max_nodes: ('"HUMAN TORCH/JOHNNY S"', '"THING/BENJAMIN J. GR"')


# Matrix Multiplication

<img src="images/Screen Shot 2019-02-11 at 11.00.35 AM.png" />





# Weighted Social Networks

<img src="images/Screen Shot 2019-02-11 at 11.03.04 AM.png" />

<img src="images/Screen Shot 2019-02-11 at 11.04.26 AM.png" />





# How to Find the Shortest Path

<img src="images/Screen Shot 2019-02-11 at 11.42.27 AM.png" />



# Dijkstra's Shortest Path Algorithm

One way to conceptualize Dijkstra's is to use a breadth-first search while keeping track of the shortest path to each node in a dictionary. One key difference is that each time you come across a node that has already been visited, you need to compare that path length to the shortest length seen so far. If the path is shorter, you need to update the shortest path dictionary and traverse the connecting edges to see if there are any other paths that should be updated.

One question to be answered is whether the lock-down step is necessary. If you're keeping track of the shortest so far, you may not need to do this.

<img src="images/Screen Shot 2019-02-12 at 3.05.20 PM.png" />



# Dijksta's Using Ueaps

<img src="images/Screen Shot 2019-02-12 at 3.48.21 PM.png" />

<img src="images/Screen Shot 2019-02-12 at 3.49.50 PM.png" />


# All Pairs Shortest Paths

<img src="images/Screen Shot 2019-02-12 at 4.39.56 PM.png" />






# Floyd-Warshall

<img src="images/Screen Shot 2019-02-12 at 4.43.13 PM.png" />

<img src="images/Screen Shot 2019-02-12 at 4.47.57 PM.png" />

<img src="images/Screen Shot 2019-02-12 at 6.10.22 PM.png" />

# Randomizing Clustering Coefficient

<img src="images/Screen Shot 2019-02-12 at 6.14.03 PM.png" />

<img src="images/Screen Shot 2019-02-12 at 6.14.38 PM.png" />


# Bounds on the Estimate

<img src="images/Screen Shot 2019-02-12 at 6.16.01 PM.png" />


In [6]:
import math
int(math.ceil(1/2.0))

1

In [None]:
#
# The code below uses a linear
# scan to find the unfinished node
# with the smallest distance from
# the source.
#
# Modify it to use a heap instead
# 

import math

def insert_node(heap, node, value):
    # Inserts a node into the heap and returns the new heap
    
    heap.append((node, value))
    heap = up_heapify(heap, len(heap) - 1)
    
    return heap
    
def get_parent(heap, index):
    # Gets the parent of the current node
    
    if index == 0:
        return index
    
    return int(math.ceil(index/2.0)-1)

def get_left_child(heap, index):
    # Returns the left child of the current node
    
    left_child = (index * 2) + 1
    
    if left_child > len(heap) - 1:
        return None
    
    return left_child
    
def get_right_child(heap, index):
    # Returns the right child of the current node
    
    right_child = (index * 2) + 2
    
    if right_child > len(heap) - 1:
        return None
    
    return right_child
    
def down_heapify(heap, index):
    # Bubbles the given node down the heap
    # and returns the new heap
    
    left_child = get_left_child(heap, index)
    right_child = get_right_child(heap, index)
    
    while left_child:
        if not right_child:
            heap[index], heap[left_child] = sorted([heap[index], heap[left_child]], key=lambda x: x[1])
            return heap
        elif heap[index][1] < min([heap[left_child], heap[right_child]], key=lambda x: x[1])[1]:
            return heap
        else:
            if heap[left_child][1] < heap[right_child][1]:
                heap[index], heap[left_child] = heap[left_child], heap[index]
                index = left_child
            else:
                heap[index], heap[right_child] = heap[right_child], heap[index]
                index = right_child
        
            left_child = get_left_child(heap, index)
            right_child = get_right_child(heap, index)
        
    return heap

def up_heapify(heap, index):
    # Bubbles the given node up the heap
    # and returns the new heap
    
    parent_index = get_parent(heap, index)
    while index != parent_index and heap[parent_index][1] > heap[index][1]:
        heap[parent_index], heap[index] = heap[index], heap[parent_index]
        index = parent_index
        parent_index = get_parent(heap, index)
        
    return heap
    
def update_heap(heap, index):
    # Restructures the heap based on the newly inserted node
    
    heap = up_heapify(heap, index)
    heap = down_heapify(heap, index)
    
    return heap
    
def remove_smallest_node(heap):
    # Removes smallest node from heap and returns new heap
    print(heap)
    
    heap[0], heap[-1] = heap[-1], heap[0]
    min_node = heap.pop()
    heap = down_heapify(heap, 0)
    return min_node, heap    
    

def shortest_dist_node(dist):
    best_node = 'undefined'
    best_value = 1000000
    for v in dist:
        if dist[v] < best_value:
            (best_node, best_value) = (v, dist[v])
    return best_node

def dijkstra(G,v):
    # Convert dist_so_far from a dict to a list of tuples
    # Each tuple contains a node and its distance
    dist_so_far = [(v, 0)]
    final_dist = {}
    
    while len(final_dist) < len(G):
        #w = shortest_dist_node(dist_so_far)
        w, dist_so_far = remove_smallest_node(dist_so_far)
        # lock it down!
        final_dist[w[0]] = w[1]
        
        for x in G[w[0]]:
            if x not in final_dist:
                dist_so_far_index = None
                
                for i in range(len(dist_so_far)):
                    if dist_so_far[i][0] == x:
                        dist_so_far_index = i
                        
                if not dist_so_far_index and dist_so_far_index != 0:
                    dist_so_far = insert_node(dist_so_far, x, final_dist[w[0]] + G[w[0]][x])
                elif final_dist[w[0]] + G[w[0]][x] < dist_so_far[dist_so_far_index][1]:
                    dist_so_far[dist_so_far_index] = (dist_so_far[dist_so_far_index][0], final_dist[w[0]] + G[w[0]][x])
                    dist_so_far = update_heap(dist_so_far, dist_so_far_index)
    return final_dist

############
# 
# Test

def make_link(G, node1, node2, w):
    if node1 not in G:
        G[node1] = {}
    if node2 not in G[node1]:
        (G[node1])[node2] = 0
    (G[node1])[node2] += w
    if node2 not in G:
        G[node2] = {}
    if node1 not in G[node2]:
        (G[node2])[node1] = 0
    (G[node2])[node1] += w
    return G


def test():
    # shortcuts
    (a,b,c,d,e,f,g) = ('A', 'B', 'C', 'D', 'E', 'F', 'G')
    triples = ((a,c,3),(c,b,10),(a,b,15),(d,b,9),(a,d,4),(d,f,7),(d,e,3), 
               (e,g,1),(e,f,5),(f,g,2),(b,f,1))
    G = {}
    for (i,j,k) in triples:
        make_link(G, i, j, k)

    dist = dijkstra(G, a)
    assert dist[g] == 8 #(a -> d -> e -> g)
    assert dist[b] == 11 #(a -> d -> e -> g -> f -> b)

In [39]:
#
# The code below uses a linear
# scan to find the unfinished node
# with the smallest distance from
# the source.
#
# Modify it to use a heap instead
# 

import math

def insert_node(heap, index_dict, node, value):
    # Inserts a node into the heap and returns the new heap
    
    heap.append((node, value))
    index_dict[node] = len(heap) - 1
    heap, index_dict = up_heapify(heap, index_dict, len(heap) - 1)
    
    return heap, index_dict
    
def get_parent(heap, index):
    # Gets the parent of the current node
    
    if index == 0:
        return index
    
    return int(math.ceil(index/2.0)-1)

def get_left_child(heap, index):
    # Returns the left child of the current node
    
    left_child = (index * 2) + 1
    
    if left_child > len(heap) - 1:
        return None
    
    return left_child
    
def get_right_child(heap, index):
    # Returns the right child of the current node
    
    right_child = (index * 2) + 2
    
    if right_child > len(heap) - 1:
        return None
    
    return right_child

def swap_heap_values(heap, index_dict, index_a, index_b):
    a = heap[index_a][0]
    b = heap[index_b][0]
    index_dict[a] = index_b
    index_dict[b] = index_a
    heap[index_a], heap[index_b] = heap[index_b], heap[index_a]
    
    return heap, index_dict
    
def down_heapify(heap, index_dict, index):
    # Bubbles the given node down the heap
    # and returns the new heap
    
    left_child = get_left_child(heap, index)
    right_child = get_right_child(heap, index)
    
    while left_child:
        if not right_child:
            if heap[left_child][1] < heap[index][1]:
                heap, index_dict = swap_heap_values(heap, index_dict, index, left_child)
            return heap, index_dict
            #heap[index], heap[left_child] = sorted([heap[index], heap[left_child]], key=lambda x: x[1])
            
            #return heap
        elif heap[index][1] < min([heap[left_child], heap[right_child]], key=lambda x: x[1])[1]:
            return heap, index_dict
        else:
            if heap[left_child][1] < heap[right_child][1]:
                heap, index_dict = swap_heap_values(heap, index_dict, index, left_child)
                index = left_child
                #heap[index], heap[left_child] = heap[left_child], heap[index]
            else:
                heap, index_dict = swap_heap_values(heap, index_dict, index, right_child)
                index = right_child
                #heap[index], heap[right_child] = heap[right_child], heap[index]
        
            left_child = get_left_child(heap, index)
            right_child = get_right_child(heap, index)
        
    return heap, index_dict

def up_heapify(heap, index_dict, index):
    # Bubbles the given node up the heap
    # and returns the new heap
    
    parent_index = get_parent(heap, index)
    while index != parent_index and heap[parent_index][1] > heap[index][1]:
        heap, index_dict = swap_heap_values(heap, index_dict, index, parent_index)
        #heap[parent_index], heap[index] = heap[index], heap[parent_index]
        index = parent_index
        parent_index = get_parent(heap, index)
        
    return heap, index_dict
    
def update_heap(heap, index_dict, index):
    # Restructures the heap based on the newly inserted node
    
    heap, index_dict = up_heapify(heap, index_dict, index)
    heap, index_dict = down_heapify(heap, index_dict, index)
    
    return heap, index_dict
    
def remove_smallest_node(heap, index_dict):
    # Removes smallest node from heap and returns new heap
    print(heap)
    heap, index_dict = swap_heap_values(heap, index_dict, 0, -1)
    min_node = heap.pop()
    del index_dict[min_node[0]]
    heap, index_dict = down_heapify(heap, index_dict, 0)
    return min_node, heap, index_dict    
    

def shortest_dist_node(dist):
    best_node = 'undefined'
    best_value = 1000000
    for v in dist:
        if dist[v] < best_value:
            (best_node, best_value) = (v, dist[v])
    return best_node

def dijkstra(G,v):
    # Convert dist_so_far from a dict to a list of tuples
    # Each tuple contains a node and its distance
    dist_so_far = [(v, 0)]
    index_dict = {v: 0}
    final_dist = {}
    
    while len(final_dist) < len(G):
        #w = shortest_dist_node(dist_so_far)
        w, dist_so_far, index_dict = remove_smallest_node(dist_so_far, index_dict)
        
        # lock it down!
        final_dist[w[0]] = w[1]
        
        for x in G[w[0]]:
            if x not in final_dist:
                if x not in index_dict:
                    print("dist_so_far:", dist_so_far)
                    dist_so_far, index_dict = insert_node(dist_so_far, index_dict, x, final_dist[w[0]] + G[w[0]][x])
                elif final_dist[w[0]] + G[w[0]][x] < dist_so_far[index_dict[x]][1]:
                    dist_so_far_index = index_dict[x]
                    dist_so_far[dist_so_far_index] = (dist_so_far[dist_so_far_index][0], final_dist[w[0]] + G[w[0]][x])
                    dist_so_far, index_dict = update_heap(dist_so_far, index_dict, dist_so_far_index)
    return final_dist

############
# 
# Test

def make_link(G, node1, node2, w):
    if node1 not in G:
        G[node1] = {}
    if node2 not in G[node1]:
        (G[node1])[node2] = 0
    (G[node1])[node2] += w
    if node2 not in G:
        G[node2] = {}
    if node1 not in G[node2]:
        (G[node2])[node1] = 0
    (G[node2])[node1] += w
    return G


def test():
    # shortcuts
    (a,b,c,d,e,f,g) = ('A', 'B', 'C', 'D', 'E', 'F', 'G')
    triples = ((a,c,3),(c,b,10),(a,b,15),(d,b,9),(a,d,4),(d,f,7),(d,e,3), 
               (e,g,1),(e,f,5),(f,g,2),(b,f,1))
    G = {}
    for (i,j,k) in triples:
        make_link(G, i, j, k)

    dist = dijkstra(G, a)
    assert dist[g] == 8 #(a -> d -> e -> g)
    assert dist[b] == 11 #(a -> d -> e -> g -> f -> b)

    


test()



[('A', 0)]
('dist_so_far:', [])
('dist_so_far:', [('C', 3)])
('dist_so_far:', [('C', 3), ('B', 15)])
[('C', 3), ('B', 15), ('D', 4)]
[('D', 4), ('B', 13)]
('dist_so_far:', [('B', 13)])
('dist_so_far:', [('E', 7), ('B', 13)])
[('E', 7), ('B', 13), ('F', 11)]
('dist_so_far:', [('F', 11), ('B', 13)])
[('G', 8), ('B', 13), ('F', 11)]
[('F', 10), ('B', 13)]
[('B', 11)]


In [38]:
(a,b,c,d,e,f,g) = ('A', 'B', 'C', 'D', 'E', 'F', 'G')
triples = ((a,c,3),(c,b,10),(a,b,15),(d,b,9),(a,d,4),(d,f,7),(d,e,3), 
           (e,g,1),(e,f,5),(f,g,2),(b,f,1))
G = {}
for (i,j,k) in triples:
    make_link(G, i, j, k)

dist = dijkstra(G, a)
dist

[('A', 0)]
('dist_so_far:', [])
('dist_so_far:', [('C', 3)])
('dist_so_far:', [('C', 3), ('B', 15)])
[('C', 3), ('B', 15), ('D', 4)]
('dist_so_far:', [('C', 3), ('B', 13)])
('dist_so_far:', [('C', 3), ('B', 13), ('E', 7)])
[('C', 3), ('F', 11), ('E', 7), ('B', 13)]
[('C', 3), ('F', 11), ('E', 7)]
('dist_so_far:', [('C', 3), ('F', 11)])
[('C', 3), ('F', 11), ('G', 8)]
[('C', 3), ('F', 10)]
[('C', 3)]


{'A': 0, 'B': 13, 'C': 3, 'D': 4, 'E': 7, 'F': 10, 'G': 8}

In [12]:
#
# The code below uses a linear
# scan to find the unfinished node
# with the smallest distance from
# the source.
#
# Modify it to use a heap instead
# 

def shortest_dist_node(dist):
    best_node = 'undefined'
    best_value = 1000000
    for v in dist:
        if dist[v] < best_value:
            (best_node, best_value) = (v, dist[v])
    return best_node

def dijkstra(G,v):
    dist_so_far = {}
    dist_so_far[v] = 0
    final_dist = {}
    while len(final_dist) < len(G):
        w = shortest_dist_node(dist_so_far)
        # lock it down!
        final_dist[w] = dist_so_far[w]
        del dist_so_far[w]
        for x in G[w]:
            if x not in final_dist:
                if x not in dist_so_far:
                    dist_so_far[x] = final_dist[w] + G[w][x]
                elif final_dist[w] + G[w][x] < dist_so_far[x]:
                    dist_so_far[x] = final_dist[w] + G[w][x]
    return final_dist

############
# 
# Test

def make_link(G, node1, node2, w):
    if node1 not in G:
        G[node1] = {}
    if node2 not in G[node1]:
        (G[node1])[node2] = 0
    (G[node1])[node2] += w
    if node2 not in G:
        G[node2] = {}
    if node1 not in G[node2]:
        (G[node2])[node1] = 0
    (G[node2])[node1] += w
    return G


def test():
    # shortcuts
    (a,b,c,d,e,f,g) = ('A', 'B', 'C', 'D', 'E', 'F', 'G')
    triples = ((a,c,3),(c,b,10),(a,b,15),(d,b,9),(a,d,4),(d,f,7),(d,e,3), 
               (e,g,1),(e,f,5),(f,g,2),(b,f,1))
    G = {}
    for (i,j,k) in triples:
        make_link(G, i, j, k)

    dist = dijkstra(G, a)
    assert dist[g] == 8 #(a -> d -> e -> g)
    assert dist[b] == 11 #(a -> d -> e -> g -> f -> b)
    
test()

In [1]:
import pandas as pd

df = pd.read_csv("data/marvel.tsv", sep="\t", header=None, names=["character", "comic"])
df.head()

Unnamed: 0,character,comic
0,24-HOUR MAN/EMMANUEL,AA2 35
1,3-D MAN/CHARLES CHAN,AVF 4
2,3-D MAN/CHARLES CHAN,AVF 5
3,3-D MAN/CHARLES CHAN,COC 1
4,3-D MAN/CHARLES CHAN,H2 251


In [2]:
from collections import Counter
from fractions import Fraction

In [90]:
def create_character_graph(df):
    
    character_graph = {}
    
    for character in sorted(pd.unique(df.character)):

        comics = df.loc[df.character == character, "comic"].tolist()
        co_characters = df.loc[(df.comic.isin(comics)) & (df.character != character), "character"].tolist()

        co_character_counts = dict(Counter(co_characters))
        for co_character, count in co_character_counts.items():
            co_character_counts[co_character] = 1.0 / count

        if len(co_characters) == 0: 
            print("No co_characters for:", character)
            continue

        character_graph[character] = co_character_counts
        
    return character_graph

character_graph = create_character_graph(df)

('No co_characters for:', 'BERSERKER II')
('No co_characters for:', 'BLARE/')
('No co_characters for:', 'CALLAHAN, DANNY')
('No co_characters for:', 'CLUMSY FOULUP')
('No co_characters for:', 'DEATHCHARGE')
('No co_characters for:', 'FENRIS')
('No co_characters for:', 'GERVASE, LADY ALYSSA')
('No co_characters for:', 'GIURESCU, RADU')
('No co_characters for:', 'JOHNSON, LYNDON BAIN')
('No co_characters for:', 'KULL')
('No co_characters for:', 'LUNATIK II')
('No co_characters for:', 'MARVEL BOY II/MARTIN')
('No co_characters for:', 'RANDAK')
('No co_characters for:', 'RED WOLF II')
('No co_characters for:', 'RUNE')
('No co_characters for:', 'SEA LEOPARD')
('No co_characters for:', 'SHARKSKIN')
('No co_characters for:', 'ZANTOR')


In [91]:
def bfs(graph, node):
    
    simple_paths = {node: {'hops': 0, 'weight': 0}}
    queue = [node]
    
    while len(queue) > 0:
        current_node = queue[0]
        del queue[0]
        
        for co_character in graph[current_node]:
            if co_character not in simple_paths:
                queue.append(co_character)
                simple_paths[co_character] = {
                    'weight': simple_paths[current_node]['weight'] + graph[current_node][co_character],
                    'hops': simple_paths[current_node]['hops'] + 1
                }
                    
                
    return simple_paths

characters_of_interest = [
    'SPIDER-MAN/PETER PAR',
    'GREEN GOBLIN/NORMAN ',
    'WOLVERINE/LOGAN ',
    'PROFESSOR X/CHARLES ',
    'CAPTAIN AMERICA'
]

character_simple_paths = {}

for character in characters_of_interest:
    character_simple_paths[character] = bfs(character_graph, character)

In [92]:
def get_smallest(dist_so_far):
    smallest_node = ''
    smallest_weight = 1000000
    for node, value in dist_so_far.items():
        if value['weight'] < smallest_weight: 
            smallest_node = node
            smallest_weight = value['weight']
            
    return smallest_node

def dijkstra(character_graph, simple_paths, node):
    
    final_distances = {}
    dist_so_far = {node: {'hops': 0, 'weight': 0}}
    current_node = node
    
    while len(final_distances) < len(simple_paths):
        w = get_smallest(dist_so_far)
        final_distances[w] = dist_so_far[w]
        del dist_so_far[w]
        
        for co_character in character_graph[w]:
            if co_character not in final_distances:
                new_hops = final_distances[w]['hops'] + 1
                new_weight = final_distances[w]['weight'] + character_graph[w][co_character]
                if co_character not in dist_so_far:
                    dist_so_far[co_character] = {'hops': new_hops, 'weight': new_weight}
                elif new_weight < dist_so_far[co_character]['weight']:
                    dist_so_far[co_character] = {'hops': new_hops, 'weight': new_weight}        
    
    return final_distances
    
weighted_count = 0
    
for character, simple_paths in character_simple_paths.items():
    
    weighted_paths = dijkstra(character_graph, simple_paths, character)
    
    weighted_count += sum([weighted_paths[c]['hops'] != simple_paths[c]['hops'] for c in simple_paths])
    print("weighted_count:", weighted_count)
    
    
weighted_count          

('weighted_count:', 4408)
('weighted_count:', 10183)
('weighted_count:', 15582)
('weighted_count:', 19747)
('weighted_count:', 23289)


23289

In [3]:
#
# Another way of thinking of a path in the Kevin Bacon game 
# is not about finding *short* paths, but by finding paths 
# that don’t use obscure movies.  We will give you a 
# list of movies along with their obscureness score.  
#
# For this assignment, we'll approximate obscurity 
# based on the multiplicative inverse of the amount of 
# money the movie made.  Though, its not really important where
# the obscurity score came from.
#
# Use the the imdb-1.tsv and imdb-weights.tsv files to find
# the obscurity of the “least obscure” 
# path from a given actor to another.  
# The obscurity of a path is the maximum obscurity of 
# any of the movies used along the path.
#
# You will have to do the processing in your local environment
# and then copy in your answer.
#
# Hint: A variation of Dijkstra can be used to solve this problem.
#

# Change the `None` values in this dictionary to be the obscurity score
# of the least obscure path between the two actors
answer = {(u'Boone Junior, Mark', u'Del Toro, Benicio'): None,
          (u'Braine, Richard', u'Coogan, Will'): None,
          (u'Byrne, Michael (I)', u'Quinn, Al (I)'): None,
          (u'Cartwright, Veronica', u'Edelstein, Lisa'): None,
          (u'Curry, Jon (II)', u'Wise, Ray (I)'): None,
          (u'Di Benedetto, John', u'Hallgrey, Johnathan'): None,
          (u'Hochendoner, Jeff', u'Cross, Kendall'): None,
          (u'Izquierdo, Ty', u'Kimball, Donna'): None,
          (u'Jace, Michael', u'Snell, Don'): None,
          (u'James, Charity', u'Tuerpe, Paul'): None,
          (u'Kay, Dominic Scott', u'Cathey, Reg E.'): None,
          (u'McCabe, Richard', u'Washington, Denzel'): None,
          (u'Reid, Kevin (I)', u'Affleck, Rab'): None,
          (u'Reid, R.D.', u'Boston, David (IV)'): None,
          (u'Restivo, Steve', u'Preston, Carrie (I)'): None,
          (u'Rodriguez, Ramon (II)', u'Mulrooney, Kelsey'): None,
          (u'Rooker, Michael (I)', u'Grady, Kevin (I)'): None,
          (u'Ruscoe, Alan', u'Thornton, Cooper'): None,
          (u'Sloan, Tina', u'Dever, James D.'): None,
          (u'Wasserman, Jerry', u'Sizemore, Tom'): None}

# Here are some test cases.
# For example, the obscurity score of the least obscure path
# between 'Ali, Tony' and 'Allen, Woody' is 0.5657
test = {(u'Ali, Tony', u'Allen, Woody'): 0.5657,
        (u'Auberjonois, Rene', u'MacInnes, Angus'): 0.0814,
        (u'Avery, Shondrella', u'Dorsey, Kimberly (I)'): 0.7837,
        (u'Bollo, Lou', u'Jeremy, Ron'): 0.4763,
        (u'Byrne, P.J.', u'Clarke, Larry'): 0.109,
        (u'Couturier, Sandra-Jessica', u'Jean-Louis, Jimmy'): 0.3649,
        (u'Crawford, Eve (I)', u'Cutler, Tom'): 0.2052,
        (u'Flemyng, Jason', u'Newman, Laraine'): 0.139,
        (u'French, Dawn', u'Smallwood, Tucker'): 0.2979,
        (u'Gunton, Bob', u'Nagra, Joti'): 0.2136,
        (u'Hoffman, Jake (I)', u'Shook, Carol'): 0.6073,
        (u'Kamiki, Ry\xfbnosuke', u'Thor, Cameron'): 0.3644,
        (u'Roache, Linus', u'Dreyfuss, Richard'): 0.6731,
        (u'Sanchez, Phillip (I)', u'Wiest, Dianne'): 0.5083,
        (u'Sheppard, William Morgan', u'Crook, Mackenzie'): 0.0849,
        (u'Stan, Sebastian', u'Malahide, Patrick'): 0.2857,
        (u'Tessiero, Michael A.', u'Molen, Gerald R.'): 0.2056,
        (u'Thomas, Ken (I)', u'Bell, Jamie (I)'): 0.3941,
        (u'Thompson, Sophie (I)', u'Foley, Dave (I)'): 0.1095,
        (u'Tzur, Mira', u'Heston, Charlton'): 0.3642}

movie_actor_df = pd.read_csv("data/imdb.tsv", sep="\t", header=None, names=["actor", "movie", "year"])
movie_obscurity_df = pd.read_csv("data/imdb_obscurity.tsv", sep="\t", header=None, names=["movie", "year", "obscurity"])

movie_df = pd.merge(movie_actor_df, movie_obscurity_df, on=["movie", "year"], how="left")
movie_df[movie_df.obscurity.isnull()]

Unnamed: 0,actor,movie,year,obscurity


In [4]:
def create_movie_graph(df):
    movie_graph = {}

    for actor in sorted(pd.unique(df.actor)):

        movies_years = df.loc[df.actor == actor, ["movie", "year"]]
        movies = movies_years["movie"].tolist()
        years = movies_years["year"].tolist()
        costar_df = pd.DataFrame({"actor": [], "obscurity": []})

        for movie, year in zip(movies, years):
            costar_df = costar_df.append(df.loc[(df.actor != actor) & (df.year == year) & (df.movie == movie), ["actor", "obscurity"]])

        costar_dict = costar_df.sort_values(["actor", "obscurity"]).groupby("actor").first()["obscurity"].to_dict()

        movie_graph[actor] = costar_dict
        
    return movie_graph

movie_graph = create_movie_graph(movie_df)

In [5]:
def get_smallest_node(weights_so_far):
    smallest_weight = 1000000
    smallest_node = ""
    
    for node, weight in weights_so_far.items():
        if weight < smallest_weight:
            smallest_weight = weight
            smallest_node = node
            
    return smallest_node

def dijkstra_search(start_node, end_node, graph):
    paths_so_far = {start_node: 0}
    final_paths = {}    
    
    while end_node not in final_paths:
        w = get_smallest_node(paths_so_far)
        final_paths[w] = paths_so_far[w]
        #print("final_paths length:", len(final_paths))
        #if len(final_paths) % 300 == 0: print("final_paths length:", len(final_paths))
        del paths_so_far[w]
        
        for neighbor in graph[w]:
            if neighbor not in final_paths:
                new_path = max([final_paths[w], graph[w][neighbor]])
                if neighbor not in paths_so_far or new_path < paths_so_far[neighbor]:
                    paths_so_far[neighbor] = new_path
        
    return final_paths
                    

In [8]:
for tup in answer:
    start_node, end_node = tup
    smallest_paths = dijkstra_search(start_node, end_node, movie_graph)
    answer[tup] = smallest_paths[end_node]
    print("answer[{}]:".format(tup), answer[tup])
    
answer    
    

answer[('Boone Junior, Mark', 'Del Toro, Benicio')]: 0.2979
answer[('Braine, Richard', 'Coogan, Will')]: 1.1345
answer[('Byrne, Michael (I)', 'Quinn, Al (I)')]: 0.1736
answer[('Cartwright, Veronica', 'Edelstein, Lisa')]: 0.7161
answer[('Curry, Jon (II)', 'Wise, Ray (I)')]: 0.2872
answer[('Di Benedetto, John', 'Hallgrey, Johnathan')]: 0.8361
answer[('Hochendoner, Jeff', 'Cross, Kendall')]: 0.6228
answer[('Izquierdo, Ty', 'Kimball, Donna')]: 0.2616
answer[('Jace, Michael', 'Snell, Don')]: 0.6758
answer[('James, Charity', 'Tuerpe, Paul')]: 0.5079
answer[('Kay, Dominic Scott', 'Cathey, Reg E.')]: 0.2184
answer[('McCabe, Richard', 'Washington, Denzel')]: 0.4031
answer[('Reid, Kevin (I)', 'Affleck, Rab')]: 0.5147
answer[('Reid, R.D.', 'Boston, David (IV)')]: 0.5768
answer[('Restivo, Steve', 'Preston, Carrie (I)')]: 0.3628
answer[('Rodriguez, Ramon (II)', 'Mulrooney, Kelsey')]: 0.2394
answer[('Rooker, Michael (I)', 'Grady, Kevin (I)')]: 0.3693
answer[('Ruscoe, Alan', 'Thornton, Cooper')]: 0.4

{('Boone Junior, Mark', 'Del Toro, Benicio'): 0.2979,
 ('Braine, Richard', 'Coogan, Will'): 1.1345,
 ('Byrne, Michael (I)', 'Quinn, Al (I)'): 0.1736,
 ('Cartwright, Veronica', 'Edelstein, Lisa'): 0.7161,
 ('Curry, Jon (II)', 'Wise, Ray (I)'): 0.2872,
 ('Di Benedetto, John', 'Hallgrey, Johnathan'): 0.8361,
 ('Hochendoner, Jeff', 'Cross, Kendall'): 0.6228,
 ('Izquierdo, Ty', 'Kimball, Donna'): 0.2616,
 ('Jace, Michael', 'Snell, Don'): 0.6758,
 ('James, Charity', 'Tuerpe, Paul'): 0.5079,
 ('Kay, Dominic Scott', 'Cathey, Reg E.'): 0.2184,
 ('McCabe, Richard', 'Washington, Denzel'): 0.4031,
 ('Reid, Kevin (I)', 'Affleck, Rab'): 0.5147,
 ('Reid, R.D.', 'Boston, David (IV)'): 0.5768,
 ('Restivo, Steve', 'Preston, Carrie (I)'): 0.3628,
 ('Rodriguez, Ramon (II)', 'Mulrooney, Kelsey'): 0.2394,
 ('Rooker, Michael (I)', 'Grady, Kevin (I)'): 0.3693,
 ('Ruscoe, Alan', 'Thornton, Cooper'): 0.4072,
 ('Sloan, Tina', 'Dever, James D.'): 0.5636,
 ('Wasserman, Jerry', 'Sizemore, Tom'): 0.1999}

In [6]:
test = {(u'Ali, Tony', u'Allen, Woody'): 0.5657,
        (u'Auberjonois, Rene', u'MacInnes, Angus'): 0.0814,
        (u'Avery, Shondrella', u'Dorsey, Kimberly (I)'): 0.7837,
        (u'Bollo, Lou', u'Jeremy, Ron'): 0.4763,
        (u'Byrne, P.J.', u'Clarke, Larry'): 0.109,
        (u'Couturier, Sandra-Jessica', u'Jean-Louis, Jimmy'): 0.3649,
        (u'Crawford, Eve (I)', u'Cutler, Tom'): 0.2052,
        (u'Flemyng, Jason', u'Newman, Laraine'): 0.139,
        (u'French, Dawn', u'Smallwood, Tucker'): 0.2979,
        (u'Gunton, Bob', u'Nagra, Joti'): 0.2136,
        (u'Hoffman, Jake (I)', u'Shook, Carol'): 0.6073,
        (u'Kamiki, Ry\xfbnosuke', u'Thor, Cameron'): 0.3644,
        (u'Roache, Linus', u'Dreyfuss, Richard'): 0.6731,
        (u'Sanchez, Phillip (I)', u'Wiest, Dianne'): 0.5083,
        (u'Sheppard, William Morgan', u'Crook, Mackenzie'): 0.0849,
        (u'Stan, Sebastian', u'Malahide, Patrick'): 0.2857,
        (u'Tessiero, Michael A.', u'Molen, Gerald R.'): 0.2056,
        (u'Thomas, Ken (I)', u'Bell, Jamie (I)'): 0.3941,
        (u'Thompson, Sophie (I)', u'Foley, Dave (I)'): 0.1095,
        (u'Tzur, Mira', u'Heston, Charlton'): 0.3642}

answer

In [7]:

for tup in test:
    start_node, end_node = tup
    smallest_paths = dijkstra_search(start_node, end_node, movie_graph)
    print(test[tup] == smallest_paths[end_node])
    print("expected answer[{}]:".format(tup), test[tup])
    print("answer[{}]:".format(tup), smallest_paths[end_node])    
    
test    
    

True
expected answer[('Ali, Tony', 'Allen, Woody')]: 0.5657
answer[('Ali, Tony', 'Allen, Woody')]: 0.5657
True
expected answer[('Auberjonois, Rene', 'MacInnes, Angus')]: 0.0814
answer[('Auberjonois, Rene', 'MacInnes, Angus')]: 0.0814
True
expected answer[('Avery, Shondrella', 'Dorsey, Kimberly (I)')]: 0.7837
answer[('Avery, Shondrella', 'Dorsey, Kimberly (I)')]: 0.7837
True
expected answer[('Bollo, Lou', 'Jeremy, Ron')]: 0.4763
answer[('Bollo, Lou', 'Jeremy, Ron')]: 0.4763
True
expected answer[('Byrne, P.J.', 'Clarke, Larry')]: 0.109
answer[('Byrne, P.J.', 'Clarke, Larry')]: 0.109
True
expected answer[('Couturier, Sandra-Jessica', 'Jean-Louis, Jimmy')]: 0.3649
answer[('Couturier, Sandra-Jessica', 'Jean-Louis, Jimmy')]: 0.3649
True
expected answer[('Crawford, Eve (I)', 'Cutler, Tom')]: 0.2052
answer[('Crawford, Eve (I)', 'Cutler, Tom')]: 0.2052
True
expected answer[('Flemyng, Jason', 'Newman, Laraine')]: 0.139
answer[('Flemyng, Jason', 'Newman, Laraine')]: 0.139
True
expected answer[('F

{('Ali, Tony', 'Allen, Woody'): 0.5657,
 ('Auberjonois, Rene', 'MacInnes, Angus'): 0.0814,
 ('Avery, Shondrella', 'Dorsey, Kimberly (I)'): 0.7837,
 ('Bollo, Lou', 'Jeremy, Ron'): 0.4763,
 ('Byrne, P.J.', 'Clarke, Larry'): 0.109,
 ('Couturier, Sandra-Jessica', 'Jean-Louis, Jimmy'): 0.3649,
 ('Crawford, Eve (I)', 'Cutler, Tom'): 0.2052,
 ('Flemyng, Jason', 'Newman, Laraine'): 0.139,
 ('French, Dawn', 'Smallwood, Tucker'): 0.2979,
 ('Gunton, Bob', 'Nagra, Joti'): 0.2136,
 ('Hoffman, Jake (I)', 'Shook, Carol'): 0.6073,
 ('Kamiki, Ryûnosuke', 'Thor, Cameron'): 0.3644,
 ('Roache, Linus', 'Dreyfuss, Richard'): 0.6731,
 ('Sanchez, Phillip (I)', 'Wiest, Dianne'): 0.5083,
 ('Sheppard, William Morgan', 'Crook, Mackenzie'): 0.0849,
 ('Stan, Sebastian', 'Malahide, Patrick'): 0.2857,
 ('Tessiero, Michael A.', 'Molen, Gerald R.'): 0.2056,
 ('Thomas, Ken (I)', 'Bell, Jamie (I)'): 0.3941,
 ('Thompson, Sophie (I)', 'Foley, Dave (I)'): 0.1095,
 ('Tzur, Mira', 'Heston, Charlton'): 0.3642}