# Pagerank Across Hamilton Data

In [1]:
import urllib
import csv

def read_data(path):
    """
    read downloaded data from a .csv file, and return a list of tuples. 
    each tuple represents a link between states. 
    """
    with open(path, "r") as f:
        reader = csv.reader(f)
        return [(row[0], row[1]) for row in list(reader)]
    
data = read_data('Hamilton_Data.csv')

In [2]:
def describe(n):
    """
    describe the meaning of the nth row of my dataset of choice 
    """
    
    print('"Element {element} of the Hamilton data set is {my_tuple}. This means that {personA} mentions {personB} in a song."'
          .format(element = n, my_tuple = data[n], personA = data[n][0].title(), personB = data[n][1].title()))


In [3]:
describe(5)

"Element 5 of the Hamilton data set is ('burr', 'betsy'). This means that Burr mentions Betsy in a song."


In [4]:
def data_to_dictionary(data):
    """
    convert data into a dictionary where there exists a key for each character and value of that key is a list of 
    corresponding values (should contain repeats if they exist) 
    """

    my_dict = {}
    
    # fill my_dict with empty values
    for i in data:
        values = []
        my_dict[i[0]] = values

#     values = []
#     for j in my_dict:
#         for i in data:
#             if i[0] == j:
#                 values.append(i[1])
#         my_dict[j] = values
#         values = []

    # use a loop to append the corresponding value to a key (i.e. 2nd element in a tuple) to each empty values list
    # of my_dict
    # ensure that repeats are appended (should they exist)
    for i in data:
        my_dict[i[0]].append(i[1]) 
    return my_dict
    
        

In [5]:
class PR_DiGraph:
    """
    create a class to represent one-way data relationships in our data
    """
    
    def __init__(self, data, iteration_limit):
        """
        create a constructor to allow user to pass additional data when initializing an object of this classParameters
        """
        # self.data = data
        self.iteration_limit = iteration_limit
        self.link_dict = data_to_dictionary(data)
        self.iteration_limit = self.iteration_limit
    
    def linked_by(self, x):
        """
        return self.link_dict[x] which will access the respective value(s) of key, x
        """
        return self.link_dict[x]
    
    def __iter__(self):
        """
        construct a PR_Iterator from PR_DiGraph
        """
        return(PR_Iterator(self))
        
        

In [6]:
D = PR_DiGraph(data, iteration_limit = 10000)

In [7]:
# look at all keys in the Hamilton dataset
list(D.link_dict.keys())

['burr',
 'hamilton',
 'ensemble',
 'company',
 'men',
 'women',
 'angelica',
 'eliza',
 'washington',
 'mulligan',
 'lafayette',
 'laurens',
 'kingGeorge',
 'jefferson',
 'madison',
 'philipH',
 'lee',
 'peggy',
 'seabury',
 'reynolds',
 'doctor']

In [8]:
import random

class PR_Iterator():
    """
    create PR_Iterator class to be used in PR_DiGraph
    """
    
    def __init__(self, D):
        """
        create a constructor that takes in as argument PR_DiGraph object
        """
        
        self.D = D
        self.i = 0
        
        # arbitrary initial value
        self.current_state = "hamilton"
        
    def follow_link(self):
        """
        Pick a random value mentioned by our current 'key' (i.e. plane/character)
        """
        self.test = "HI"
        # Take necessary precautions (i.e. try/except) if we encounter KeyErrors and include 
        # if statements to handle if a respective random value (i.e. next_state) is not a key in our data or is a 
        # repeat (i.e. hamilton calls philip and philip can only call himself for the duration of the loop)
        try:
            self.next_state = random.choice(self.D.linked_by(self.current_state))
            if self.next_state not in (self.D.link_dict.keys()):
                self.teleport()
            if (self.next_state != self.current_state):
                self.current_state = self.next_state
            elif (self.next_state == self.current_state):
                self.teleport()
        except KeyError: 
            # member function (from our iterator object (self))
            self.teleport()
    
    def teleport(self): 
        """
        set current state to a new state (key of the link dict) at random.
        """
        self.current_state = random.choice(list(self.D.link_dict.keys()))
            
    def __next__(self):
        """
        allow for iteration of all items of a PR_DiGraph object 
        """
#         print(self.test)
        # only generate one random variable
        # execute follow_link() with 85% probability and teleport() with 15% probability
      
        if random.random() < 0.85:
            self.follow_link()
        else:
            self.teleport()
        
        # raise StopIteration at end of list
        if self.i == self.D.iteration_limit:
            raise StopIteration 
        
        # grab next element of our list
        self.i += 1
    
        return(self.current_state)
        



In [9]:
# because object is iterable you can make a list
obj = PR_DiGraph(data, iteration_limit = 1000000)
L = list(obj)
my_dict = {}

# Note: .count() treated as for loop (therefore expensive operation)
# update dictionary efficiently, by not using nested for loops we save on time complexity
for i in L:
    # efficient update of loop
    if i not in my_dict:
        my_dict[i] = 1
    else:
        my_dict[i] += 1
#     my_dict.update({i : L.count(i)})
    


In [10]:
# holds the PageRank score of each key
my_dict

{'madison': 37294,
 'jAdams': 31276,
 'reynolds': 28802,
 'mulligan': 21388,
 'washington': 92482,
 'hamilton': 166199,
 'peggy': 20704,
 'company': 17165,
 'men': 17352,
 'burr': 99492,
 'schuylerSis': 19246,
 'jefferson': 72446,
 'eliza': 51500,
 'philipH': 26285,
 'women': 17025,
 'lafayette': 33931,
 'lee': 33654,
 'theodosiaMother': 1706,
 'angelica': 48168,
 'seabury': 17035,
 'ensemble': 16929,
 'marthaWashington': 1743,
 'eacker': 6220,
 'kingGeorge': 28800,
 'philipS': 7717,
 'laurens': 27423,
 'green': 3876,
 'doctor': 16722,
 'betsy': 1716,
 'knox': 3983,
 'generalMercer': 1694,
 'sAdams': 3412,
 'rochambeau': 3820,
 'maria': 1735,
 'paine': 1951,
 'generalMontgomery': 1769,
 'weeks': 1647,
 'jay': 1637,
 'franklin': 1904,
 'sally': 2809,
 'conway': 1762,
 'pendleton': 1726,
 'ness': 1698,
 'kingLouis': 1778,
 'theodosiaDaughter': 1628,
 'admiralHowe': 751}

### Top 10 'states' ranked by highest PageRank. These are the characters who are introduced and/or discussed most often across the entirety of the play.

In [11]:
# sort by value
sorted_dict = sorted(my_dict.items(), key = lambda x: x[1], reverse = True)[:10]
sorted_dict

[('hamilton', 166199),
 ('burr', 99492),
 ('washington', 92482),
 ('jefferson', 72446),
 ('eliza', 51500),
 ('angelica', 48168),
 ('madison', 37294),
 ('lafayette', 33931),
 ('lee', 33654),
 ('jAdams', 31276)]