In [8]:
from YakGrabber import *

import numpy as np

In [6]:
lines = [line.rstrip('\n').split(", ") for line in open('latlong.csv')]
locations = [(line[0].lower(), float(line[1]), float(line[2])) for line in lines]
locations

[('princeton', 40.3487, -74.6593),
 ('harvard', 42.3744, -71.1169),
 ('yale', 41.3111, -72.9267),
 ('columbia', 40.8075, -73.9619),
 ('stanford', 37.43, -122.17),
 ('chicago', 41.7897, -87.5997),
 ('mit', 42.3598, -71.0921),
 ('duke', 36.0011, -78.9389),
 ('penn', 39.95, -75.19),
 ('caltech', 34.1386, -118.1255),
 ('dartmouth', 43.7033, -72.2883),
 ('johns hopkins', 39.3289, -76.6203),
 ('northwestern', 42.05598, -87.6752),
 ('wustl', 38.648, -90.305),
 ('cornell', 42.48, -76.4511),
 ('brown', 41.8262, -71.4032),
 ('notre dame', 41.703, -86.239),
 ('vanderbilt', 36.1486, -86.805),
 ('rice', 29.7169, -95.4028),
 ('uc berkeley', 37.87, -122.259)]

In [58]:
# Module that handles Yak retrieval, without any preprocessing.
# yg.fetch_yaks gets us all the yaks that would appear to a user.
# (usually on the order of the 100 most recent yaks)
# The location is set to Stanford's lat/long coords, because 
# that's the location we're interested in.
yg = YakGrabber()

# We were thinking about having some sort of time limit for yaks to accumulate
# upvotes, but that didn't seem objective, since this depends on the activity
# of yik yak at the time of the yak: for instance a yak posted at 2am might take
# more time to be upvoted properly than a yak posted at 6pm. 
# Waiting until yaks are no longer in the "new" section solves this because 
# the "new" section's size is fixed, and yaks don't leave after some specific time 
# period.

# Set of all yaks that are 'old' and aren't being voted on actively any more
# These are either yaks that have been voted off, or yaks that are no longer 
# found in the recents section. This is an array of tuples of the form 
# (messageID, handle, message, numberOfLikes)
# If the yak was voted off, the numberOfLikes is set to float("-inf")
completed_yaks = []

# Set of yaks that are still being actively voted on.
# This will be a dict, of type: messageID -> [handle, message, numberOfLikes]
current_yaks = {}

In [59]:
# if completed_yaks becomes super big, just make it a numpy array and pickle it or something
def handle_new_yak_set(completed_yaks, current_yaks, new_yaks_dict):
    # MOVE ALL OLD YAKS INTO completed_yaks
    keys_to_remove = set()
    for yak_id in current_yaks:
        if yak_id not in new_yaks_dict:
            # Then the yak is no longer in circulation. This could be for two 
            # reasons. Either the yak has been voted off, or it is too old.
            # it's not actually possible to know exactly which one it is, but 
            # we simply use the heuristic that if the yak's numberOfLikes 
            # when it is in current_yaks is negative, we assume that it was voted off
            if current_yaks[yak_id][2] < 0:
                completed_yaks.append((yak_id, 
                                       current_yaks[yak_id][0], 
                                       current_yaks[yak_id][1], 
                                       float("-inf")))
            else:
                completed_yaks.append((yak_id, 
                                       current_yaks[yak_id][0], 
                                       current_yaks[yak_id][1], 
                                       float(current_yaks[yak_id][2])))
            keys_to_remove.add(yak_id)
    for to_del in keys_to_remove:
        del current_yaks[to_del]

    # UPDATE ALL CURRENT YAKS THAT ARE LEFT
    for yak_id in new_yaks_dict:
        if yak_id in current_yaks:
            current_yaks[yak_id][2] = new_yaks_dict[yak_id][2]
        else:
            current_yaks[yak_id] = new_yaks_dict[yak_id]

In [60]:
# TEST CODE FOR THE ABOVE FUNCTION

# completed_yaks = []

# current_yaks = {}

# test_yaks_1 = {"1": ["", "message 1", 2],
#                "2": ["", "message 2", -2],
#                "3": ["", "message 3", 2],
#                "4": ["", "message 4", 2]}

# handle_new_yak_set(completed_yaks, current_yaks, test_yaks_1)
# print completed_yaks
# print current_yaks

# test_yaks_2 = {"1": ["", "message 1", 4],
#                "6": ["", "message 2", -2],
#                "3": ["", "message 3", -1],
#                "5": ["", "message 5", 10]}

# handle_new_yak_set(completed_yaks, current_yaks, test_yaks_2)
# print completed_yaks
# print current_yaks

In [95]:
def make_dict(yaks_arr):
    return {yak["messageID"]:[(yak['handle'] if ('handle' in yak) and yak['handle'] else ""), 
                              yak['message'], 
                              yak['numberOfLikes']] for yak in yaks_arr}
most_recent_yaks = yg.fetch_yaks()
new_yaks_dict = make_dict(most_recent_yaks)

In [96]:
handle_new_yak_set(completed_yaks, current_yaks, new_yaks_dict)

In [97]:
print completed_yaks
print len(current_yaks)

[(u'R/55637c257f4242ec0f0179af18e1e', '', u'Happiness at this school is preceded by drive and the guilt that comes from not being able to provide for that drive all the time.', 14.0), (u'R/55637c46150304482e56ac6a6b55c', '', u'Only money buys you time ', 1.0), (u'R/55637c4eac96c74004e22d7291af4', '', u'Money is the power ', 2.0), (u'R/55637d3e3846af4578aefe2dd24fd', u'Uncrushed', u"Stanford Crushes is ending and I've still never been mentioned.", 62.0), (u'R/55637eae3832f1cd781b4b2ff0e8b', '', u"I get why people don't appreciate them, but I personally feel my humanities major is preparing me well for life outside of Stanford", 83.0), (u'R/55637e958450b6c2f5aab21d0805e', '', u'Is the library open today?', 3.0), (u'R/5563871a60a4461a8a766b7bc5889', '', u'Breaking a mirror is 7 years of bad luck. Breaking a condom is 18 years of bad luck. ', 231.0), (u'R/556386f37ca6188e4bbe53e0b82e6', '', u'Can somebody suggest a good place to get a tattoo around here?', 5.0), (u'R/55637f7dc893da0356bf71

In [98]:
import datetime
np.save(("data/yak_grab_" + str(datetime.datetime.now())), completed_yaks)

In [100]:
np.load("data/yak_grab_2015-05-25 19:24:11.720222.npy")

array([[u'R/55637c257f4242ec0f0179af18e1e', u'',
        u'Happiness at this school is preceded by drive and the guilt that comes from not being able to provide for that drive all the time.',
        u'14.0'],
       [u'R/55637c46150304482e56ac6a6b55c', u'',
        u'Only money buys you time ', u'1.0'],
       [u'R/55637c4eac96c74004e22d7291af4', u'', u'Money is the power ',
        u'2.0'],
       [u'R/55637d3e3846af4578aefe2dd24fd', u'Uncrushed',
        u"Stanford Crushes is ending and I've still never been mentioned.",
        u'62.0'],
       [u'R/55637eae3832f1cd781b4b2ff0e8b', u'',
        u"I get why people don't appreciate them, but I personally feel my humanities major is preparing me well for life outside of Stanford",
        u'83.0'],
       [u'R/55637e958450b6c2f5aab21d0805e', u'',
        u'Is the library open today?', u'3.0'],
       [u'R/5563871a60a4461a8a766b7bc5889', u'',
        u'Breaking a mirror is 7 years of bad luck. Breaking a condom is 18 years of bad luck. 

In [101]:
len(completed_yaks)

18

In [9]:
X = np.load("data/yak_grab_2015-05-26 17:12:38.840583.npy")

In [11]:
len(X)

101