In [147]:
import pickle as pkl
import numpy as np
import matplotlib.pyplot as plt
import json
from nltk.tokenize import word_tokenize
import re
from collections import Counter
from nltk.corpus import stopwords
import string
import pandas as pd
import pytz
vincent.core.initialize_notebook()

In [25]:
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
 
def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens

In [32]:
def to_json(data_array):
        # #initialize geo_data json (just a dict here) to feed in to the maps
        geo_data = {
            "type": "FeatureCollection",
            "features": []
        }

        #populate the json file
        for d in data_array:
            geo_json_feature = {
                    "type": "Feature",
                    "geometry": {"type" : "Point", "coordinates" : d['coordinates']},
                    "properties": {
                        "text": d['text'],
                        "created_at": d['created_at']
                    }
                }
            geo_data['features'].append(geo_json_feature)

        #write the json out to a file
        with open('geo_data.json', 'w') as fout:
            fout.write(json.dumps(geo_data, indent=4))

In [29]:
def vector_add(v,w):
    return[v_i + w_i for v_i, w_i in zip(v,w)]

def vector_sum(vectors):
    result = vectors[0]
    for vector in vectors[1:]:
        result = vector_add(result, vector)
    return result

def vector_subtract(v, w):
    return [v_i - w_i for v_i, w_i in zip(v,w)]

def scalar_multiply(c, v):
    return[c * v_i for v_i in v]

def vector_mean(vectors):
    n = len(vectors)
    return scalar_multiply(1/n, vector_sum(vectors))

def dot(v, w):
    return sum(v_i * w_i for v_i, w_i in zip(v,w))

def sum_of_squares(v):
    return dot(v,v)

def squared_distance(v,w):
    return sum_of_squares(vector_subtract(v,w))

In [30]:
class KMeans(object):
    """performs k-means clustering"""

    def __init__(self, k):
        self.k = k          # number of clusters
        self.means = None   # means of clusters

    def classify(self, input):
        """return the index of the cluster closest to the input"""
        return min(range(self.k),
                   key=lambda i: squared_distance(input, self.means[i]))

    def train(self, inputs):

        self.means = random.sample(inputs, self.k)
        assignments = None

        while True:
            # Find new assignments
            new_assignments = list(map(self.classify, inputs))

            # If no assignments have changed, we're done.
            if assignments == new_assignments:
                return

            # Otherwise keep the new assignments,
            assignments = new_assignments

            for i in range(self.k):
                i_points = [p for p, a in zip(inputs, assignments) if a == i]
                # avoid divide-by-zero if i_points is empty
                if i_points:
                    self.means[i] = vector_mean(i_points)


In [None]:
f = open('/Users/calvin/Documents/Lehigh/English/Research/data/cap1.pkl', 'rb')
data_array = []
count = 0
x_sum = 0
y_sum = 0
#pull out the first 10000 tweets, note this is easy to change, but speed and space
#concerns make this limited. I think that doing a random sample would be better
for x in range(0,700000):
    try:
        dd = pkl.load(f)
    except EOFError:
        break
    except Exception:
        print(count)
        count += 1
        unpickler.load()
        continue
    else:
        #right now we just take the first coordinate in the bounding box as the actual
        #we could average to find the middle, but this seems good enough for now
        if dd['coordinates'] == None:
            if dd['place'] == None:
                continue
            dd['coordinates'] = dd['place']['bounding_box']['coordinates'][0][0]
        else:
            #account for edge case where coordinates are wrapped
            dd['coordinates'] = dd['coordinates']['coordinates']
            
        #count how many samples we take
#         count += 1
#         # print(dd.keys())
#         # print(dd)

#         #sum up the coordinate values
#         x_sum += dd['coordinates'][0]
#         y_sum += dd['coordinates'][1]

        #append the data point to the data array
        data_array.append(dd)

In [143]:
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via', 'I\'m', 'I', '😂', 'like','get','don\'t',]

count_all = Counter()
for d in data_array:
#     tokens = preprocess(d['text'])
    terms_stop = [term for term in preprocess(d['text']) if term not in stop]
    terms_hash = [term for term in preprocess(d['text']) 
              if term.startswith('#')]
    count_all.update(terms_hash)
    
print(count_all.most_common(100))


[('#Hiring', 11231), ('#job', 8833), ('#Job', 7615), ('#Jobs', 6855), ('#CareerArc', 5815), ('#hiring', 3224), ('#Retail', 2071), ('#MakeTVShowsEvil', 2024), ('#Hospitality', 1794), ('#', 1785), ('#Nursing', 1588), ('#Healthcare', 1440), ('#Veterans', 1378), ('#420', 1283), ('#Sales', 994), ('#NYPrimary', 853), ('#traffic', 806), ('#IT', 709), ('#CustomerService', 544), ('#Transportation', 485), ('#ImWithHer', 469), ('#trndnl', 460), ('#Blackhawks', 423), ('#FeelTheBern', 387), ('#BusinessMgmt', 372), ('#Manufacturing', 357), ('#orlpol', 325), ('#TSOUWithoutYou', 317), ('#Accounting', 311), ('#Finance', 280), ('#Engineering', 280), ('#SoundCloud', 272), ('#Clerical', 269), ('#Trump', 264), ('#NewYork', 257), ('#TweetMe', 255), ('#Toronto', 251), ('#stlblues', 250), ('#SONIC', 247), ('#Physician', 246), ('#realestate', 244), ('#Houston', 241), ('#np', 241), ('#SkilledTrade', 234), ('#BernieOrBust', 224), ('#Boston', 220), ('#Banking', 212), ('#Cosmetology', 212), ('#Repost', 208), ('#Ke

In [164]:
san_fran = [-123.2556,36.8489,-120.3223,38.4235]
if (p.x > x && p.y > y && p.x < x+width && p.y < y+height)

In [None]:
def checkBox(d, box):
    x = d['coordinates'][0]
    y = d['coordinates'][0]
    if(x < box[0]) return false;
    if(point.y < rect.y) return false;
    if(point.x >= rect.x + rect.width) return false;
    if(point.y >= rect.y + rect.height) return false;
return true;

In [165]:
filtered = []
for d in data_array:

    if "#Boston" in d['text']:
        filtered.append(d)
print(len(filtered))
        

242


In [138]:
to_json(filtered)

In [162]:
dates = []
eastern = pytz.timezone('US/Eastern')
for d in filtered:
    dates.append(d['created_at'])
ones = [1]*len(dates)
idx = pd.DatetimeIndex(dates).tz_localize(pytz.utc).tz_convert(eastern)
dates_series = pd.Series(ones, index=idx)
# Resampling / bucketing
per_minuite = dates_series.resample('1Min', how='sum').fillna(0)


the new syntax is .resample(...).sum()


In [163]:
time_chart = vincent.Line(per_minuite)
time_chart.axis_titles(x='Time', y='Freq')

In [174]:
max_rt = data_array[0]['retweet_count']
tweet = data_array[0]
for d in data_array:
    if d['retweet_count'] > max_rt:
        max_rt = d['retweet_count']
        tweet = d
print(max_rt)
print(d)

0
{'geo': None, 'in_reply_to_user_id': None, 'lang': 'en', 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'favorited': False, 'id_str': '722788938310483968', 'in_reply_to_screen_name': None, 'id': 722788938310483968, 'is_quote_status': False, 'place': {'name': 'Wichita Falls', 'place_type': 'city', 'country_code': 'US', 'attributes': {}, 'url': 'https://api.twitter.com/1.1/geo/id/b980515f617707a9.json', 'id': 'b980515f617707a9', 'country': 'United States', 'full_name': 'Wichita Falls, TX', 'bounding_box': {'coordinates': [[[-98.614411, 33.835461], [-98.614411, 34.017379], [-98.425702, 34.017379], [-98.425702, 33.835461]]], 'type': 'Polygon'}}, 'contributors': None, 'retweeted': False, 'text': "&amp; if that doesn't describe our generation &amp; what's going wrong &amp; causing so much heartache then idk man.", 'in_reply_to_status_id_str': None, 'timestamp_ms': '1461161286319', 'in_reply_to_user_id_str': None, 'retweet_count': 0, 'entitie

In [None]:
print(data_array[-1])