In [1]:
import json
import os

from collections import defaultdict
from time import time

from emoji import UNICODE_EMOJI
from IPython.core.display import display, HTML
from tweepy import OAuthHandler, Stream, StreamListener, API as TwApi

from util.misc import SKIN_TONES, STATE_LOOKUP, STATES
from util.tfidf import tfidf

In [2]:
CONSUMER_KEY = os.environ.get('TW_CONSUMER_KEY')
CONSUMER_SECRET = os.environ.get('TW_CONSUMER_SECRET')
ACCESS_TOKEN_KEY = os.environ.get('TW_ACCESS_TOKEN_KEY')
ACCESS_TOKEN_SECRET = os.environ.get('TW_ACCESS_TOKEN_SECRET')

DATA_DIR = 'data2'
USA_BBOX = [-175.1, 22.4, -59.8, 72.3]  # via http://boundingbox.klokantech.com/

In [33]:
# a few helper functions

def extract_emojis(txt):
    return [c for c in txt if c in UNICODE_EMOJI]

def sort_values(data):
    return sorted(data.items(), key=lambda x: x[1], reverse=True)

def get_json(fname):
    with open('{}/{}'.format(DATA_DIR, fname)) as f:
        return json.load(f)
    
def save_to_json(data, fname):
    with open('{}/{}'.format(DATA_DIR, fname), 'w') as f:
        json.dump(data, f, ensure_ascii=False)

**step 1: fetch tweets (within USA)**

In [7]:
# this uses Twitter's streaming API
# for demo purposes, this only fetches 1k tweets
# (in reality, this ran over several days and collected millions of tweets)

class MyListener(StreamListener):
    def __init__(self):
        super().__init__()
        self.ct = 0
        self.started = time()

    def on_status(self, data):
        if hasattr(data, 'retweeted_status'):
            return

        try:
            with open('{}/tweets.json'.format(DATA_DIR), 'a') as f:
                f.write('{}\n'.format(json.dumps(data._json)))
        except Exception as e:
            print('error: {}'.format(str(e)))

        self.ct += 1
        if (self.ct % 100 == 0):
            print('🚨 {} tweets... ({} secs elapsed)'.format(
                self.ct,
                int((time() - self.started))
            ))

        if self.ct > 1000:
            return False
            
    def on_error(self, status):
        print('uh-oh! ({})'.format(status))

In [8]:
auth = OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN_KEY, ACCESS_TOKEN_SECRET)
api = TwApi(auth)

stream = Stream(auth, MyListener())
stream.filter(locations=USA_BBOX)

🚨 100 tweets... (4 secs elapsed)
🚨 200 tweets... (8 secs elapsed)
🚨 300 tweets... (13 secs elapsed)
🚨 400 tweets... (17 secs elapsed)
🚨 500 tweets... (21 secs elapsed)
🚨 600 tweets... (26 secs elapsed)
🚨 700 tweets... (29 secs elapsed)
🚨 800 tweets... (34 secs elapsed)
🚨 900 tweets... (38 secs elapsed)
🚨 1000 tweets... (42 secs elapsed)


**step 2: filter tweets to ones containing emojis**

In [9]:
filtered = []

with open('{}/tweets.json'.format(DATA_DIR)) as f:
    for i, line in enumerate(f):
        d = json.loads(line)
        emojis = extract_emojis(d['text'])

        if i % 10000 == 0:
            print('done with {}...'.format(i))

        if not len(emojis):
            continue

        filtered.append({
            'id': d['id_str'],
            'time': d['created_at'],
            'user': d['user']['screen_name'],
            'text': d['text'],
            'coordinates': d['coordinates'],
            'place': d['place'],
            'emojis': emojis,
            'emojis_names': [UNICODE_EMOJI[e] for e in emojis],
        })

print('{} tweets with emojis'.format(len(filtered)))

done with 0...
770 tweets with emojis


In [12]:
save_to_json(filtered, 'tweets-w-emojis.json')

**step 3: aggregate by emoji and state**

In [14]:
data = get_json('tweets-w-emojis.json')

data[0]

{'coordinates': None,
 'emojis': ['💘'],
 'emojis_names': [':heart_with_arrow:'],
 'id': '872474708570144769',
 'place': {'attributes': {},
  'bounding_box': {'coordinates': [[[-75.503507, 43.170051],
     [-75.503507, 43.279981],
     [-75.382773, 43.279981],
     [-75.382773, 43.170051]]],
   'type': 'Polygon'},
  'country': 'United States',
  'country_code': 'US',
  'full_name': 'Rome, NY',
  'id': '00228ed265b1139e',
  'name': 'Rome',
  'place_type': 'city',
  'url': 'https://api.twitter.com/1.1/geo/id/00228ed265b1139e.json'},
 'text': 'I loved it AGAIN 💘thanks to grandma. https://t.co/2c3e4ARSMI',
 'time': 'Wed Jun 07 15:25:54 +0000 2017',
 'user': 'chas_x0'}

In [16]:
state_cts, emoji_cts = defaultdict(int), defaultdict(int)
state_emoji_cts = defaultdict(lambda: defaultdict(int))

for d in data:
    place = d['place']

    if not place:
        continue

    country, ptype = place['country_code'], place['place_type']
    if country != 'US' or ptype not in ['city', 'admin']:
        continue

    state = place['name']
    if ptype == 'city':
        state = STATE_LOOKUP[place['full_name'][-2:].upper()]

    if state not in STATES:
        continue

    state_cts[state] += 1
    for e in d['emojis_names']:
        if e not in SKIN_TONES:
            emoji_cts[e] += 1
            state_emoji_cts[state][e] += 1

In [17]:
sort_values(state_cts)[:10]

[('Texas', 94),
 ('California', 90),
 ('Florida', 59),
 ('Ohio', 44),
 ('New York', 38),
 ('Georgia', 33),
 ('Virginia', 25),
 ('Louisiana', 23),
 ('Pennsylvania', 22),
 ('Tennessee', 19)]

In [18]:
sort_values(emoji_cts)[:10]

[(':face_with_tears_of_joy:', 274),
 (':red_heart:', 65),
 (':loudly_crying_face:', 63),
 (':smiling_face_with_heart-eyes:', 57),
 (':face_with_rolling_eyes:', 36),
 (':weary_face:', 35),
 (':raising_hands:', 32),
 (':fire:', 31),
 (':female_sign:', 29),
 (':smiling_face_with_smiling_eyes:', 29)]

In [32]:
# show most popular emojis by state 

results = []

for state, emojis in sorted(state_emoji_cts.items()):
    results.append({
        'state': state,
        'emojis': dict(emojis),
        'top_emojis': sort_values(emojis)[:10],
    })

for r in results[:5]:
    print('{}:\n{}'.format(r['state'], r['top_emojis'][:3]))

Alabama:
[(':hundred_points:', 4), (':face_with_tears_of_joy:', 4), (':red_heart:', 2)]
Alaska:
[(':crying_face:', 1), (':sneezing_face:', 1), (':sparkling_heart:', 1)]
Arizona:
[(':thumbs_up:', 4), (':down_button:', 4), (':heavy_check_mark:', 3)]
Arkansas:
[(':face_with_tears_of_joy:', 2), (':raising_hands:', 2), (':person_facepalming:', 1)]
California:
[(':white_heavy_check_mark:', 21), (':face_with_tears_of_joy:', 20), (':white_medium_star:', 20)]


In [20]:
save_to_json(results, 'emojis-by-state.json')

**step 4: add tf-idf (term frequency–inverse document frequency)**

In [42]:
data = get_json('emojis-by-state.json')
counts_list = [d['emojis'] for d in data]

data_w_tfidf = []

for d in data:    
    counts = d['emojis']
    scores = {word: tfidf(word, counts, counts_list) for word in counts}
    sorted_words = sort_values(scores)

    data_w_tfidf.append({
        'state': d['state'],
        'top_emojis': d['top_emojis'],
        'tfidf': sorted_words[:10],
    })

In [46]:
# preview results

for d in data_w_tfidf[:5]:
    print('{}:'.format(d['state']))

    for i, result in enumerate(d['tfidf'][:3]):
        word, score = result
        print("\t{}. {} tf-idf: {}".format(i + 1, word, round(score, 5)))

Alabama:
	1. :airplane: tf-idf: 0.23226
	2. :hundred_points: tf-idf: 0.21196
	3. :angry_face: tf-idf: 0.10111
Alaska:
	1. :sneezing_face: tf-idf: 0.6271
	2. :crying_face: tf-idf: 0.54601
	3. :sparkling_heart: tf-idf: 0.40738
Arizona:
	1. :down_button: tf-idf: 0.40458
	2. :heavy_check_mark: tf-idf: 0.30343
	3. :thumbs_up: tf-idf: 0.21051
Arkansas:
	1. :smirking_face: tf-idf: 0.24818
	2. :rolling_on_the_floor_laughing: tf-idf: 0.18517
	3. :raising_hands: tf-idf: 0.17059
California:
	1. :white_heavy_check_mark: tf-idf: 0.27563
	2. :white_medium_star: tf-idf: 0.2625
	3. :smiling_face_with_smiling_eyes: tf-idf: 0.07291


In [47]:
save_to_json(data_w_tfidf, 'emojis-by-state-more.json')