A notebook for cleaning celebrity chef Tweet data and putting it in a local MongoDB collection.

#0. Setup

In [215]:
import json
from nltk.corpus import stopwords
from collections import defaultdict
from tqdm import tqdm
import pymongo
import pickle

Load in Tweets that have been dumped into JSON from EC2 Mongo:

In [8]:
tweets_path = '/Users/Ben/ds/metis/Kojak/chefs/Data/cheftweets_so_far.json'
tweets_data = []
tweets_file = open(tweets_path, 'r')
for line in tweets_file:
    try:
        tweet = json.loads(line)
        tweets_data.append(tweet)
    except:
        continue

In [9]:
len(tweets_data)

16198

Chefs' Twitter handles:

In [157]:
handles_and_chefs = {'Wolfiesmom':'Valerie Bertinelli', 'inagarten':'Ina Garten', 
                     'thepioneerwoman':'Ree Drummond', 'GDeLaurentiis':'Giada De Laurentiis',
                     'trishayearwood':'Trisha Yearwood', 'GuyFieri':'Guy Fieri',
                     'GuyFieri_ebooks':'Guy Fieri', 'RobertIrvine':'Robert Irvine', 
                     'altonbrown':'Alton Brown', 'AltonBrownNews':'Alton Brown', 
                     'bflay':'Bobby Flay', 'Duff_Goldman':'Duff Goldman', 
                     'DuffsCakemix':'Duff Goldman', 'SunnyAnderson':'Sunny Anderson',
                     'chefmarcela':'Marcela Valladolid', 'CasaMarcela':'Marcela Valladolid'}
handles = set(handles_and_chefs.keys())

Filter out tweets that don't mention chefs' handles:

In [152]:
tweets_by_chef = defaultdict(int)
filtered_tweets = []
for tweet in tqdm(tweets_data):
    mentions = set([mention['screen_name'] for mention in tweet['entities']['user_mentions']])
    if mentions & handles != set([]):
        tweets_by_chef[list(mentions & handles)[0]] += 1
        filtered_tweets.append(tweet)



In [153]:
len(filtered_tweets)

13940

Local Mongo initialization:

In [188]:
client = pymongo.MongoClient()
chefs = client.chefs_db
tweet_collection = client.chefs_db.celebrity_tweets

#1. Implementation
- Document setup, including NLP for Tweet text; put docs into Mongo `tweet_collection`
- Track capitalized words qua `terms` associated with each chef

In [219]:
tweet_collection.drop()
stops = stopwords.words('english')

for tweet in tqdm(filtered_tweets):
    doc = {}
    doc['hashtags'] = [hashtag['text'] for hashtag in tweet['entities']['hashtags']]
    chef = handles_and_chefs[[mention['screen_name'] 
                              for mention in tweet['entities']['user_mentions']
                              if mention['screen_name'] in handles][0]]
    doc['chef'] = chef
    text = tweet['text']
    doc['terms'] = []
    for word in text.split():
        if word[0] != word[0].lower():
            doc['terms'].append(word)
            #terms[chef][word] += 1       
    text = text.lower().split()
    words = [word for word in text if '@' not in word and '#' not in word 
             and 'https://' not in word and 'rt' not in word and word not in stops]
    words = ' '.join(words)
    doc['tweet'] = words
    
    tweet_collection.save(doc)



In [220]:
print "Count of number of tweets:", tweet_collection.count()

Count of number of tweets: 13940


In [223]:
tweet_collection.find_one()

{u'_id': ObjectId('56eef84bfdb32c01761b1bcc'),
 u'chef': u'Trisha Yearwood',
 u'hashtags': [u'ThePassionLive'],
 u'terms': [u'RT', u'Christian', u'Lifehouse'],
 u'tweet': u'makes first christian appearance lifehouse cover'}