# Create Twitter Database

In [1]:
from election_helper import convert_datetime
import pymongo
from pymongo import MongoClient
import os, io, json, glob, re, pprint

We are going to create a Mongo database to store all of our tweets. This will allow us to easily query and access the tweets. We will create two collections, tweet_collection and user_collection. The tweet_collection will contain all of the tweets while the user_collection will contain all of the users. The tweet_collection will still contain the user information

In [2]:
# Define database globally
client = MongoClient('mongodb://localhost:27017/')
db = client.election_tweets
tweet_collection = db.tweets
user_collection = db.users

### Minimize Tweets

There is a lot of unnessecary information stored within the tweets. The following function removes the unnessecary keys from the tweet and performs a few modifications on the JSON. A query and root_query field are added to keep track of which query the tweet came from. The root_query field has the place id removed. The tweet JSON contains an entities field which contains information about extra twitter objects in the tweet such as mentions and hashtags. This is flattened and any null fields are removed. The value of retweeted_status is either null if not a retweet or it contains the original retweet. If retweeted_status is not null, retweeted is set to true otherwise false. We then create an object which contains the original tweet if it exists. We then use this object to create a new database entry. The user is also extracted from the tweet and is used to create a seperate collection from the tweet collection

In [3]:
def minimize_tweet(tweet, query):
    # if a tweet doesn't have an id return none
    if tweet.get('id', None) is None or tweet is None:
        return None
    
    # helper func to remove unnessecary keys
    
    tweet_keys = ['contributors', 'geo', 'lang', 'id_str', 'metadata', 'in_reply_to_status_id_str',
                'in_reply_to_user_id_str', 'quoted_status_id_str', 'notifications', 'truncated',
                  'translator_type', 'contributors_enabled', 'default_profile', 'geo_enabled', 
                'has_extended_profile', 'source', 'coordinates', 'filter_level', 'possibly_sensitive', 
                'quoted_status_id_str','scopes', 'withheld_copyright']
    
    user_keys = [ 'contributors_enabled', 'default_profile', 'default_profile_image', 'follow_request_sent',
                'geo_enabled', 'id_str', 'is_translator', 'lang', 'listed_count', 'notifications',
                'profile_background_color', 'profile_banner_url', 'profile_background_tile',
                'profile_background_image_url_https', 'profile_background_image_url', 'profile_image_url',
                'profile_image_url_https', 'profile_image_url_https', 'profile_link_color', 
                'profile_sidebar_border_color', 'profile_sidebar_fill_color', 'profile_text_color',
                'profile_use_background_image', 'show_all_inline_media', 'protected', 'url', 'utc_offset', 
                'time_zone', 'withheld_in_countries', 'withheld_scope', 'translator_type', 'entities',
                'has_extended_profile', 'is_translation_enabled']
    
    place_keys = ['attributes', 'bounding_box', 'contained_within', 'place_type', 'url']
    
    def remove_keys(obj, keys):
        if obj is None:
            return None
        
        for key in keys:
            obj.pop(key, None)
        
        # also sets id to _id for mongo
        obj['_id'] = obj.pop('id')
        
        return obj
    
    # remove keys
    tweet = remove_keys(tweet, tweet_keys)
    
    # remove place from query
    pattern = '^(.*?)(?= place:|$)'
    root_query = re.findall(pattern, query)[0]
    
    # set query key and from_query_root
    tweet[u'query'] = query
    tweet[u'root_query'] = root_query
    
    # handle entities
    entities = tweet.get('entities', None)
    if entities:
        # remove entities from tweet
        tweet.pop('entities', None)
        # remove urls from entities
        entities.pop('urls', None)
        
        # set entity objects directly to tweet
        for entity, items in entities.iteritems():
            # reduce keys from user mentions
            if entity == 'user_mentions':
                for i, obj in enumerate(items):
                    items[i].pop('id_str')
                    items[i].pop('name')
                    items[i]['_id'] = items[i].pop('id')
                    
            if len(items) == 0:
                tweet[entity] = None
            else:
                tweet[entity] = items
                
    # minimize the place field
    tweet['place'] = remove_keys(tweet.get('place', None), place_keys)
    
    # minimize user field
    user = remove_keys(tweet.get('user', None), user_keys)
    tweet['user'] = user
    
    ### RETWEET has yet to be
    # check if the tweet was retweeted
    retweet = tweet.get('retweeted_status', None)
    
    # Set to None by default
    retweet_container = None
    
    if retweet:
        # set the retweeted field to true
        tweet['retweeted'] = True
        
        # grab the retweet id
        retweet_id = retweet.get('id', None)
        
        # Create retweet container to return
        retweet_container = {
            'retweet': retweet,
            'id': retweet_id,
            # save the time the retweet was made, can use to compare vs a tweet saved in the data base
            'created_at': convert_datetime(tweet.get('created_at'))
        }
        
        # set retweeted_status to the id of the orginal retweet
        tweet['retweeted_status'] = retweet_id
    
    return (tweet, user, retweet_container)

Below is an exmaple of a minimized tweet.

In [4]:
query = "#Trump OR #Hilary place:c23"
test_file_path = './queries/#DonaldTrump OR #HillaryClinton OR Trump OR Clinton OR #Trump OR #Clinton/Canada/2016-11-07T19:43:48.144881.json'
test = io.open(test_file_path)
tweets = json.loads(test.read())

min_tweets = [minimize_tweet(tweet, query) for key, tweet in tweets.iteritems()]
pprint.pprint(min_tweets[:1])

[({'_id': 795670816545456129,
   u'created_at': u'Mon Nov 07 16:54:40 +0000 2016',
   u'favorite_count': 0,
   u'favorited': False,
   u'hashtags': None,
   u'in_reply_to_screen_name': None,
   u'in_reply_to_status_id': None,
   u'in_reply_to_user_id': None,
   u'is_quote_status': False,
   u'place': {'_id': u'0064529d5fb32582',
              u'country': u'Canada',
              u'country_code': u'CA',
              u'full_name': u'Georgian Bluffs, Ontario',
              u'name': u'Georgian Bluffs'},
   u'query': '#Trump OR #Hilary place:c23',
   u'retweet_count': 1,
   u'retweeted': False,
   u'root_query': '#Trump OR #Hilary',
   u'symbols': None,
   u'text': u'Trump interrupts summation by re-looping back into the full, rambling glory of the body of his speech.',
   u'user': {'_id': 167535120,
             u'created_at': u'Fri Jul 16 21:00:58 +0000 2010',
             u'description': u'I write columns for the National Post and Postmedia/Sun Media newspapers.',
             u'favour

The insert_many function will open a json file and insert all the objects inside into the database. If the object already exists in the database, check the creation times and update based on the most recent tweet. The tweet is minimized and then added. If the tweet is retweeted, the original tweet is added to the database and also minimized.

In [5]:
def insert_many(filename, query):
    '''
    Insert multiple items into the Mongo database.
    '''
    
    def insert(document, query, collection):
        '''
        Insert an object into the Mongo Database, if the item already exists update it based on the created time.
        '''
        
        _id = document['_id']
        # find a cursor with the same id
        cur = collection.find({'_id': _id}).limit(1)
        
        #
        # if finding the most updated tweets is not required
        # can pass upsert=True to collection.update
        #
        
        # make a new entry
        if document and cur.count() == 0:
            # save query to database in list
            collection.insert_one(document)
        # check if entry should be updated
        else:
            new_document = cur.next()
            # convert time objects to date time to compare
            old_time = convert_datetime(document['created_at'])
            new_time = convert_datetime(new_document['created_at'])
            
            # if the new tweet is more recent, then update
            if new_time > old_time:
                collection.update({{'_id': _id}}, new_document)

        return None

    # open the file and load as a json object
    f = io.open(filename)
    tweets = json.loads(f.read())
    
    # for each tweet saved in the json file
    for tweet_id, raw_tweet in tweets.iteritems():
        
        # minimize the tweet
        tweet, user, retweet_container = minimize_tweet(raw_tweet, query)
        
        # insert tweet and user
        insert(tweet, query, tweet_collection)
        insert(user, query, user_collection)
        
        # if there is a retweet
        if retweet_container:
            
            # minimize the original tweet
            tweet, user, _ = minimize_tweet(retweet_container['retweet'], query)
            
            # insert retweet and user
            insert(tweet, query, tweet_collection)
            insert(user, query, user_collection)
            
    f.close()
    return None

When the tweets were scraped they were stored in seperate directories in seperate files. The following function will iterate through these directories and open all the files. It will then call the insert many function above and all the tweets into the database.

In [6]:
def create_tweet_db(root, path):
    '''
    Retrieve all the tweets from the comments, minimize and create database.
    '''
    os.chdir(path)
    root_query = os.getcwd()

    for query in os.listdir(os.getcwd()):
        # skip hidden files
        if query.startswith('.') or query.endswith('.json'):
            continue

        # move into query directory
        os.chdir(query)
        # save the path of the current query
        query_path = os.getcwd()

        # retrieve all the json files not in location directories
        for filename in glob.glob('*.json'):
            insert_many(filename, query)
            # print query, json_file

        # iterate through all subdirectories inside of the parent query
        for subdir in os.listdir(os.getcwd()):
            # skip hidden files and json files (since we already retrieved these above)
            if subdir.startswith('.') or subdir.endswith('.json'):
                continue
            # move into sub directory inside of query (country, timestamp)
            os.chdir(subdir)

            # retrieve all the json files not in location and timestamp folders
            for filename in glob.glob('*.json'):
                insert_many(filename, query)
                # print query, subdir, json_file

            # move out back to query directory
            os.chdir(query_path)

        # move out of query directory
        os.chdir(root_query)

    os.chdir(root)

Tweets Leading up the election

In [7]:
# set the current directory
path = 'queries'
root = os.getcwd()

create_tweet_db(root, path)

Tweets from election day

In [8]:
path = 'election_day'
create_tweet_db(root, path)

Verify the database has been created.

In [9]:
print tweet_collection.count()
print user_collection.count()

112717
87501
