In [1]:
import pymongo
import codecs
import os
import json
from pymongo import IndexModel, ASCENDING, DESCENDING
import importlib
from pprint import pprint
import shelve
import time

import mongodb # module for setting up connection with (local) MongoDB database

In [3]:
DB_NAME = 'tweets_ek' # database for tweets collected on expanded keywords
RAW_COLLECTION_NAME = 'c1' # collection for raw data
UPDATED_COLLECTION_NAME = 'c2' # collection for updated data

raw_data = mongodb.initialize(db_name=DB_NAME, collection_name=RAW_COLLECTION_NAME)
updated_data = mongodb.initialize(db_name=DB_NAME, collection_name=UPDATED_COLLECTION_NAME)

MongoDB on localhost:27017/tweets_ek.c1 connected successfully!
MongoDB on localhost:27017/tweets_ek.c2 connected successfully!


### Compare tweets in 'c1' and 'c2' collection by 'id' field
Test whether all tweets re-queried consist a legitimate subset of tweets collected from Streaming API

In [3]:
tweets_ek_shl = os.path.join('data', 'tweets_ek.db')
c1_id_shl_key = 'c1_id_set'
c2_id_shl_key = 'c2_id_set'

In [4]:
%%time
# get all tweets id from 'c1' collection, picklize, and write to shelve
if 0 == 1:
    c1_id_set = set()
    cursor = raw_data.find(projection={'_id': 0, 'id': 1}, # minimize IO bandwidth
#                            limit=1000000
                          )
    print('Start buidling id set...')
    for document in cursor:
        tweet_id = int(document['id'])
        c1_id_set.add(tweet_id)
    print('Successfully create set obj for "id" field in "c1" with size: {:,}'.format(len(c1_id_set)))
    
    print('Writing to shelve...')
    with shelve.open(tweets_ek_shl) as s:
        s[c1_id_shl_key] = c1_id_set # store data at key (overwrites old data if using an existing key)
    print('Done')

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 11.7 µs


In [5]:
%%time
# get all tweets id from 'c2' collection, picklize, and write to shelve
if 0 == 1:
    # get all tweets id from 'c1' collection, picklize, and write to shelve
    c2_id_set = set()
    cursor = updated_data.find(projection={'_id': 0, 'id': 1}, # minimize IO bandwidth
                              )
    print('Start buidling id set...')
    for document in cursor:
        tweet_id = int(document['id'])
        c2_id_set.add(tweet_id)
    print('Successfully create set obj for "id" field in "c2" with size: {:,}'.format(len(c2_id_set)))
    
    print('Writing to shelve...')
    with shelve.open(tweets_ek_shl) as s:
        s[c2_id_shl_key] = c2_id_set # store data at key (overwrites old data if using an existing key)
    print('Done')

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 10.5 µs


In [6]:
%%time
# find out the difference between c2_id_set and c1_id_set
if 0 == 1:
    c1_id_set = set()
    c2_id_set = set()
    with shelve.open(tweets_ek_shl, flag='r') as s:
        if c1_id_shl_key in s:
            c1_id_set = s[c1_id_shl_key]
            print('Check pickle "{}" exists: True'.format(c1_id_shl_key))
        if c2_id_shl_key in s:
            c2_id_set = s[c2_id_shl_key]
            print('Check pickle "{}" exists: True'.format(c2_id_shl_key))


    print('Pickle {} size: {:,}'.format(c1_id_shl_key, len(c1_id_set)))
    print('Pickle {} size: {:,}'.format(c2_id_shl_key, len(c2_id_set)))
    print('Check whether {} is a legitimate subset of {}: {}'.format(c2_id_shl_key, 
                                                                     c1_id_shl_key, 
                                                                     c2_id_set.issubset(c1_id_set)))
    c2_id_diff_c1_id = c2_id_set.difference(c1_id_set)
    print('The length of difference of {} - {} is {}'.format(c2_id_shl_key,
                                                             c1_id_shl_key,
                                                             len(c2_id_diff_c1_id)))

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 11.2 µs


### Compare tweets in 'c2' and 'c2_parsed_created_at' collection by 'id' field
Test whether all parsed 'created_at' tweets consist a legitimate subset of re-queried tweets

In [7]:
PARSED_CREATED_AT_COLLECTION = 'c2_parsed_created_at'
parsed_created_at_col = mongodb.initialize(db_name=DB_NAME, collection_name=PARSED_CREATED_AT_COLLECTION)
c2_p_id_shl_key = 'c2_p_id_set'

MongoDB on localhost:27017/tweets_ek.c2_parsed_created_at connected successfully!


In [8]:
%%time
# get all tweets id from 'c2_parsed_created_at' collection, picklize, and write to shelve
if 0 == 1:
    c2_p_id_set = set()
    cursor = parsed_created_at_col.find(projection={'_id': 0, 'id': 1}, # minimize IO bandwidth
#                                         limit=1000000
                                       )
    print('Start buidling id set...')
    for document in cursor:
        tweet_id = int(document['id'])
        c2_p_id_set.add(tweet_id)
    print('Successfully create set obj for "id" field in "c2_parsed_created_at" with size: {:,}'.format(len(c2_p_id_set)))
    
    print('Writing to shelve...')
    with shelve.open(tweets_ek_shl) as s:
        s[c2_p_id_shl_key] = c2_p_id_set # store data at key (overwrites old data if using an existing key)
    print('Done')

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 11.4 µs


In [9]:
%%time
# find out the difference between c2_p_id_set and c2_id_set
if 0 == 1:
    c2_p_id_set = set()
    c2_id_set = set()
    with shelve.open(tweets_ek_shl, flag='r') as s:
        if c2_p_id_shl_key in s:
            c2_p_id_set = s[c2_p_id_shl_key]
            print('Check pickle "{}" exists: True'.format(c2_p_id_shl_key))
        
        if c2_id_shl_key in s:
            c2_id_set = s[c2_id_shl_key]
            print('Check pickle "{}" exists: True'.format(c2_id_shl_key))


    print('Pickle {} size: {:,}'.format(c2_p_id_shl_key, len(c2_p_id_set)))
    print('Pickle {} size: {:,}'.format(c2_id_shl_key, len(c2_id_set)))
    print('Check whether {} is a legitimate subset of {}: {}'.format(c2_p_id_shl_key, 
                                                                     c2_id_shl_key, 
                                                                     c2_p_id_set.issubset(c2_id_set)))
    diff_set = c2_p_id_set.difference(c2_id_set)
    print('The length of difference of {} - {} is {}'.format(c2_p_id_shl_key,
                                                             c2_id_shl_key,
                                                             len(diff_set)))

Check pickle "c2_p_id_set" exists: True
Check pickle "c2_id_set" exists: True
Pickle c2_p_id_set size: 5,042,298
Pickle c2_id_set size: 5,042,298
Check whether c2_p_id_set is a legitimate subset of c2_id_set: True
The length of difference of c2_p_id_set - c2_id_set is 0
CPU times: user 11.5 s, sys: 1.15 s, total: 12.7 s
Wall time: 12.5 s


### Compare unique users in 'c2' and 'c2_users' by 'id' field
Test if 'c2_users' have all unique users contained in 'c2'

In [4]:
USERS_COLLECTION = 'c2_users'
user_col = mongodb.initialize(db_name=DB_NAME, collection_name=USERS_COLLECTION)

unique_user_ids_shl = os.path.join('data', 'unique_user_ids.db')
unique_user_ids_key = 'unique_user_ids'

MongoDB on localhost:27017/tweets_ek.c2_users connected successfully!


In [7]:
%%time
# get user id from 'c2_users' collection
if 1 == 1:
    user_ids_set = set()
    cursor = user_col.find(projection={'_id': 0, 'id': 1})
    print('Start buidling user ids set...')
    for document in cursor:
        user_id = int(document['id'])
        user_ids_set.add(user_id)
    print('Successfully create set obj for user "id" field in "{}" with size: {:,}'
          .format(USERS_COLLECTION, len(user_ids_set)))
    
    unique_user_ids_set = set()
    with shelve.open(unique_user_ids_shl, flag='r') as s:
        if unique_user_ids_key in s:
            unique_user_ids_set = s[unique_user_ids_key]
            print('Check pickle "{}" exists: True'.format(unique_user_ids_key))
    
    print('Set {} size: {:,}'.format('user_ids_set', len(user_ids_set)))
    print('Pickle {} size: {:,}'.format(unique_user_ids_key, len(unique_user_ids_set)))
    
    print('Check whether {} is a legitimate subset of {}: {}'
          .format('user_ids_set', unique_user_ids_key, user_ids_set.issubset(unique_user_ids_set)))
    
    diff_set = user_ids_set.difference(unique_user_ids_set)
    print('The length of difference of {} - {} is {}'.format('user_ids_set',
                                                             unique_user_ids_key,
                                                             len(diff_set)))

Start buidling user ids set...
Successfully create set obj for user "id" field in "c2_users" with size: 844,675
Check pickle "unique_user_ids" exists: True
Set user_ids_set size: 844,675
Pickle unique_user_ids size: 844,675
Check whether user_ids_set is a legitimate subset of unique_user_ids: True
The length of difference of user_ids_set - unique_user_ids is 0
CPU times: user 7.52 s, sys: 536 ms, total: 8.05 s
Wall time: 14.1 s
